From 0c5a3019372d7756e9707e02174d497c2b3793ee Mon Sep 17 00:00:00 2001
From: Benjamin Capodanno <bencap@outlook.com>
Date: Tue, 28 Apr 2026 15:41:47 -0700
Subject: [PATCH 1/6] feat(mapping): add target-level mapping metadata to
 scoreset output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Introduce `AlignmentQc` schema capturing per-alignment BLAT quality
  metrics (identity, CIGAR, mismatch positions, gap intervals) with
  in-memory-only position lists excluded from serialization
- Add `TargetMapping` schema representing a per-(target, layer) mapping
  row with variant counts, tool parameters, and preferred-layer flag
- Add `VrsMapResult` NamedTuple to pair VRS mappings with their
  `TargetMapping` rows from `vrs_map`
- Rename `annotation_layer` → `alignment_level` on `MappedScore` /
  `ScoreAnnotation` to align with new terminology
- Rename `ident_pct` → `percent_identity` on `AlignmentResult`; add
  `score`, `next_best_score`, `alignment_qc`, `aligner_parameters`, and
  `reference_assembly` fields
- Implement `build_scoreset_mapping` in `annotate.py` to assemble
  `ScoresetMapping` with populated `target_mappings` list, per-variant
  locus-quality flags (`at_mismatched_locus`, `near_gap`), and reference
  sequence metadata
- Restore canonical BLAT PSL scoring (`matches - misMatches -
  qNumInsert - tNumInsert`) in `_get_best_hsp`; previous BioPython port
  used raw identity count, causing noisy alignments to outrank clean ones
- Update JSON schema, API router, CI workflow, and README to reflect new
  output shape
- Add `test_annotate_target_mapping.py` and expand `test_align.py` /
  `test_annotate.py` with unit tests for new logic

Co-authored-by: Copilot <copilot@github.com>
---
 .github/workflows/checks.yaml         |    5 +
 .gitignore                            |    5 +
 README.md                             |   82 +-
 pyproject.toml                        |    2 +-
 schema.json                           | 1076 +++++++++++++++++--------
 scripts/generate_schema.py            |   31 +
 src/api/routers/map.py                |  146 +---
 src/dcd_mapping/align.py              |  989 +++++++++++++++++++----
 src/dcd_mapping/annotate.py           |  734 +++++++++++++++--
 src/dcd_mapping/main.py               |   22 +-
 src/dcd_mapping/schemas.py            |  249 +++++-
 src/dcd_mapping/vrs_map.py            |   70 +-
 tests/fixtures/align_result.json      |   22 +-
 tests/test_align.py                   |  366 ++++++++-
 tests/test_annotate.py                |  138 +++-
 tests/test_annotate_target_mapping.py |  855 ++++++++++++++++++++
 tests/test_transcript.py              |    2 +-
 tests/test_vrs_map.py                 |   12 +-
 18 files changed, 4082 insertions(+), 724 deletions(-)
 create mode 100644 scripts/generate_schema.py
 create mode 100644 tests/test_annotate_target_mapping.py

diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml
index 9bfd1f8..a55e921 100644
--- a/.github/workflows/checks.yaml
+++ b/.github/workflows/checks.yaml
@@ -39,3 +39,8 @@ jobs:
 
       - name: Check style
         run: python3 -m ruff check . && ruff format --check .
+
+      - name: Verify schema.json is up to date
+        run: |
+          python scripts/generate_schema.py
+          git diff --exit-code schema.json
diff --git a/.gitignore b/.gitignore
index fd06d06..0d25f59 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,3 +166,8 @@ cython_debug/
 # mapping data/output
 notebooks/analysis/analysis_files
 notebooks/analysis/mavedb_files
+
+# debug / ad-hoc mapping output files — prevent accidental commits
+urn:*.json
+tmp:*.json
+*_mapping_*.json
diff --git a/README.md b/README.md
index da3cb1e..3c7ed4a 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ This library implements a novel method for mapping [MaveDB scoreset data](https:
 * Universal Transcript Archive (UTA): see [README](https://github.com/biocommons/uta?tab=readme-ov-file#installing-uta-locally) for setup instructions. Users with access to Docker on their local devices can use the available Docker image; otherwise, start a relatively recent (version 14+) PostgreSQL instance and add data from the available database dump.
 * SeqRepo: see [README](https://github.com/biocommons/biocommons.seqrepo?tab=readme-ov-file#requirements) for setup instructions. The SeqRepo data directory must be writeable; see specific instructions [here](https://github.com/biocommons/biocommons.seqrepo/blob/main/docs/store.rst) for more.
 * Gene Normalizer: see [documentation](https://gene-normalizer.readthedocs.io/0.3.0-dev1/install.html) for data setup instructions.
-* blat: Must be available on the local PATH and executable by the user. Otherwise, its location can be set manually with the `BLAT_BIN_PATH` env var. See the [UCSC Genome Browser FAQ](https://genome.ucsc.edu/FAQ/FAQblat.html#blat3) for download instructions. 
+* blat: Must be available on the local PATH and executable by the user. Otherwise, its location can be set manually with the `BLAT_BIN_PATH` env var. See the [UCSC Genome Browser FAQ](https://genome.ucsc.edu/FAQ/FAQblat.html#blat3) for download instructions.
 
 
 ## Installation
@@ -40,6 +40,86 @@ Output is saved in the format `<URN>_mapping_results_<ISO datetime>.json` in the
 
 Use `dcd-map --help` to see other available options.
 
+## Mapping output
+
+Each mapping run produces a single JSON document conforming to [`schema.json`](schema.json) (the JSON Schema serialization of `ScoresetMapping`). Top-level keys:
+
+* `metadata` — the verbatim MaveDB API scoreset response, stored unchanged so no upstream fields are lost.
+* `mapped_date` — ISO 8601 UTC timestamp of when this run completed.
+* `reference_sequences` — per-target reference sequence info per annotation layer.
+* `mapped_scores` — flat list of per-variant `ScoreAnnotation` records (see below).
+* `target_mappings` — per-`(target, alignment_level)` provenance and alignment QC rows. The MaveDB API consumes these as `target_gene_mappings` and uses them to attribute every `mapped_score` back to the alignment that produced it.
+* `error_message` — populated only when the run failed before producing scores.
+
+### `metadata`
+
+The verbatim MaveDB API scoreset response. Stored unchanged so downstream consumers retain access to every upstream field (URN, title, description, target gene definitions, score-column metadata, etc.) without having to query MaveDB again.
+
+### `reference_sequences`
+
+A `dict[target_gene_name, TargetAnnotation]` describing the reference sequences each target was mapped against, organized by annotation layer. Each `TargetAnnotation` carries:
+
+* `gene_info` — `hgnc_symbol` plus the `selection_method` that picked it (transcript-derived, alignment-overlap-derived, variant-overlap-derived, or metadata fallback).
+* `layers` — a `dict[AnnotationLayer, {computed_reference_sequence, mapped_reference_sequence}]` populated only for layers that actually produced mappings. `computed_reference_sequence` is the in-pipeline sequence (e.g. translated protein); `mapped_reference_sequence` lists the canonical accession(s) the variants were ultimately grounded in. Layers with no usable reference are pruned, not emitted as `null`.
+
+This block is the human-readable "what was used as reference" view; programmatic auditing should use `target_mappings` instead.
+
+### `mapped_scores`
+
+A flat list of per-variant `ScoreAnnotation` records. One entry per `(score_record, emitted annotation_layer)` pair. Key fields:
+
+* `mavedb_id`, `score` — identifier and numeric score copied from the MaveDB record.
+* `relation` — fixed at `"SO:is_homologous_to"` while `pre_mapped` is populated.
+* `target_gene_identifier`, `alignment_level` — composite key linking back to a `target_mappings` row (see below).
+* `pre_mapped`, `post_mapped` — VRS variant objects in the target's coordinate frame and in the reference frame, respectively. Either may be `null` for failed mappings.
+* `vrs_version` — VRS schema version used for this record.
+* `error_message` — populated when `post_mapped` is `null` *or* when mapping succeeded with a caveat (e.g. RLE fallback, ambiguous reference allele).
+* `at_mismatched_locus`, `near_gap` — per-variant audit flags, described below.
+
+### `target_mappings`
+
+Per-`(target, alignment_level)` provenance and alignment QC rows. The MaveDB API consumes these as `target_gene_mappings` and uses them to attribute every `mapped_score` back to the alignment that produced it. (See [`schema.json`](schema.json) `TargetMapping` for the wire format.)
+
+### `error_message`
+
+Populated only when the run failed before producing any scores; otherwise omitted. Per-variant errors live on `mapped_scores[].error_message`, not here.
+
+## Audit and provenance details
+
+### `target_mappings` fields
+
+Each row describes the alignment that one set of mapped variants is grounded in:
+
+| Field | Notes |
+|---|---|
+| `target_gene_identifier`, `alignment_level`, `preferred` | Composite key. `(target_gene_identifier, alignment_level)` is unique per run. Exactly one row per target has `preferred=True`. |
+| `tool_name`, `tool_version`, `tool_parameters` | Aligner provenance. `tool_parameters.aligner` is `"blat"` for sequence-based targets and `"cdot_transcript_placement"` for accession-based targets. |
+| `reference_accession`, `reference_sequence_id`, `vrs_version` | Coordinate-frame and run provenance. |
+| `percent_identity`, `alignment_score`, `next_best_alignment_score`, `alignment_length`, `mismatch_count`, `gap_count` | Aggregate QC for the winning HSP. `alignment_score` is the canonical PSL score (`identities − mismatches − qNumInsert − tNumInsert`). |
+| `alignment_string`, `alignment_metadata` | Pairwise visualization plus a small structured payload (CIGAR, `near_gap_window`, `at_mismatched_locus_evaluated`). |
+| `total_variants`, `variants_mapped_cleanly`, `variants_with_mapping_warnings`, `variants_with_alignment_warnings`, `variants_failed` | Per-row variant counts. `variants_with_alignment_warnings` counts variants whose reference position fell on a mismatched base or near a gap. |
+
+### Per-variant audit flags
+
+Each `ScoreAnnotation` is attributable to exactly one `target_mappings` row via the composite key `(target_gene_identifier, alignment_level)`. The pipeline enforces this as a runtime invariant — orphaned scores raise `RuntimeError` rather than silently corrupting downstream joins.
+
+Per-variant locus flags:
+
+* `at_mismatched_locus` — `True` when any base in the variant's reference span mismatches between the target sequence and the reference; `False` when evaluated and no mismatch was found; `None` when per-base sequence content was unavailable for that layer (see `alignment_metadata.at_mismatched_locus_evaluated`), or when the variant is a `ReferenceLengthExpression` allele (large deletions/duplications, always `None`/`None`).
+* `near_gap` — `True` when the variant lies within `alignment_metadata.near_gap_window` reference bases of any alignment gap; `None` for layers without an alignment (e.g. `cdna`).
+
+Completely-failed variants (`pre_mapped is None` and no annotation layer was determined) are attributed to the target's preferred layer so the join invariant holds.
+
+### Regenerating `schema.json`
+
+`schema.json` is checked in and consumed by downstream services (notably the MaveDB API). After any change to `src/dcd_mapping/schemas.py` that alters the public output contract, regenerate it:
+
+```shell
+python scripts/generate_schema.py
+```
+
+Commit the regenerated `schema.json` in the same change.
+
 ## Notebooks
 
 Notebooks for manuscript data analysis and figure generation are provided within `notebooks/analysis`. See [`notebooks/analysis/README.md`](notebooks/analysis/README.md) for more information.
diff --git a/pyproject.toml b/pyproject.toml
index 3ae7444..c4436a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -169,4 +169,4 @@ ignore = [
 # ANN102 - missing-type-cls
 # S101 - assert
 # B011 - assert-false
-"tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "D103"]
+"tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "D101", "D102", "D103"]
diff --git a/schema.json b/schema.json
index 977a590..708e4d1 100644
--- a/schema.json
+++ b/schema.json
@@ -1,18 +1,28 @@
 {
   "$defs": {
+    "AnnotationLayer": {
+      "description": "Create enum for supported annotation layers",
+      "enum": [
+        "p",
+        "c",
+        "g"
+      ],
+      "title": "AnnotationLayer",
+      "type": "string"
+    },
     "ComputedReferenceSequence": {
       "description": "Define metadata describing a computed reference sequence",
       "properties": {
-        "sequence_type": {
-          "$ref": "#/$defs/TargetSequenceType"
+        "sequence": {
+          "title": "Sequence",
+          "type": "string"
         },
         "sequence_id": {
           "title": "Sequence Id",
           "type": "string"
         },
-        "sequence": {
-          "title": "Sequence",
-          "type": "string"
+        "sequence_type": {
+          "$ref": "#/$defs/TargetSequenceType"
         }
       },
       "required": [
@@ -23,6 +33,37 @@
       "title": "ComputedReferenceSequence",
       "type": "object"
     },
+    "GeneInfo": {
+      "description": "Basic gene metadata for a target, including symbol and selection method.",
+      "properties": {
+        "hgnc_symbol": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Hgnc Symbol"
+        },
+        "selection_method": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Selection Method"
+        }
+      },
+      "title": "GeneInfo",
+      "type": "object"
+    },
     "IRI": {
       "description": "An IRI Reference (either an IRI or a relative-reference), according to `RFC3986 section 4.1  <https://datatracker.ietf.org/doc/html/rfc3986#section-4.1>` and `RFC3987 section 2.1 <https://datatracker.ietf.org/doc/html/rfc3987#section-2.1>`. MAY be a JSON Pointer as an IRI fragment, as  described by `RFC6901 section 6 <https://datatracker.ietf.org/doc/html/rfc6901#section-6>`.",
       "title": "IRI",
@@ -31,19 +72,19 @@
     "MappedReferenceSequence": {
       "description": "Define metadata describing a mapped, human reference sequence",
       "properties": {
-        "sequence_type": {
-          "$ref": "#/$defs/TargetSequenceType"
-        },
-        "sequence_id": {
-          "title": "Sequence Id",
-          "type": "string"
-        },
         "sequence_accessions": {
           "items": {
             "type": "string"
           },
           "title": "Sequence Accessions",
           "type": "array"
+        },
+        "sequence_id": {
+          "title": "Sequence Id",
+          "type": "string"
+        },
+        "sequence_type": {
+          "$ref": "#/$defs/TargetSequenceType"
         }
       },
       "required": [
@@ -60,9 +101,6 @@
         "type": {
           "const": "Number",
           "default": "Number",
-          "enum": [
-            "Number"
-          ],
           "title": "Type",
           "type": "string"
         },
@@ -97,7 +135,7 @@
     "ReferenceLengthExpression": {
       "description": "An expression sequence derived from a reference.",
       "properties": {
-        "id": {
+        "description": {
           "anyOf": [
             {
               "type": "string"
@@ -107,23 +145,25 @@
             }
           ],
           "default": null,
-          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
-          "title": "Id"
+          "description": "A free-text description of the entity.",
+          "title": "Description"
         },
-        "label": {
+        "extensions": {
           "anyOf": [
             {
-              "type": "string"
+              "items": {
+                "$ref": "#/$defs/ga4gh__core___internal__models__Extension"
+              },
+              "type": "array"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "description": "A primary label for the entity.",
-          "title": "Label"
+          "title": "Extensions"
         },
-        "description": {
+        "id": {
           "anyOf": [
             {
               "type": "string"
@@ -133,33 +173,21 @@
             }
           ],
           "default": null,
-          "description": "A free-text description of the entity.",
-          "title": "Description"
+          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
+          "title": "Id"
         },
-        "extensions": {
+        "label": {
           "anyOf": [
             {
-              "items": {
-                "$ref": "#/$defs/ga4gh__core___internal__models__Extension"
-              },
-              "type": "array"
+              "type": "string"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "title": "Extensions"
-        },
-        "type": {
-          "const": "ReferenceLengthExpression",
-          "default": "ReferenceLengthExpression",
-          "description": "MUST be \"ReferenceLengthExpression\"",
-          "enum": [
-            "ReferenceLengthExpression"
-          ],
-          "title": "Type",
-          "type": "string"
+          "description": "A primary label for the entity.",
+          "title": "Label"
         },
         "length": {
           "anyOf": [
@@ -173,6 +201,12 @@
           "description": "The number of residues of the expressed sequence.",
           "title": "Length"
         },
+        "repeatSubunitLength": {
+          "default": null,
+          "description": "The number of residues of the repeat subunit.",
+          "title": "Repeatsubunitlength",
+          "type": "integer"
+        },
         "sequence": {
           "anyOf": [
             {
@@ -185,11 +219,12 @@
           "default": null,
           "description": "the Sequence encoded by the Reference Length Expression."
         },
-        "repeatSubunitLength": {
-          "default": null,
-          "description": "The number of residues of the repeat subunit.",
-          "title": "Repeatsubunitlength",
-          "type": "integer"
+        "type": {
+          "const": "ReferenceLengthExpression",
+          "default": "ReferenceLengthExpression",
+          "description": "MUST be \"ReferenceLengthExpression\"",
+          "title": "Type",
+          "type": "string"
         }
       },
       "required": [
@@ -209,30 +244,65 @@
     "ScoreAnnotation": {
       "description": "Provide extra annotations on top of mappings for an individual experiment score.\n\nThis model defines what an individual mapping instance looks like in the final JSON.",
       "properties": {
-        "pre_mapped": {
+        "alignment_level": {
           "anyOf": [
             {
-              "$ref": "#/$defs/VariationDescriptor"
+              "$ref": "#/$defs/AnnotationLayer"
             },
             {
-              "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Haplotype"
+              "type": "null"
             }
           ],
-          "title": "Pre Mapped"
+          "default": null
         },
-        "post_mapped": {
+        "at_mismatched_locus": {
           "anyOf": [
             {
-              "$ref": "#/$defs/VariationDescriptor"
+              "type": "boolean"
             },
             {
-              "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Haplotype"
+              "type": "null"
             }
           ],
-          "title": "Post Mapped"
+          "default": null,
+          "title": "At Mismatched Locus"
+        },
+        "error_message": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Error Message"
+        },
+        "mavedb_id": {
+          "title": "Mavedb Id",
+          "type": "string"
+        },
+        "near_gap": {
+          "anyOf": [
+            {
+              "type": "boolean"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Near Gap"
         },
-        "pre_mapped_2_0": {
+        "post_mapped": {
           "anyOf": [
+            {
+              "$ref": "#/$defs/VariationDescriptor"
+            },
+            {
+              "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Haplotype"
+            },
             {
               "$ref": "#/$defs/ga4gh__vrs___internal__models__Allele"
             },
@@ -244,10 +314,16 @@
             }
           ],
           "default": null,
-          "title": "Pre Mapped 2 0"
+          "title": "Post Mapped"
         },
-        "post_mapped_2_0": {
+        "pre_mapped": {
           "anyOf": [
+            {
+              "$ref": "#/$defs/VariationDescriptor"
+            },
+            {
+              "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Haplotype"
+            },
             {
               "$ref": "#/$defs/ga4gh__vrs___internal__models__Allele"
             },
@@ -259,18 +335,11 @@
             }
           ],
           "default": null,
-          "title": "Post Mapped 2 0"
-        },
-        "mavedb_id": {
-          "title": "Mavedb Id",
-          "type": "string"
+          "title": "Pre Mapped"
         },
         "relation": {
           "const": "SO:is_homologous_to",
           "default": "SO:is_homologous_to",
-          "enum": [
-            "SO:is_homologous_to"
-          ],
           "title": "Relation",
           "type": "string"
         },
@@ -283,14 +352,35 @@
               "type": "null"
             }
           ],
+          "default": null,
           "title": "Score"
+        },
+        "target_gene_identifier": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Target Gene Identifier"
+        },
+        "vrs_version": {
+          "anyOf": [
+            {
+              "$ref": "#/$defs/VrsVersion"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null
         }
       },
       "required": [
-        "pre_mapped",
-        "post_mapped",
-        "mavedb_id",
-        "score"
+        "mavedb_id"
       ],
       "title": "ScoreAnnotation",
       "type": "object"
@@ -298,20 +388,17 @@
     "SequenceInterval": {
       "description": "Define VRS 1.3 SequenceInterval.",
       "properties": {
+        "end": {
+          "$ref": "#/$defs/Number"
+        },
+        "start": {
+          "$ref": "#/$defs/Number"
+        },
         "type": {
           "const": "SequenceInterval",
           "default": "SequenceInterval",
-          "enum": [
-            "SequenceInterval"
-          ],
           "title": "Type",
           "type": "string"
-        },
-        "start": {
-          "$ref": "#/$defs/Number"
-        },
-        "end": {
-          "$ref": "#/$defs/Number"
         }
       },
       "required": [
@@ -323,7 +410,7 @@
     },
     "SequenceReference": {
       "properties": {
-        "id": {
+        "description": {
           "anyOf": [
             {
               "type": "string"
@@ -333,23 +420,25 @@
             }
           ],
           "default": null,
-          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
-          "title": "Id"
+          "description": "A free-text description of the entity.",
+          "title": "Description"
         },
-        "label": {
+        "extensions": {
           "anyOf": [
             {
-              "type": "string"
+              "items": {
+                "$ref": "#/$defs/ga4gh__core___internal__models__Extension"
+              },
+              "type": "array"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "description": "A primary label for the entity.",
-          "title": "Label"
+          "title": "Extensions"
         },
-        "description": {
+        "id": {
           "anyOf": [
             {
               "type": "string"
@@ -359,33 +448,21 @@
             }
           ],
           "default": null,
-          "description": "A free-text description of the entity.",
-          "title": "Description"
+          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
+          "title": "Id"
         },
-        "extensions": {
+        "label": {
           "anyOf": [
             {
-              "items": {
-                "$ref": "#/$defs/ga4gh__core___internal__models__Extension"
-              },
-              "type": "array"
+              "type": "string"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "title": "Extensions"
-        },
-        "type": {
-          "const": "SequenceReference",
-          "default": "SequenceReference",
-          "description": "MUST be \"SequenceReference\"",
-          "enum": [
-            "SequenceReference"
-          ],
-          "title": "Type",
-          "type": "string"
+          "description": "A primary label for the entity.",
+          "title": "Label"
         },
         "refgetAccession": {
           "description": "A `GA4GH RefGet <http://samtools.github.io/hts-specs/refget.html>` identifier for the referenced sequence, using the sha512t24u digest.",
@@ -403,6 +480,13 @@
             }
           ],
           "default": null
+        },
+        "type": {
+          "const": "SequenceReference",
+          "default": "SequenceReference",
+          "description": "MUST be \"SequenceReference\"",
+          "title": "Type",
+          "type": "string"
         }
       },
       "required": [
@@ -433,34 +517,326 @@
       "title": "Syntax",
       "type": "string"
     },
-    "TargetSequenceType": {
-      "description": "Define target sequence type. Add more definitions as needed.",
-      "enum": [
-        "protein",
-        "dna"
-      ],
-      "title": "TargetSequenceType",
-      "type": "string"
+    "TargetAnnotation": {
+      "description": "Represents annotations associated with a biological target, including optional gene metadata\nand structured annotation layers.\n\nAttributes\n----------\ngene_info : GeneInfo | None\n    Optional metadata describing the gene associated with the target,\n    including identifiers and descriptive information where available.\n\nlayers : dict[AnnotationLayer, dict[str, ComputedReferenceSequence | MappedReferenceSequence | dict | None]]\n    A mapping of annotation layers to keyed layer data. Each layer is identified by an\n    AnnotationLayer key and contains a dictionary where:\n      - keys are string identifiers for items within the layer (e.g., feature names),\n      - values are one of:\n          - ComputedReferenceSequence: a computed sequence representation for the item,\n          - MappedReferenceSequence: a sequence mapped to a reference coordinate system,\n          - dict: a generic dictionary for custom layer-specific payloads,\n          - None: indicating missing or intentionally omitted data.\n\nNotes\n-----\n- The default value for 'layers' is an empty dictionary.\n- This model is intended to standardize layer-based annotations for downstream processing\n  and validation, allowing both computed and mapped sequence data to coexist within the same\n  structure.",
+      "properties": {
+        "gene_info": {
+          "anyOf": [
+            {
+              "$ref": "#/$defs/GeneInfo"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null
+        },
+        "layers": {
+          "additionalProperties": {
+            "additionalProperties": {
+              "anyOf": [
+                {
+                  "$ref": "#/$defs/ComputedReferenceSequence"
+                },
+                {
+                  "$ref": "#/$defs/MappedReferenceSequence"
+                },
+                {
+                  "additionalProperties": true,
+                  "type": "object"
+                },
+                {
+                  "type": "null"
+                }
+              ]
+            },
+            "type": "object"
+          },
+          "default": {},
+          "propertyNames": {
+            "$ref": "#/$defs/AnnotationLayer"
+          },
+          "title": "Layers",
+          "type": "object"
+        }
+      },
+      "title": "TargetAnnotation",
+      "type": "object"
     },
-    "VariationDescriptor": {
-      "description": "Define VRSATILE VariationDescriptor.",
+    "TargetMapping": {
+      "description": "Per-(target, alignment_level) provenance and QC block.\n\nField names mirror the corresponding columns on `target_gene_mappings` in the\nMaveDB API so the API worker can deserialize directly with minimal transformation.\nAny aligner-specific structured details go in ``tool_parameters`` /\n``alignment_metadata`` (JSONB on the API side).\n\n``reference_assembly`` is a top-level column (not nested in ``tool_parameters``)\nbecause it describes the coordinate frame of the mapping result, not aligner\nconfiguration.  It is ``None`` for alignments with no genomic frame (e.g.\nprotein-vs-protein).\n\n``tool_parameters`` shape per aligner\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nBLAT genomic (sequence-based, nucleotide or protein-vs-genome)::\n\n    {\n      \"aligner\":         \"blat\",\n      \"aligner_version\": \"<BLAT version string | null>\",\n      \"min_score\":       <int>,\n      \"out_format\":      \"<pslx|...>\",\n      \"target_args\":     \"<e.g. '-q=prot -t=dnax' | ''>\",\n    }\n\nBLAT protein-vs-protein (sequence-based, protein annotation layer)::\n\n    {\n      \"aligner\":         \"blat\",\n      \"aligner_version\": \"<BLAT version string | null>\",\n      \"min_score\":       <int>,\n      \"out_format\":      \"<pslx|...>\",\n      \"target_args\":     \"-q=prot -t=prot\",\n    }\n\ncdot transcript placement (accession-based)::\n\n    {\n      \"aligner\":           \"cdot_transcript_placement\",\n      \"aligner_version\":   \"<cdot library semver, e.g. '0.2.26'>\",\n      \"cdot_url\":          \"<CDOT REST endpoint>\",\n      \"cdot_data_version\": \"<data version string | null>\",\n    }\n\n``cdot_data_version`` is ``null`` when the cdot REST response did not\ninclude a ``cdot_data_version`` field; its presence (even as ``null``)\ndistinguishes \"cdot run, version unknown\" from \"not a cdot run\".",
       "properties": {
-        "id": {
-          "title": "Id",
+        "alignment_length": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Alignment Length"
+        },
+        "alignment_level": {
+          "$ref": "#/$defs/AnnotationLayer"
+        },
+        "alignment_metadata": {
+          "anyOf": [
+            {
+              "additionalProperties": true,
+              "type": "object"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Alignment Metadata"
+        },
+        "alignment_score": {
+          "anyOf": [
+            {
+              "type": "number"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Alignment Score"
+        },
+        "alignment_string": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Alignment String"
+        },
+        "gap_count": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Gap Count"
+        },
+        "mismatch_count": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Mismatch Count"
+        },
+        "next_best_alignment_score": {
+          "anyOf": [
+            {
+              "type": "number"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Next Best Alignment Score"
+        },
+        "percent_identity": {
+          "anyOf": [
+            {
+              "type": "number"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Percent Identity"
+        },
+        "preferred": {
+          "default": false,
+          "title": "Preferred",
+          "type": "boolean"
+        },
+        "reference_accession": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Reference Accession"
+        },
+        "reference_assembly": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Reference Assembly"
+        },
+        "reference_sequence_id": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Reference Sequence Id"
+        },
+        "target_gene_identifier": {
+          "title": "Target Gene Identifier",
           "type": "string"
         },
-        "type": {
-          "const": "VariationDescriptor",
-          "default": "VariationDescriptor",
-          "enum": [
-            "VariationDescriptor"
+        "tool_name": {
+          "default": "dcd-mapping",
+          "title": "Tool Name",
+          "type": "string"
+        },
+        "tool_parameters": {
+          "anyOf": [
+            {
+              "additionalProperties": true,
+              "type": "object"
+            },
+            {
+              "type": "null"
+            }
           ],
-          "title": "Type",
+          "default": null,
+          "title": "Tool Parameters"
+        },
+        "tool_version": {
+          "default": "2026.1.2",
+          "title": "Tool Version",
           "type": "string"
         },
-        "variation": {
-          "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Allele"
+        "total_variants": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Total Variants"
         },
+        "variants_failed": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Variants Failed"
+        },
+        "variants_failed_pre_layer_selection": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Variants Failed Pre Layer Selection"
+        },
+        "variants_mapped_cleanly": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Variants Mapped Cleanly"
+        },
+        "variants_with_alignment_warnings": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Variants With Alignment Warnings"
+        },
+        "variants_with_mapping_warnings": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Variants With Mapping Warnings"
+        },
+        "vrs_version": {
+          "anyOf": [
+            {
+              "$ref": "#/$defs/VrsVersion"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null
+        }
+      },
+      "required": [
+        "target_gene_identifier",
+        "alignment_level"
+      ],
+      "title": "TargetMapping",
+      "type": "object"
+    },
+    "TargetSequenceType": {
+      "description": "Define target sequence type. Add more definitions as needed.",
+      "enum": [
+        "protein",
+        "dna"
+      ],
+      "title": "TargetSequenceType",
+      "type": "string"
+    },
+    "VariationDescriptor": {
+      "description": "Define VRSATILE VariationDescriptor.",
+      "properties": {
         "expressions": {
           "anyOf": [
             {
@@ -476,16 +852,29 @@
           "default": null,
           "title": "Expressions"
         },
-        "vrs_ref_allele_seq": {
-          "title": "Vrs Ref Allele Seq",
-          "type": "string"
-        },
         "extensions": {
           "items": {
             "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Extension"
           },
           "title": "Extensions",
           "type": "array"
+        },
+        "id": {
+          "title": "Id",
+          "type": "string"
+        },
+        "type": {
+          "const": "VariationDescriptor",
+          "default": "VariationDescriptor",
+          "title": "Type",
+          "type": "string"
+        },
+        "variation": {
+          "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Allele"
+        },
+        "vrs_ref_allele_seq": {
+          "title": "Vrs Ref Allele Seq",
+          "type": "string"
         }
       },
       "required": [
@@ -497,18 +886,18 @@
       "title": "VariationDescriptor",
       "type": "object"
     },
+    "VrsVersion": {
+      "description": "Define VRS versions",
+      "enum": [
+        "1.3",
+        "2"
+      ],
+      "title": "VrsVersion",
+      "type": "string"
+    },
     "dcd_mapping__vrs_v1_schemas__Allele": {
       "description": "Define VRS 1.3 Allele.",
       "properties": {
-        "type": {
-          "const": "Allele",
-          "default": "Allele",
-          "enum": [
-            "Allele"
-          ],
-          "title": "Type",
-          "type": "string"
-        },
         "id": {
           "title": "Id",
           "type": "string"
@@ -518,6 +907,12 @@
         },
         "state": {
           "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__LiteralSequenceExpression"
+        },
+        "type": {
+          "const": "Allele",
+          "default": "Allele",
+          "title": "Type",
+          "type": "string"
         }
       },
       "required": [
@@ -531,25 +926,22 @@
     "dcd_mapping__vrs_v1_schemas__Expression": {
       "description": "Define VRS 1.3 Expression.",
       "properties": {
+        "syntax": {
+          "title": "Syntax",
+          "type": "string"
+        },
+        "syntax_version": {
+          "title": "Syntax Version"
+        },
         "type": {
           "const": "Expression",
           "default": "Expression",
-          "enum": [
-            "Expression"
-          ],
           "title": "Type",
           "type": "string"
         },
-        "syntax": {
-          "title": "Syntax",
-          "type": "string"
-        },
         "value": {
           "title": "Value",
           "type": "string"
-        },
-        "syntax_version": {
-          "title": "Syntax Version"
         }
       },
       "required": [
@@ -563,19 +955,16 @@
     "dcd_mapping__vrs_v1_schemas__Extension": {
       "description": "Define VRS 1.3 Extension.",
       "properties": {
+        "name": {
+          "title": "Name",
+          "type": "string"
+        },
         "type": {
           "const": "Extension",
           "default": "Extension",
-          "enum": [
-            "Extension"
-          ],
           "title": "Type",
           "type": "string"
         },
-        "name": {
-          "title": "Name",
-          "type": "string"
-        },
         "value": {
           "title": "Value"
         }
@@ -590,15 +979,6 @@
     "dcd_mapping__vrs_v1_schemas__Haplotype": {
       "description": "Define VRS 1.3 Haplotype.",
       "properties": {
-        "type": {
-          "const": "Haplotype",
-          "default": "Haplotype",
-          "enum": [
-            "Haplotype"
-          ],
-          "title": "Type",
-          "type": "string"
-        },
         "members": {
           "anyOf": [
             {
@@ -615,6 +995,12 @@
             }
           ],
           "title": "Members"
+        },
+        "type": {
+          "const": "Haplotype",
+          "default": "Haplotype",
+          "title": "Type",
+          "type": "string"
         }
       },
       "required": [
@@ -626,18 +1012,15 @@
     "dcd_mapping__vrs_v1_schemas__LiteralSequenceExpression": {
       "description": "Define VRS 1.3 LiteralSequenceExpression.",
       "properties": {
+        "sequence": {
+          "title": "Sequence",
+          "type": "string"
+        },
         "type": {
           "const": "LiteralSequenceExpression",
           "default": "LiteralSequenceExpression",
-          "enum": [
-            "LiteralSequenceExpression"
-          ],
           "title": "Type",
           "type": "string"
-        },
-        "sequence": {
-          "title": "Sequence",
-          "type": "string"
         }
       },
       "required": [
@@ -660,21 +1043,18 @@
           ],
           "title": "Id"
         },
-        "type": {
-          "const": "SequenceLocation",
-          "default": "SequenceLocation",
-          "enum": [
-            "SequenceLocation"
-          ],
-          "title": "Type",
-          "type": "string"
+        "interval": {
+          "$ref": "#/$defs/SequenceInterval"
         },
         "sequence_id": {
           "title": "Sequence Id",
           "type": "string"
         },
-        "interval": {
-          "$ref": "#/$defs/SequenceInterval"
+        "type": {
+          "const": "SequenceLocation",
+          "default": "SequenceLocation",
+          "title": "Type",
+          "type": "string"
         }
       },
       "required": [
@@ -689,21 +1069,18 @@
       "additionalProperties": true,
       "description": "The Extension class provides VODs with a means to extend descriptions with other\nattributes unique to a content provider. These extensions are not expected to be\nnatively understood under VRSATILE, but may be used for pre-negotiated exchange of\nmessage attributes when needed.",
       "properties": {
+        "name": {
+          "description": "A name for the Extension",
+          "title": "Name",
+          "type": "string"
+        },
         "type": {
           "const": "Extension",
           "default": "Extension",
           "description": "MUST be \"Extension\".",
-          "enum": [
-            "Extension"
-          ],
           "title": "Type",
           "type": "string"
         },
-        "name": {
-          "description": "A name for the Extension",
-          "title": "Name",
-          "type": "string"
-        },
         "value": {
           "anyOf": [
             {
@@ -716,6 +1093,7 @@
               "type": "boolean"
             },
             {
+              "additionalProperties": true,
               "type": "object"
             },
             {
@@ -739,7 +1117,7 @@
     },
     "ga4gh__vrs___internal__models__Allele": {
       "properties": {
-        "id": {
+        "description": {
           "anyOf": [
             {
               "type": "string"
@@ -748,13 +1126,14 @@
               "type": "null"
             }
           ],
-          "default": null,
-          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
-          "title": "Id"
+          "default": null,
+          "description": "A free-text description of the entity.",
+          "title": "Description"
         },
-        "label": {
+        "digest": {
           "anyOf": [
             {
+              "pattern": "^[0-9A-Za-z_\\-]{32}$",
               "type": "string"
             },
             {
@@ -762,21 +1141,23 @@
             }
           ],
           "default": null,
-          "description": "A primary label for the entity.",
-          "title": "Label"
+          "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.",
+          "title": "Digest"
         },
-        "description": {
+        "expressions": {
           "anyOf": [
             {
-              "type": "string"
+              "items": {
+                "$ref": "#/$defs/ga4gh__vrs___internal__models__Expression"
+              },
+              "type": "array"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "description": "A free-text description of the entity.",
-          "title": "Description"
+          "title": "Expressions"
         },
         "extensions": {
           "anyOf": [
@@ -793,20 +1174,9 @@
           "default": null,
           "title": "Extensions"
         },
-        "type": {
-          "const": "Allele",
-          "default": "Allele",
-          "description": "MUST be \"Allele\"",
-          "enum": [
-            "Allele"
-          ],
-          "title": "Type",
-          "type": "string"
-        },
-        "digest": {
+        "id": {
           "anyOf": [
             {
-              "pattern": "^[0-9A-Za-z_\\-]{32}$",
               "type": "string"
             },
             {
@@ -814,23 +1184,21 @@
             }
           ],
           "default": null,
-          "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.",
-          "title": "Digest"
+          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
+          "title": "Id"
         },
-        "expressions": {
+        "label": {
           "anyOf": [
             {
-              "items": {
-                "$ref": "#/$defs/ga4gh__vrs___internal__models__Expression"
-              },
-              "type": "array"
+              "type": "string"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "title": "Expressions"
+          "description": "A primary label for the entity.",
+          "title": "Label"
         },
         "location": {
           "anyOf": [
@@ -855,6 +1223,13 @@
           ],
           "description": "An expression of the sequence state",
           "title": "State"
+        },
+        "type": {
+          "const": "Allele",
+          "default": "Allele",
+          "description": "MUST be \"Allele\"",
+          "title": "Type",
+          "type": "string"
         }
       },
       "required": [
@@ -870,10 +1245,6 @@
         "syntax": {
           "$ref": "#/$defs/Syntax"
         },
-        "value": {
-          "title": "Value",
-          "type": "string"
-        },
         "syntax_version": {
           "anyOf": [
             {
@@ -885,6 +1256,10 @@
           ],
           "default": null,
           "title": "Syntax Version"
+        },
+        "value": {
+          "title": "Value",
+          "type": "string"
         }
       },
       "required": [
@@ -897,7 +1272,7 @@
     "ga4gh__vrs___internal__models__Haplotype": {
       "description": "A set of non-overlapping Allele members that co-occur on the same molecule.",
       "properties": {
-        "id": {
+        "description": {
           "anyOf": [
             {
               "type": "string"
@@ -907,12 +1282,13 @@
             }
           ],
           "default": null,
-          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
-          "title": "Id"
+          "description": "A free-text description of the entity.",
+          "title": "Description"
         },
-        "label": {
+        "digest": {
           "anyOf": [
             {
+              "pattern": "^[0-9A-Za-z_\\-]{32}$",
               "type": "string"
             },
             {
@@ -920,21 +1296,23 @@
             }
           ],
           "default": null,
-          "description": "A primary label for the entity.",
-          "title": "Label"
+          "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.",
+          "title": "Digest"
         },
-        "description": {
+        "expressions": {
           "anyOf": [
             {
-              "type": "string"
+              "items": {
+                "$ref": "#/$defs/ga4gh__vrs___internal__models__Expression"
+              },
+              "type": "array"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "description": "A free-text description of the entity.",
-          "title": "Description"
+          "title": "Expressions"
         },
         "extensions": {
           "anyOf": [
@@ -951,20 +1329,9 @@
           "default": null,
           "title": "Extensions"
         },
-        "type": {
-          "const": "Haplotype",
-          "default": "Haplotype",
-          "description": "MUST be \"Haplotype\"",
-          "enum": [
-            "Haplotype"
-          ],
-          "title": "Type",
-          "type": "string"
-        },
-        "digest": {
+        "id": {
           "anyOf": [
             {
-              "pattern": "^[0-9A-Za-z_\\-]{32}$",
               "type": "string"
             },
             {
@@ -972,23 +1339,21 @@
             }
           ],
           "default": null,
-          "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.",
-          "title": "Digest"
+          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
+          "title": "Id"
         },
-        "expressions": {
+        "label": {
           "anyOf": [
             {
-              "items": {
-                "$ref": "#/$defs/ga4gh__vrs___internal__models__Expression"
-              },
-              "type": "array"
+              "type": "string"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "title": "Expressions"
+          "description": "A primary label for the entity.",
+          "title": "Label"
         },
         "members": {
           "description": "A list of Alleles (or IRI references to `Alleles`) that comprise a Haplotype. Since each `Haplotype` member MUST be an `Allele`, and all members MUST share a common `SequenceReference`, implementations MAY use a compact representation of Haplotype that omits type and `SequenceReference` information in individual Haplotype members. Implementations MUST transform compact `Allele` representations into an `Allele` when computing GA4GH identifiers.",
@@ -1005,6 +1370,13 @@
           "minItems": 2,
           "title": "Members",
           "type": "array"
+        },
+        "type": {
+          "const": "Haplotype",
+          "default": "Haplotype",
+          "description": "MUST be \"Haplotype\"",
+          "title": "Type",
+          "type": "string"
         }
       },
       "required": [
@@ -1016,7 +1388,7 @@
     "ga4gh__vrs___internal__models__LiteralSequenceExpression": {
       "description": "An explicit expression of a Sequence.",
       "properties": {
-        "id": {
+        "description": {
           "anyOf": [
             {
               "type": "string"
@@ -1026,23 +1398,25 @@
             }
           ],
           "default": null,
-          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
-          "title": "Id"
+          "description": "A free-text description of the entity.",
+          "title": "Description"
         },
-        "label": {
+        "extensions": {
           "anyOf": [
             {
-              "type": "string"
+              "items": {
+                "$ref": "#/$defs/ga4gh__core___internal__models__Extension"
+              },
+              "type": "array"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "description": "A primary label for the entity.",
-          "title": "Label"
+          "title": "Extensions"
         },
-        "description": {
+        "id": {
           "anyOf": [
             {
               "type": "string"
@@ -1052,41 +1426,32 @@
             }
           ],
           "default": null,
-          "description": "A free-text description of the entity.",
-          "title": "Description"
+          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
+          "title": "Id"
         },
-        "extensions": {
+        "label": {
           "anyOf": [
             {
-              "items": {
-                "$ref": "#/$defs/ga4gh__core___internal__models__Extension"
-              },
-              "type": "array"
+              "type": "string"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "title": "Extensions"
+          "description": "A primary label for the entity.",
+          "title": "Label"
+        },
+        "sequence": {
+          "$ref": "#/$defs/SequenceString",
+          "description": "the literal sequence"
         },
         "type": {
           "const": "LiteralSequenceExpression",
           "default": "LiteralSequenceExpression",
           "description": "MUST be \"LiteralSequenceExpression\"",
-          "enum": [
-            "LiteralSequenceExpression"
-          ],
           "title": "Type",
           "type": "string"
-        },
-        "sequence": {
-          "allOf": [
-            {
-              "$ref": "#/$defs/SequenceString"
-            }
-          ],
-          "description": "the literal sequence"
         }
       },
       "required": [
@@ -1098,7 +1463,7 @@
     "ga4gh__vrs___internal__models__SequenceLocation": {
       "description": "A `Location` defined by an interval on a referenced `Sequence`.",
       "properties": {
-        "id": {
+        "description": {
           "anyOf": [
             {
               "type": "string"
@@ -1108,12 +1473,13 @@
             }
           ],
           "default": null,
-          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
-          "title": "Id"
+          "description": "A free-text description of the entity.",
+          "title": "Description"
         },
-        "label": {
+        "digest": {
           "anyOf": [
             {
+              "pattern": "^[0-9A-Za-z_\\-]{32}$",
               "type": "string"
             },
             {
@@ -1121,21 +1487,20 @@
             }
           ],
           "default": null,
-          "description": "A primary label for the entity.",
-          "title": "Label"
+          "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.",
+          "title": "Digest"
         },
-        "description": {
+        "end": {
           "anyOf": [
             {
-              "type": "string"
+              "$ref": "#/$defs/Range"
             },
             {
-              "type": "null"
+              "type": "integer"
             }
           ],
-          "default": null,
-          "description": "A free-text description of the entity.",
-          "title": "Description"
+          "description": "The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than the value of `start`.",
+          "title": "End"
         },
         "extensions": {
           "anyOf": [
@@ -1152,20 +1517,22 @@
           "default": null,
           "title": "Extensions"
         },
-        "type": {
-          "const": "SequenceLocation",
-          "default": "SequenceLocation",
-          "description": "MUST be \"SequenceLocation\"",
-          "enum": [
-            "SequenceLocation"
+        "id": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
           ],
-          "title": "Type",
-          "type": "string"
+          "default": null,
+          "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is  unique within a given system. The identified entity may have a different 'id' in a different  system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).",
+          "title": "Id"
         },
-        "digest": {
+        "label": {
           "anyOf": [
             {
-              "pattern": "^[0-9A-Za-z_\\-]{32}$",
               "type": "string"
             },
             {
@@ -1173,8 +1540,8 @@
             }
           ],
           "default": null,
-          "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.",
-          "title": "Digest"
+          "description": "A primary label for the entity.",
+          "title": "Label"
         },
         "sequenceReference": {
           "anyOf": [
@@ -1204,17 +1571,12 @@
           "description": "The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range less than the value of `end`.",
           "title": "Start"
         },
-        "end": {
-          "anyOf": [
-            {
-              "$ref": "#/$defs/Range"
-            },
-            {
-              "type": "integer"
-            }
-          ],
-          "description": "The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than the value of `start`.",
-          "title": "End"
+        "type": {
+          "const": "SequenceLocation",
+          "default": "SequenceLocation",
+          "description": "MUST be \"SequenceLocation\"",
+          "title": "Type",
+          "type": "string"
         }
       },
       "required": [
@@ -1227,28 +1589,82 @@
   },
   "description": "Provide all mapped scores for a scoreset.",
   "properties": {
+    "error_message": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "title": "Error Message"
+    },
+    "mapped_date": {
+      "anyOf": [
+        {
+          "format": "date-time",
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "title": "Mapped Date"
+    },
+    "mapped_scores": {
+      "anyOf": [
+        {
+          "items": {
+            "$ref": "#/$defs/ScoreAnnotation"
+          },
+          "type": "array"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "title": "Mapped Scores"
+    },
     "metadata": {
       "title": "Metadata"
     },
-    "computed_reference_sequence": {
-      "$ref": "#/$defs/ComputedReferenceSequence"
-    },
-    "mapped_reference_sequence": {
-      "$ref": "#/$defs/MappedReferenceSequence"
+    "reference_sequences": {
+      "anyOf": [
+        {
+          "additionalProperties": {
+            "$ref": "#/$defs/TargetAnnotation"
+          },
+          "type": "object"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "title": "Reference Sequences"
     },
-    "mapped_scores": {
-      "items": {
-        "$ref": "#/$defs/ScoreAnnotation"
-      },
-      "title": "Mapped Scores",
-      "type": "array"
+    "target_mappings": {
+      "anyOf": [
+        {
+          "items": {
+            "$ref": "#/$defs/TargetMapping"
+          },
+          "type": "array"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "title": "Target Mappings"
     }
   },
   "required": [
-    "metadata",
-    "computed_reference_sequence",
-    "mapped_reference_sequence",
-    "mapped_scores"
+    "metadata"
   ],
   "title": "ScoresetMapping",
   "type": "object"
diff --git a/scripts/generate_schema.py b/scripts/generate_schema.py
new file mode 100644
index 0000000..f25f25f
--- /dev/null
+++ b/scripts/generate_schema.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+"""Regenerate schema.json from the current Pydantic models.
+
+Run from the repository root after any change to src/dcd_mapping/schemas.py
+that alters the public output contract:
+
+    python scripts/generate_schema.py
+
+The script writes the JSON Schema for :class:`~dcd_mapping.schemas.ScoresetMapping`
+to ``schema.json`` at the repository root, overwriting the previous file.
+
+If you change schemas.py, re-run this script and commit the result.
+"""
+import json
+import pathlib
+
+from dcd_mapping.schemas import ScoresetMapping
+
+REPO_ROOT = pathlib.Path(__file__).parent.parent
+SCHEMA_PATH = REPO_ROOT / "schema.json"
+
+
+def main() -> None:
+    """Generate JSON Schema for ScoresetMapping and write to file."""
+    schema = ScoresetMapping.model_json_schema()
+    SCHEMA_PATH.write_text(json.dumps(schema, indent=2, sort_keys=True) + "\n")
+    print(f"Written: {SCHEMA_PATH}")  # noqa: T201
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/api/routers/map.py b/src/api/routers/map.py
index 9235061..cc0149b 100644
--- a/src/api/routers/map.py
+++ b/src/api/routers/map.py
@@ -2,17 +2,14 @@
 import logging
 from pathlib import Path
 
-from cool_seq_tool.schemas import AnnotationLayer
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import JSONResponse
 from httpx import HTTPStatusError
 
 from dcd_mapping.align import build_alignment_result
 from dcd_mapping.annotate import (
-    _get_computed_reference_sequence,
-    _get_mapped_reference_sequence,
-    _set_scoreset_layer,
     annotate,
+    build_scoreset_mapping,
     compute_target_gene_info,
 )
 from dcd_mapping.exceptions import (
@@ -34,11 +31,8 @@
     with_mavedb_score_set,
 )
 from dcd_mapping.schemas import (
-    ScoreAnnotation,
+    AlignmentResult,
     ScoresetMapping,
-    TargetAnnotation,
-    TargetType,
-    TxSelectResult,
     VrsVersion,
 )
 from dcd_mapping.transcripts import select_transcripts
@@ -60,6 +54,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
     :param store_path: optional path to save output at
     """
     try:
+        raw_metadata = get_raw_scoreset_metadata(urn, store_path)
         metadata = get_scoreset_metadata(urn, store_path)
         records = get_scoreset_records(metadata, True, store_path)
         metadata = patch_target_sequence_type(metadata, records, force=False)
@@ -79,7 +74,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
     if not records:
         return JSONResponse(
             content=ScoresetMapping(
-                metadata=metadata,
+                metadata=raw_metadata,
                 error_message="Score set contains no variants to map",
             ).model_dump(exclude_none=True)
         )
@@ -99,14 +94,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
         _logger.error("Alignment error for %s: %s", urn, e)
         return JSONResponse(
             content=ScoresetMapping(
-                metadata=metadata, error_message=str(e).strip("'")
+                metadata=raw_metadata, error_message=str(e).strip("'")
             ).model_dump(exclude_none=True)
         )
     except ScoresetNotSupportedError as e:
         _logger.error("Scoreset not supported during alignment for %s: %s", urn, e)
         return JSONResponse(
             content=ScoresetMapping(
-                metadata=metadata, error_message=str(e).strip("'")
+                metadata=raw_metadata, error_message=str(e).strip("'")
             ).model_dump(exclude_none=True)
         )
 
@@ -127,15 +122,18 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
         raise HTTPException(status_code=500, detail=msg) from e
 
     vrs_results = {}
+    protein_align_results: dict[str, AlignmentResult | None] = {}
     try:
         for target_gene in metadata.target_genes:
-            vrs_results[target_gene] = vrs_map(
+            vrs_map_result = vrs_map(
                 metadata=metadata.target_genes[target_gene],
                 align_result=alignment_results[target_gene],
                 records=records[target_gene],
                 transcript=transcripts[target_gene],
                 silent=True,
             )
+            vrs_results[target_gene] = vrs_map_result.mappings
+            protein_align_results[target_gene] = vrs_map_result.protein_align_result
     except (
         UnsupportedReferenceSequenceNameSpaceError,
         VrsMapError,
@@ -145,7 +143,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
         _logger.error("VRS mapping error for %s: %s", urn, e)
         return JSONResponse(
             content=ScoresetMapping(
-                metadata=metadata, error_message=str(e).strip("'")
+                metadata=raw_metadata, error_message=str(e).strip("'")
             ).model_dump(exclude_none=True)
         )
 
@@ -158,14 +156,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
     if not vrs_results or all(nonetype_vrs_results):
         return JSONResponse(
             content=ScoresetMapping(
-                metadata=metadata,
+                metadata=raw_metadata,
                 error_message="No variant mappings available for this score set",
             ).model_dump(exclude_none=True)
         )
     if any(nonetype_vrs_results):
         return JSONResponse(
             content=ScoresetMapping(
-                metadata=metadata,
+                metadata=raw_metadata,
                 error_message="Some variants generated vrs results, but not all. If any variants were mapped, all should have been.",
             ).model_dump(exclude_none=True)
         )
@@ -184,7 +182,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
         _logger.error("Unexpected error during annotation for %s: %s", urn, e)
         return JSONResponse(
             content=ScoresetMapping(
-                metadata=metadata, error_message=str(e).strip("'")
+                metadata=raw_metadata, error_message=str(e).strip("'")
             ).model_dump(exclude_none=True)
         )
 
@@ -197,125 +195,57 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
     if not annotated_vrs_results or all(nonetype_annotated_vrs_results):
         return JSONResponse(
             content=ScoresetMapping(
-                metadata=metadata,
+                metadata=raw_metadata,
                 error_message="No annotated variant mappings available for this score set",
             ).model_dump(exclude_none=True)
         )
     if any(nonetype_annotated_vrs_results):
         return JSONResponse(
             content=ScoresetMapping(
-                metadata=metadata,
+                metadata=raw_metadata,
                 error_message="Some variants generated annotated vrs results, but not all. If any variants were annotated, all should have been.",
             ).model_dump(exclude_none=True)
         )
 
     try:
-        raw_metadata = get_raw_scoreset_metadata(urn, store_path)
-        reference_sequences: dict[str, TargetAnnotation] = {}
-        mapped_scores: list[ScoreAnnotation] = []
+        gene_info = {}
         for target_gene in annotated_vrs_results:
-            preferred_layers = {
-                _set_scoreset_layer(urn, annotated_vrs_results[target_gene]),
-            }
-            target_gene_name = metadata.target_genes[target_gene].target_gene_name
-            reference_sequences[target_gene_name] = TargetAnnotation()
-            reference_sequences[target_gene_name].layers = {
-                layer: {
-                    "computed_reference_sequence": None,
-                    "mapped_reference_sequence": None,
-                }
-                for layer in preferred_layers
-            }
-
-            # sometimes Nonetype layers show up in preferred layers dict; remove these
-            preferred_layers.discard(None)
-
-            # Determine one gene symbol per target and its selection method
-            gene_info = await compute_target_gene_info(
+            gene_info[target_gene] = await compute_target_gene_info(
                 target_key=target_gene,
                 transcripts=transcripts,
                 alignment_results=alignment_results,
                 metadata=metadata,
                 mapped_scores=annotated_vrs_results[target_gene],
             )
-
-            for layer in preferred_layers:
-                reference_sequences[target_gene_name].layers[layer][
-                    "computed_reference_sequence"
-                ] = _get_computed_reference_sequence(
-                    metadata.target_genes[target_gene], layer, transcripts[target_gene]
-                )
-                reference_sequences[target_gene_name].layers[layer][
-                    "mapped_reference_sequence"
-                ] = _get_mapped_reference_sequence(
-                    metadata.target_genes[target_gene],
-                    layer,
-                    transcripts[target_gene],
-                    alignment_results[target_gene],
-                )
-
-            if gene_info is not None:
-                reference_sequences[target_gene_name].gene_info = gene_info
-
-            for m in annotated_vrs_results[target_gene]:
-                if m.pre_mapped is None:
-                    mapped_scores.append(ScoreAnnotation(**m.model_dump()))
-                elif m.annotation_layer in preferred_layers:
-                    # drop annotation layer from mapping object
-                    mapped_scores.append(ScoreAnnotation(**m.model_dump()))
-
-            # if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict
-            if (
-                AnnotationLayer.GENOMIC in reference_sequences[target_gene_name].layers
-                and metadata.target_genes[target_gene].target_gene_category
-                == TargetType.PROTEIN_CODING
-                and metadata.target_genes[target_gene].target_accession_id is None
-                and transcripts[target_gene] is not None
-                and isinstance(transcripts[target_gene], TxSelectResult)
-                and transcripts[target_gene].nm is not None
-            ):
-                reference_sequences[target_gene_name].layers[AnnotationLayer.CDNA] = {
-                    "computed_reference_sequence": None,
-                    "mapped_reference_sequence": {
-                        "sequence_accessions": [transcripts[target_gene].nm]
-                    },
-                }
-
-            # drop Nonetype reference sequences
-            for target_gene in reference_sequences:
-                for layer in list(reference_sequences[target_gene].layers.keys()):
-                    if (
-                        reference_sequences[target_gene].layers[layer][
-                            "mapped_reference_sequence"
-                        ]
-                        is None
-                        and reference_sequences[target_gene].layers[layer][
-                            "computed_reference_sequence"
-                        ]
-                        is None
-                    ) or layer is None:
-                        del reference_sequences[target_gene].layers[layer]
-
+        output = build_scoreset_mapping(
+            metadata=metadata,
+            mappings=annotated_vrs_results,
+            align_results=alignment_results,
+            tx_output=transcripts,
+            gene_info=gene_info,
+            preferred_layer_only=True,
+            vrs_version=VrsVersion.V_2,
+            raw_metadata=raw_metadata,
+            protein_align_results=protein_align_results,
+        )
     except Exception as e:
         _logger.error("Unexpected error during result assembly for %s: %s", urn, e)
         return JSONResponse(
             content=ScoresetMapping(
-                metadata=metadata, error_message=str(e).strip("'")
+                metadata=raw_metadata, error_message=str(e).strip("'")
             ).model_dump(exclude_none=True)
         )
 
-    if len(mapped_scores) != total_score_records:
+    # With preferred_layer_only=True, build_scoreset_mapping emits exactly one
+    # mapped_score per input variant: preferred-layer successes/failures plus
+    # completely-failed variants (annotation_layer=None) re-attributed to the
+    # preferred layer.  The count must equal total_score_records.
+    if len(output.mapped_scores or []) != total_score_records:
         return JSONResponse(
             content=ScoresetMapping(
-                metadata=metadata,
-                error_message=f"Mismatch between number of mapped scores ({len(mapped_scores)}) and total score records ({total_score_records}). This is unexpected and indicates an issue with the mapping process.",
+                metadata=raw_metadata,
+                error_message=f"Mismatch between number of mapped scores ({len(output.mapped_scores or [])}) and total score records ({total_score_records}). This is unexpected and indicates an issue with the mapping process.",
             ).model_dump(exclude_none=True)
         )
 
-    return JSONResponse(
-        content=ScoresetMapping(
-            metadata=raw_metadata,
-            reference_sequences=reference_sequences,
-            mapped_scores=mapped_scores,
-        ).model_dump(exclude_none=True)
-    )
+    return JSONResponse(content=output.model_dump(mode="json", exclude_none=True))
diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py
index 6002dea..de547af 100644
--- a/src/dcd_mapping/align.py
+++ b/src/dcd_mapping/align.py
@@ -1,6 +1,8 @@
 """Align MaveDB target sequences to a human reference genome."""
+import functools
 import logging
 import os
+import shlex
 import subprocess
 import tempfile
 from collections.abc import Mapping
@@ -8,10 +10,8 @@
 from urllib.parse import urlparse
 
 import httpx
-from Bio.SearchIO import HSP
-from Bio.SearchIO import parse as parse_blat
-from Bio.SearchIO import read as read_blat
-from Bio.SearchIO._model import Hit, QueryResult
+from Bio import Align as BioAlign
+from Bio.Seq import Seq, UndefinedSequenceError
 from cool_seq_tool.schemas import Strand
 
 from dcd_mapping.exceptions import (
@@ -24,6 +24,7 @@
 from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, patch_target_sequence_type
 from dcd_mapping.resource_utils import CDOT_URL, http_download
 from dcd_mapping.schemas import (
+    AlignmentQc,
     AlignmentResult,
     GeneLocation,
     ScoresetMetadata,
@@ -34,8 +35,47 @@
 
 __all__ = ["align"]
 
+
 _logger = logging.getLogger(__name__)
 
+# The assembly name is a derived property of the 2bit file built into the Docker image—
+# if the URL is ever changed to another build, update REFERENCE_GENOME_ASSEMBLY here as
+# well so the audit field in AlignmentResult.reference_assembly stays accurate.
+REFERENCE_GENOME_URL = (
+    "https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.2bit"
+)
+REFERENCE_GENOME_ASSEMBLY = "GRCh38"
+
+# BLAT invocation parameters
+BLAT_MIN_SCORE = 20
+BLAT_OUT_FORMAT = "pslx"
+
+
+@functools.lru_cache
+def _get_blat_version(bin_name: str) -> str:
+    """Return BLAT's self-reported version line by invoking the binary with no arguments.
+
+    BLAT prints usage text to stdout when called without args; the first line
+    typically reads: ``blat - Standalone BLAT v. 36 fast sequence search command line tool``
+    We return that line verbatim rather than trying to parse a version number out of it,
+    so the record is robust to BLAT changing its version string format.
+    Result is cached per binary path so we only probe once per process.
+    Returns ``"unknown"`` if the binary cannot be invoked or produces no output,
+    so the audit field is always a string rather than null.
+    """
+    try:
+        #
+        result = subprocess.run([bin_name], capture_output=True, timeout=5)  # noqa: S603
+        output = result.stdout.decode("utf-8", errors="replace")
+        first_line = output.splitlines()[0].strip() if output.strip() else None
+        if first_line:
+            return first_line
+    except (OSError, IndexError):
+        _logger.debug(
+            "Could not determine BLAT version for %s", bin_name, exc_info=True
+        )
+    return "unknown"
+
 
 def _write_query_file(file: Path, lines: list[str]) -> None:
     """Write lines to query file. This method is broken out to enable easy mocking while
@@ -61,6 +101,7 @@ def _build_query_file(scoreset_metadata: ScoresetMetadata, query_file: Path) ->
     for target_gene in scoreset_metadata.target_genes:
         lines.append(f">{target_gene}")
         lines.append(scoreset_metadata.target_genes[target_gene].target_sequence)
+
     _write_query_file(query_file, lines)
     return query_file
 
@@ -76,12 +117,13 @@ def get_ref_genome_file(
     :return: path to acquired file
     :raise ResourceAcquisitionError: if unable to acquire file.
     """
-    url = "https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.2bit"
+    url = REFERENCE_GENOME_URL
     parsed_url = urlparse(url)
     if not dcd_mapping_dir:
         dcd_mapping_dir = LOCAL_STORE_PATH
-    genome_file = dcd_mapping_dir / Path(parsed_url.path).name
+
     # this file shouldn't change, so no need to think about more advanced caching
+    genome_file = dcd_mapping_dir / Path(parsed_url.path).name
     if not genome_file.exists():
         try:
             http_download(url, genome_file, silent)
@@ -89,6 +131,7 @@ def get_ref_genome_file(
             msg = f"HTTPError when fetching reference genome file from {url}"
             _logger.error(msg)
             raise ResourceAcquisitionError(msg) from e
+
     return genome_file
 
 
@@ -98,8 +141,13 @@ def _run_blat(
     reference_file: Path,
     out_file: str,
     silent: bool,
-) -> subprocess.CompletedProcess:
-    """Execute BLAT binary with relevant params.
+    min_score: int = BLAT_MIN_SCORE,
+    out_format: str = BLAT_OUT_FORMAT,
+) -> tuple[subprocess.CompletedProcess, dict]:
+    """Execute BLAT binary with the given parameters.
+
+    Returns both the process result and a parameters dict built from the exact
+    values passed in — so the QC record is inseparably paired with what ran.
 
     Currently, we rely on a system-installed BLAT binary accessible in the containing
     environment's PATH, or under env var ``BLAT_BIN_PATH``. This is sort of awkward and
@@ -110,31 +158,64 @@ def _run_blat(
 
     :param target_args: target params eg ``"-q=prot -t=dnax"`` (can be empty)
     :param query_file: path to query FASTA file
-    :param out_file: path-like string to output fill (could be "/dev/stdout")
+    :param out_file: path-like string to output file (could be "/dev/stdout")
     :param silent: if True, suppress all console output
-    :return: process result
+    :param min_score: value passed as -minScore to BLAT
+    :param out_format: value passed as -out to BLAT
+    :return: (process result, aligner_parameters dict)
     """
-    bin_name = os.environ["BLAT_BIN_PATH"] if "BLAT_BIN_PATH" in os.environ else "blat"  # noqa: SIM401
-    command = f"{bin_name} {reference_file} {target_args} -minScore=20 {query_file} {out_file}"
-    _logger.debug("Running BLAT command: %s", command)
-    result = subprocess.run(  # noqa: UP022
-        command,
-        shell=True,  # noqa: S602
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+    bin_name = os.environ.get("BLAT_BIN_PATH", "blat")
+
+    # -out=pslx emits PSL plus per-block aligned bases (tseqs/qseqs). Biopython's
+    # psl parser surfaces those as aln.target.seq / aln.query.seq, which is
+    # what we need to report real per-base mismatches and gaps on the output.
+    # target_args is a space-separated string of BLAT flags (e.g. "-q=prot -t=dnax");
+    # all values originate from internal constants, so shlex.split is safe here.
+    cmd = [bin_name, str(reference_file)]
+    if target_args:
+        cmd.extend(shlex.split(target_args))
+
+    cmd.extend(
+        [f"-minScore={min_score}", f"-out={out_format}", str(query_file), out_file]
     )
+    _logger.debug("Running BLAT command: %s", " ".join(cmd))
+
+    try:
+        result = subprocess.run(
+            cmd,
+            shell=False,  # noqa: S603 -- BLAT args are all internally derived constants, not user input, so shell=False is safe here
+            capture_output=True,
+            timeout=600,
+        )
+    except subprocess.TimeoutExpired as e:
+        msg = f"BLAT timed out after 600 s: {target_args} {query_file} {out_file}"
+        raise AlignmentError(msg) from e
+    except FileNotFoundError as e:
+        raise BlatNotFoundError from e
     _logger.debug("BLAT command finished with result %s", result.returncode)
+
     if result.returncode == 127:
         raise BlatNotFoundError
     if result.returncode != 0:
         msg = f"BLAT process returned error code {result.returncode}: {target_args} {query_file} {out_file}"
         raise AlignmentError(msg)
-    return result
+
+    params = {
+        "aligner": "blat",
+        "aligner_version": _get_blat_version(bin_name),
+        "min_score": min_score,
+        "out_format": out_format,
+        "target_args": target_args,
+    }
+    return result, params
 
 
 def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str:
-    """Create temp BLAT output file. Not immediately deleted, but should eventually
-    be cleared by the OS.
+    """Write BLAT pslx stdout to a tempfile and return its path.
+
+    BLAT is invoked with ``-out=pslx``, so the file contains the standard 21
+    PSL columns plus 2 extra columns (tseqs, qseqs) of aligned bases.
+    ``Bio.Align.parse(..., "psl")`` reads this directly.
 
     :param result: BLAT process result object
     :return: path-like string representing file location
@@ -145,6 +226,379 @@ def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str:
     return tmp.name
 
 
+def _count_query_insert_blocks(aln: "BioAlign.Alignment") -> int:
+    """Count gap-open events where extra bases appear in the query (query inserts).
+
+    Corresponds to PSL ``qNumInsert`` — the **number of gap-opening blocks**
+    in the query, not the total number of inserted bases (that would be
+    ``qBaseInsert``).  A single contiguous run of inserted query bases counts
+    as one, regardless of how many bases it spans.
+    (PSL field reference: https://genome.ucsc.edu/FAQ/FAQformat.html#format2)
+
+    Used for BLAT-style identity and score, which penalize query inserts but
+    ignore target inserts (introns).
+    """
+    coords = aln.coordinates
+    n = 0
+    for i in range(coords.shape[1] - 1):
+        ts, te = int(coords[0][i]), int(coords[0][i + 1])
+        qs, qe = int(coords[1][i]), int(coords[1][i + 1])
+        if ts == te and qs != qe:
+            n += 1
+
+    return n
+
+
+def _count_target_insert_blocks(aln: "BioAlign.Alignment") -> int:
+    """Count gap-open events where the target advances but the query does not.
+
+    Corresponds to PSL ``tNumInsert`` — the **number of gap-opening blocks**
+    in the target, not the total bases skipped (that would be
+    ``tBaseInsert``).  A single contiguous deleted region in the query counts
+    as one block, regardless of its length.
+    (PSL field reference: https://genome.ucsc.edu/FAQ/FAQformat.html#format2)
+
+    Together with ``qNumInsert`` these make up the insert penalty in the
+    canonical BLAT PSL score formula.
+    """
+    coords = aln.coordinates
+    n = 0
+    for i in range(coords.shape[1] - 1):
+        ts, te = int(coords[0][i]), int(coords[0][i + 1])
+        qs, qe = int(coords[1][i]), int(coords[1][i + 1])
+        if qs == qe and ts != te:
+            n += 1
+
+    return n
+
+
+def _blat_score(aln: "BioAlign.Alignment") -> int:
+    """Compute the BLAT PSL score for a single HSP.
+
+    Equivalent to the ``score`` column that BLAT writes to PSL output::
+
+        score = matches - misMatches - qNumInsert - tNumInsert
+
+    This formula implements the canonical PSL scoring described in the UCSC
+    BLAT documentation (https://genome.ucsc.edu/FAQ/FAQblat.html#blat4) and
+    matches the ``score`` field definition in the PSL format spec
+    (https://genome.ucsc.edu/FAQ/FAQformat.html#format2):
+    ``matches - misMatches - qNumInsert - tNumInsert``.
+
+    **Block count, not base count:** ``qNumInsert`` and ``tNumInsert`` are
+    gap-open event counts (one per contiguous inserted run), not total inserted
+    bases (those are ``qBaseInsert`` / ``tBaseInsert`` in PSL).  The helpers
+    :func:`_count_query_insert_blocks` and :func:`_count_target_insert_blocks`
+    implement this correctly by counting coordinate transitions where one
+    dimension is zero-width — each such transition is exactly one gap-open
+    block in Biopython's alignment representation.
+
+    This was the original basis for best-HSP selection before the BioPython
+    port and correctly penalises both mismatches and gap-open events without
+    double-counting individual gap bases.  Penalising target inserts is safe
+    here because ``_get_best_hsp`` operates on a chromosome-filtered list
+    where target gaps do not represent tolerated intron splices.
+
+    :param aln: Bio.Align.Alignment representing a single BLAT HSP
+    :return: integer BLAT score (can be negative for very noisy alignments)
+    """
+    counts = aln.counts()
+    q_inserts = _count_query_insert_blocks(aln)
+    t_inserts = _count_target_insert_blocks(aln)
+    return int(counts.identities) - int(counts.mismatches) - q_inserts - t_inserts
+
+
+def _blat_style_identity(
+    identities: int, mismatches: int, query_insert_blocks: int
+) -> float | None:
+    """Compute BLAT-style percent identity: ``matches / (matches + mismatches + qNumInsert)``.
+
+    Penalizes real query indels but ignores target gaps (intron-style splits),
+    matching what BLAT itself reports for spliced cDNA-to-genome alignments.
+    """
+    denom = identities + mismatches + query_insert_blocks
+    if denom == 0:
+        _logger.warning(
+            "BLAT identity denominator is zero — alignment has no matches, mismatches, or query inserts."
+        )
+        return None
+
+    return 100.0 * identities / denom
+
+
+def _compact_alignment_string(aln_str: str) -> str:
+    """Collapse consecutive all-gap display blocks in a Biopython alignment string.
+
+    Biopython renders every 60-bp genomic window as a three-line display group
+    (target / match / query).  For cDNA→genome alignments the intronic regions
+    produce long runs of ``?`` characters in the target row and ``-`` in the
+    match/query rows, ballooning the string to tens of thousands of characters.
+
+    This function replaces each run of consecutive all-gap groups (where the
+    target sequence is entirely ``?``) with a single summary line::
+
+        ... [4,560 bp gap: chr14:90400123-90404683] ...
+
+    Groups that contain any real sequence characters are left untouched, so
+    exon-intron boundaries and the flanking partial-gap rows remain visible.
+
+    The function is a no-op on protein or purely genomic alignments that
+    contain no ``?`` characters.
+    """
+    groups = aln_str.split("\n\n")
+    output: list[str] = []
+
+    # State for the currently-accumulating gap run.
+    gap_chrom: str | None = None
+    gap_start: int | None = None
+    gap_end: int | None = None
+
+    def _flush_gap() -> None:
+        nonlocal gap_chrom, gap_start, gap_end
+        if gap_start is not None:
+            span = gap_end - gap_start  # type: ignore[operator]
+            output.append(
+                f"... [{span:,} bp gap: {gap_chrom}:{gap_start}-{gap_end}] ..."
+            )
+        gap_chrom = gap_start = gap_end = None
+
+    for group in groups:
+        lines = group.strip("\n").splitlines()
+        if len(lines) != 3:
+            _flush_gap()
+            if group.strip():
+                output.append(group)
+            continue
+
+        target_parts = lines[0].split()
+        # Expect at least: <chrom> <start> <sequence>
+        if len(target_parts) < 3:
+            _flush_gap()
+            output.append(group)
+            continue
+
+        try:
+            t_start = int(target_parts[1])
+        except ValueError:
+            _flush_gap()
+            output.append(group)
+            continue
+
+        seq = target_parts[2]
+
+        if all(c == "?" for c in seq):
+            # Pure-gap block — accumulate into the current run.
+            t_end = t_start + len(seq)
+            if gap_start is None:
+                gap_chrom = target_parts[0]
+                gap_start = t_start
+                gap_end = t_end
+            else:
+                gap_end = t_end
+        else:
+            _flush_gap()
+            output.append(group)
+
+    _flush_gap()
+    return "\n\n".join(output)
+
+
+def _log_alignment_summary(label: str, result: "AlignmentResult") -> None:
+    """Emit a single human-readable INFO log summarizing a finished alignment.
+
+    ``label`` identifies the alignment context (target gene name, or
+    ``"<gene> (protein)"`` for protein-to-protein BLAT). The pairwise
+    visualization is logged at DEBUG to keep INFO compact.
+    """
+    qc = result.alignment_qc
+    score = result.score
+    next_best = result.next_best_score
+    pct = result.percent_identity
+    chrom = result.chrom or "N/A"
+    span = (
+        f"{result.hit_range.start}-{result.hit_range.end}"
+        if result.hit_range is not None
+        else "N/A"
+    )
+    mismatches = qc.mismatch_count if qc is not None else "N/A"
+    gaps = qc.gap_count if qc is not None else "N/A"
+    aln_len = qc.alignment_length if qc is not None else "N/A"
+    cigar = qc.cigar if qc is not None else None
+
+    _logger.info(
+        "Alignment[%s]: chrom=%s span=%s strand=%s "
+        "score=%s next_best=%s identity=%s length=%s mismatches=%s gaps=%s%s",
+        label,
+        chrom,
+        span,
+        result.strand.value if result.strand is not None else "N/A",
+        f"{score:g}" if score is not None else "N/A",
+        f"{next_best:g}" if next_best is not None else "N/A",
+        f"{pct:.2f}%" if pct is not None else "N/A",
+        aln_len,
+        mismatches,
+        gaps,
+        f" cigar={cigar}" if cigar else "",
+    )
+    if qc is not None and qc.alignment_string:
+        _logger.debug("Alignment[%s] visualization:\n%s", label, qc.alignment_string)
+    else:
+        _logger.debug(
+            "Alignment[%s] visualization unavailable (BLAT output may have lacked aligned sequence content).",
+            label,
+        )
+
+
+def _cigar_from_coords(aln: "BioAlign.Alignment") -> str | None:
+    """Build a CIGAR string directly from alignment coordinates.
+
+    Biopython's SAM formatter raises ``ValueError: Unequal step sizes`` for
+    minus-strand PSL alignments because query coordinates are stored in
+    descending order (step = -1) while target steps are +1.  Building the
+    CIGAR ourselves from absolute step sizes avoids this entirely.
+    """
+    coords = aln.coordinates
+    assert coords.shape[0] == 2, (  # noqa: S101
+        f"_cigar_from_coords expects a pairwise alignment (shape[0]==2), got {coords.shape[0]}"
+    )
+    if coords.shape[1] < 2:
+        return None
+
+    parts: list[str] = []
+
+    # Leading soft clip: query bases before the first aligned block.
+    # min() handles both strand orientations (descending coords for minus-strand).
+    q_aligned_min = int(min(coords[1][0], coords[1][-1]))
+    if q_aligned_min > 0:
+        parts.append(f"{q_aligned_min}S")
+
+    for i in range(coords.shape[1] - 1):
+        t_step = abs(int(coords[0][i + 1]) - int(coords[0][i]))
+        q_step = abs(int(coords[1][i + 1]) - int(coords[1][i]))
+        # query insert: query advances, target does not
+        if t_step == 0 and q_step > 0:
+            parts.append(f"{q_step}I")
+        # deletion: target advances, query does not
+        elif t_step > 0 and q_step == 0:
+            parts.append(f"{t_step}D")
+        # aligned block: both advance (match or mismatch)
+        elif t_step > 0 and q_step > 0:
+            parts.append(f"{t_step}M")
+        # t_step == 0 and q_step == 0: degenerate zero-width column; skip
+
+    # Trailing soft clip.
+    q_aligned_max = int(max(coords[1][0], coords[1][-1]))
+    q_len = len(aln.query.seq)  # works even when .seq is UndefinedSequenceData
+    if q_len > q_aligned_max:
+        parts.append(f"{q_len - q_aligned_max}S")
+
+    return "".join(parts) if parts else None
+
+
+def _build_alignment_qc(aln: "BioAlign.Alignment") -> "AlignmentQc":
+    """Derive the per-(target, alignment_level) QC block from a Bio.Align
+    ``Alignment`` parsed out of BLAT's pslx output.
+
+    Aggregate stats come from ``Alignment.counts()``; CIGAR from
+    ``_cigar_from_coords()``; the pairwise visualization from ``str(aln)``.  A single
+    walk over ``.coordinates`` collects mismatch positions and gap intervals on
+    the target (genome) for downstream per-variant flagging; those lists live
+    in memory only and are not serialized.
+
+    When per-base sequence content is unavailable (e.g. BLAT pslx omitted
+    tseqs/qseqs for cDNA→genome runs), ``mismatch_count`` from
+    ``Alignment.counts()`` remains accurate but ``mismatch_positions`` will be
+    empty and ``mismatch_positions_unavailable`` will be ``True``.  Callers
+    must treat ``at_mismatched_locus`` as ``None`` (not evaluated) in that
+    situation to avoid a silent disagreement where ``mismatch_count > 0`` but
+    no variant is ever flagged.
+    """
+    counts = aln.counts()
+    coords = aln.coordinates
+    mismatch_positions: list[int] = []
+    gap_intervals: list[tuple[int, int]] = []
+    query_insert_blocks = 0
+
+    # Per-base mismatch detection requires concrete sequence content for both
+    # target and query. Biopython's pslx parser sometimes leaves bases outside
+    # aligned blocks (or even within, depending on BLAT output) as undefined
+    # placeholders. The aggregate mismatch *count* still comes from
+    # ``aln.counts()`` below; we treat per-position recording as best-effort
+    # and set mismatch_positions_unavailable=True if the sequence content isn't
+    # materialized so callers know at_mismatched_locus is unreliable.
+    mismatch_positions_unavailable = False
+    sequences_available = True
+    for i in range(coords.shape[1] - 1):
+        ts, te = int(coords[0][i]), int(coords[0][i + 1])
+        qs, qe = int(coords[1][i]), int(coords[1][i + 1])
+
+        # gap in reference (extra bases in query): zero-width interval
+        # at ts so variants exactly at ts are "near" it within any
+        # positive window. This is a query insert in PSL terms.
+        if ts == te and qs != qe:
+            gap_intervals.append((ts, ts))
+            query_insert_blocks += 1
+
+        # gap in query (target advances, query does not): deletion in the
+        # query relative to the reference.  This is a target insert in PSL
+        # terms (tNumInsert).  Record the full genomic span so variants
+        # overlapping or adjacent to the deleted region can be flagged as
+        # near_gap.
+        elif qs == qe and ts != te:
+            gap_intervals.append((min(ts, te), max(ts, te)))
+
+        # Aligned block -- target coords are always ascending (ts < te).
+        # For minus-strand query blocks (qe < qs), Biopython stores the
+        # query SeqRecord in plus-strand orientation; we need the reverse
+        # complement of the [qe:qs] slice to match the target bases.
+        elif ts != te and qs != qe and sequences_available:
+            try:
+                t_bases = str(aln.target.seq[ts:te])
+                if qe < qs:
+                    q_bases = str(aln.query.seq[qe:qs].reverse_complement())
+                else:
+                    q_bases = str(aln.query.seq[qs:qe])
+            except UndefinedSequenceError:
+                _logger.debug(
+                    "Skipping per-base mismatch positions: aligned-block "
+                    "sequence content unavailable in BLAT output. "
+                    "mismatch_count from counts() is still accurate; "
+                    "mismatch_positions_unavailable will be set to True."
+                )
+                mismatch_positions = []
+                mismatch_positions_unavailable = True
+                sequences_available = False
+                continue
+
+            for j, (tb, qb) in enumerate(zip(t_bases, q_bases, strict=True)):
+                if qb.upper() != tb.upper():
+                    mismatch_positions.append(ts + j)
+
+    cigar = _cigar_from_coords(aln)
+    percent_identity = _blat_style_identity(
+        counts.identities, counts.mismatches, query_insert_blocks
+    )
+
+    # PSL target coordinates are always on the + strand (ascending), so the
+    # target span equals the number of reference bases consumed by M+D ops —
+    # which is what alignment_length represents.  Using aln.length directly
+    # raises "Unequal step sizes" for minus-strand alignments because Biopython
+    # derives length from coordinate steps and rejects descending query strides.
+    alignment_length = int(coords[0][-1]) - int(coords[0][0])
+
+    return AlignmentQc(
+        percent_identity=percent_identity,
+        alignment_length=alignment_length,
+        mismatch_count=int(counts.mismatches),
+        gap_count=len(gap_intervals),
+        cigar=cigar,
+        alignment_string=_compact_alignment_string(str(aln)),
+        mismatch_positions_unavailable=mismatch_positions_unavailable,
+        mismatch_positions=mismatch_positions,
+        gap_intervals=gap_intervals,
+    )
+
+
 def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType | str:
     """Get overall target sequence type for a score set's target genes.
     Protein if all target sequences are protein sequences, nucleotide if all target
@@ -157,6 +611,7 @@ def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType
         target_sequence_types.add(
             metadata.target_genes[target_gene].target_sequence_type
         )
+
     if len(target_sequence_types) > 1:
         return "mixed"
     elif len(target_sequence_types) == 1:  # noqa: RET505
@@ -166,38 +621,72 @@ def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType
         raise ValueError(msg)
 
 
+def _group_alignments_by_query(
+    pslx_path: str,
+) -> dict[str, list["BioAlign.Alignment"]]:
+    """Parse a pslx file and group Bio.Align Alignments by query ID.
+
+    :param pslx_path: path to a pslx-format file produced by BLAT
+    :return: mapping of query ID to the list of all alignments for that query
+    :raise ValueError: propagated from Bio.Align.parse on malformed input
+    """
+    grouped: dict[str, list[BioAlign.Alignment]] = {}
+    for aln in BioAlign.parse(pslx_path, "psl"):
+        grouped.setdefault(str(aln.query.id), []).append(aln)
+
+    return grouped
+
+
 def _get_blat_output(
     metadata: ScoresetMetadata, silent: bool
-) -> dict[str, QueryResult]:
-    """Run a BLAT query and returns a path to the output object.
+) -> tuple[dict[str, list["BioAlign.Alignment"]], dict]:
+    """Run a BLAT query and return a per-gene grouping of Bio.Align Alignments.
 
-    If unable to produce a valid query the first time, then try a query using ``dnax``
-    bases.
+    If unable to produce a valid query the first time, retries with
+    ``-q=dnax -t=dnax`` flags.
 
-    :param scoreset_metadata: object containing scoreset attributes
+    :param metadata: object containing scoreset attributes
     :param silent: suppress BLAT command output
-    :return: dict where keys are target gene identifiers and values are BLAT query result objects
-    :raise AlignmentError: if BLAT subprocess returns error code
+    :return: ``(grouped, blat_params)`` — ``grouped`` maps each query gene
+        label to its alignments; ``blat_params`` is the parameters dict
+        returned directly by ``_run_blat`` from the successful invocation.
+    :raise AlignmentError: if BLAT subprocess returns error code or both
+        parse attempts fail.
     """
     with tempfile.NamedTemporaryFile() as tmp_file:
         query_file = _build_query_file(metadata, Path(tmp_file.name))
         target_sequence_type = _get_target_sequence_type(metadata)
+
         if target_sequence_type == TargetSequenceType.PROTEIN:
             target_args = "-q=prot -t=dnax"
         elif target_sequence_type == TargetSequenceType.DNA:
             target_args = ""
         else:
-            # TODO consider implementing support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments.
+            # TODO consider implementing support for mixed types
             msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported."
             raise NotImplementedError(msg)
+
+        _logger.info(
+            "Running BLAT for %s with mode: %s",
+            metadata.urn,
+            target_args if target_args else "default (nucleotide DNA)",
+        )
+
         reference_genome_file = get_ref_genome_file(silent=silent)
-        process_result = _run_blat(
-            target_args, query_file, reference_genome_file, "/dev/stdout", silent
+        process_result, blat_params = _run_blat(
+            target_args,
+            query_file,
+            reference_genome_file,
+            "/dev/stdout",
+            silent,
         )
-        out_file = _write_blat_output_tempfile(process_result)
+        pslx_path = _write_blat_output_tempfile(process_result)
 
         try:
-            output = parse_blat(out_file, "blat-psl")
+            grouped = _group_alignments_by_query(pslx_path)
+            if not grouped:
+                msg = f"BLAT produced no alignments for {metadata.urn} with mode: {target_args if target_args else 'default (nucleotide DNA)'}"
+                raise ValueError(msg)
 
         except ValueError:
             _logger.debug(
@@ -205,129 +694,234 @@ def _get_blat_output(
                 metadata.urn,
                 exc_info=True,
             )
-            target_args = "-q=dnax -t=dnax"
-            process_result = _run_blat(
-                target_args, query_file, reference_genome_file, "/dev/stdout", silent
+
+            process_result, blat_params = _run_blat(
+                "-q=dnax -t=dnax",
+                query_file,
+                reference_genome_file,
+                "/dev/stdout",
+                silent,
             )
-            out_file = _write_blat_output_tempfile(process_result)
+            pslx_path = _write_blat_output_tempfile(process_result)
+
             try:
-                output = parse_blat(out_file, "blat-psl")
+                grouped = _group_alignments_by_query(pslx_path)
+                if not grouped:
+                    msg = f"BLAT produced no alignments for {metadata.urn} after retry with -q=dnax -t=dnax"
+                    raise ValueError(msg)
+
             except ValueError as e:
                 msg = f"Unable to run successful BLAT on {metadata.urn}"
                 _logger.exception(msg=msg, exc_info=e)
                 raise AlignmentError(msg) from e
 
-    return output
+            _logger.info(
+                "BLAT -q=dnax -t=dnax retry succeeded for %s (%d query gene(s)).",
+                metadata.urn,
+                len(grouped),
+            )
+
+    return grouped, blat_params
 
 
-def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit:
-    """Get best hit from BLAT output.
+def _get_best_hit(
+    alignments: "list[BioAlign.Alignment]",
+    chromosome: str | None,
+    scores: dict[int, int] | None = None,
+) -> "list[BioAlign.Alignment]":
+    """Return the subset of alignments on the best-matching chromosome.
 
-    First, try to return hit corresponding to expected chromosome taken from scoreset
-    metadata. If chromosome doesn't match any of the outputs or is unavailable, take
-    the hit with the single highest-scoring HSP.
+    First, try to find alignments on the expected chromosome taken from scoreset
+    metadata.  If no chromosome matches or none is provided, fall back to the
+    chromosome whose best-scoring alignment has the highest BLAT score.
 
-    :param output: BLAT output
-    :param chromosome: refseq chromosome ID, e.g. ``"NC_000001.11"``
-    :return: best Hit
-    :raise AlignmentError: if unable to get hits from output
+    :param alignments: all alignments for one query gene
+    :param chromosome: RefSeq chromosome accession, e.g. ``"NC_000001.11"``
+    :param scores: optional pre-computed ``{id(aln): score}`` cache; when
+        provided avoids redundant ``aln.counts()`` walks inside the fallback.
+    :return: alignments filtered to the selected chromosome
+    :raise AlignmentError: if the alignment list is empty
     """
+    if not alignments:
+        msg = f"Empty alignment list passed to _get_best_hit for chromosome {chromosome or 'N/A'}."
+        raise AlignmentError(msg)
+
     if chromosome:
-        if chromosome.startswith("refseq"):
+        if chromosome.startswith("refseq:"):
             chromosome = chromosome[7:]
 
-        for hit in output:
-            hit_chr = hit.id
+        for aln in alignments:
+            hit_chr = str(aln.target.id)
             if hit_chr.startswith("chr"):
                 hit_chr = hit_chr[3:]
+
             hit_chr_ac = get_chromosome_identifier(hit_chr)
             if hit_chr_ac == chromosome:
-                return hit
-        else:
-            if list(output):
-                hit_chrs = [h.id for h in output]
-                # TODO should this be an error rather than a warning? it seems like a problem if we can't find a hit on the expected chromosome
-                _logger.warning(
-                    "Failed to match hit chromosomes during alignment for target %s. Expected chromosome: %s, hit chromosomes: %s",
-                    output.id,
-                    chromosome,
-                    hit_chrs,
-                )
-
-    best_score = 0
-    best_score_hit = None
-    for hit in output:
-        best_local_score = max(hit, key=lambda i: i.score).score
-        if best_local_score > best_score:
-            best_score = best_local_score
-            best_score_hit = hit
+                matched_chrom = str(aln.target.id)
+                return [a for a in alignments if str(a.target.id) == matched_chrom]
+
+        # No hit matched the expected chromosome; log and fall through.
+        hit_chrs = list({str(a.target.id) for a in alignments})
+        query_id = str(alignments[0].query.id)
+
+        _logger.warning(
+            "Failed to match hit chromosomes during alignment for target %s. "
+            "Expected chromosome: %s, hit chromosomes: %s",
+            query_id,
+            chromosome,
+            hit_chrs,
+        )
 
-    if best_score_hit is None:
-        msg = f"Couldn't get BLAT hits for target {output.id}."
+    # Fallback: find the chromosome with the highest-scoring alignment.
+    chrom_best: dict[str, float] = {}
+    for aln in alignments:
+        chrom = str(aln.target.id)
+        s = float(scores[id(aln)] if scores is not None else _blat_score(aln))
+        if chrom not in chrom_best or s > chrom_best[chrom]:
+            chrom_best[chrom] = s
+
+    if not chrom_best:
+        query_id = str(alignments[0].query.id) if alignments else "unknown"
+        msg = f"Couldn't determine best chromosome for target {query_id} — no valid alignments found."
         raise AlignmentError(msg)
 
-    return best_score_hit
+    best_chrom = max(chrom_best, key=chrom_best.__getitem__)
+    query_id = str(alignments[0].query.id)
 
+    _logger.info(
+        "Chromosome fallback selected %s for target %s (best identity score: %.1f, %d candidates).",
+        best_chrom,
+        query_id,
+        chrom_best[best_chrom],
+        len(chrom_best),
+    )
+    return [a for a in alignments if str(a.target.id) == best_chrom]
+
+
+def _get_best_hsp(
+    alignments: "list[BioAlign.Alignment]",
+    gene_location: GeneLocation | None = None,
+    scores: dict[int, int] | None = None,
+) -> "BioAlign.Alignment":
+    """Retrieve the preferred alignment from a list filtered to one chromosome.
+
+    If gene location data is available, prefer the alignment with the least
+    distance between its target start and the known gene start coordinate.
+    Otherwise, take the alignment with the highest BLAT score.
+
+    :param alignments: alignments on a single chromosome for one query gene
+    :param gene_location: location data acquired by normalising scoreset metadata
+    :param scores: optional pre-computed ``{id(aln): score}`` cache; when
+        provided avoids redundant ``aln.counts()`` walks.
+    :return: preferred alignment
+    :raise AlignmentError: if the list is empty
+    """
+    if not alignments:
+        msg = f"Empty alignment list passed to _get_best_hsp for gene location {gene_location or 'N/A'}."
+        raise AlignmentError(msg)
 
-def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None = None) -> HSP:
-    """Retrieve preferred HSP from BLAT Hit object.
+    if gene_location and gene_location.start is not None:
+        best = min(
+            alignments,
+            key=lambda a: abs(int(a.coordinates[0][0]) - gene_location.start),
+        )
 
-    If gene location data is available, prefer the HSP with the least distance
-    between the start of the hit and the start coordinate of the gene. Otherwise,
-    take the HSP with the highest score value.
+        _logger.info(
+            "Selected HSP by gene-location proximity (gene start: %d, hit start: %d, %d candidate(s)).",
+            gene_location.start,
+            int(best.coordinates[0][0]),
+            len(alignments),
+        )
+        return best
 
-    :param hit: hit object from BLAT result
-    :param gene_location: location data acquired by normalizing scoreset metadata
-    :return: Preferred HSP object
-    :raise AlignmentError: if hit object appears to be empty (should be impossible)
-    """
-    best_hsp = None
-    if gene_location and gene_location.start is not None:
-        best_hsp = min(hit, key=lambda hsp: abs(hsp.hit_start - gene_location.start))
-    else:
-        best_hsp = max(hit, key=lambda hsp: hsp.score)
-    if best_hsp is None:
-        msg = f"Unable to get best HSP from BLAT hit: {hit}"
-        raise AlignmentError(msg)
-    return best_hsp
+    best = max(
+        alignments,
+        key=lambda a: scores[id(a)] if scores is not None else _blat_score(a),
+    )
 
+    _logger.info(
+        "Selected HSP by best BLAT score (%d, %d candidate(s)).",
+        scores[id(best)] if scores is not None else _blat_score(best),
+        len(alignments),
+    )
+    return best
 
-def _get_best_match(output: QueryResult, target_gene: TargetGene) -> AlignmentResult:
-    """Obtain best high-scoring pairs (HSP) object for query sequence.
 
-    :param metadata: scoreset metadata
-    :param output: BLAT result object
-    :return: alignment result ??
+def _get_best_match(
+    alignments: "list[BioAlign.Alignment]",
+    target_gene: TargetGene,
+    seq_len: int,
+    blat_params: dict,
+) -> AlignmentResult:
+    """Obtain the best-matching alignment for one query gene.
+
+    :param alignments: all alignments for this gene across all chromosomes
+    :param target_gene: target gene metadata (used for gene-location lookup)
+    :param seq_len: query sequence length (for coverage calculation)
+    :param blat_params: parameters dict returned by ``_run_blat``
+    :return: AlignmentResult
     """
     location = get_gene_location(target_gene)
     chromosome = location.chromosome if location else None
-    best_hit = _get_best_hit(output, chromosome)
-    best_hsp = _get_best_hsp(best_hit, location)
 
-    strand = Strand.POSITIVE if best_hsp[0].query_strand == 1 else Strand.NEGATIVE
-    coverage = 100 * (best_hsp.query_end - best_hsp.query_start) / output.seq_len
-    identity = best_hsp.ident_pct
-    chrom = best_hsp.hit_id
+    # Pre-compute BLAT scores once per alignment — each call to _blat_score walks
+    # the alignment coordinates, so caching avoids O(n * alignment_length) work
+    # when the same alignments are ranked by multiple selection helpers.
+    _scores: dict[int, int] = {id(a): _blat_score(a) for a in alignments}
+    chrom_alns = _get_best_hit(alignments, chromosome, _scores)
+    best_aln = _get_best_hsp(chrom_alns, location, _scores)
+
+    best_counts = best_aln.counts()
+    others = [_scores[id(a)] for a in alignments if a is not best_aln]
+    next_best = float(max(others)) if others else None
+
+    coords = best_aln.coordinates
+    tcoords = coords[0]
+    qcoords = coords[1]
+
+    strand = Strand.POSITIVE if int(qcoords[0]) <= int(qcoords[-1]) else Strand.NEGATIVE
+    q_start = int(qcoords.min())
+    q_end = int(qcoords.max())
+
+    coverage = 100.0 * (q_end - q_start) / seq_len if seq_len else 0.0
+    identity = _blat_style_identity(
+        best_counts.identities,
+        best_counts.mismatches,
+        _count_query_insert_blocks(best_aln),
+    )
+    chrom = str(best_aln.target.id)
 
-    query_subranges = []
-    hit_subranges = []
-    for fragment in best_hsp:
-        query_subranges.append(
-            SequenceRange(start=fragment.query_start, end=fragment.query_end)
-        )
-        hit_subranges.append(
-            SequenceRange(start=fragment.hit_start, end=fragment.hit_end)
-        )
+    query_subranges: list[SequenceRange] = []
+    hit_subranges: list[SequenceRange] = []
+    for i in range(coords.shape[1] - 1):
+        ts, te = int(tcoords[i]), int(tcoords[i + 1])
+        qs, qe = int(qcoords[i]), int(qcoords[i + 1])
+
+        # skip gap-only segments (BLAT-style identity ignores target gaps
+        # and penalizes query gaps, so these don't represent real aligned
+        # sequence)
+        if ts == te or qs == qe:
+            continue
+
+        hit_subranges.append(SequenceRange(start=ts, end=te))
+        query_subranges.append(SequenceRange(start=min(qs, qe), end=max(qs, qe)))
+
+    alignment_qc = _build_alignment_qc(best_aln)
 
     return AlignmentResult(
+        aligner_parameters=blat_params,
+        reference_assembly=REFERENCE_GENOME_ASSEMBLY,
         chrom=chrom,
         strand=strand,
-        ident_pct=identity,
+        percent_identity=identity,
         coverage=coverage,
-        query_range=SequenceRange(start=best_hsp.query_start, end=best_hsp.query_end),
+        query_range=SequenceRange(start=q_start, end=q_end),
         query_subranges=query_subranges,
-        hit_range=SequenceRange(start=best_hsp.hit_start, end=best_hsp.hit_end),
+        hit_range=SequenceRange(start=int(tcoords[0]), end=int(tcoords[-1])),
         hit_subranges=hit_subranges,
+        score=float(_scores[id(best_aln)]),
+        next_best_score=next_best,
+        alignment_qc=alignment_qc,
     )
 
 
@@ -340,20 +934,21 @@ def align(
     :param silent: suppress BLAT process output if true
     :return: dictionary where keys are target gene identifiers and values are alignment result objects
     """
-    blat_output = _get_blat_output(scoreset_metadata, silent)
+    grouped, blat_params = _get_blat_output(scoreset_metadata, silent)
     alignment_results = {}
-    for blat_result in blat_output:
-        target_label = blat_result.id
-        # blat names the result id "query" if there is only one query; replace "query" with the target gene name for single-target score sets
+    for blat_id, aln_list in grouped.items():
+        target_label = blat_id
+
+        # BLAT names the result "query" when there is only one query sequence;
+        # replace with the actual target gene name for single-target score sets.
         if target_label == "query" and len(scoreset_metadata.target_genes) == 1:
             target_label = list(scoreset_metadata.target_genes.keys())[0]  # noqa: RUF015
-        # blat automatically reformats query names, so sometimes they don't match our metadata
+
+        # BLAT automatically reformats query names; resolve back to metadata key.
         if target_label not in scoreset_metadata.target_genes:
-            # if single-target score set, don't need to match by name
             if len(scoreset_metadata.target_genes) == 1:
                 target_label = list(scoreset_metadata.target_genes.keys())[0]  # noqa: RUF015
             else:
-                # try to match query name to a target gene in the metadata
                 matches = 0
                 for target_gene_name in scoreset_metadata.target_genes:
                     blat_target_gene_name = (
@@ -362,35 +957,53 @@ def align(
                         .replace(")", "")
                         .replace(",", "")
                     )
-                    if blat_target_gene_name == target_label:
+
+                    if blat_target_gene_name == blat_id:
                         target_label = target_gene_name
                         matches += 1
-                # we may be missing some blat reformatting rules here - if so, this error will be thrown
+
                 if matches == 0:
-                    msg = f"BLAT result {target_label} does not match any target gene names in scoreset {scoreset_metadata.urn}."
+                    msg = f"BLAT result {blat_id} does not match any target gene names in scoreset {scoreset_metadata.urn}."
                     raise AlignmentError(msg)
                 if matches > 1:
-                    # could happen if multiple target genes have the same first word in their label (unlikely)
-                    msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}"
+                    msg = f"BLAT result {blat_id} matches multiple target gene names in scoreset {scoreset_metadata.urn}"
+                    raise AlignmentError(msg)
+
         target_gene = scoreset_metadata.target_genes[target_label]
-        alignment_results[target_label] = _get_best_match(blat_result, target_gene)
+        seq_len = len(target_gene.target_sequence or "")
+        alignment_results[target_label] = _get_best_match(
+            aln_list, target_gene, seq_len, blat_params
+        )
+        _log_alignment_summary(target_label, alignment_results[target_label])
+
     # confirm that there is an alignment result for each target gene
-    for target_gene in scoreset_metadata.target_genes:
-        if target_gene not in alignment_results:
-            msg = f"No BLAT result found for target gene {target_gene} in scoreset {scoreset_metadata.urn}"
+    for tgt_gene in scoreset_metadata.target_genes:
+        if tgt_gene not in alignment_results:
+            msg = f"No BLAT result found for target gene {tgt_gene} in scoreset {scoreset_metadata.urn}"
             raise AlignmentError(msg)
+
     return alignment_results
 
 
 def fetch_alignment(
     metadata: ScoresetMetadata, silent: bool
 ) -> dict[str, AlignmentResult | None]:
-    alignment_results = {}
+    alignment_results: dict[str, AlignmentResult | None] = {}
     for target_gene in metadata.target_genes:
         accession_id = metadata.target_genes[target_gene].target_accession_id
+        if not accession_id:
+            msg = f"Target gene {target_gene} in scoreset {metadata.urn} is missing an accession ID, which is required for fetching CDOT alignments."
+            raise AlignmentError(msg)
+
         # protein and contig/chromosome accession ids do not need to be aligned to the genome
         if accession_id.startswith(("NP", "ENSP", "NC_")):
+            _logger.info(
+                "Skipping BLAT for %s (%s): direct protein/contig accession — alignment not required.",
+                target_gene,
+                accession_id,
+            )
             alignment_results[accession_id] = None
+
         else:
             url = f"{CDOT_URL}/transcript/{accession_id}"
             r = httpx.get(url, timeout=30)
@@ -403,7 +1016,18 @@ def fetch_alignment(
                 raise ResourceAcquisitionError(msg) from e
 
             cdot_mapping = r.json()
-            alignment_results[accession_id] = parse_cdot_mapping(cdot_mapping, silent)
+            cdot_alignment = parse_cdot_mapping(cdot_mapping, silent)
+            alignment_results[accession_id] = cdot_alignment
+
+            _logger.info(
+                "CDOT alignment parsed for %s (%s): %s, strand=%s, %d exon(s)",
+                target_gene,
+                accession_id,
+                cdot_alignment.chrom,
+                cdot_alignment.strand,
+                len(cdot_alignment.hit_subranges),
+            )
+
     return alignment_results
 
 
@@ -445,6 +1069,12 @@ def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult:
             start=hit_subranges[-1].start, end=hit_subranges[0].end
         )
 
+    reference_assembly = "GRCh38" if grch38 else "GRCh37"
+    cdot_data_version = cdot_mapping.get("cdot_data_version")
+    # Always emit the dict so consumers can distinguish
+    # "cdot run, version unknown" (cdot_data_version=None) from "not a cdot run" (None).
+    aligner_parameters: dict = {"cdot_data_version": cdot_data_version}
+
     return AlignmentResult(
         chrom=chrom,
         strand=strand,
@@ -452,6 +1082,8 @@ def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult:
         query_subranges=query_subranges,
         hit_range=hit_range,
         hit_subranges=hit_subranges,
+        aligner_parameters=aligner_parameters,
+        reference_assembly=reference_assembly,
     )
 
 
@@ -477,6 +1109,13 @@ def build_alignment_result(
                 raise ScoresetNotSupportedError(msg)
             score_set_type = "sequence"
 
+    _logger.info(
+        "Building alignment result for %s (path: %s, targets: %s)",
+        metadata.urn,
+        score_set_type,
+        list(metadata.target_genes.keys()),
+    )
+
     if score_set_type == "sequence":
         try:
             alignment_result = align(metadata, silent)
@@ -497,6 +1136,12 @@ def build_alignment_result(
                 alignment_result = align(
                     patch_target_sequence_type(metadata, {}, force=True), silent
                 )
+                _logger.info(
+                    "Protein-level BLAT retry succeeded for %s. "
+                    "The nucleotide-level alignment result has been discarded and replaced with the protein-level result.",
+                    metadata.urn,
+                )
+
             except AlignmentError as e2:
                 msg = f"BLAT alignment failed for {metadata.urn} at the protein level after failing at the nucleotide level."
                 _logger.error(msg)
@@ -511,45 +1156,83 @@ def build_alignment_result(
 def align_target_to_protein(
     target_sequence: str, reference_sequence: str, silent: bool
 ) -> AlignmentResult:
+    """Align a protein target sequence against a reference protein using BLAT.
+
+    :param target_sequence: query protein sequence
+    :param reference_sequence: reference protein sequence
+    :param silent: suppress BLAT process output if true
+    :return: AlignmentResult for the best alignment
+    :raise AlignmentError: if BLAT produces no usable alignment
+    """
+    target_args = "-q=prot -t=prot"
     with tempfile.NamedTemporaryFile() as query_tmp_file, tempfile.NamedTemporaryFile() as reference_tmp_file:
         _write_query_file(Path(query_tmp_file.name), [">query", target_sequence])
         _write_query_file(
             Path(reference_tmp_file.name), [">reference", reference_sequence]
         )
-        target_args = "-q=prot -t=prot"
 
-        process_result = _run_blat(
+        process_result, blat_params = _run_blat(
             target_args,
             query_tmp_file.name,
             reference_tmp_file.name,
             "/dev/stdout",
             silent,
         )
-        out_file = _write_blat_output_tempfile(process_result)
+        pslx_path = _write_blat_output_tempfile(process_result)
 
         try:
-            blat_output = read_blat(out_file, "blat-psl")
+            alignments = list(BioAlign.parse(pslx_path, "psl"))
+            if not alignments:
+                raise ValueError
 
         except ValueError as e:
             msg = "Unable to run successful BLAT on target sequence against selected reference protein sequence."
+            _logger.error(msg)
             raise AlignmentError(msg) from e
 
-    best_hit = _get_best_hit(blat_output, None)
-    best_hsp = _get_best_hsp(best_hit)
-
-    query_subranges = []
-    hit_subranges = []
-    for fragment in best_hsp:
-        query_subranges.append(
-            SequenceRange(start=fragment.query_start, end=fragment.query_end)
-        )
-        hit_subranges.append(
-            SequenceRange(start=fragment.hit_start, end=fragment.hit_end)
-        )
-
-    return AlignmentResult(
-        query_range=SequenceRange(start=best_hsp.query_start, end=best_hsp.query_end),
+    _scores: dict[int, int] = {id(a): _blat_score(a) for a in alignments}
+    best_aln = max(alignments, key=lambda a: _scores[id(a)])
+    best_counts = best_aln.counts()
+    others = [_scores[id(a)] for a in alignments if a is not best_aln]
+    next_best = float(max(others)) if others else None
+
+    coords = best_aln.coordinates
+    tcoords = coords[0]
+    qcoords = coords[1]
+
+    query_subranges: list[SequenceRange] = []
+    hit_subranges: list[SequenceRange] = []
+    for i in range(coords.shape[1] - 1):
+        ts, te = int(tcoords[i]), int(tcoords[i + 1])
+        qs, qe = int(qcoords[i]), int(qcoords[i + 1])
+        if ts == te or qs == qe:
+            continue
+        hit_subranges.append(SequenceRange(start=ts, end=te))
+        query_subranges.append(SequenceRange(start=min(qs, qe), end=max(qs, qe)))
+
+    # Attach full sequences so _build_alignment_qc can do per-base mismatch
+    # detection. BLAT protein pslx does not populate tseqs/qseqs columns, so
+    # the parser leaves SeqRecords with undefined content; we supply the same
+    # strings we passed to BLAT, which share coordinate space with the PSL output.
+    best_aln.target.seq = Seq(reference_sequence)
+    best_aln.query.seq = Seq(target_sequence)
+    alignment_qc = _build_alignment_qc(best_aln)
+
+    result = AlignmentResult(
+        query_range=SequenceRange(start=int(qcoords.min()), end=int(qcoords.max())),
         query_subranges=query_subranges,
-        hit_range=SequenceRange(start=best_hsp.hit_start, end=best_hsp.hit_end),
+        hit_range=SequenceRange(start=int(tcoords[0]), end=int(tcoords[-1])),
         hit_subranges=hit_subranges,
+        percent_identity=_blat_style_identity(
+            best_counts.identities,
+            best_counts.mismatches,
+            _count_query_insert_blocks(best_aln),
+        ),
+        score=float(_scores[id(best_aln)]),
+        next_best_score=next_best,
+        alignment_qc=alignment_qc,
+        aligner_parameters=blat_params,
     )
+
+    _log_alignment_summary("protein->protein", result)
+    return result
diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py
index 941f45f..2f6c291 100644
--- a/src/dcd_mapping/annotate.py
+++ b/src/dcd_mapping/annotate.py
@@ -1,10 +1,13 @@
 """Annotate MaveDB score set metadata with mapped scores."""
 
+import bisect
 import datetime
 import json
 import logging
 from pathlib import Path
+from typing import Any
 
+import cdot
 import hgvs.edit
 import hgvs.location
 import hgvs.parser
@@ -34,7 +37,7 @@
     get_ucsc_chromosome_name,
     get_vrs_id_from_identifier,
 )
-from dcd_mapping.resource_utils import LOCAL_STORE_PATH
+from dcd_mapping.resource_utils import CDOT_URL, LOCAL_STORE_PATH
 from dcd_mapping.schemas import (
     AlignmentResult,
     ComputedReferenceSequence,
@@ -42,20 +45,175 @@
     MappedReferenceSequence,
     MappedScore,
     ScoreAnnotation,
-    ScoreAnnotationWithLayer,
     ScoresetMapping,
     ScoresetMetadata,
     TargetAnnotation,
     TargetGene,
+    TargetMapping,
     TargetSequenceType,
     TargetType,
     TxSelectResult,
     VrsVersion,
 )
 from dcd_mapping.transcripts import TxSelectError
+from dcd_mapping.version import dcd_mapping_version
 
 _logger = logging.getLogger(__name__)
 
+# How close (in reference bases) a variant must be to an alignment gap before
+# we flag it as ``near_gap``. This is somewhat arbitrary, but seems like a reasonable window to
+# capture potential edge effects without being so large that it flags a large
+# fraction of variants as near-gap. We might consider re-evaluating this window with more
+# rigor in the future, or making it configurable to support situations where callers
+# want to be more or less conservative in their definition of "near".
+NEAR_GAP_WINDOW = 5
+
+
+def _allele_ref_positions(post_mapped: object) -> list[tuple[int, int]]:
+    """Return the reference-frame half-open intervals covered by an Allele or
+    Haplotype-of-Alleles.
+
+    Returns an empty list for:
+    - Unrecognised shapes (e.g. pre-VRS-2.0 representations) — caller leaves
+      locus flags as None.
+    - ReferenceLengthExpression alleles (reference-identical variants) — their
+      span covers the entire reference allele unchanged, so per-variant locus
+      flags carry no meaningful audit value (and could span entire chromosomes).
+    """
+    members = (
+        post_mapped.members if isinstance(post_mapped, Haplotype) else [post_mapped]
+    )
+    intervals: list[tuple[int, int]] = []
+    for member in members:
+        if (
+            not isinstance(member, Allele)
+            or member.location is None
+            or isinstance(member.state, ReferenceLengthExpression)
+        ):
+            continue
+
+        try:
+            start = int(member.location.start)
+            end = int(member.location.end)
+        except (TypeError, AttributeError, ValueError):
+            continue
+
+        # SequenceLocation is half-open [start, end). Ensure we don't produce
+        # a zero-length interval when start == end.
+        intervals.append((start, max(start + 1, end)))
+
+    return intervals
+
+
+def _stamp_alignment_locus_flags(
+    annotations: list[ScoreAnnotation],
+    align_result: AlignmentResult | None,
+    target_layer: AnnotationLayer,
+    near_gap_window: int = NEAR_GAP_WINDOW,
+) -> None:
+    """Stamp ``at_mismatched_locus`` and ``near_gap`` on each annotation whose
+    ``annotation_layer`` matches ``target_layer``, using positions from the
+    given alignment's QC block.
+
+    The caller is responsible for pairing each layer with its own alignment:
+    GENOMIC variants get checked against the BLAT genomic alignment; PROTEIN
+    variants get checked against the target-protein-to-reference-protein
+    alignment. CDNA flags are not currently evaluated -- we'd need either a
+    dedicated cDNA alignment or to translate genomic mismatch/gap positions
+    through exon structure -- cdna annotations leave the flags as None to
+    signal "not evaluated".
+
+    When ``AlignmentQc.mismatch_positions_unavailable`` is ``True`` and
+    ``mismatch_count > 0``, the per-base positions are absent but the aggregate
+    count is non-zero.  In that case ``at_mismatched_locus`` is left as ``None``
+    (not evaluated) for all variants to prevent a silent disagreement between
+    ``mismatch_count`` and the per-variant flag.  ``near_gap`` is still evaluated
+    from gap intervals, which are always coordinate-derived.
+    """
+    if align_result is None or align_result.alignment_qc is None:
+        return
+
+    qc = align_result.alignment_qc
+    mismatches = sorted(qc.mismatch_positions)
+    gap_intervals = sorted(qc.gap_intervals)
+
+    # When sequence content was unavailable during QC computation, mismatch_count
+    # may be non-zero but mismatch_positions is empty. Marking at_mismatched_locus
+    # as False would silently disagree with mismatch_count; leave it as None instead.
+    positions_reliable = not (
+        qc.mismatch_positions_unavailable and qc.mismatch_count > 0
+    )
+
+    layer_anns = [
+        ann
+        for ann in annotations
+        if ann.alignment_level == target_layer and ann.post_mapped is not None
+    ]
+    if not layer_anns:
+        return
+
+    # Explicitly record that this target/layer had no mismatches or gaps. Use
+    # mismatch_count (not the positions list) as the source of truth: when
+    # mismatch_positions_unavailable is True the list is empty even though
+    # mismatches exist, so testing `not mismatches` would be a faulty premise.
+    # mismatch_count comes from aln.counts() which is purely coordinate-based
+    # and is always accurate regardless of sequence availability.
+    if qc.mismatch_count == 0 and not gap_intervals:
+        for ann in layer_anns:
+            ann.at_mismatched_locus = False
+            ann.near_gap = False
+        return
+
+    # Pre-compute sorted lists for O(log G) gap overlap queries.
+    gap_starts = [gs for gs, _ge in gap_intervals]
+    gap_ends = [ge for _gs, ge in gap_intervals]
+
+    for ann in layer_anns:
+        intervals = _allele_ref_positions(ann.post_mapped)
+        if not intervals:
+            # Empty means either RLE allele or unrecognised shape; leave flags None.
+            continue
+
+        at_mismatch = False
+        near_g = False
+        for int_start, int_end in intervals:
+            # -- Mismatch check: any mismatch position in [int_start, int_end)?  O(log M)
+            if mismatches and not at_mismatch:
+                idx = bisect.bisect_left(mismatches, int_start)
+                if idx < len(mismatches) and mismatches[idx] < int_end:
+                    at_mismatch = True
+
+            # -- Gap proximity check: any gap overlaps [int_start-W, int_end+W)?  O(log G)
+            # A gap (gs, ge) overlaps window [L, R) iff gs < R AND ge > L.
+            # With sorted (non-overlapping) gap intervals:
+            #   Case 1: gap start in [L, R) — definitively overlaps.
+            #   Case 2: gap start < L — overlaps only if its end > L.
+            if not near_g:
+                win_l = int_start - near_gap_window
+                win_r = int_end + near_gap_window
+                first_at_or_after_l = bisect.bisect_left(gap_starts, win_l)
+                first_at_or_after_r = bisect.bisect_left(gap_starts, win_r)
+                # At least one gap starts in [win_l, win_r) OR the last gap starting before win_l extends into [win_l, win_r).
+                if first_at_or_after_l < first_at_or_after_r or (
+                    first_at_or_after_l > 0
+                    and gap_ends[first_at_or_after_l - 1] > win_l
+                ):
+                    near_g = True
+
+            if at_mismatch and near_g:
+                break
+
+        # Stamp flags.
+        # Note: when positions_reliable is False (mismatch_positions_unavailable
+        # AND count>0), at_mismatch will always be False here because mismatches
+        # is an empty list — we cannot trust it. Leaving at_mismatched_locus as
+        # None (not evaluated) rather than stamping a misleading False. The
+        # near_gap branch is unaffected: gap_intervals are coordinate-derived and
+        # always reliable, so near_g is evaluated regardless.
+        if positions_reliable:
+            ann.at_mismatched_locus = at_mismatch
+        ann.near_gap = near_g
+
 
 async def compute_target_gene_info(
     target_key: str,
@@ -353,7 +511,7 @@ def _iter_genomic_spans_from_mapped_scores(
     """
     spans: list[tuple[str, int, int]] = []
     for ms in mapped_scores:
-        if ms.annotation_layer != AnnotationLayer.GENOMIC or ms.post_mapped is None:
+        if ms.alignment_level != AnnotationLayer.GENOMIC or ms.post_mapped is None:
             continue
         if isinstance(ms.post_mapped, Allele):
             loc = ms.post_mapped.location
@@ -559,7 +717,7 @@ def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]:
     if alt == "":
         edit = "del"
 
-    if edit != "dup" or edit != "del" or edit != "=":
+    if edit not in ("dup", "del", "="):
         edit = (
             hgvs.edit.AARefAlt(ref=ref, alt=alt)
             if syntax == Syntax.HGVS_P
@@ -589,7 +747,7 @@ def _annotate_allele_mapping(
     metadata: TargetGene,
     urn: str,
     vrs_version: VrsVersion = VrsVersion.V_2,
-) -> ScoreAnnotationWithLayer:
+) -> ScoreAnnotation:
     """Perform annotations and, if necessary, create VRS 1.3 equivalents for allele mappings."""
     pre_mapped: Allele = mapped_score.pre_mapped
     post_mapped: Allele = mapped_score.post_mapped
@@ -607,7 +765,7 @@ def _annotate_allele_mapping(
 
     if post_mapped:
         # Determine reference sequence
-        if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
+        if mapped_score.alignment_level == AnnotationLayer.GENOMIC:
             sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}"
             accession = get_chromosome_identifier_from_vrs_id(sequence_id)
             if accession is None:
@@ -641,14 +799,14 @@ def _annotate_allele_mapping(
         pre_mapped = _allele_to_vod(pre_mapped)
         post_mapped = _allele_to_vod(post_mapped) if post_mapped else None
 
-    return ScoreAnnotationWithLayer(
+    return ScoreAnnotation(
         pre_mapped=pre_mapped,
         post_mapped=post_mapped,
         vrs_version=vrs_version,
         mavedb_id=mapped_score.accession_id,
         score=float(mapped_score.score) if mapped_score.score is not None else None,
-        annotation_layer=mapped_score.annotation_layer,
         error_message=mapped_score.error_message,
+        alignment_level=mapped_score.alignment_level,
     )
 
 
@@ -658,7 +816,7 @@ def _annotate_haplotype_mapping(
     metadata: TargetGene,
     urn: str,
     vrs_version: VrsVersion = VrsVersion.V_2,
-) -> ScoreAnnotationWithLayer:
+) -> ScoreAnnotation:
     """Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings."""
     pre_mapped: Haplotype = mapped_score.pre_mapped  # type: ignore
     post_mapped: Haplotype = mapped_score.post_mapped  # type: ignore
@@ -674,7 +832,7 @@ def _annotate_haplotype_mapping(
 
     if post_mapped:
         # Determine reference sequence
-        if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
+        if mapped_score.alignment_level == AnnotationLayer.GENOMIC:
             sequence_id = f"ga4gh:{post_mapped.members[0].location.sequenceReference.refgetAccession}"
             accession = get_chromosome_identifier_from_vrs_id(sequence_id)
             if accession is None:
@@ -712,14 +870,14 @@ def _annotate_haplotype_mapping(
         pre_mapped = _haplotype_to_haplotype_1_3(pre_mapped)
         post_mapped = _haplotype_to_haplotype_1_3(post_mapped) if post_mapped else None
 
-    return ScoreAnnotationWithLayer(
+    return ScoreAnnotation(
         pre_mapped=pre_mapped,
         post_mapped=post_mapped,
         vrs_version=vrs_version,
         mavedb_id=mapped_score.accession_id,
         score=float(mapped_score.score) if mapped_score.score is not None else None,
-        annotation_layer=mapped_score.annotation_layer,
         error_message=mapped_score.error_message,
+        alignment_level=mapped_score.alignment_level,
     )
 
 
@@ -729,7 +887,7 @@ def annotate(
     metadata: TargetGene,
     urn: str,
     vrs_version: VrsVersion = VrsVersion.V_2,
-) -> list[ScoreAnnotationWithLayer]:
+) -> list[ScoreAnnotation]:
     """Given a list of mappings, add additional contextual data:
 
     1. ``vrs_ref_allele_seq``: The sequence between the start and end positions
@@ -751,13 +909,14 @@ def annotate(
     for mapped_score in mapped_scores:
         if mapped_score.pre_mapped is None:
             score_annotations.append(
-                ScoreAnnotationWithLayer(
+                ScoreAnnotation(
                     mavedb_id=mapped_score.accession_id,
                     score=float(mapped_score.score)
                     if mapped_score.score is not None
                     else None,
                     vrs_version=vrs_version,
                     error_message=mapped_score.error_message,
+                    alignment_level=mapped_score.alignment_level,
                 )
             )
         elif isinstance(mapped_score.pre_mapped, Haplotype) and (
@@ -780,7 +939,7 @@ def annotate(
             )
         else:
             score_annotations.append(
-                ScoreAnnotationWithLayer(
+                ScoreAnnotation(
                     pre_mapped=mapped_score.pre_mapped,
                     post_mapped=mapped_score.post_mapped,
                     vrs_version=vrs_version,
@@ -789,6 +948,7 @@ def annotate(
                     if mapped_score.score is not None
                     else None,
                     error_message=f"Multiple issues with annotation: Inconsistent variant structure (Allele and Haplotype mix).{' ' + mapped_score.error_message if mapped_score.error_message else ''}",
+                    alignment_level=mapped_score.alignment_level,
                 )
             )
 
@@ -891,16 +1051,44 @@ def _get_mapped_reference_sequence(
     )
 
 
-def _set_scoreset_layer(
-    urn: str, mappings: list[ScoreAnnotationWithLayer]
+def _align_result_for_target(
+    target_gene: str,
+    metadata: ScoresetMetadata,
+    align_results: dict[str, "AlignmentResult"],
+) -> "AlignmentResult | None":
+    """Return the AlignmentResult for a target, handling both sequence-based and
+    accession-based targets.
+
+    Sequence-based targets key ``align_results`` by target gene name.
+    Accession-based targets key it by ``target_accession_id`` (set by
+    ``parse_cdot_mapping`` in ``align.py``). This helper does the fallback so
+    callers don't have to repeat the lookup logic.
+    """
+    result = align_results.get(target_gene)
+    if result is None:
+        accession_id = metadata.target_genes[target_gene].target_accession_id
+        if accession_id:
+            result = align_results.get(accession_id)
+
+    return result
+
+
+def _pick_preferred_genomic_or_protein_layer(
+    mappings: list[ScoreAnnotation],
 ) -> AnnotationLayer:
-    """Many individual score results provide both genomic and protein variant
-    expressions. If genomic expressions are available, that's what we'd like to use.
-    This function tells us how to filter the final annotation objects.
+    """Return GENOMIC if any annotation has that layer, else PROTEIN.
+
+    Precondition: ``mappings`` must contain at least one annotation with
+    ``alignment_level`` in {GENOMIC, PROTEIN}. CDNA-only score sets are not
+    currently supported (``vrs_map`` does not emit CDNA-layer scores); this
+    function will silently return PROTEIN for them, which would then drop
+    every CDNA score from the output. Update this function if CDNA scores
+    become emittable.
     """
     for mapping in mappings:
-        if mapping.annotation_layer == AnnotationLayer.GENOMIC:
+        if mapping.alignment_level == AnnotationLayer.GENOMIC:
             return AnnotationLayer.GENOMIC
+
     return AnnotationLayer.PROTEIN
 
 
@@ -919,48 +1107,345 @@ def write_scoreset_mapping_to_json(
     _logger.info("Saving mapping output to %s", output_path)
     with output_path.open("w") as file:
         json.dump(
-            scoreset_mapping.model_dump(exclude_unset=False, exclude_none=True),
+            scoreset_mapping.model_dump(
+                mode="json", exclude_unset=False, exclude_none=True
+            ),
             file,
             indent=4,
         )
     return output_path
 
 
-def save_mapped_output_json(
+def _resolve_protein_accession(
+    target_metadata: TargetGene,
+    tx_result: TxSelectResult | TxSelectError | None,
+    _align_result: AlignmentResult
+    | None,  # protein accession comes from tx_result only
+) -> str | None:
+    if isinstance(tx_result, TxSelectResult) and tx_result.np:
+        return tx_result.np
+
+    accession_id = target_metadata.target_accession_id
+    if accession_id and accession_id.startswith(("NP", "ENSP")):
+        return accession_id
+
+    return None
+
+
+def _resolve_cdna_accession(
+    target_metadata: TargetGene,
+    tx_result: TxSelectResult | TxSelectError | None,
+    _align_result: AlignmentResult | None,  # cDNA accession comes from tx_result only
+) -> str | None:
+    if isinstance(tx_result, TxSelectResult) and tx_result.nm:
+        return tx_result.nm
+
+    accession_id = target_metadata.target_accession_id
+    if accession_id and accession_id.startswith(("NM", "ENST")):
+        return accession_id
+
+    return None
+
+
+def _resolve_genomic_accession(
+    target_metadata: TargetGene,
+    _tx_result: TxSelectResult
+    | TxSelectError
+    | None,  # genomic accession comes from align_result only
+    align_result: AlignmentResult | None,
+) -> str | None:
+    if align_result is not None and align_result.chrom:
+        try:
+            chrom_id = get_chromosome_identifier(align_result.chrom)
+        except Exception:
+            _logger.exception(
+                "Could not resolve chromosome identifier for %s", align_result.chrom
+            )
+            return None
+
+        if chrom_id and chrom_id.startswith("refseq:"):
+            return chrom_id[len("refseq:") :]
+
+        return chrom_id
+
+    accession_id = target_metadata.target_accession_id
+    if accession_id and accession_id.startswith("NC"):
+        return accession_id
+
+    return None
+
+
+_LAYER_ACCESSION_RESOLVERS = {
+    AnnotationLayer.PROTEIN: _resolve_protein_accession,
+    AnnotationLayer.CDNA: _resolve_cdna_accession,
+    AnnotationLayer.GENOMIC: _resolve_genomic_accession,
+}
+
+
+def _reference_accession_for_target_level(
+    alignment_level: AnnotationLayer,
+    target_metadata: TargetGene,
+    tx_result: TxSelectResult | TxSelectError | None,
+    align_result: AlignmentResult | None,
+) -> str | None:
+    """Pick the most informative RefSeq-style accession describing the reference
+    sequence that variants at ``alignment_level`` were mapped against.
+
+    Falls back to None when nothing sensible is available; the API column is nullable.
+    """
+    resolver = _LAYER_ACCESSION_RESOLVERS.get(alignment_level)
+    if resolver is None:
+        return None
+
+    return resolver(target_metadata, tx_result, align_result)
+
+
+def _build_target_mapping(
+    target_gene_identifier: str,
+    target_meta: TargetGene,
+    alignment_level: AnnotationLayer,
+    preferred: bool,
+    tx_result: TxSelectResult | TxSelectError | None,
+    align_result: AlignmentResult | None,
+    vrs_version: VrsVersion,
+    annotations: list[ScoreAnnotation],
+    protein_align_result: AlignmentResult | None = None,
+    null_failure_count: int = 0,
+    near_gap_window: int = NEAR_GAP_WINDOW,
+) -> TargetMapping:
+    """Assemble one target_mappings[] row for a (target, alignment_level) pair.
+
+    Provenance and aggregate counts are populated from the data the pipeline
+    already has. Alignment QC columns are populated from
+    ``align_result.alignment_qc`` for GENOMIC rows or
+    ``protein_align_result.alignment_qc`` for PROTEIN rows.
+
+    Note: CDNA ``TargetMapping`` rows are **not** emitted today — no
+    ``mapped_score`` carries ``alignment_level=CDNA`` (``vrs_map`` only
+    produces GENOMIC and PROTEIN scores).  The CDNA entry added to
+    ``reference_sequences`` in ``build_scoreset_mapping`` is purely a
+    reference-sequence accession record, not a scored layer.
+    """
+    reference_accession = _reference_accession_for_target_level(
+        alignment_level, target_meta, tx_result, align_result
+    )
+    reference_sequence_id: str | None = None
+    if reference_accession is not None:
+        try:
+            reference_sequence_id = get_vrs_id_from_identifier(reference_accession)
+        except Exception:
+            _logger.exception(
+                "Could not resolve VRS sequence id for %s", reference_accession
+            )
+
+    # Pick the alignment that lives in this row's coordinate frame:
+    # GENOMIC -> BLAT genomic alignment; PROTEIN -> target-protein-to-reference
+    # alignment from vrs_map.
+    # No CDNA branch: vrs_map never emits CDNA-layer scores, so this function
+    # is never called with alignment_level=CDNA in practice.
+    qc_source: AlignmentResult | None = None
+    if alignment_level == AnnotationLayer.GENOMIC:
+        qc_source = align_result
+    elif alignment_level == AnnotationLayer.PROTEIN:
+        qc_source = protein_align_result
+
+    tool_parameters: dict[str, Any] | None
+    # Accession-based targets split into two cases:
+    #   NC_ (chromosome/contig) — no alignment tool was run; no tool_parameters.
+    #   NM_/ENST (transcript)   — cdot was used for transcript placement; record its parameters.
+    # For non-accession-based targets we pull aligner parameters from the BLAT alignment as usual.
+    accession_id = target_meta.target_accession_id
+    if accession_id is not None and accession_id.startswith("NC"):
+        # Direct chromosome/contig accession: the coordinate frame is defined by
+        # the accession itself — no placement tool was invoked.
+        tool_parameters = {
+            "aligner": "direct_contig_accession",
+        }
+    elif accession_id is not None:
+        # Transcript accession (NM_/ENST): cdot was used for transcript placement.
+        # cdot_data_version is present in aligner_parameters (possibly None) for cdot
+        # paths after parse_cdot_mapping, so we can read it unconditionally.
+        cdot_data_version = (
+            align_result.aligner_parameters.get("cdot_data_version")
+            if align_result is not None and align_result.aligner_parameters is not None
+            else None
+        )
+        tool_parameters = {
+            "aligner": "cdot_transcript_placement",
+            "aligner_version": cdot.__version__,
+            "cdot_url": CDOT_URL,
+            # Always emit the key; None means "cdot run, version unknown".
+            "cdot_data_version": cdot_data_version,
+        }
+    else:
+        tool_parameters = (
+            qc_source.aligner_parameters if qc_source is not None else None
+        )
+
+    qc = qc_source.alignment_qc if qc_source is not None else None
+    percent_identity = qc.percent_identity if qc is not None else None
+    alignment_length = qc.alignment_length if qc is not None else None
+    mismatch_count = qc.mismatch_count if qc is not None else None
+    gap_count = qc.gap_count if qc is not None else None
+    alignment_string = qc.alignment_string if qc is not None else None
+
+    # Record the near-gap window alongside the CIGAR so consumers can interpret
+    # the per-variant ``near_gap`` flag. Stamped on every layer that had its
+    # flags evaluated against an alignment (any layer with a non-None qc).
+    alignment_metadata: dict[str, Any] | None = None
+    if qc is not None and qc.cigar:
+        alignment_metadata = {"cigar": qc.cigar}
+    if qc is not None:
+        alignment_metadata = alignment_metadata or {}
+        alignment_metadata["near_gap_window"] = near_gap_window
+        # Always emit this bit so consumers never have to infer it from absence.
+        alignment_metadata[
+            "at_mismatched_locus_evaluated"
+        ] = not qc.mismatch_positions_unavailable
+
+    alignment_score = qc_source.score if qc_source is not None else None
+    next_best_alignment_score = (
+        qc_source.next_best_score if qc_source is not None else None
+    )
+
+    total = len(annotations)
+    failed = sum(1 for a in annotations if a.post_mapped is None)
+    warnings = sum(
+        1
+        for a in annotations
+        if a.post_mapped is not None and a.error_message is not None
+    )
+    clean = total - failed - warnings
+
+    # near_gap is always evaluable (derived from gap positions in the alignment,
+    # not per-base sequence content), so count it unconditionally.
+    # at_mismatched_locus is only reliable when mismatch_positions_unavailable=False;
+    # when pslx tseq/qseq data was absent we leave it as None (falsy) and omit it
+    # from the tally so we don't silently under-count. The schema's
+    # at_mismatched_locus_evaluated flag in alignment_metadata lets consumers
+    # know they may be seeing only the near_gap portion of this count.
+    mismatch_unavailable = qc is not None and qc.mismatch_positions_unavailable
+    alignment_warnings = sum(
+        1
+        for a in annotations
+        if a.near_gap or (not mismatch_unavailable and a.at_mismatched_locus)
+    )
+
+    return TargetMapping(
+        target_gene_identifier=target_gene_identifier,
+        alignment_level=alignment_level,
+        preferred=preferred,
+        tool_name="dcd-mapping",
+        tool_version=dcd_mapping_version,
+        tool_parameters=tool_parameters,
+        reference_assembly=align_result.reference_assembly
+        if align_result is not None
+        else None,
+        reference_accession=reference_accession,
+        reference_sequence_id=reference_sequence_id,
+        vrs_version=vrs_version,
+        percent_identity=percent_identity,
+        alignment_score=alignment_score,
+        next_best_alignment_score=next_best_alignment_score,
+        alignment_length=alignment_length,
+        mismatch_count=mismatch_count,
+        gap_count=gap_count,
+        alignment_string=alignment_string,
+        alignment_metadata=alignment_metadata,
+        total_variants=total,
+        variants_mapped_cleanly=clean,
+        variants_with_mapping_warnings=warnings,
+        variants_with_alignment_warnings=alignment_warnings,
+        variants_failed=failed,
+        variants_failed_pre_layer_selection=null_failure_count
+        if null_failure_count
+        else None,
+    )
+
+
+def build_scoreset_mapping(
     metadata: ScoresetMetadata,
-    mappings: dict[str, list[ScoreAnnotationWithLayer]],
+    raw_metadata: dict,
+    mappings: dict[str, list[ScoreAnnotation]],
     align_results: dict[str, AlignmentResult],
     tx_output: dict[str, TxSelectResult | TxSelectError | None],
     gene_info: dict[str, GeneInfo | None],
     preferred_layer_only: bool = False,
-    output_path: Path | None = None,
-) -> Path:
-    """Save mapping output for a score set in a JSON file
-
-    :param urn: Score set accession
-    :param mappings: A dictionary of annotated VRS mappings for each target
-    :param align_result: A dictionary of alignment information for each target
-    :param tx_output: A dictionary of transcript output for each target
-    :param output_path: specific location to save output to. Default to
-        <dcd_mapping_data_dir>/urn:mavedb:00000XXX-X-X_mapping_<ISO8601 datetime>.json
-    :return: output location
+    vrs_version: VrsVersion = VrsVersion.V_2,
+    protein_align_results: dict[str, AlignmentResult | None] | None = None,
+    near_gap_window: int = NEAR_GAP_WINDOW,
+) -> ScoresetMapping:
+    """Assemble and return a :class:`ScoresetMapping` without writing to disk.
+
+    :param metadata: parsed scoreset metadata (used for pipeline logic)
+    :param raw_metadata: the full MaveDB API response dict, stored verbatim in
+        the output ``metadata`` field so no fields are lost
+    :param mappings: annotated VRS mappings per target. **Mutated in place** —
+        ``_stamp_alignment_locus_flags`` sets ``at_mismatched_locus`` and
+        ``near_gap`` on each ``ScoreAnnotation``. Do not pass the same dict to
+        two successive calls if you need reproducible per-call results; deep-copy
+        it first.
+    :param align_results: alignment results per target (genomic frame, from BLAT)
+    :param tx_output: transcript selection results per target
+    :param gene_info: gene info per target
+    :param preferred_layer_only: if True, only emit the preferred annotation layer
+    :param vrs_version: VRS schema version for this run
+    :param protein_align_results: per-target target-protein-to-reference-protein
+        alignments (from vrs_map). Used to flag protein-layer variants at
+        mismatched/near-gap loci and as the QC source for PROTEIN target_mappings
+        rows. Pass ``None`` if you don't have them.
+    :param near_gap_window: half-width (in reference bases) of the window used to
+        flag variants as ``near_gap``. Defaults to :data:`NEAR_GAP_WINDOW` (5).
+        Increase for a more conservative call set; decrease to restrict to
+        immediately adjacent variants.
+    :return: fully assembled ScoresetMapping
     """
+    run_mapped_date_utc = datetime.datetime.now(tz=datetime.UTC)
     # set preferred layers for each target, to allow a mix of coding and noncoding targets
     reference_sequences: dict[str, TargetAnnotation] = {}
     mapped_scores: list[ScoreAnnotation] = []
+    target_mappings: list[TargetMapping] = []
     for target_gene in mappings:
+        # Stamp per-variant locus flags first so the warning counts in
+        # TargetMapping and the emitted ScoreAnnotations are consistent.
+        protein_align_for_target = (protein_align_results or {}).get(target_gene)
+        genomic_align_for_target = _align_result_for_target(
+            target_gene, metadata, align_results
+        )
+        _stamp_alignment_locus_flags(
+            mappings[target_gene],
+            genomic_align_for_target,
+            AnnotationLayer.GENOMIC,
+            near_gap_window,
+        )
+        _stamp_alignment_locus_flags(
+            mappings[target_gene],
+            protein_align_for_target,
+            AnnotationLayer.PROTEIN,
+            near_gap_window,
+        )
+
+        # preferred_layer_for_target is the single "best" layer for this target
+        # (GENOMIC when available, else PROTEIN).  It drives both which
+        # TargetMapping row gets preferred=True and where completely-failed
+        # variants (annotation_layer=None) are attributed.
+        preferred_layer_for_target = _pick_preferred_genomic_or_protein_layer(
+            mappings[target_gene]
+        )
         if preferred_layer_only:
-            preferred_layers = {
-                _set_scoreset_layer(metadata.urn, mappings[target_gene]),
-            }
+            preferred_layers = {preferred_layer_for_target}
         else:
             preferred_layers = {
-                mapping.annotation_layer for mapping in mappings[target_gene]
+                mapping.alignment_level for mapping in mappings[target_gene]
             }
 
         # use target gene name in reference sequence dictionary, rather than the label, which differs between score sets
         target_gene_name = metadata.target_genes[target_gene].target_gene_name
 
+        # sometimes Nonetype layers show up in preferred layers dict; remove these
+        # before constructing TargetAnnotation (which requires valid AnnotationLayer keys).
+        preferred_layers.discard(None)
+
         reference_sequences[target_gene_name] = TargetAnnotation(
             gene_info=gene_info.get(target_gene),
             layers={
@@ -971,8 +1456,6 @@ def save_mapped_output_json(
                 for layer in preferred_layers
             },
         )
-        # sometimes Nonetype layers show up in preferred layers dict; remove these
-        preferred_layers.discard(None)
         for layer in preferred_layers:
             reference_sequences[target_gene_name].layers[layer][
                 "computed_reference_sequence"
@@ -985,7 +1468,7 @@ def save_mapped_output_json(
                 metadata.target_genes[target_gene],
                 layer,
                 tx_output[target_gene],
-                align_results[target_gene],
+                genomic_align_for_target,
             )
 
         # if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict
@@ -1006,31 +1489,152 @@ def save_mapped_output_json(
             }
 
         for m in mappings[target_gene]:
-            if m.pre_mapped is None:
-                mapped_scores.append(ScoreAnnotation(**m.model_dump()))
-            elif m.annotation_layer in preferred_layers:
-                # drop annotation layer from mapping object
-                mapped_scores.append(ScoreAnnotation(**m.model_dump()))
-
-        # drop Nonetype reference sequences
-        for target_gene in reference_sequences:
-            for layer in list(reference_sequences[target_gene].layers.keys()):
-                if (
-                    reference_sequences[target_gene].layers[layer][
-                        "mapped_reference_sequence"
-                    ]
-                    is None
-                    and reference_sequences[target_gene].layers[layer][
-                        "computed_reference_sequence"
-                    ]
-                    is None
-                ) or layer is None:
-                    del reference_sequences[target_gene].layers[layer]
-
-    output = ScoresetMapping(
-        metadata=metadata.model_dump(),
+            if m.alignment_level is None and m.pre_mapped is None:
+                # Completely-failed variant — vrs_map could not determine a layer.
+                # Re-attribute to the preferred layer so every mapped_score has a
+                # parent TargetMapping row (the API joins on alignment_level).
+                score_dict = m.model_dump()
+                score_dict["alignment_level"] = preferred_layer_for_target
+                score_dict["target_gene_identifier"] = target_gene_name
+                mapped_scores.append(ScoreAnnotation(**score_dict))
+            elif m.alignment_level in preferred_layers:
+                score_dict = m.model_dump()
+                score_dict["target_gene_identifier"] = target_gene_name
+                mapped_scores.append(ScoreAnnotation(**score_dict))
+
+        # Provenance/QC: emit one TargetMapping per (target, alignment_level) that
+        # actually produced variants in this run. The (target_gene_identifier,
+        # alignment_level) pair must be unique per run; the API uses it to attribute
+        # each mapped_variant to its source row via target_gene_mapping_id.
+        #
+        # Completely-failed variants (annotation_layer=None) were re-attributed to
+        # preferred_layer_for_target in mapped_scores above; include them in that
+        # layer's TargetMapping count so total_variants is accurate.
+        null_failures = [
+            m
+            for m in mappings[target_gene]
+            if m.alignment_level is None and m.pre_mapped is None
+        ]
+        align_result_for_target = genomic_align_for_target
+        emitted_mappings = [
+            m for m in mappings[target_gene] if m.alignment_level in preferred_layers
+        ]
+
+        # Group by alignment_level.
+        layers_seen: dict = {}
+        for m in emitted_mappings:
+            layers_seen.setdefault(m.alignment_level, []).append(m)
+
+        for layer, layer_annotations in layers_seen.items():
+            if layer is None:
+                continue
+
+            # Defensive: coerce raw string values (e.g. "g") to AnnotationLayer.
+            if not isinstance(layer, AnnotationLayer):
+                try:
+                    layer = AnnotationLayer(layer)
+                except ValueError:
+                    _logger.warning(
+                        "Skipping target_mappings row for unknown annotation layer %r",
+                        layer,
+                    )
+                    continue
+
+            # Null-layer failures are attributed to the preferred layer; fold them
+            # into that layer's annotation list so variant counts are correct.
+            annotations_for_tm = (
+                list(layer_annotations) + null_failures
+                if layer == preferred_layer_for_target
+                else layer_annotations
+            )
+            target_mappings.append(
+                _build_target_mapping(
+                    target_gene_identifier=target_gene_name,
+                    target_meta=metadata.target_genes[target_gene],
+                    alignment_level=layer,
+                    preferred=(layer == preferred_layer_for_target),
+                    tx_result=tx_output.get(target_gene),
+                    align_result=align_result_for_target,
+                    vrs_version=vrs_version,
+                    annotations=annotations_for_tm,
+                    protein_align_result=protein_align_for_target,
+                    null_failure_count=len(null_failures)
+                    if layer == preferred_layer_for_target
+                    else 0,
+                    near_gap_window=near_gap_window,
+                )
+            )
+
+    # Drop layers where both reference sequence entries are None, and any None-keyed
+    # layers. Moved outside the per-target loop to avoid O(n²) scans and eliminate
+    # the variable-shadowing hazard (the inner loop previously reused `target_gene`).
+    for tgt_name in reference_sequences:
+        for layer in list(reference_sequences[tgt_name].layers.keys()):
+            if (
+                reference_sequences[tgt_name].layers[layer]["mapped_reference_sequence"]
+                is None
+                and reference_sequences[tgt_name].layers[layer][
+                    "computed_reference_sequence"
+                ]
+                is None
+            ) or layer is None:
+                del reference_sequences[tgt_name].layers[layer]
+
+    # Runtime invariant: every mapped_score must be attributable to a TargetMapping
+    # row via (target_gene_identifier, alignment_level). If any orphaned keys exist
+    # the API join would silently drop those scores, undermining the audit guarantee.
+    # Using an explicit RuntimeError rather than `assert` so this is never compiled
+    # away with `-O` optimizations in production deployments.
+    tm_keys = {
+        (tm.target_gene_identifier, tm.alignment_level) for tm in target_mappings
+    }
+    orphaned = {
+        (ms.target_gene_identifier, ms.alignment_level)
+        for ms in mapped_scores
+        if ms.alignment_level is not None
+    } - tm_keys
+    if orphaned:
+        raise RuntimeError(
+            f"build_scoreset_mapping produced {len(orphaned)} orphaned mapped_score "  # noqa: EM102
+            f"key(s) with no corresponding TargetMapping row: {orphaned!r}. "
+            "This is a pipeline invariant violation — every mapped_score must have "
+            "a parent TargetMapping so the API join is unambiguous."
+        )
+
+    return ScoresetMapping(
+        metadata=raw_metadata,
+        mapped_date=run_mapped_date_utc,
         reference_sequences=reference_sequences,
         mapped_scores=mapped_scores,
+        target_mappings=target_mappings,
     )
 
+
+def save_mapped_output_json(
+    metadata: ScoresetMetadata,
+    raw_metadata: dict,
+    mappings: dict[str, list[ScoreAnnotation]],
+    align_results: dict[str, AlignmentResult],
+    tx_output: dict[str, TxSelectResult | TxSelectError | None],
+    gene_info: dict[str, GeneInfo | None],
+    preferred_layer_only: bool = False,
+    output_path: Path | None = None,
+    vrs_version: VrsVersion = VrsVersion.V_2,
+    protein_align_results: dict[str, AlignmentResult | None] | None = None,
+) -> Path:
+    """Build and write mapping output for a score set to a JSON file.
+
+    :return: output file path
+    """
+    output = build_scoreset_mapping(
+        metadata=metadata,
+        raw_metadata=raw_metadata,
+        mappings=mappings,
+        align_results=align_results,
+        tx_output=tx_output,
+        gene_info=gene_info,
+        preferred_layer_only=preferred_layer_only,
+        vrs_version=vrs_version,
+        protein_align_results=protein_align_results,
+    )
     return write_scoreset_mapping_to_json(metadata.urn, output, output_path)
diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py
index 8216bf2..b389538 100644
--- a/src/dcd_mapping/main.py
+++ b/src/dcd_mapping/main.py
@@ -32,12 +32,14 @@
     check_uta,
 )
 from dcd_mapping.mavedb_data import (
+    get_raw_scoreset_metadata,
     get_scoreset_metadata,
     get_scoreset_records,
     patch_target_sequence_type,
     with_mavedb_score_set,
 )
 from dcd_mapping.schemas import (
+    AlignmentResult,
     ScoreRow,
     ScoresetMapping,
     ScoresetMetadata,
@@ -151,6 +153,7 @@ async def _check_data_prereqs(silent: bool) -> None:
 
 async def map_scoreset(
     metadata: ScoresetMetadata,
+    raw_metadata: dict,
     records: dict[str, list[ScoreRow]],
     output_path: Path | None = None,
     vrs_version: VrsVersion = VrsVersion.V_2,
@@ -229,17 +232,20 @@ async def map_scoreset(
     _emit_info("Reference selection complete.", silent)
 
     _emit_info("Mapping to VRS...", silent)
-    vrs_results = {}
     gene_info = {}
+    vrs_results = {}
+    protein_align_results: dict[str, AlignmentResult | None] = {}
     try:
         for target_gene in metadata.target_genes:
-            vrs_results[target_gene] = vrs_map(
+            vrs_map_result = vrs_map(
                 metadata=metadata.target_genes[target_gene],
                 align_result=alignment_results[target_gene],
                 records=records[target_gene],
                 transcript=transcripts[target_gene],
                 silent=silent,
             )
+            vrs_results[target_gene] = vrs_map_result.mappings
+            protein_align_results[target_gene] = vrs_map_result.protein_align_result
 
             gene_info[target_gene] = await compute_target_gene_info(
                 target_gene,
@@ -327,12 +333,15 @@ async def map_scoreset(
     try:
         final_output = save_mapped_output_json(
             metadata,
+            raw_metadata,
             annotated_vrs_results,
             alignment_results,
             transcripts,
             gene_info,
             prefer_genomic,
             output_path,
+            vrs_version=vrs_version,
+            protein_align_results=protein_align_results,
         )
     except Exception as e:
         _logger.exception(
@@ -372,6 +381,7 @@ async def map_scoreset_urn(
     :param silent: if True, suppress console information output
     """
     try:
+        raw_metadata = get_raw_scoreset_metadata(urn, store_path)
         metadata = get_scoreset_metadata(urn, store_path)
         records = get_scoreset_records(metadata, silent, store_path)
         metadata = patch_target_sequence_type(metadata, records, force=False)
@@ -407,5 +417,11 @@ async def map_scoreset_urn(
         return
 
     await map_scoreset(
-        metadata, records, output_path, vrs_version, prefer_genomic, silent
+        metadata,
+        raw_metadata,
+        records,
+        output_path,
+        vrs_version,
+        prefer_genomic,
+        silent,
     )
diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py
index 75f0be6..b0e6889 100644
--- a/src/dcd_mapping/schemas.py
+++ b/src/dcd_mapping/schemas.py
@@ -1,11 +1,19 @@
 """Provide class definitions for commonly-used information objects."""
 import datetime
 from enum import Enum
-from typing import Any, Literal
+from typing import Any, Literal, NamedTuple
 
 from cool_seq_tool.schemas import AnnotationLayer, Strand, TranscriptPriority
 from ga4gh.vrs._internal.models import Allele, Haplotype
-from pydantic import BaseModel, ConfigDict, Field, StrictBool, StrictInt, StrictStr
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    StrictBool,
+    StrictInt,
+    StrictStr,
+    field_serializer,
+)
 
 from dcd_mapping import vrs_v1_schemas
 from dcd_mapping.version import dcd_mapping_version
@@ -76,6 +84,61 @@ class SequenceRange(BaseModel):
     end: int
 
 
+class AlignmentQc(BaseModel):
+    """Aggregate QC for a BLAT pslx alignment.
+
+    Mismatch positions and gap intervals are kept on the in-memory model so
+    downstream code can flag individual variants that fall on a mismatched
+    base or near a gap, but they are excluded from JSON serialization to keep
+    payloads compact -- consumers reconstruct per-base detail from the CIGAR
+    or use the per-variant flags emitted on each :class:`ScoreAnnotation`.
+
+    **cDNA→genome BLAT limitation:** BLAT's pslx format is supposed to include
+    per-block target and query sequences (tseqs/qseqs), but in practice
+    Biopython's PSL parser may leave aligned-block bases as undefined when
+    the pslx output is incomplete or when blocks fall outside the portion of
+    the database sequence that BLAT includes in its output.  When this occurs,
+    ``mismatch_count`` (derived from ``Alignment.counts()``, which is purely
+    coordinate-based) remains accurate, but per-position detail cannot be
+    extracted.  The ``mismatch_positions_unavailable`` flag is set to ``True``
+    in that case; consumers should treat ``at_mismatched_locus`` as ``None``
+    (not evaluated) for affected rows rather than trusting a ``False`` value.
+
+    A future improvement would be to fetch target slices directly via SeqRepo
+    when per-base detail is required, decoupling per-position accuracy from
+    what pslx happens to include.
+
+    **Downstream effect** — ``annotate._stamp_alignment_locus_flags`` is the
+    sole consumer of ``mismatch_positions`` and ``gap_intervals``.  When
+    ``mismatch_positions_unavailable`` is ``True`` it skips the
+    ``at_mismatched_locus`` assignment entirely (leaving it ``None``) so
+    that a ``False`` value is never incorrectly written for a variant whose
+    position could not actually be checked against the alignment.
+    """
+
+    percent_identity: float | None = None
+    alignment_length: int
+    mismatch_count: int
+    gap_count: int
+    cigar: StrictStr | None = None
+    # Compact Biopython alignment visualization. Consecutive all-gap display
+    # blocks (intronic regions rendered as '?' characters in cDNA→genome
+    # alignments) are collapsed to a single summary line of the form
+    # ``... [N bp gap: chrom:start-end] ...`` by ``_compact_alignment_string``
+    # in align.py. This keeps the string proportional to exon count rather
+    # than genomic span.
+    alignment_string: StrictStr | None = None
+    # True when per-base sequence content was unavailable during QC computation
+    # (e.g. BLAT pslx did not populate tseqs/qseqs). mismatch_count is still
+    # correct in that case, but mismatch_positions is empty and at_mismatched_locus
+    # should be treated as None (not evaluated) for all variants in this target/level.
+    mismatch_positions_unavailable: bool = False
+    # In-memory only -- not serialized. Used to compute per-variant flags for variants that
+    # fall on a mismatched base or near a gap.
+    mismatch_positions: list[int] = Field(default_factory=list, exclude=True)
+    gap_intervals: list[tuple[int, int]] = Field(default_factory=list, exclude=True)
+
+
 class GeneLocation(BaseModel):
     """Gene location info, gathered from normalizer result. Likely to be incomplete."""
 
@@ -109,11 +172,28 @@ class AlignmentResult(BaseModel):
     chrom: str | None = None
     strand: Strand | None = None
     coverage: float | None = None
-    ident_pct: float | None = None
+    percent_identity: float | None = None
     query_range: SequenceRange
     query_subranges: list[SequenceRange]
     hit_range: SequenceRange
     hit_subranges: list[SequenceRange]
+    # BLAT PSL score (matches - misMatches - qNumInsert - tNumInsert) for the
+    # winning HSP. Only set for BLAT-derived results.
+    score: float | None = None
+    # BLAT PSL score (same units as ``score``) of the next-best HSP considered
+    # during selection. Presence signals that an ambiguity check was performed;
+    # None means no ambiguity check was run (not that there was no ambiguity).
+    # For instance, when returning results for protein->protein alignments, BLAT
+    # returns only the highest scoring HSP.
+    next_best_score: float | None = None
+    # Structured per-base QC derived from the pslx alignment (populated for
+    # BLAT-sourced alignments; None for cdot-fetched results).
+    alignment_qc: AlignmentQc | None = None
+    # Aligner invocation parameters recorded for reproducibility.
+    aligner_parameters: dict[str, Any] | None = None
+    # Human reference genome assembly used for this alignment (e.g. "GRCh38").
+    # None for alignments that have no genomic coordinate frame (protein-vs-protein).
+    reference_assembly: StrictStr | None = None
 
 
 class TranscriptDescription(BaseModel):
@@ -165,7 +245,7 @@ class MappedScore(BaseModel):
 
     accession_id: StrictStr
     score: str | None
-    annotation_layer: AnnotationLayer | None = None
+    alignment_level: AnnotationLayer | None = None
     pre_mapped: Allele | Haplotype | None = None
     post_mapped: Allele | Haplotype | None = None
     error_message: str | None = None
@@ -181,11 +261,22 @@ class ScoreAnnotation(BaseModel):
     relation: Literal[
         "SO:is_homologous_to"
     ] = "SO:is_homologous_to"  # TODO this should probably be None if pre_mapped is false?
+    # Identifies which target gene this score belongs to. Required for the API
+    # to join mapped_scores → target_mappings unambiguously when a score set has
+    # more than one target gene (joining on alignment_level alone is ambiguous in
+    # that case). Nullable for backwards compatibility with older pipeline outputs.
+    target_gene_identifier: StrictStr | None = None
     pre_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype | None = None
     post_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype | None = None
     vrs_version: VrsVersion | None = None
     score: float | None = None
     error_message: str | None = None
+    alignment_level: AnnotationLayer | None = None
+    # Per-variant alignment-locus flags. None means "not evaluated" (e.g. no
+    # genomic alignment available, or non-genomic annotation layer); True/False
+    # mean the flag was computed against this run's alignment.
+    at_mismatched_locus: bool | None = None
+    near_gap: bool | None = None
 
 
 class GeneInfo(BaseModel):
@@ -231,24 +322,146 @@ class TargetAnnotation(BaseModel):
     ] = {}
 
 
-class ScoreAnnotationWithLayer(ScoreAnnotation):
-    """Couple annotations with an easily-computable definition of the annotation layer
-    from which they originate.
-
-    Used for filtering individual annotations just before saving the final JSON product.
-    """
-
-    annotation_layer: AnnotationLayer | None = None
-
-
 class ScoresetMapping(BaseModel):
     """Provide all mapped scores for a scoreset."""
 
     metadata: Any  # TODO get exact MaveDB metadata structure?
-    dcd_mapping_version: str = Field(default=dcd_mapping_version)
-    mapped_date_utc: str = Field(
-        default=datetime.datetime.now(tz=datetime.UTC).isoformat()
-    )
+    mapped_date: datetime.datetime | None = None
     reference_sequences: dict[str, TargetAnnotation] | None = None
     mapped_scores: list[ScoreAnnotation] | None = None
+    target_mappings: list["TargetMapping"] | None = None
     error_message: str | None = None
+
+    @field_serializer("mapped_date")
+    def _serialize_mapped_date(self, value: datetime.datetime | None) -> str | None:
+        """Serialize mapped_date as an ISO 8601 string so it is always JSON-safe."""
+        return value.isoformat() if value is not None else None
+
+
+class TargetMapping(BaseModel):
+    """Per-(target, alignment_level) provenance and QC block.
+
+    Field names mirror the corresponding columns on `target_gene_mappings` in the
+    MaveDB API so the API worker can deserialize directly with minimal transformation.
+    Any aligner-specific structured details go in ``tool_parameters`` /
+    ``alignment_metadata`` (JSONB on the API side).
+
+    ``reference_assembly`` is a top-level column (not nested in ``tool_parameters``)
+    because it describes the coordinate frame of the mapping result, not aligner
+    configuration.  It is ``None`` for alignments with no genomic frame (e.g.
+    protein-vs-protein).
+
+    ``tool_parameters`` shape per aligner
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    BLAT genomic (sequence-based, nucleotide or protein-vs-genome)::
+
+        {
+          "aligner":         "blat",
+          "aligner_version": "<BLAT version string | null>",
+          "min_score":       <int>,
+          "out_format":      "<pslx|...>",
+          "target_args":     "<e.g. '-q=prot -t=dnax' | ''>",
+        }
+
+    BLAT protein-vs-protein (sequence-based, protein annotation layer)::
+
+        {
+          "aligner":         "blat",
+          "aligner_version": "<BLAT version string | null>",
+          "min_score":       <int>,
+          "out_format":      "<pslx|...>",
+          "target_args":     "-q=prot -t=prot",
+        }
+
+    Direct contig accession (accession-based)::
+
+        {
+          "aligner":         "direct_contig_accession",
+        }
+
+    cdot transcript placement (accession-based)::
+
+        {
+          "aligner":           "cdot_transcript_placement",
+          "aligner_version":   "<cdot library semver, e.g. '0.2.26'>",
+          "cdot_url":          "<CDOT REST endpoint>",
+          "cdot_data_version": "<data version string | null>",
+        }
+
+    ``cdot_data_version`` is ``null`` when the cdot REST response did not
+    include a ``cdot_data_version`` field; its presence (even as ``null``)
+    distinguishes "cdot run, version unknown" from "not a cdot run".
+    """
+
+    # Identity
+    target_gene_identifier: StrictStr
+    # Single-char cool-seq-tool AnnotationLayer value ("p", "c", "g").
+    alignment_level: AnnotationLayer
+    preferred: StrictBool = False
+
+    # Provenance (required on the API side: tool_name, tool_version)
+    tool_name: StrictStr = "dcd-mapping"
+    tool_version: StrictStr = dcd_mapping_version
+    tool_parameters: dict[str, Any] | None = None
+    # Human reference genome assembly (e.g. "GRCh38"). None for protein-vs-protein alignments.
+    reference_assembly: StrictStr | None = None
+    reference_accession: StrictStr | None = None
+    reference_sequence_id: StrictStr | None = None
+    vrs_version: VrsVersion | None = None
+
+    # Alignment QC -- all optional; omit (leave None) fields the aligner doesn't produce.
+    percent_identity: float | None = None
+    alignment_score: float | None = None
+    next_best_alignment_score: float | None = None
+    alignment_length: int | None = None
+    mismatch_count: int | None = None
+    gap_count: int | None = None
+    # Compact alignment visualization (see AlignmentQc.alignment_string).
+    # Intronic gap runs are collapsed to ``... [N bp gap: chrom:start-end] ...``
+    # summaries so the string length is proportional to exon count, not
+    # genomic span. None for protein-layer or cdot-derived alignments.
+    alignment_string: StrictStr | None = None
+    # Structured per-alignment metadata payload (JSONB on the API side).
+    # Keys present when available:
+    #   "cigar"                     — CIGAR string for the winning HSP.
+    #   "near_gap_window"           — half-width (in ref bases) of the window
+    #                                 used to flag ``near_gap`` on each variant.
+    #   "at_mismatched_locus_evaluated" — False when per-base sequence content
+    #                                 was unavailable (BLAT pslx omitted
+    #                                 tseqs/qseqs); treat ``at_mismatched_locus``
+    #                                 as None (not evaluated) for all variants in
+    #                                 this target/level.  True otherwise.
+    alignment_metadata: dict[str, Any] | None = None
+
+    # Annotation QC -- totals for this target x level. The two warning columns
+    # count disjoint things:
+    #   * variants_with_mapping_warnings: the mapper itself attached an
+    #     ``error_message`` despite producing a ``post_mapped`` value (e.g.
+    #     fell back to RLE, ambiguous reference allele).
+    #   * variants_with_alignment_warnings: the variant's reference position
+    #     fell on a mismatched locus or near a gap in the underlying alignment
+    #     (``at_mismatched_locus`` or ``near_gap`` was True). Mapping itself
+    #     succeeded cleanly; this is purely about alignment context.
+    # A single variant can be in neither, either, or both.
+    total_variants: int | None = None
+    # Count of variants where post_mapped is not None AND error_message is None.
+    # Alignment warnings (at_mismatched_locus, near_gap) are counted separately in
+    # variants_with_alignment_warnings and do NOT reduce this count; a variant can
+    # be "cleanly mapped" and still sit at a mismatched locus or near a gap.
+    variants_mapped_cleanly: int | None = None
+    variants_with_mapping_warnings: int | None = None
+    variants_with_alignment_warnings: int | None = None
+    variants_failed: int | None = None
+    # Count of variants that failed before any annotation layer was determined
+    # (annotation_layer=None, pre_mapped=None in the VRS output). These are
+    # re-attributed to the preferred layer's variants_failed total so the join
+    # invariant holds, but keeping them separate lets consumers see a clean
+    # per-layer failure count on all other (non-preferred) rows.
+    variants_failed_pre_layer_selection: int | None = None
+
+
+class VrsMapResult(NamedTuple):
+    """Provide mappings for an individual experiment score."""
+
+    mappings: list["MappedScore"]
+    protein_align_result: "AlignmentResult | None"
diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py
index af17f8d..eb7364d 100644
--- a/src/dcd_mapping/vrs_map.py
+++ b/src/dcd_mapping/vrs_map.py
@@ -44,6 +44,7 @@
     TargetSequenceType,
     TargetType,
     TxSelectResult,
+    VrsMapResult,
 )
 from dcd_mapping.transcripts import TxSelectError
 
@@ -472,7 +473,7 @@ def _map_protein_coding_pro(
         return MappedScore(
             accession_id=row.accession,
             score=row.score,
-            annotation_layer=AnnotationLayer.PROTEIN,
+            alignment_level=AnnotationLayer.PROTEIN,
             pre_mapped=vrs_variation,
             post_mapped=vrs_variation,
         )
@@ -527,7 +528,7 @@ def _map_protein_coding_pro(
         return MappedScore(
             accession_id=row.accession,
             score=row.score,
-            annotation_layer=AnnotationLayer.PROTEIN,
+            alignment_level=AnnotationLayer.PROTEIN,
             pre_mapped=pre_mapped_protein,
             error_message=f"{type(e).__name__}: {e}",
         )
@@ -535,7 +536,7 @@ def _map_protein_coding_pro(
     return MappedScore(
         accession_id=row.accession,
         score=row.score,
-        annotation_layer=AnnotationLayer.PROTEIN,
+        alignment_level=AnnotationLayer.PROTEIN,
         pre_mapped=pre_mapped_protein,
         post_mapped=post_mapped_protein,
     )
@@ -670,7 +671,7 @@ def _map_genomic(
             return MappedScore(
                 accession_id=row.accession,
                 score=row.score,
-                annotation_layer=AnnotationLayer.GENOMIC,
+                alignment_level=AnnotationLayer.GENOMIC,
                 pre_mapped=pre_mapped_genomic,
                 error_message=f"{type(e).__name__}: {e}",
             )
@@ -725,7 +726,7 @@ def _map_genomic(
             return MappedScore(
                 accession_id=row.accession,
                 score=row.score,
-                annotation_layer=AnnotationLayer.GENOMIC,
+                alignment_level=AnnotationLayer.GENOMIC,
                 pre_mapped=pre_mapped_genomic,
                 error_message=f"{type(e).__name__}: {e}",
             )
@@ -736,7 +737,7 @@ def _map_genomic(
     return MappedScore(
         accession_id=row.accession,
         score=row.score,
-        annotation_layer=AnnotationLayer.GENOMIC,
+        alignment_level=AnnotationLayer.GENOMIC,
         pre_mapped=pre_mapped_genomic,
         post_mapped=post_mapped_genomic,
     )
@@ -801,14 +802,17 @@ def _map_protein_coding(
     transcript: TxSelectResult | TxSelectError,
     align_result: AlignmentResult,
     silent: bool,
-) -> list[MappedScore]:
-    """Perform mapping on protein coding experiment results
+) -> tuple[list[MappedScore], AlignmentResult | None]:
+    """Perform mapping on protein coding experiment results.
 
     :param metadata: Target gene metadata from MaveDB API
     :param records: The list of MAVE variants in a given score set
     :param transcript: The transcript data for a score set, or an error message describing why an expected transcript is missing
-    :param align_results: The alignment data for a score set
-    :return: A list of mappings
+    :param align_result: The alignment data for a score set
+    :return: ``(mappings, protein_align_result)``. The target-protein-to-reference-protein
+        alignment is surfaced so downstream annotation can flag protein-layer
+        variants at mismatched/near-gap loci using its QC block; pipelines that
+        don't need it can ignore the second element.
     """
     if metadata.target_sequence_type == TargetSequenceType.DNA:
         sequence = str(
@@ -821,11 +825,24 @@ def _map_protein_coding(
         psequence_id = gsequence_id = store_sequence(sequence)
 
     # align protein sequence to selected reference protein sequence to get offsets and gaps
-    protein_align_result = None
+    protein_align_result: AlignmentResult | None = None
     if isinstance(transcript, TxSelectResult):
         protein_align_result = align_target_to_protein(
             sequence, transcript.sequence, silent
         )
+        _logger.info(
+            "Protein-to-protein alignment produced for %s (ref protein: %s). "
+            "This alignment will be used for pro-layer variants in place of the genomic alignment.",
+            metadata.target_gene_name,
+            transcript.np,
+        )
+    else:
+        _logger.warning(
+            "Skipping protein-to-protein alignment for %s: no valid transcript selected (%s). "
+            "Pro-layer variants will not be mapped.",
+            metadata.target_gene_name,
+            transcript,
+        )
 
     variations: list[MappedScore] = []
     for row in records:
@@ -872,7 +889,8 @@ def _map_protein_coding(
             variations.append(hgvs_pro_mappings)
         if hgvs_nt_mappings:
             variations.append(hgvs_nt_mappings)
-    return variations
+
+    return variations, protein_align_result
 
 
 def _map_regulatory_noncoding(
@@ -1042,7 +1060,7 @@ def vrs_map(
     records: list[ScoreRow],
     transcript: TxSelectResult | TxSelectError | None = None,
     silent: bool = True,
-) -> list[MappedScore]:
+) -> VrsMapResult:
     """Given a description of a MAVE scoreset and an aligned transcript, generate
     the corresponding VRS objects.
 
@@ -1051,20 +1069,32 @@ def vrs_map(
     :param records: scoreset records
     :param transcript: output of transcript selection process
     :param silent: If true, suppress console output
-    :return: A list of mapping results
+    :return: :class:`~dcd_mapping.schemas.VrsMapResult` with ``mappings`` and
+        ``protein_align_result``.  ``protein_align_result`` is the
+        target-protein-to-reference-protein alignment used for protein-level
+        mapping, surfaced so downstream annotation can flag protein-layer
+        variants at mismatched/near-gap loci. It is ``None`` for accession-based
+        and regulatory/non-coding targets.
     """
     if metadata.target_accession_id:
-        return _map_accession(metadata, records, align_result, transcript)
+        return VrsMapResult(
+            mappings=_map_accession(metadata, records, align_result, transcript),
+            protein_align_result=None,
+        )
+
     if metadata.target_gene_category == TargetType.PROTEIN_CODING and transcript:
-        return _map_protein_coding(
+        mappings, protein_align_result = _map_protein_coding(
             metadata,
             records,
             transcript,
             align_result,
             silent,
         )
-    return _map_regulatory_noncoding(
-        metadata,
-        records,
-        align_result,
+        return VrsMapResult(
+            mappings=mappings, protein_align_result=protein_align_result
+        )
+
+    return VrsMapResult(
+        mappings=_map_regulatory_noncoding(metadata, records, align_result),
+        protein_align_result=None,
     )
diff --git a/tests/fixtures/align_result.json b/tests/fixtures/align_result.json
index 6b2d086..b570171 100644
--- a/tests/fixtures/align_result.json
+++ b/tests/fixtures/align_result.json
@@ -4,7 +4,7 @@
       "chrom": "chr11",
       "strand": 1,
       "coverage": 100.0,
-      "ident_pct": 77.45098039215686,
+      "percent_identity": 77.45098039215686,
       "query_range": {
         "start": 0,
         "end": 102
@@ -40,7 +40,7 @@
       "chrom": "chr3",
       "strand": 1,
       "coverage": 100.0,
-      "ident_pct": 100.0,
+      "percent_identity": 100.0,
       "query_range": {
         "start": 0,
         "end": 1047
@@ -100,7 +100,7 @@
       "chrom": "chr22",
       "strand": -1,
       "coverage": 100.0,
-      "ident_pct": 99.9071494893222,
+      "percent_identity": 99.9071494893222,
       "query_range": {
         "start": 0,
         "end": 360
@@ -184,7 +184,7 @@
       "chrom": "chr20",
       "strand": 1,
       "coverage": 100.0,
-      "ident_pct": 99.86666666666666,
+      "percent_identity": 99.86666666666666,
       "query_range": {
         "start": 0,
         "end": 750
@@ -252,7 +252,7 @@
       "chrom": "chr11",
       "strand": 1,
       "coverage": 100.0,
-      "ident_pct": 100.0,
+      "percent_identity": 100.0,
       "query_range": {
         "start": 0,
         "end": 187
@@ -280,7 +280,7 @@
       "chrom": "chr16",
       "strand": 1,
       "coverage": 100.0,
-      "ident_pct": 100.0,
+      "percent_identity": 100.0,
       "query_range": {
         "start": 0,
         "end": 477
@@ -348,7 +348,7 @@
       "chrom": "chr3",
       "strand": -1,
       "coverage": 100.0,
-      "ident_pct": 100.0,
+      "percent_identity": 100.0,
       "query_range": {
         "start": 0,
         "end": 12
@@ -376,7 +376,7 @@
       "chrom": "chr3",
       "strand": -1,
       "coverage": 100.0,
-      "ident_pct": 100.0,
+      "percent_identity": 100.0,
       "query_range": {
         "start": 0,
         "end": 117
@@ -412,7 +412,7 @@
       "chrom": "chr17",
       "strand": -1,
       "coverage": 99.83079526226734,
-      "ident_pct": 99.91525423728814,
+      "percent_identity": 99.91525423728814,
       "query_range": {
         "start": 0,
         "end": 1180
@@ -512,7 +512,7 @@
       "chrom": "chr17",
       "strand": -1,
       "coverage": 100.0,
-      "ident_pct": 100.0,
+      "percent_identity": 100.0,
       "query_range": {
         "start": 0,
         "end": 300
@@ -564,7 +564,7 @@
       "chrom": "chr2",
       "strand": -1,
       "coverage": 97.05882352941177,
-      "ident_pct": 100.0,
+      "percent_identity": 100.0,
       "query_range": {
         "start": 9,
         "end": 306
diff --git a/tests/test_align.py b/tests/test_align.py
index 468d719..0dda37c 100644
--- a/tests/test_align.py
+++ b/tests/test_align.py
@@ -6,18 +6,332 @@
 
 
 """
+from unittest.mock import patch
+
+import numpy as np
 import pytest
+from Bio import Align as BioAlign
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from dcd_mapping.align import (
+    _blat_score,
+    _build_alignment_qc,
+    _compact_alignment_string,
+    _get_best_hsp,
+    align,
+)
+from dcd_mapping.exceptions import AlignmentError
+from dcd_mapping.schemas import (
+    AlignmentQc,
+    AlignmentResult,
+    ScoresetMetadata,
+    TargetGene,
+    TargetType,
+)
+
+
+def _make_aln(
+    query_seq: str,
+    target_seq: str,
+    q_start: int = 0,
+    t_start: int = 0,
+    coords: "np.ndarray | None" = None,
+) -> BioAlign.Alignment:
+    """Build a minimal Bio.Align.Alignment for unit-testing _build_alignment_qc."""
+    n = len(query_seq)
+    t_len = max(t_start + n + 10, len(target_seq) + 10)
+    # Pad target so sequence slicing is always in bounds
+    padded_target = target_seq.ljust(t_len, "N")
+    target = SeqRecord(Seq(padded_target), id="chr1")
+    query = SeqRecord(Seq(query_seq), id="query")
+    if coords is None:
+        coords = np.array([[t_start, t_start + n], [q_start, q_start + n]])
+    return BioAlign.Alignment(sequences=[target, query], coordinates=coords)
+
+
+class TestBuildAlignmentQc:
+    def test_exact_match(self):
+        seq = "ACGTACGTAC"
+        aln = _make_aln(seq, seq)
+        qc = _build_alignment_qc(aln)
+        assert isinstance(qc, AlignmentQc)
+        assert qc.mismatch_count == 0
+        assert qc.gap_count == 0
+        assert qc.mismatch_positions == []
+        assert qc.mismatch_positions_unavailable is False
+        assert qc.gap_intervals == []
+        assert qc.cigar == "10M"
+        assert qc.percent_identity == pytest.approx(100.0)
+
+    def test_single_mismatch(self):
+        query = "ACGTACGTAC"
+        # position 3: T->G mismatch
+        target = "ACGGACGTAC"
+        aln = _make_aln(query, target)
+        qc = _build_alignment_qc(aln)
+        assert qc.mismatch_count == 1
+        assert qc.mismatch_positions == [3]
+
+    def test_gap_in_ref(self):
+        """A gap appears when target has fewer bases than query (insertion in query)."""
+        target_seq = "ACGTAACGTA"  # 10 bases
+        query_seq = "ACGTAGGACGTA"  # 12 bases (2 extra at position 5-7)
+        target = SeqRecord(Seq(target_seq + "N" * 10), id="chr1")
+        query = SeqRecord(Seq(query_seq), id="query")
+        coords = np.array([[0, 5, 5, 10], [0, 5, 7, 12]])
+        aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords)
+        qc = _build_alignment_qc(aln)
+        assert qc.gap_count == 1
+        assert len(qc.gap_intervals) == 1
+
+    def test_gap_in_query(self):
+        """A gap appears when query has fewer bases than target (deletion in query)."""
+        target_seq = "ACGTANNNACGTA"  # 13 bases
+        query_seq = "ACGTAACGTA"  # 10 bases
+        target = SeqRecord(Seq(target_seq + "N" * 10), id="chr1")
+        query = SeqRecord(Seq(query_seq), id="query")
+        coords = np.array([[0, 5, 8, 13], [0, 5, 5, 10]])
+        aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords)
+        qc = _build_alignment_qc(aln)
+        assert qc.gap_count == 1
+        assert len(qc.gap_intervals) == 1
+        gs, ge = qc.gap_intervals[0]
+        assert ge - gs == 3
+
+    def test_multi_block_alignment(self):
+        """Two aligned blocks with a gap between them."""
+        seq = "ACGTAACGTA"  # 10-base query
+        target_seq = seq[:5] + "NNN" + seq[5:]  # 13-base target with 3-base insertion
+        target = SeqRecord(Seq(target_seq + "N" * 10), id="chr1")
+        query = SeqRecord(Seq(seq), id="query")
+        coords = np.array([[0, 5, 8, 13], [0, 5, 5, 10]])
+        aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords)
+        qc = _build_alignment_qc(aln)
+        # alignment_length = target span = coords[0][-1] - coords[0][0] = 13
+        assert qc.alignment_length == 13
+        assert qc.gap_count == 1
+        assert qc.mismatch_count == 0
+
+    def test_undefined_sequence_fallback(self):
+        """_build_alignment_qc falls back gracefully when sequence content is unavailable.
+
+        Biopython's PSL parser leaves SeqRecords with undefined content when BLAT's
+        pslx output omits per-block sequences (common in protein-vs-protein mode).
+        Slicing raises UndefinedSequenceError; the function should fall back to an
+        empty mismatch_positions list rather than crashing, and must set
+        mismatch_positions_unavailable=True so callers know at_mismatched_locus is
+        unreliable when mismatch_count > 0.
+        """
+        target = SeqRecord(Seq(None, length=20), id="reference")
+        query = SeqRecord(Seq(None, length=10), id="query")
+        coords = np.array([[0, 10], [0, 10]])
+        aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords)
+
+        qc = _build_alignment_qc(aln)
+
+        assert qc.mismatch_positions == []
+        assert qc.mismatch_positions_unavailable is True
+        # gap_intervals and gap_count are coordinate-driven (no sequence needed)
+        assert qc.gap_count == 0
+
+    def test_mismatch_positions_after_sequence_attachment(self):
+        """Attaching sequences to an undefined alignment enables per-base mismatch detection.
+
+        This mirrors what align_target_to_protein does before calling _build_alignment_qc:
+        the PSL gives coordinates; we supply the original sequences so that slicing works.
+        """
+        reference = "MKALVDQSLR"
+        query_seq = "MKALXDQZLR"  # mismatches at positions 4 ('X') and 7 ('Z')
+
+        target = SeqRecord(Seq(None, length=len(reference)), id="reference")
+        query = SeqRecord(Seq(None, length=len(query_seq)), id="query")
+        coords = np.array([[0, len(reference)], [0, len(query_seq)]])
+        aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords)
+
+        # Mimic align_target_to_protein: attach sequences before QC
+        aln.target.seq = Seq(reference)
+        aln.query.seq = Seq(query_seq)
+
+        qc = _build_alignment_qc(aln)
+
+        assert 4 in qc.mismatch_positions
+        assert 7 in qc.mismatch_positions
+        assert len(qc.mismatch_positions) == 2
+        assert qc.mismatch_count == 2
+
+    def test_minus_strand_alignment(self):
+        """_build_alignment_qc works for minus-strand PSL alignments.
+
+        Biopython's SAM formatter raises 'Unequal step sizes' for PSL
+        alignments where the query is on the minus strand, because query
+        coordinates are stored in descending order.  We build CIGAR from
+        coordinates directly so this is a non-issue.
+        """
+        # 8-base sequences; minus-strand: query[8:0] aligns to target[10:18].
+        # Biopython stores minus-strand query coords as descending.
+        seq = "ACGTACGT"
+        target = SeqRecord(Seq(seq * 20), id="chr1")
+        query = SeqRecord(Seq(seq), id="query")
+        coords = np.array([[10, 18], [8, 0]])
+        aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords)
+
+        qc = _build_alignment_qc(aln)
+
+        assert qc.cigar == "8M"
+        assert qc.gap_count == 0
+
+
+class TestCompactAlignmentString:
+    """Unit tests for _compact_alignment_string.
 
-from dcd_mapping.align import align
-from dcd_mapping.schemas import AlignmentResult
+    The function collapses consecutive all-'?' target-sequence blocks (intronic
+    regions in cDNA→genome BLAT output) into summary lines of the form
+    ``... [N bp gap: chrom:start-end] ...``.
+    """
+
+    def _make_group(
+        self, chrom: str, start: int, seq: str, match_row: str, query_seq: str
+    ) -> str:
+        """Build a single Biopython-style 3-line display group."""
+        return f"{chrom} {start} {seq}\n           {match_row}\n query 0    {query_seq}"
+
+    def test_noop_on_no_gap_blocks(self):
+        """A string with no all-'?' target rows is returned unchanged."""
+        group = self._make_group("chr1", 100, "ACGT", "||||", "ACGT")
+        assert _compact_alignment_string(group) == group
+
+    def test_single_gap_block_collapsed(self):
+        """One all-'?' block is replaced with a summary line."""
+        exon1 = self._make_group("chr1", 100, "ACGT", "||||", "ACGT")
+        intron = self._make_group("chr1", 104, "????", "    ", "----")
+        exon2 = self._make_group("chr1", 108, "TTTT", "||||", "TTTT")
+        inp = "\n\n".join([exon1, intron, exon2])
+        out = _compact_alignment_string(inp)
+        # Intron block replaced; exon blocks preserved
+        assert "... [4 bp gap: chr1:104-108] ..." in out
+        assert "ACGT" in out
+        assert "TTTT" in out
+        # No raw '????' remaining
+        assert "????" not in out
+
+    def test_consecutive_gap_blocks_merged(self):
+        """Multiple consecutive all-'?' blocks are merged into a single summary."""
+        exon = self._make_group("chr1", 0, "ACGT", "||||", "ACGT")
+        intron1 = self._make_group("chr1", 4, "?" * 60, " " * 60, "-" * 60)
+        intron2 = self._make_group("chr1", 64, "?" * 60, " " * 60, "-" * 60)
+        inp = "\n\n".join([exon, intron1, intron2])
+        out = _compact_alignment_string(inp)
+        # Both intron blocks merged into one summary
+        assert "... [120 bp gap: chr1:4-124] ..." in out
+        assert out.count("... [") == 1
+
+    def test_mixed_sequence_block_not_collapsed(self):
+        """A block with mixed '?' and real bases is left as-is."""
+        mixed = self._make_group("chr1", 0, "AC?T", "|| |", "ACGT")
+        out = _compact_alignment_string(mixed)
+        assert "AC?T" in out
+        assert "... [" not in out
+
+    def test_protein_alignment_noop(self):
+        """Protein alignments have no '?' characters; function is a no-op."""
+        protein_group = "NP_001234.1 0 MAAAA\n          |||||\n query 0   MAAAA"
+        assert _compact_alignment_string(protein_group) == protein_group
+
+    def test_comma_formatting_in_large_gap(self):
+        """Gap length ≥ 1000 uses comma-formatted number."""
+        exon = self._make_group("chr1", 0, "ACGT", "||||", "ACGT")
+        # Build consecutive 60-bp intron blocks totalling > 1000 bp
+        introns = [
+            self._make_group("chr1", 4 + i * 60, "?" * 60, " " * 60, "-" * 60)
+            for i in range(17)  # 17 * 60 = 1020 bp
+        ]
+        inp = "\n\n".join([exon, *introns])
+        out = _compact_alignment_string(inp)
+        assert "1,020 bp gap" in out
+
+
+class TestBlatScore:
+    """_blat_score and _get_best_hsp regression tests.
+
+    The key invariant: _get_best_hsp must use BLAT PSL score
+    (identities - mismatches - qNumInsert - tNumInsert), NOT raw identity
+    count.  These tests construct two alignments where the two metrics
+    disagree and verify that BLAT-score-based selection wins.
+    """
+
+    def _make_noisy_aln(self) -> BioAlign.Alignment:
+        """Alignment A: 10 identity matches + 8 mismatches in one block.
+
+        BLAT score  = 10 - 8 = 2
+        identity count = 10
+        """
+        # First 10 chars identical, last 8 mismatched (N vs T)
+        target_seq = "ACGTACGTAC" + "TTTTTTTT"
+        query_seq = "ACGTACGTAC" + "NNNNNNNN"
+        padded = target_seq.ljust(len(target_seq) + 10, "A")
+        target = SeqRecord(Seq(padded), id="chr1")
+        query = SeqRecord(Seq(query_seq), id="query_noisy")
+        coords = np.array([[0, len(query_seq)], [0, len(query_seq)]])
+        return BioAlign.Alignment(sequences=[target, query], coordinates=coords)
+
+    def _make_clean_aln(self) -> BioAlign.Alignment:
+        """Alignment B: 7 perfect matches, no mismatches.
+
+        BLAT score  = 7 - 0 = 7
+        identity count = 7
+
+        Shorter but cleaner: should be preferred by _get_best_hsp.
+        """
+        seq = "ACGTACG"
+        padded = seq.ljust(len(seq) + 10, "A")
+        target = SeqRecord(Seq(padded), id="chr1")
+        query = SeqRecord(Seq(seq), id="query_clean")
+        coords = np.array([[0, len(seq)], [0, len(seq)]])
+        return BioAlign.Alignment(sequences=[target, query], coordinates=coords)
+
+    def test_blat_score_noisy_lower_than_clean(self):
+        """Noisy alignment has more identities but a lower BLAT score."""
+        noisy = self._make_noisy_aln()
+        clean = self._make_clean_aln()
+
+        noisy_counts = noisy.counts()
+        clean_counts = clean.counts()
+
+        # Identity count: noisy wins
+        assert (
+            noisy_counts.identities > clean_counts.identities
+        ), "Noisy alignment must have more raw identities for the test to be meaningful"
+        # BLAT score: clean wins
+        assert (
+            _blat_score(noisy) < _blat_score(clean)
+        ), "Clean alignment must have the higher BLAT score for the test to be meaningful"
+
+    def test_get_best_hsp_prefers_blat_score_over_identity_count(self):
+        """_get_best_hsp selects the alignment with the higher BLAT score, not
+        the higher raw identity count.
+
+        Regression guard: before the fix, the selector used
+        ``a.counts().identities``, which would have returned the noisy
+        alignment.  After the fix it must return the clean one.
+        """
+        noisy = self._make_noisy_aln()
+        clean = self._make_clean_aln()
+
+        best = _get_best_hsp([noisy, clean])
+
+        assert best is clean, (
+            "_get_best_hsp chose the noisy alignment (higher identity count) "
+            "instead of the clean one (higher BLAT score). "
+            "The ranking criterion has regressed to identity-count-only."
+        )
 
 
 def check_alignment_result_equality(actual: AlignmentResult, expected: AlignmentResult):
-    """Check correctness of alignment result against fixture"""
     assert actual.chrom == expected.chrom
     assert actual.strand == expected.strand
     assert actual.coverage == pytest.approx(expected.coverage)
-    assert actual.ident_pct == pytest.approx(expected.ident_pct)
+    assert actual.percent_identity == pytest.approx(expected.percent_identity)
     assert actual.query_range.start == expected.query_range.start
     assert actual.query_range.end == expected.query_range.end
     for a, e in zip(actual.query_subranges, expected.query_subranges, strict=False):
@@ -90,3 +404,47 @@ def test_align_tp53(scoreset_metadata_fixture, align_result_fixture):
     expected = AlignmentResult(**align_result_fixture[urn])
     assert align_result
     check_alignment_result_equality(align_result, expected)
+
+
+def test_align_raises_on_ambiguous_blat_id():
+    """align() must raise AlignmentError when a BLAT result ID matches more than one
+    target gene name after stripping whitespace and punctuation.
+
+    Regression guard for the silent-pass bug where ``matches > 1`` constructed
+    an error message but never raised it.
+    """
+    # Two gene names that both normalise to the same BLAT identifier "GENE".
+    # "GENE (isoform1)" -> split()[0].replace("(","").replace(")","") -> "GENE"
+    # "GENE (isoform2)" -> same result -> ambiguous.
+    metadata = ScoresetMetadata(
+        urn="urn:mavedb:00000001-a-1",
+        target_genes={
+            "GENE (isoform1)": TargetGene(
+                target_gene_name="GENE (isoform1)",
+                target_gene_category=TargetType.PROTEIN_CODING,
+                target_sequence="ACGT",
+                target_sequence_type="dna",
+                target_accession_id=None,
+            ),
+            "GENE (isoform2)": TargetGene(
+                target_gene_name="GENE (isoform2)",
+                target_gene_category=TargetType.PROTEIN_CODING,
+                target_sequence="ACGT",
+                target_sequence_type="dna",
+                target_accession_id=None,
+            ),
+        },
+    )
+    # _get_blat_output returns a grouped dict keyed by the shortened BLAT id.
+    # We fake a single hit for the shared prefix so the disambiguation loop runs.
+    fake_grouped = {"GENE": []}
+    fake_params = {"aligner": "blat"}
+
+    with (
+        patch(
+            "dcd_mapping.align._get_blat_output",
+            return_value=(fake_grouped, fake_params),
+        ),
+        pytest.raises(AlignmentError, match="matches multiple target gene names"),
+    ):
+        align(metadata)
diff --git a/tests/test_annotate.py b/tests/test_annotate.py
index 6a7e18e..b7af03b 100644
--- a/tests/test_annotate.py
+++ b/tests/test_annotate.py
@@ -9,17 +9,21 @@
     SequenceReference,
 )
 
+from dcd_mapping import vrs_v1_schemas
 from dcd_mapping.annotate import (
     _compute_target_gene_info_from_alignment,
     _compute_target_gene_info_from_mapped_variant_spans,
     _covered_bases_from_overlapping_genes_of_chromosomal_intervals,
+    _stamp_alignment_locus_flags,
     compute_target_gene_info,
 )
 from dcd_mapping.schemas import (
+    AlignmentQc,
     AlignmentResult,
     AnnotationLayer,
     GeneInfo,
     MappedScore,
+    ScoreAnnotation,
     ScoresetMetadata,
     SequenceRange,
     TargetGene,
@@ -56,7 +60,7 @@ def make_align(hit_intervals):
         chrom="NC_000001.11",
         strand=1,
         coverage=None,
-        ident_pct=None,
+        percent_identity=None,
         query_range=SequenceRange(start=1, end=10),
         query_subranges=[SequenceRange(start=1, end=10)],
         hit_range=SequenceRange(start=1, end=10),
@@ -161,7 +165,7 @@ def test_compute_target_gene_info_mapped_variants_path(scoreset_metadata):
         accession_id="id",
         pre_mapped=None,
         post_mapped=allele,
-        annotation_layer=AnnotationLayer.GENOMIC,
+        alignment_level=AnnotationLayer.GENOMIC,
         score=None,
         error_message=None,
     )
@@ -246,7 +250,7 @@ def make_ms(start, end):
             accession_id=f"id_{start}",
             pre_mapped=None,
             post_mapped=allele,
-            annotation_layer=AnnotationLayer.GENOMIC,
+            alignment_level=AnnotationLayer.GENOMIC,
             score=None,
             error_message=None,
         )
@@ -281,3 +285,131 @@ def fake_overlap(chrom, s, e, features=None):
         assert isinstance(res, GeneInfo)
         assert res.hgnc_symbol == "GENEZ"
         assert res.selection_method == "variants_max_covered_bases"
+
+
+# ---------------------------------------------------------------------------
+# Tests for _apply_alignment_locus_flags / mismatch_positions_unavailable
+# ---------------------------------------------------------------------------
+
+
+def _make_genomic_annotation(start: int, end: int) -> "ScoreAnnotation":
+    """Return a ScoreAnnotation with a simple genomic post_mapped Allele."""
+    allele = Allele(
+        location=SequenceLocation(
+            sequenceReference=SequenceReference(
+                refgetAccession="SQ.1234567890abcdef1234567890abcdef"
+            ),
+            start=start,
+            end=end,
+        ),
+        state=LiteralSequenceExpression(sequence="A"),
+        expressions=[],
+    )
+    return ScoreAnnotation(
+        mavedb_id=f"id_{start}",
+        pre_mapped=None,
+        post_mapped=allele,
+        alignment_level=AnnotationLayer.GENOMIC,
+        score=None,
+    )
+
+
+def _make_align_result_with_qc(**qc_kwargs) -> "AlignmentResult":
+    """Build a minimal AlignmentResult with an AlignmentQc built from qc_kwargs."""
+    qc = AlignmentQc(
+        alignment_length=100,
+        mismatch_count=qc_kwargs.pop("mismatch_count", 0),
+        gap_count=qc_kwargs.pop("gap_count", 0),
+        **qc_kwargs,
+    )
+    return AlignmentResult(
+        chrom="NC_000001.11",
+        strand=1,
+        coverage=None,
+        percent_identity=None,
+        query_range=SequenceRange(start=0, end=100),
+        query_subranges=[SequenceRange(start=0, end=100)],
+        hit_range=SequenceRange(start=0, end=100),
+        hit_subranges=[SequenceRange(start=0, end=100)],
+        alignment_qc=qc,
+    )
+
+
+def test_apply_alignment_locus_flags_mismatch_positions_unavailable_leaves_at_mismatched_locus_none():
+    """When mismatch_positions_unavailable=True and mismatch_count>0, at_mismatched_locus
+    must stay None (not evaluated) to prevent silent disagreement with mismatch_count.
+    """
+    ann = _make_genomic_annotation(10, 11)
+    assert ann.at_mismatched_locus is None  # pre-condition
+
+    align_result = _make_align_result_with_qc(
+        mismatch_count=5,
+        mismatch_positions_unavailable=True,
+    )
+    _stamp_alignment_locus_flags([ann], align_result, AnnotationLayer.GENOMIC)
+
+    assert ann.at_mismatched_locus is None, (
+        "at_mismatched_locus must remain None when mismatch_positions_unavailable=True "
+        "and mismatch_count>0 -- it cannot claim False when positions are unknown"
+    )
+    assert ann.near_gap is False  # gap_intervals empty → near_gap still evaluated
+
+
+def test_apply_alignment_locus_flags_mismatch_positions_unavailable_zero_count_stamps_false():
+    """When mismatch_positions_unavailable=True but mismatch_count==0, there are no
+    mismatches to miss, so at_mismatched_locus=False is safe.
+    """
+    ann = _make_genomic_annotation(10, 11)
+    align_result = _make_align_result_with_qc(
+        mismatch_count=0,
+        mismatch_positions_unavailable=True,
+    )
+    _stamp_alignment_locus_flags([ann], align_result, AnnotationLayer.GENOMIC)
+
+    assert ann.at_mismatched_locus is False
+    assert ann.near_gap is False
+
+
+def test_apply_alignment_locus_flags_early_exit_does_not_fire_when_count_nonzero():
+    """The early-exit 'no mismatches and no gaps → stamp False' must NOT fire when
+    mismatch_positions_unavailable=True and mismatch_count>0, even if gap_intervals
+    is also empty.  Previously the check was ``not mismatch_positions``, which is
+    always True under unavailability, causing the early-exit to stamp near_gap=False
+    on annotations whose positions could not be extracted (and thus were never evaluated).
+    The fix uses ``mismatch_count == 0`` as the ground-truth condition.
+    """
+    # A VRS v1 Allele is not an instance of ga4gh.vrs.Allele (v2), so
+    # _allele_ref_positions returns [] and the main-loop body is skipped via `continue`.
+    v1_allele = vrs_v1_schemas.Allele(
+        id="ga4gh:VA.test",
+        location=vrs_v1_schemas.SequenceLocation(
+            id="loc1",
+            sequence_id="ga4gh:SQ.test",
+            interval=vrs_v1_schemas.SequenceInterval(
+                start=vrs_v1_schemas.Number(value=10),
+                end=vrs_v1_schemas.Number(value=11),
+            ),
+        ),
+        state=vrs_v1_schemas.LiteralSequenceExpression(sequence="A"),
+    )
+    ann = ScoreAnnotation.model_construct(
+        mavedb_id="id_v1",
+        post_mapped=v1_allele,
+        alignment_level=AnnotationLayer.GENOMIC,
+        at_mismatched_locus=None,
+        near_gap=None,
+    )
+
+    align_result = _make_align_result_with_qc(
+        mismatch_count=3,
+        mismatch_positions_unavailable=True,
+    )
+    _stamp_alignment_locus_flags([ann], align_result, AnnotationLayer.GENOMIC)
+
+    # Both flags must remain None: positions could not be extracted, and the early-exit
+    # must not have fired (it would have incorrectly stamped near_gap=False).
+    assert ann.at_mismatched_locus is None
+    assert ann.near_gap is None, (
+        "near_gap must be None when positions are unextractable and mismatch_count>0 "
+        "prevents the early-exit from firing -- old code stamped False via faulty early-exit"
+    )
diff --git a/tests/test_annotate_target_mapping.py b/tests/test_annotate_target_mapping.py
new file mode 100644
index 0000000..744b225
--- /dev/null
+++ b/tests/test_annotate_target_mapping.py
@@ -0,0 +1,855 @@
+"""Tests for annotate._reference_accession_for_target_level and build_scoreset_mapping."""
+from unittest.mock import patch
+
+from cool_seq_tool.schemas import AnnotationLayer
+
+from dcd_mapping.annotate import (
+    _align_result_for_target,
+    _reference_accession_for_target_level,
+    _stamp_alignment_locus_flags,
+    build_scoreset_mapping,
+)
+from dcd_mapping.schemas import (
+    AlignmentQc,
+    AlignmentResult,
+    GeneInfo,
+    ScoreAnnotation,
+    ScoresetMapping,
+    ScoresetMetadata,
+    SequenceRange,
+    TargetGene,
+    TargetSequenceType,
+    TargetType,
+    TxSelectResult,
+    VrsVersion,
+)
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+def _make_seq_target(name: str = "GENE1") -> TargetGene:
+    return TargetGene(
+        target_gene_name=name,
+        target_gene_category=TargetType.PROTEIN_CODING,
+        target_sequence="ATGCATGC",
+        target_sequence_type=TargetSequenceType.DNA,
+        target_accession_id=None,
+    )
+
+
+def _make_acc_target(accession_id: str, name: str = "GENE1") -> TargetGene:
+    return TargetGene(
+        target_gene_name=name,
+        target_gene_category=TargetType.PROTEIN_CODING,
+        target_sequence=None,
+        target_sequence_type=None,
+        target_accession_id=accession_id,
+    )
+
+
+def _make_tx(nm: str = "NM_000001.1", np: str = "NP_000001.1") -> TxSelectResult:
+    return TxSelectResult(
+        nm=nm,
+        np=np,
+        start=0,
+        is_full_match=True,
+        sequence="MAAAA",
+        hgnc_symbol="GENE1",
+    )
+
+
+def _make_align(chrom: str = "chr1") -> AlignmentResult:
+    return AlignmentResult(
+        chrom=chrom,
+        query_range=SequenceRange(start=0, end=100),
+        query_subranges=[SequenceRange(start=0, end=100)],
+        hit_range=SequenceRange(start=1000, end=1100),
+        hit_subranges=[SequenceRange(start=1000, end=1100)],
+        aligner_parameters={"aligner": "blat", "min_score": 20, "out_format": "pslx"},
+    )
+
+
+# ---------------------------------------------------------------------------
+# _reference_accession_for_target_level
+# ---------------------------------------------------------------------------
+
+
+class TestReferenceAccessionForTargetLevel:
+    def test_protein_layer_from_tx(self):
+        target = _make_seq_target()
+        tx = _make_tx(np="NP_001234.1")
+        result = _reference_accession_for_target_level(
+            AnnotationLayer.PROTEIN, target, tx, None
+        )
+        assert result == "NP_001234.1"
+
+    def test_protein_layer_from_accession(self):
+        target = _make_acc_target("NP_001234.1")
+        result = _reference_accession_for_target_level(
+            AnnotationLayer.PROTEIN, target, None, None
+        )
+        assert result == "NP_001234.1"
+
+    def test_protein_layer_ensp_accession(self):
+        target = _make_acc_target("ENSP00000001.1")
+        result = _reference_accession_for_target_level(
+            AnnotationLayer.PROTEIN, target, None, None
+        )
+        assert result == "ENSP00000001.1"
+
+    def test_protein_layer_no_info(self):
+        target = _make_seq_target()
+        result = _reference_accession_for_target_level(
+            AnnotationLayer.PROTEIN, target, None, None
+        )
+        assert result is None
+
+    def test_cdna_layer_from_tx(self):
+        target = _make_seq_target()
+        tx = _make_tx(nm="NM_000001.1")
+        result = _reference_accession_for_target_level(
+            AnnotationLayer.CDNA, target, tx, None
+        )
+        assert result == "NM_000001.1"
+
+    def test_cdna_layer_from_accession(self):
+        target = _make_acc_target("NM_000001.1")
+        result = _reference_accession_for_target_level(
+            AnnotationLayer.CDNA, target, None, None
+        )
+        assert result == "NM_000001.1"
+
+    def test_cdna_layer_no_info(self):
+        target = _make_seq_target()
+        result = _reference_accession_for_target_level(
+            AnnotationLayer.CDNA, target, None, None
+        )
+        assert result is None
+
+    def test_genomic_layer_from_alignment(self):
+        target = _make_seq_target()
+        align = _make_align(chrom="chr1")
+        with patch(
+            "dcd_mapping.annotate.get_chromosome_identifier",
+            return_value="refseq:NC_000001.11",
+        ):
+            result = _reference_accession_for_target_level(
+                AnnotationLayer.GENOMIC, target, None, align
+            )
+        assert result == "NC_000001.11"
+
+    def test_genomic_layer_nc_accession(self):
+        target = _make_acc_target("NC_000001.11")
+        result = _reference_accession_for_target_level(
+            AnnotationLayer.GENOMIC, target, None, None
+        )
+        assert result == "NC_000001.11"
+
+    def test_genomic_layer_no_info(self):
+        target = _make_seq_target()
+        result = _reference_accession_for_target_level(
+            AnnotationLayer.GENOMIC, target, None, None
+        )
+        assert result is None
+
+
+# ---------------------------------------------------------------------------
+# build_scoreset_mapping golden test
+# ---------------------------------------------------------------------------
+
+
+def _make_annotation(
+    layer: AnnotationLayer | None,
+    post_mapped=None,
+    error_message: str | None = None,
+) -> ScoreAnnotation:
+    """Minimal ScoreAnnotation for golden test."""
+    return ScoreAnnotation(
+        mavedb_id="urn:mavedb:00000001-a-1#1",
+        alignment_level=layer,
+        pre_mapped=None,
+        post_mapped=post_mapped,
+        error_message=error_message,
+        score=None,
+    )
+
+
+class TestBuildScoresetMapping:
+    def test_emits_one_target_mapping_per_layer(self):
+        """Each (target, layer) pair appearing in mappings generates exactly one TargetMapping."""
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={
+                "GENE1": _make_seq_target("GENE1"),
+            },
+        )
+        # Two annotations: one genomic, one protein
+        g_ann = _make_annotation(AnnotationLayer.GENOMIC)
+        p_ann = _make_annotation(AnnotationLayer.PROTEIN)
+        mappings = {"GENE1": [g_ann, p_ann]}
+
+        with (
+            patch(
+                "dcd_mapping.annotate.get_vrs_id_from_identifier",
+                return_value="ga4gh:SQ.test",
+            ),
+            patch(
+                "dcd_mapping.annotate.get_chromosome_identifier",
+                return_value="NC_000001.11",
+            ),
+            patch(
+                "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer",
+                return_value=AnnotationLayer.GENOMIC,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_computed_reference_sequence",
+                return_value=None,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None
+            ),
+        ):
+            result = build_scoreset_mapping(
+                metadata=metadata,
+                raw_metadata={},
+                mappings=mappings,
+                align_results={"GENE1": _make_align()},
+                tx_output={"GENE1": _make_tx()},
+                gene_info={"GENE1": GeneInfo(hgnc_symbol="GENE1")},
+                preferred_layer_only=False,
+                vrs_version=VrsVersion.V_2,
+            )
+
+        assert isinstance(result, ScoresetMapping)
+        assert result.target_mappings is not None
+        layers_seen = {tm.alignment_level for tm in result.target_mappings}
+        assert AnnotationLayer.GENOMIC in layers_seen
+        assert AnnotationLayer.PROTEIN in layers_seen
+        # Exactly one mapping per layer
+        assert len(result.target_mappings) == 2
+
+    def test_preferred_flag_on_exactly_one_row_per_target(self):
+        """preferred=True must appear on exactly one TargetMapping per target."""
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_seq_target("GENE1")},
+        )
+        g_ann = _make_annotation(AnnotationLayer.GENOMIC)
+        p_ann = _make_annotation(AnnotationLayer.PROTEIN)
+
+        with (
+            patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None),
+            patch(
+                "dcd_mapping.annotate.get_chromosome_identifier",
+                return_value="NC_000001.11",
+            ),
+            patch(
+                "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer",
+                return_value=AnnotationLayer.GENOMIC,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_computed_reference_sequence",
+                return_value=None,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None
+            ),
+        ):
+            result = build_scoreset_mapping(
+                metadata=metadata,
+                raw_metadata={},
+                mappings={"GENE1": [g_ann, p_ann]},
+                align_results={"GENE1": _make_align()},
+                tx_output={"GENE1": _make_tx()},
+                gene_info={"GENE1": None},
+                preferred_layer_only=False,
+                vrs_version=VrsVersion.V_2,
+            )
+
+        preferred = [tm for tm in result.target_mappings if tm.preferred]
+        assert len(preferred) == 1
+        assert preferred[0].alignment_level == AnnotationLayer.GENOMIC
+
+    def test_annotation_qc_counts_sum_correctly(self):
+        """total_variants == clean + warnings + failed."""
+        from ga4gh.vrs._internal.models import (
+            Allele,
+            LiteralSequenceExpression,
+            SequenceLocation,
+            SequenceReference,
+        )
+
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_seq_target("GENE1")},
+        )
+
+        allele = Allele(
+            location=SequenceLocation(
+                sequenceReference=SequenceReference(refgetAccession="SQ." + "A" * 32),
+                start=0,
+                end=1,
+            ),
+            state=LiteralSequenceExpression(sequence="A"),
+        )
+
+        annotations = [
+            _make_annotation(AnnotationLayer.GENOMIC, post_mapped=allele),  # clean
+            _make_annotation(
+                AnnotationLayer.GENOMIC, post_mapped=allele, error_message="warn"
+            ),  # warning
+            _make_annotation(AnnotationLayer.GENOMIC),  # failed
+        ]
+
+        with (
+            patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None),
+            patch(
+                "dcd_mapping.annotate.get_chromosome_identifier",
+                return_value="NC_000001.11",
+            ),
+            patch(
+                "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer",
+                return_value=AnnotationLayer.GENOMIC,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_computed_reference_sequence",
+                return_value=None,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None
+            ),
+        ):
+            result = build_scoreset_mapping(
+                metadata=metadata,
+                raw_metadata={},
+                mappings={"GENE1": annotations},
+                align_results={"GENE1": _make_align()},
+                tx_output={"GENE1": _make_tx()},
+                gene_info={"GENE1": None},
+                preferred_layer_only=False,
+                vrs_version=VrsVersion.V_2,
+            )
+
+        assert result.target_mappings
+        tm = result.target_mappings[0]
+        assert tm.total_variants == 3
+        assert tm.variants_mapped_cleanly == 1
+        assert tm.variants_with_mapping_warnings == 1
+        assert tm.variants_failed == 1
+        assert (
+            (tm.variants_mapped_cleanly or 0)
+            + (tm.variants_with_mapping_warnings or 0)
+            + (tm.variants_failed or 0)
+        ) == tm.total_variants
+
+    def test_tool_parameters_for_sequence_based_target(self):
+        """tool_parameters should contain BLAT aligner key for sequence-based targets."""
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_seq_target("GENE1")},
+        )
+        ann = _make_annotation(AnnotationLayer.GENOMIC)
+
+        with (
+            patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None),
+            patch(
+                "dcd_mapping.annotate.get_chromosome_identifier",
+                return_value="NC_000001.11",
+            ),
+            patch(
+                "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer",
+                return_value=AnnotationLayer.GENOMIC,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_computed_reference_sequence",
+                return_value=None,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None
+            ),
+        ):
+            result = build_scoreset_mapping(
+                metadata=metadata,
+                raw_metadata={},
+                mappings={"GENE1": [ann]},
+                align_results={"GENE1": _make_align()},
+                tx_output={"GENE1": _make_tx()},
+                gene_info={"GENE1": None},
+                preferred_layer_only=False,
+                vrs_version=VrsVersion.V_2,
+            )
+
+        tm = result.target_mappings[0]
+        assert tm.tool_parameters is not None
+        assert tm.tool_parameters.get("aligner") == "blat"
+        assert "min_score" in tm.tool_parameters
+
+    def test_tool_parameters_for_accession_based_target(self):
+        """tool_parameters should reference cdot for accession-based targets."""
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_acc_target("NM_000001.1", "GENE1")},
+        )
+        ann = _make_annotation(AnnotationLayer.CDNA)
+
+        with (
+            patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None),
+            patch(
+                "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer",
+                return_value=AnnotationLayer.CDNA,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_computed_reference_sequence",
+                return_value=None,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None
+            ),
+        ):
+            result = build_scoreset_mapping(
+                metadata=metadata,
+                raw_metadata={},
+                mappings={"GENE1": [ann]},
+                align_results={"GENE1": _make_align()},
+                tx_output={"GENE1": _make_tx()},
+                gene_info={"GENE1": None},
+                preferred_layer_only=False,
+                vrs_version=VrsVersion.V_2,
+            )
+
+        tm = result.target_mappings[0]
+        assert tm.tool_parameters is not None
+        assert tm.tool_parameters.get("aligner") == "cdot_transcript_placement"
+        assert "cdot_url" in tm.tool_parameters
+
+    def test_tool_parameters_for_nc_accession_target(self):
+        """NC_ (chromosome/contig) targets should use 'direct_contig_accession', not cdot."""
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_acc_target("NC_000001.11", "GENE1")},
+        )
+        ann = _make_annotation(AnnotationLayer.GENOMIC)
+
+        with (
+            patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None),
+            patch(
+                "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer",
+                return_value=AnnotationLayer.GENOMIC,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_computed_reference_sequence",
+                return_value=None,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None
+            ),
+        ):
+            result = build_scoreset_mapping(
+                metadata=metadata,
+                raw_metadata={},
+                mappings={"GENE1": [ann]},
+                # NC_ targets produce align_result=None in fetch_alignment
+                align_results={"NC_000001.11": None},
+                tx_output={"GENE1": _make_tx()},
+                gene_info={"GENE1": None},
+                preferred_layer_only=False,
+                vrs_version=VrsVersion.V_2,
+            )
+
+        tm = result.target_mappings[0]
+        assert tm.tool_parameters is not None
+        assert tm.tool_parameters.get("aligner") == "direct_contig_accession"
+        # Must not bleed cdot fields into a non-cdot path
+        assert "cdot_data_version" not in tm.tool_parameters
+        assert "cdot_url" not in tm.tool_parameters
+        # Accession is already in reference_accession; must not duplicate here
+        assert "accession" not in tm.tool_parameters
+
+    def test_mapped_scores_alignment_levels_subset_of_target_mappings(self):
+        """Invariant: every alignment_level in mapped_scores must have a parent TargetMapping row.
+
+        No mapped_score should be orphaned (i.e. have an alignment_level that
+        does not correspond to any TargetMapping.alignment_level for this run).
+        This mirrors the join the MaveDB API performs via target_gene_mapping_id.
+        """
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_seq_target("GENE1")},
+        )
+        # Genomic success, protein success, and a completely-failed (NULL-layer) variant
+        g_ann = _make_annotation(AnnotationLayer.GENOMIC)
+        p_ann = _make_annotation(AnnotationLayer.PROTEIN)
+        null_ann = _make_annotation(None)  # completely failed - no layer attribution
+
+        with (
+            patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None),
+            patch(
+                "dcd_mapping.annotate.get_chromosome_identifier",
+                return_value="NC_000001.11",
+            ),
+            patch(
+                "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer",
+                return_value=AnnotationLayer.GENOMIC,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_computed_reference_sequence",
+                return_value=None,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None
+            ),
+        ):
+            result = build_scoreset_mapping(
+                metadata=metadata,
+                raw_metadata={},
+                mappings={"GENE1": [g_ann, p_ann, null_ann]},
+                align_results={"GENE1": _make_align()},
+                tx_output={"GENE1": _make_tx()},
+                gene_info={"GENE1": None},
+                preferred_layer_only=False,
+                vrs_version=VrsVersion.V_2,
+            )
+
+        assert result.mapped_scores is not None
+        assert result.target_mappings is not None
+
+        tm_levels = {tm.alignment_level for tm in result.target_mappings}
+        ms_levels = {ms.alignment_level for ms in result.mapped_scores}
+
+        # Core invariant: every alignment_level in mapped_scores has a parent row
+        assert (
+            ms_levels <= tm_levels
+        ), f"Orphaned alignment levels in mapped_scores: {ms_levels - tm_levels}"
+
+    def test_null_layer_failures_attributed_to_preferred_layer(self):
+        """NULL-layer (completely-failed) variants must be re-attributed to the preferred
+        layer so the MaveDB API can join them via alignment_level.
+        """
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_seq_target("GENE1")},
+        )
+        g_ann = _make_annotation(AnnotationLayer.GENOMIC)
+        null_ann = _make_annotation(None)  # completely failed
+
+        with (
+            patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None),
+            patch(
+                "dcd_mapping.annotate.get_chromosome_identifier",
+                return_value="NC_000001.11",
+            ),
+            patch(
+                "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer",
+                return_value=AnnotationLayer.GENOMIC,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_computed_reference_sequence",
+                return_value=None,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None
+            ),
+        ):
+            result = build_scoreset_mapping(
+                metadata=metadata,
+                raw_metadata={},
+                mappings={"GENE1": [g_ann, null_ann]},
+                align_results={"GENE1": _make_align()},
+                tx_output={"GENE1": _make_tx()},
+                gene_info={"GENE1": None},
+                preferred_layer_only=True,
+                vrs_version=VrsVersion.V_2,
+            )
+
+        assert result.mapped_scores is not None
+        assert result.target_mappings is not None
+
+        # NULL-layer failure should appear in mapped_scores as the preferred layer
+        for ms in result.mapped_scores:
+            assert (
+                ms.alignment_level == AnnotationLayer.GENOMIC
+            ), f"Expected alignment_level=GENOMIC for all mapped_scores; got {ms.alignment_level}"
+
+        # The preferred layer's TargetMapping must count the null failure
+        tm = result.target_mappings[0]
+        assert tm.alignment_level == AnnotationLayer.GENOMIC
+        assert tm.total_variants == 2  # one genomic + one null failure
+
+    def test_preferred_layer_only_total_variants_includes_null_failures(self):
+        """With preferred_layer_only=True, total_variants in TargetMapping must
+        account for null-layer failures so len(mapped_scores) == total_variants.
+        """
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_seq_target("GENE1")},
+        )
+        # 3 genomic successes + 2 completely-failed variants
+        g_anns = [_make_annotation(AnnotationLayer.GENOMIC) for _ in range(3)]
+        null_anns = [_make_annotation(None) for _ in range(2)]
+
+        with (
+            patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None),
+            patch(
+                "dcd_mapping.annotate.get_chromosome_identifier",
+                return_value="NC_000001.11",
+            ),
+            patch(
+                "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer",
+                return_value=AnnotationLayer.GENOMIC,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_computed_reference_sequence",
+                return_value=None,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None
+            ),
+        ):
+            result = build_scoreset_mapping(
+                metadata=metadata,
+                raw_metadata={},
+                mappings={"GENE1": g_anns + null_anns},
+                align_results={"GENE1": _make_align()},
+                tx_output={"GENE1": _make_tx()},
+                gene_info={"GENE1": None},
+                preferred_layer_only=True,
+                vrs_version=VrsVersion.V_2,
+            )
+
+        assert result.mapped_scores is not None
+        assert result.target_mappings is not None
+
+        tm = result.target_mappings[0]
+        assert len(result.mapped_scores) == 5
+        assert tm.total_variants == 5
+        assert tm.variants_failed == 5  # all have pre_mapped=None (failed)
+
+
+# ---------------------------------------------------------------------------
+# Locus flag stamping
+# ---------------------------------------------------------------------------
+
+
+def _make_allele_annotation(
+    start: int, end: int, layer: AnnotationLayer
+) -> ScoreAnnotation:
+    """Build a ScoreAnnotation with a minimal VRS 2 Allele post_mapped."""
+    from ga4gh.vrs._internal.models import (
+        Allele,
+        LiteralSequenceExpression,
+        SequenceLocation,
+        SequenceReference,
+    )
+
+    allele = Allele(
+        location=SequenceLocation(
+            sequenceReference=SequenceReference(refgetAccession="SQ." + "A" * 32),
+            start=start,
+            end=end,
+        ),
+        state=LiteralSequenceExpression(sequence="T"),
+    )
+    return ScoreAnnotation(
+        mavedb_id=f"urn:mavedb:00000001-a-1#{start}",
+        alignment_level=layer,
+        post_mapped=allele,
+        score=None,
+    )
+
+
+def _make_rle_annotation(start: int, end: int) -> ScoreAnnotation:
+    """Build a ScoreAnnotation with a ReferenceLengthExpression (reference-identical)."""
+    from ga4gh.vrs._internal.models import (
+        Allele,
+        ReferenceLengthExpression,
+        SequenceLocation,
+        SequenceReference,
+    )
+
+    allele = Allele(
+        location=SequenceLocation(
+            sequenceReference=SequenceReference(refgetAccession="SQ." + "A" * 32),
+            start=start,
+            end=end,
+        ),
+        state=ReferenceLengthExpression(length=end - start, sequence=None),
+    )
+    return ScoreAnnotation(
+        mavedb_id=f"urn:mavedb:00000001-a-1#rle{start}",
+        alignment_level=AnnotationLayer.GENOMIC,
+        post_mapped=allele,
+        score=None,
+    )
+
+
+def _make_align_result_with_qc(
+    mismatches: list[int],
+    gap_intervals: list[tuple[int, int]],
+    mismatch_count: int | None = None,
+) -> AlignmentResult:
+    qc = AlignmentQc(
+        alignment_length=1000,
+        mismatch_count=mismatch_count
+        if mismatch_count is not None
+        else len(mismatches),
+        gap_count=len(gap_intervals),
+        mismatch_positions=mismatches,
+        gap_intervals=gap_intervals,
+    )
+    return AlignmentResult(
+        chrom="chr1",
+        query_range=SequenceRange(start=0, end=100),
+        query_subranges=[SequenceRange(start=0, end=100)],
+        hit_range=SequenceRange(start=0, end=1000),
+        hit_subranges=[SequenceRange(start=0, end=1000)],
+        alignment_qc=qc,
+    )
+
+
+class TestStampAlignmentLocusFlags:
+    def test_variant_at_mismatch(self):
+        ann = _make_allele_annotation(10, 11, AnnotationLayer.GENOMIC)
+        align = _make_align_result_with_qc(mismatches=[10], gap_intervals=[])
+        _stamp_alignment_locus_flags([ann], align, AnnotationLayer.GENOMIC)
+        assert ann.at_mismatched_locus is True
+        assert ann.near_gap is False
+
+    def test_variant_near_gap(self):
+        ann = _make_allele_annotation(10, 11, AnnotationLayer.GENOMIC)
+        # Gap at [20, 30); within 5-base window of position 10+5=15 < 20, so NOT near
+        # Place gap closer: [13, 20)
+        align = _make_align_result_with_qc(mismatches=[], gap_intervals=[(13, 20)])
+        _stamp_alignment_locus_flags(
+            [ann], align, AnnotationLayer.GENOMIC, near_gap_window=5
+        )
+        assert ann.at_mismatched_locus is False
+        assert ann.near_gap is True  # pos 10, window end 15, gap start 13 < 15
+
+    def test_variant_not_near_gap(self):
+        ann = _make_allele_annotation(0, 1, AnnotationLayer.GENOMIC)
+        # Gap far away
+        align = _make_align_result_with_qc(mismatches=[], gap_intervals=[(1000, 1100)])
+        _stamp_alignment_locus_flags([ann], align, AnnotationLayer.GENOMIC)
+        assert ann.near_gap is False
+
+    def test_rle_allele_flags_left_as_none(self):
+        """RLE alleles (reference-identical) must leave both flags as None."""
+        ann = _make_rle_annotation(0, 198295559)  # whole-chromosome span
+        # Use a real mismatch/gap so the function would flag a normal allele
+        align = _make_align_result_with_qc(
+            mismatches=list(range(100)),
+            gap_intervals=[(200, 300)],
+        )
+        _stamp_alignment_locus_flags([ann], align, AnnotationLayer.GENOMIC)
+        assert ann.at_mismatched_locus is None
+        assert ann.near_gap is None
+
+    def test_idempotent_double_call(self):
+        """Calling _stamp_alignment_locus_flags twice must not change the result."""
+        ann = _make_allele_annotation(10, 11, AnnotationLayer.GENOMIC)
+        align = _make_align_result_with_qc(mismatches=[10], gap_intervals=[(15, 25)])
+        _stamp_alignment_locus_flags([ann], align, AnnotationLayer.GENOMIC)
+        first_at = ann.at_mismatched_locus
+        first_gap = ann.near_gap
+        _stamp_alignment_locus_flags([ann], align, AnnotationLayer.GENOMIC)
+        assert ann.at_mismatched_locus == first_at
+        assert ann.near_gap == first_gap
+
+    def test_no_align_result_leaves_flags_none(self):
+        ann = _make_allele_annotation(10, 11, AnnotationLayer.GENOMIC)
+        _stamp_alignment_locus_flags([ann], None, AnnotationLayer.GENOMIC)
+        assert ann.at_mismatched_locus is None
+        assert ann.near_gap is None
+
+
+# ---------------------------------------------------------------------------
+# _align_result_for_target
+# ---------------------------------------------------------------------------
+
+
+class TestAlignResultForTarget:
+    def test_sequence_based_found_by_gene_name(self):
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_seq_target("GENE1")},
+        )
+        align = _make_align()
+        result = _align_result_for_target("GENE1", metadata, {"GENE1": align})
+        assert result is align
+
+    def test_accession_based_found_by_accession(self):
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_acc_target("NM_000001.1", "GENE1")},
+        )
+        align = _make_align()
+        # align_results keyed by accession (as produced by parse_cdot_mapping)
+        result = _align_result_for_target("GENE1", metadata, {"NM_000001.1": align})
+        assert result is align
+
+    def test_returns_none_when_not_found(self):
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_seq_target("GENE1")},
+        )
+        result = _align_result_for_target("GENE1", metadata, {})
+        assert result is None
+
+
+# ---------------------------------------------------------------------------
+# accession-based tool_parameters: cdot_data_version propagation
+# ---------------------------------------------------------------------------
+
+
+class TestCdotDataVersionPropagation:
+    def test_cdot_data_version_propagates_when_present(self):
+        """tool_parameters.cdot_data_version is emitted even when explicitly None."""
+        metadata = ScoresetMetadata(
+            urn="urn:mavedb:00000001-a-1",
+            target_genes={"GENE1": _make_acc_target("NM_000001.1", "GENE1")},
+        )
+        ann = _make_annotation(AnnotationLayer.CDNA)
+
+        align_with_version = AlignmentResult(
+            chrom="chr1",
+            query_range=SequenceRange(start=0, end=100),
+            query_subranges=[SequenceRange(start=0, end=100)],
+            hit_range=SequenceRange(start=1000, end=1100),
+            hit_subranges=[SequenceRange(start=1000, end=1100)],
+            aligner_parameters={
+                "aligner": "cdot_transcript_placement",
+                "cdot_data_version": "0.2.26",
+            },
+        )
+
+        with (
+            patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None),
+            patch(
+                "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer",
+                return_value=AnnotationLayer.CDNA,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_computed_reference_sequence",
+                return_value=None,
+            ),
+            patch(
+                "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None
+            ),
+        ):
+            result = build_scoreset_mapping(
+                metadata=metadata,
+                raw_metadata={},
+                mappings={"GENE1": [ann]},
+                align_results={"NM_000001.1": align_with_version},
+                tx_output={"GENE1": _make_tx()},
+                gene_info={"GENE1": None},
+                preferred_layer_only=False,
+                vrs_version=VrsVersion.V_2,
+            )
+
+        tm = result.target_mappings[0]
+        assert tm.tool_parameters is not None
+        assert "cdot_data_version" in tm.tool_parameters
+        assert tm.tool_parameters["cdot_data_version"] == "0.2.26"
diff --git a/tests/test_transcript.py b/tests/test_transcript.py
index 9219e39..bd8548a 100644
--- a/tests/test_transcript.py
+++ b/tests/test_transcript.py
@@ -309,7 +309,7 @@ def fake_get_sequence(ac):
         chrom="chr1",
         strand=1,
         coverage=None,
-        ident_pct=None,
+        percent_identity=None,
         query_range=SequenceRange(start=1, end=6),
         query_subranges=[SequenceRange(start=1, end=6)],
         hit_range=SequenceRange(start=1, end=6),
diff --git a/tests/test_vrs_map.py b/tests/test_vrs_map.py
index 20b0244..a52b5ca 100644
--- a/tests/test_vrs_map.py
+++ b/tests/test_vrs_map.py
@@ -29,7 +29,7 @@ def _assert_correct_vrs_map(
     expected_mappings_data: dict[tuple[str, AnnotationLayer], dict],
 ):
     """Note that we're testing against VRS 1.3 VA IDs (temporary?)."""
-    key = (mapping.accession_id, mapping.annotation_layer)
+    key = (mapping.accession_id, mapping.alignment_level)
     assert (
         key in expected_mappings_data
     ), "Score row/layer combination is not in expected mappings"
@@ -116,7 +116,7 @@ def test_2_a_2(
                 align_result=align_result[target_gene],
                 records=records[target_gene],
                 transcript=tx_result[target_gene],
-            )
+            ).mappings
     assert mappings["hYAP65 WW domain"] is not None
     assert len(mappings["hYAP65 WW domain"]) == 1
 
@@ -178,7 +178,7 @@ def test_41_a_1(
                 align_result=align_result[target_gene],
                 records=records[target_gene],
                 transcript=tx_result[target_gene],
-            )
+            ).mappings
     assert mappings["Src catalytic domain"] is not None
     assert len(mappings["Src catalytic domain"]) == 5
 
@@ -251,7 +251,7 @@ def test_99_a_1(
                 align_result=align_result[target_gene],
                 records=records[target_gene],
                 transcript=tx_result[target_gene],
-            )
+            ).mappings
     assert mappings["RHO"] is not None
     assert len(mappings["RHO"]) == 8  # includes protein and genomic for all 4 rows
 
@@ -309,7 +309,7 @@ def test_103_c_1(
                 align_result=align_result[target_gene],
                 records=records[target_gene],
                 transcript=tx_result[target_gene],
-            )
+            ).mappings
     assert mappings["MAPK1"] is not None
     assert len(mappings["MAPK1"]) == 4
     for m in mappings["MAPK1"]:
@@ -378,7 +378,7 @@ def test_1_b_2(
                 align_result=align_result[target_gene],
                 records=records[target_gene],
                 transcript=tx_result[target_gene],
-            )
+            ).mappings
     assert mappings["SUMO1"] is not None
     assert len(mappings["SUMO1"]) == 8
     for m in mappings["SUMO1"]:

From e017c701f18375ca2eab4f8373f61fd577b092f6 Mon Sep 17 00:00:00 2001
From: Benjamin Capodanno <bencap@outlook.com>
Date: Tue, 28 Apr 2026 16:33:06 -0700
Subject: [PATCH 2/6] fix(align): handle protein-vs-DNA and malformed PSL
 blocks in _build_alignment_qc

- protein-vs-DNA (-q=prot -t=dnax) BLAT runs store target coords in
  nucleotide space and query coords in amino-acid space (3:1 ratio);
  minus-strand target blocks have ts > te, making seq[ts:te] return "".
  Comparison was crashing with ValueError from zip(strict=True); the
  per-base mismatch loop is now skipped entirely for this mode, setting
  mismatch_positions_unavailable=True so at_mismatched_locus is
  correctly left as None (not evaluated) rather than a false False.
  The preferred layer for protein scoresets is PROTEIN, flagged from
  the downstream protein-to-protein alignment, so no signal is lost.

- For nucleotide-vs-nucleotide runs, replace the bare zip(strict=True)
  with an explicit length-mismatch guard that logs a WARNING and falls
  through to zip(strict=False), preserving all mismatches in the
  overlapping prefix rather than crashing or discarding the block.
---
 src/dcd_mapping/align.py | 59 ++++++++++++++++++++++++++++++++--------
 1 file changed, 47 insertions(+), 12 deletions(-)

diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py
index de547af..14dd060 100644
--- a/src/dcd_mapping/align.py
+++ b/src/dcd_mapping/align.py
@@ -495,7 +495,9 @@ def _cigar_from_coords(aln: "BioAlign.Alignment") -> str | None:
     return "".join(parts) if parts else None
 
 
-def _build_alignment_qc(aln: "BioAlign.Alignment") -> "AlignmentQc":
+def _build_alignment_qc(
+    aln: "BioAlign.Alignment", protein_vs_dna: bool = False
+) -> "AlignmentQc":
     """Derive the per-(target, alignment_level) QC block from a Bio.Align
     ``Alignment`` parsed out of BLAT's pslx output.
 
@@ -512,6 +514,10 @@ def _build_alignment_qc(aln: "BioAlign.Alignment") -> "AlignmentQc":
     must treat ``at_mismatched_locus`` as ``None`` (not evaluated) in that
     situation to avoid a silent disagreement where ``mismatch_count > 0`` but
     no variant is ever flagged.
+
+    :param protein_vs_dna: set ``True`` for ``-q=prot -t=dnax`` BLAT runs.
+        Skips per-base mismatch detection (target/query coordinates are in
+        incompatible spaces) and sets ``mismatch_positions_unavailable=True``.
     """
     counts = aln.counts()
     coords = aln.coordinates
@@ -519,15 +525,23 @@ def _build_alignment_qc(aln: "BioAlign.Alignment") -> "AlignmentQc":
     gap_intervals: list[tuple[int, int]] = []
     query_insert_blocks = 0
 
-    # Per-base mismatch detection requires concrete sequence content for both
-    # target and query. Biopython's pslx parser sometimes leaves bases outside
-    # aligned blocks (or even within, depending on BLAT output) as undefined
-    # placeholders. The aggregate mismatch *count* still comes from
-    # ``aln.counts()`` below; we treat per-position recording as best-effort
-    # and set mismatch_positions_unavailable=True if the sequence content isn't
-    # materialized so callers know at_mismatched_locus is unreliable.
-    mismatch_positions_unavailable = False
-    sequences_available = True
+    # Per-base mismatch detection requires sequence content from pslx tseqs/qseqs.
+    # BioPython sometimes leaves these as undefined placeholders; aggregate
+    # mismatch *count* from counts() is still accurate in that case.  We treat
+    # per-position recording as best-effort and set mismatch_positions_unavailable
+    # so callers leave at_mismatched_locus as None rather than a misleading False.
+    #
+    # For -q=prot -t=dnax runs, target coordinates are in nucleotide space and
+    # query coordinates are in amino-acid space (3:1 ratio).  Minus-strand target
+    # blocks have ts > te, making aln.target.seq[ts:te] return "".  Per-base
+    # comparison is impossible, so we skip it entirely (sequences_available=False).
+    # This is fine: for protein scoresets the preferred layer is PROTEIN, flagged
+    # from the protein-to-protein alignment in vrs_map.py — not this genomic one.
+    # The gap_intervals walk below still runs; its output is stored in AlignmentQc
+    # but not consumed for near_gap in the typical protein-only case (no GENOMIC
+    # layer variants are emitted when hgvs_nt is absent).
+    mismatch_positions_unavailable = protein_vs_dna
+    sequences_available = not protein_vs_dna
     for i in range(coords.shape[1] - 1):
         ts, te = int(coords[0][i]), int(coords[0][i + 1])
         qs, qe = int(coords[1][i]), int(coords[1][i + 1])
@@ -570,7 +584,27 @@ def _build_alignment_qc(aln: "BioAlign.Alignment") -> "AlignmentQc":
                 sequences_available = False
                 continue
 
-            for j, (tb, qb) in enumerate(zip(t_bases, q_bases, strict=True)):
+            # PSL should guarantee equal-length aligned blocks for
+            # nucleotide-vs-nucleotide runs (protein-vs-DNA is excluded
+            # above via sequences_available=False).  This branch is a
+            # defensive fallback for malformed BLAT output.  Compare the
+            # overlapping prefix rather than discarding the entire block.
+            if len(t_bases) != len(q_bases):
+                _logger.warning(
+                    "Aligned-block sequence slices have unequal lengths "
+                    "(%d vs %d) for target block [%d:%d] query block [%d:%d]. "
+                    "Comparing the overlapping prefix; %d base(s) at the tail "
+                    "of the longer slice will not be assessed for mismatches.",
+                    len(t_bases),
+                    len(q_bases),
+                    ts,
+                    te,
+                    qs,
+                    qe,
+                    abs(len(t_bases) - len(q_bases)),
+                )
+
+            for j, (tb, qb) in enumerate(zip(t_bases, q_bases, strict=False)):
                 if qb.upper() != tb.upper():
                     mismatch_positions.append(ts + j)
 
@@ -906,7 +940,8 @@ def _get_best_match(
         hit_subranges.append(SequenceRange(start=ts, end=te))
         query_subranges.append(SequenceRange(start=min(qs, qe), end=max(qs, qe)))
 
-    alignment_qc = _build_alignment_qc(best_aln)
+    protein_vs_dna = "-q=prot" in blat_params.get("target_args", "")
+    alignment_qc = _build_alignment_qc(best_aln, protein_vs_dna=protein_vs_dna)
 
     return AlignmentResult(
         aligner_parameters=blat_params,

From b449001a90b70db3f56a0c4a8b027d80684acb60 Mon Sep 17 00:00:00 2001
From: Benjamin Capodanno <bencap@outlook.com>
Date: Wed, 29 Apr 2026 10:43:14 -0700
Subject: [PATCH 3/6] fix(annotate): ensure JSON output is expanded by
 serializing without JSON mode

---
 src/api/routers/map.py      | 2 +-
 src/dcd_mapping/annotate.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/api/routers/map.py b/src/api/routers/map.py
index cc0149b..495babd 100644
--- a/src/api/routers/map.py
+++ b/src/api/routers/map.py
@@ -248,4 +248,4 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
             ).model_dump(exclude_none=True)
         )
 
-    return JSONResponse(content=output.model_dump(mode="json", exclude_none=True))
+    return JSONResponse(content=output.model_dump(exclude_none=True))
diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py
index 2f6c291..72296d9 100644
--- a/src/dcd_mapping/annotate.py
+++ b/src/dcd_mapping/annotate.py
@@ -1107,9 +1107,7 @@ def write_scoreset_mapping_to_json(
     _logger.info("Saving mapping output to %s", output_path)
     with output_path.open("w") as file:
         json.dump(
-            scoreset_mapping.model_dump(
-                mode="json", exclude_unset=False, exclude_none=True
-            ),
+            scoreset_mapping.model_dump(exclude_none=True),
             file,
             indent=4,
         )

From 736db82223ac80d57842a072a74837953dc7a167 Mon Sep 17 00:00:00 2001
From: Benjamin Capodanno <bencap@outlook.com>
Date: Wed, 29 Apr 2026 11:54:03 -0700
Subject: [PATCH 4/6] feat(lookup): enhance gene symbol retrieval by tokenizing
 target names for better matching

This is mostly useful in multi-word target names where gene information is available but not in the first word of the target name.
---
 src/dcd_mapping/lookup.py | 46 ++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py
index f763b16..6b60cc6 100644
--- a/src/dcd_mapping/lookup.py
+++ b/src/dcd_mapping/lookup.py
@@ -368,10 +368,11 @@ def _get_hgnc_symbol(term: str) -> str | None:
 def get_gene_symbol(target_gene: TargetGene) -> str | None:
     """Acquire HGNC gene symbol given provided target gene metadata from MaveDB.
 
-    Right now, we use two sources for normalizing:
-    1. UniProt ID, if available
-    2. Target name: specifically, we try the first word in the name (this could
-    cause some problems and we should double-check it)
+    Tokenizes the target name on whitespace and tries each token against the
+    gene normalizer until one matches (gene symbols are not always the first
+    token, e.g. ``"Wildtype G6PD"``).  Silently returns ``None`` if no token
+    resolves — see ``_get_normalized_gene_response`` for the full description
+    of this limitation.
 
     :param target_gene: target gene metadata given by MaveDB API
     :return: gene symbol if available
@@ -381,10 +382,12 @@ def get_gene_symbol(target_gene: TargetGene) -> str | None:
         if result:
             return result
 
-    # try taking the first word in the target name
     if target_gene.target_gene_name:
-        parsed_name = target_gene.target_gene_name.split(" ")[0]
-        return _get_hgnc_symbol(parsed_name)
+        for word in target_gene.target_gene_name.split(" "):
+            result = _get_hgnc_symbol(word)
+            if result:
+                return result
+
     return None
 
 
@@ -406,7 +409,17 @@ def _get_normalized_gene_response(
 ) -> Gene | None:
     """Fetch best normalized concept given available scoreset metadata.
 
-    :param metadata: salient scoreset metadata items
+    **Limitation — heuristic name parsing**: when the target name is not itself
+    a valid HGNC symbol this function tokenizes it on whitespace and tries each
+    token in order (e.g. ``"Wildtype G6PD"`` resolves because ``"G6PD"`` is the
+    second token).  This will silently fail for names whose tokens are all
+    non-HGNC strings (e.g. ``"my favourite protein"``).  When it fails,
+    downstream chromosome selection has no anchor and BLAT's chromosome fallback
+    may land on the wrong chromosome, causing transcript selection to return no
+    results.  The only reliable fix is to ensure the target name contains the
+    HGNC gene symbol as one of its whitespace-delimited tokens.
+
+    :param target_gene: salient scoreset metadata items
     :return: Normalized gene if available
     """
     if target_gene.target_uniprot_ref:
@@ -414,12 +427,13 @@ def _get_normalized_gene_response(
         if gene_descriptor:
             return gene_descriptor
 
-    # try taking the first word in the target name
+    # Try each whitespace-delimited token from the target name. Gene symbols
+    # are not always the first word (e.g. "Wildtype G6PD").
     if target_gene.target_gene_name:
-        parsed_name = target_gene.target_gene_name.split(" ")[0]
-        gene_descriptor = _normalize_gene(parsed_name)
-        if gene_descriptor:
-            return gene_descriptor
+        for word in target_gene.target_gene_name.split(" "):
+            gene_descriptor = _normalize_gene(word)
+            if gene_descriptor:
+                return gene_descriptor
 
     return None
 
@@ -453,10 +467,8 @@ def get_gene_location(target_gene: TargetGene) -> GeneLocation | None:
     """Acquire gene location data from gene normalizer using metadata provided by
     scoreset.
 
-    As with ``get_gene_symbol()``, we try to normalize from the following:
-    1. UniProt ID, if available
-    2. Target name: specifically, we try the first word in the name (this could
-    cause some problems and we should double-check it)
+    Delegates to ``_get_normalized_gene_response`` — see that function for the
+    full description of the heuristic and its limitations.
 
     :param target_gene: data given by MaveDB API
     :return: gene location data if available

From ef26cbd6eb1c3478637f87b727b060b677b47f5c Mon Sep 17 00:00:00 2001
From: Benjamin Capodanno <bencap@outlook.com>
Date: Wed, 29 Apr 2026 15:25:29 -0700
Subject: [PATCH 5/6] fix(mapping): update aligner type to
 reference_accession_passthrough and adjust related annotations

Co-authored-by: Copilot <copilot@github.com>
---
 src/dcd_mapping/annotate.py           | 17 ++--------------
 src/dcd_mapping/schemas.py            | 28 ++++++++-------------------
 tests/test_annotate_target_mapping.py |  4 ++--
 3 files changed, 12 insertions(+), 37 deletions(-)

diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py
index 72296d9..38e2bd5 100644
--- a/src/dcd_mapping/annotate.py
+++ b/src/dcd_mapping/annotate.py
@@ -1208,7 +1208,6 @@ def _build_target_mapping(
     vrs_version: VrsVersion,
     annotations: list[ScoreAnnotation],
     protein_align_result: AlignmentResult | None = None,
-    null_failure_count: int = 0,
     near_gap_window: int = NEAR_GAP_WINDOW,
 ) -> TargetMapping:
     """Assemble one target_mappings[] row for a (target, alignment_level) pair.
@@ -1257,7 +1256,7 @@ def _build_target_mapping(
         # Direct chromosome/contig accession: the coordinate frame is defined by
         # the accession itself — no placement tool was invoked.
         tool_parameters = {
-            "aligner": "direct_contig_accession",
+            "aligner": "reference_accession_passthrough",
         }
     elif accession_id is not None:
         # Transcript accession (NM_/ENST): cdot was used for transcript placement.
@@ -1308,12 +1307,7 @@ def _build_target_mapping(
 
     total = len(annotations)
     failed = sum(1 for a in annotations if a.post_mapped is None)
-    warnings = sum(
-        1
-        for a in annotations
-        if a.post_mapped is not None and a.error_message is not None
-    )
-    clean = total - failed - warnings
+    clean = total - failed
 
     # near_gap is always evaluable (derived from gap positions in the alignment,
     # not per-base sequence content), so count it unconditionally.
@@ -1352,12 +1346,8 @@ def _build_target_mapping(
         alignment_metadata=alignment_metadata,
         total_variants=total,
         variants_mapped_cleanly=clean,
-        variants_with_mapping_warnings=warnings,
         variants_with_alignment_warnings=alignment_warnings,
         variants_failed=failed,
-        variants_failed_pre_layer_selection=null_failure_count
-        if null_failure_count
-        else None,
     )
 
 
@@ -1556,9 +1546,6 @@ def build_scoreset_mapping(
                     vrs_version=vrs_version,
                     annotations=annotations_for_tm,
                     protein_align_result=protein_align_for_target,
-                    null_failure_count=len(null_failures)
-                    if layer == preferred_layer_for_target
-                    else 0,
                     near_gap_window=near_gap_window,
                 )
             )
diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py
index b0e6889..fd92aed 100644
--- a/src/dcd_mapping/schemas.py
+++ b/src/dcd_mapping/schemas.py
@@ -373,10 +373,10 @@ class TargetMapping(BaseModel):
           "target_args":     "-q=prot -t=prot",
         }
 
-    Direct contig accession (accession-based)::
+    Passthrough for fully qualified reference accessions (accession-based)::
 
         {
-          "aligner":         "direct_contig_accession",
+          "aligner":         "reference_accession_passthrough",
         }
 
     cdot transcript placement (accession-based)::
@@ -433,31 +433,19 @@ class TargetMapping(BaseModel):
     #                                 this target/level.  True otherwise.
     alignment_metadata: dict[str, Any] | None = None
 
-    # Annotation QC -- totals for this target x level. The two warning columns
-    # count disjoint things:
-    #   * variants_with_mapping_warnings: the mapper itself attached an
-    #     ``error_message`` despite producing a ``post_mapped`` value (e.g.
-    #     fell back to RLE, ambiguous reference allele).
-    #   * variants_with_alignment_warnings: the variant's reference position
-    #     fell on a mismatched locus or near a gap in the underlying alignment
-    #     (``at_mismatched_locus`` or ``near_gap`` was True). Mapping itself
-    #     succeeded cleanly; this is purely about alignment context.
-    # A single variant can be in neither, either, or both.
+    # Annotation QC -- totals for this target x level.
+    # variants_with_alignment_warnings: the variant's reference position fell on
+    # a mismatched locus or near a gap in the underlying alignment
+    # (``at_mismatched_locus`` or ``near_gap`` was True). Mapping itself
+    # succeeded cleanly; this is purely about alignment context.
     total_variants: int | None = None
-    # Count of variants where post_mapped is not None AND error_message is None.
+    # Count of variants where post_mapped is not None.
     # Alignment warnings (at_mismatched_locus, near_gap) are counted separately in
     # variants_with_alignment_warnings and do NOT reduce this count; a variant can
     # be "cleanly mapped" and still sit at a mismatched locus or near a gap.
     variants_mapped_cleanly: int | None = None
-    variants_with_mapping_warnings: int | None = None
     variants_with_alignment_warnings: int | None = None
     variants_failed: int | None = None
-    # Count of variants that failed before any annotation layer was determined
-    # (annotation_layer=None, pre_mapped=None in the VRS output). These are
-    # re-attributed to the preferred layer's variants_failed total so the join
-    # invariant holds, but keeping them separate lets consumers see a clean
-    # per-layer failure count on all other (non-preferred) rows.
-    variants_failed_pre_layer_selection: int | None = None
 
 
 class VrsMapResult(NamedTuple):
diff --git a/tests/test_annotate_target_mapping.py b/tests/test_annotate_target_mapping.py
index 744b225..2f66f13 100644
--- a/tests/test_annotate_target_mapping.py
+++ b/tests/test_annotate_target_mapping.py
@@ -425,7 +425,7 @@ def test_tool_parameters_for_accession_based_target(self):
         assert "cdot_url" in tm.tool_parameters
 
     def test_tool_parameters_for_nc_accession_target(self):
-        """NC_ (chromosome/contig) targets should use 'direct_contig_accession', not cdot."""
+        """NC_ (chromosome/contig) targets should use 'reference_accession_passthrough', not cdot."""
         metadata = ScoresetMetadata(
             urn="urn:mavedb:00000001-a-1",
             target_genes={"GENE1": _make_acc_target("NC_000001.11", "GENE1")},
@@ -460,7 +460,7 @@ def test_tool_parameters_for_nc_accession_target(self):
 
         tm = result.target_mappings[0]
         assert tm.tool_parameters is not None
-        assert tm.tool_parameters.get("aligner") == "direct_contig_accession"
+        assert tm.tool_parameters.get("aligner") == "reference_accession_passthrough"
         # Must not bleed cdot fields into a non-cdot path
         assert "cdot_data_version" not in tm.tool_parameters
         assert "cdot_url" not in tm.tool_parameters

From 6cc9b664ca89bf3c04cc6425e8d26e60348db89f Mon Sep 17 00:00:00 2001
From: Benjamin Capodanno <bencap@outlook.com>
Date: Thu, 30 Apr 2026 10:34:10 -0700
Subject: [PATCH 6/6] chore(schema): Update schema to reflect updated output

---
 schema.json | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/schema.json b/schema.json
index 708e4d1..32a9bc0 100644
--- a/schema.json
+++ b/schema.json
@@ -564,7 +564,7 @@
       "type": "object"
     },
     "TargetMapping": {
-      "description": "Per-(target, alignment_level) provenance and QC block.\n\nField names mirror the corresponding columns on `target_gene_mappings` in the\nMaveDB API so the API worker can deserialize directly with minimal transformation.\nAny aligner-specific structured details go in ``tool_parameters`` /\n``alignment_metadata`` (JSONB on the API side).\n\n``reference_assembly`` is a top-level column (not nested in ``tool_parameters``)\nbecause it describes the coordinate frame of the mapping result, not aligner\nconfiguration.  It is ``None`` for alignments with no genomic frame (e.g.\nprotein-vs-protein).\n\n``tool_parameters`` shape per aligner\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nBLAT genomic (sequence-based, nucleotide or protein-vs-genome)::\n\n    {\n      \"aligner\":         \"blat\",\n      \"aligner_version\": \"<BLAT version string | null>\",\n      \"min_score\":       <int>,\n      \"out_format\":      \"<pslx|...>\",\n      \"target_args\":     \"<e.g. '-q=prot -t=dnax' | ''>\",\n    }\n\nBLAT protein-vs-protein (sequence-based, protein annotation layer)::\n\n    {\n      \"aligner\":         \"blat\",\n      \"aligner_version\": \"<BLAT version string | null>\",\n      \"min_score\":       <int>,\n      \"out_format\":      \"<pslx|...>\",\n      \"target_args\":     \"-q=prot -t=prot\",\n    }\n\ncdot transcript placement (accession-based)::\n\n    {\n      \"aligner\":           \"cdot_transcript_placement\",\n      \"aligner_version\":   \"<cdot library semver, e.g. '0.2.26'>\",\n      \"cdot_url\":          \"<CDOT REST endpoint>\",\n      \"cdot_data_version\": \"<data version string | null>\",\n    }\n\n``cdot_data_version`` is ``null`` when the cdot REST response did not\ninclude a ``cdot_data_version`` field; its presence (even as ``null``)\ndistinguishes \"cdot run, version unknown\" from \"not a cdot run\".",
+      "description": "Per-(target, alignment_level) provenance and QC block.\n\nField names mirror the corresponding columns on `target_gene_mappings` in the\nMaveDB API so the API worker can deserialize directly with minimal transformation.\nAny aligner-specific structured details go in ``tool_parameters`` /\n``alignment_metadata`` (JSONB on the API side).\n\n``reference_assembly`` is a top-level column (not nested in ``tool_parameters``)\nbecause it describes the coordinate frame of the mapping result, not aligner\nconfiguration.  It is ``None`` for alignments with no genomic frame (e.g.\nprotein-vs-protein).\n\n``tool_parameters`` shape per aligner\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nBLAT genomic (sequence-based, nucleotide or protein-vs-genome)::\n\n    {\n      \"aligner\":         \"blat\",\n      \"aligner_version\": \"<BLAT version string | null>\",\n      \"min_score\":       <int>,\n      \"out_format\":      \"<pslx|...>\",\n      \"target_args\":     \"<e.g. '-q=prot -t=dnax' | ''>\",\n    }\n\nBLAT protein-vs-protein (sequence-based, protein annotation layer)::\n\n    {\n      \"aligner\":         \"blat\",\n      \"aligner_version\": \"<BLAT version string | null>\",\n      \"min_score\":       <int>,\n      \"out_format\":      \"<pslx|...>\",\n      \"target_args\":     \"-q=prot -t=prot\",\n    }\n\nPassthrough for fully qualified reference accessions (accession-based)::\n\n    {\n      \"aligner\":         \"reference_accession_passthrough\",\n    }\n\ncdot transcript placement (accession-based)::\n\n    {\n      \"aligner\":           \"cdot_transcript_placement\",\n      \"aligner_version\":   \"<cdot library semver, e.g. '0.2.26'>\",\n      \"cdot_url\":          \"<CDOT REST endpoint>\",\n      \"cdot_data_version\": \"<data version string | null>\",\n    }\n\n``cdot_data_version`` is ``null`` when the cdot REST response did not\ninclude a ``cdot_data_version`` field; its presence (even as ``null``)\ndistinguishes \"cdot run, version unknown\" from \"not a cdot run\".",
       "properties": {
         "alignment_length": {
           "anyOf": [
@@ -758,18 +758,6 @@
           "default": null,
           "title": "Variants Failed"
         },
-        "variants_failed_pre_layer_selection": {
-          "anyOf": [
-            {
-              "type": "integer"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "title": "Variants Failed Pre Layer Selection"
-        },
         "variants_mapped_cleanly": {
           "anyOf": [
             {
@@ -794,18 +782,6 @@
           "default": null,
           "title": "Variants With Alignment Warnings"
         },
-        "variants_with_mapping_warnings": {
-          "anyOf": [
-            {
-              "type": "integer"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "title": "Variants With Mapping Warnings"
-        },
         "vrs_version": {
           "anyOf": [
             {