From 0c5a3019372d7756e9707e02174d497c2b3793ee Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Tue, 28 Apr 2026 15:41:47 -0700 Subject: [PATCH 1/6] feat(mapping): add target-level mapping metadata to scoreset output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Introduce `AlignmentQc` schema capturing per-alignment BLAT quality metrics (identity, CIGAR, mismatch positions, gap intervals) with in-memory-only position lists excluded from serialization - Add `TargetMapping` schema representing a per-(target, layer) mapping row with variant counts, tool parameters, and preferred-layer flag - Add `VrsMapResult` NamedTuple to pair VRS mappings with their `TargetMapping` rows from `vrs_map` - Rename `annotation_layer` → `alignment_level` on `MappedScore` / `ScoreAnnotation` to align with new terminology - Rename `ident_pct` → `percent_identity` on `AlignmentResult`; add `score`, `next_best_score`, `alignment_qc`, `aligner_parameters`, and `reference_assembly` fields - Implement `build_scoreset_mapping` in `annotate.py` to assemble `ScoresetMapping` with populated `target_mappings` list, per-variant locus-quality flags (`at_mismatched_locus`, `near_gap`), and reference sequence metadata - Restore canonical BLAT PSL scoring (`matches - misMatches - qNumInsert - tNumInsert`) in `_get_best_hsp`; previous BioPython port used raw identity count, causing noisy alignments to outrank clean ones - Update JSON schema, API router, CI workflow, and README to reflect new output shape - Add `test_annotate_target_mapping.py` and expand `test_align.py` / `test_annotate.py` with unit tests for new logic Co-authored-by: Copilot --- .github/workflows/checks.yaml | 5 + .gitignore | 5 + README.md | 82 +- pyproject.toml | 2 +- schema.json | 1076 +++++++++++++++++-------- scripts/generate_schema.py | 31 + src/api/routers/map.py | 146 +--- src/dcd_mapping/align.py | 989 +++++++++++++++++++---- src/dcd_mapping/annotate.py | 734 +++++++++++++++-- src/dcd_mapping/main.py | 22 +- src/dcd_mapping/schemas.py | 249 +++++- src/dcd_mapping/vrs_map.py | 70 +- tests/fixtures/align_result.json | 22 +- tests/test_align.py | 366 ++++++++- tests/test_annotate.py | 138 +++- tests/test_annotate_target_mapping.py | 855 ++++++++++++++++++++ tests/test_transcript.py | 2 +- tests/test_vrs_map.py | 12 +- 18 files changed, 4082 insertions(+), 724 deletions(-) create mode 100644 scripts/generate_schema.py create mode 100644 tests/test_annotate_target_mapping.py diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml index 9bfd1f8..a55e921 100644 --- a/.github/workflows/checks.yaml +++ b/.github/workflows/checks.yaml @@ -39,3 +39,8 @@ jobs: - name: Check style run: python3 -m ruff check . && ruff format --check . + + - name: Verify schema.json is up to date + run: | + python scripts/generate_schema.py + git diff --exit-code schema.json diff --git a/.gitignore b/.gitignore index fd06d06..0d25f59 100644 --- a/.gitignore +++ b/.gitignore @@ -166,3 +166,8 @@ cython_debug/ # mapping data/output notebooks/analysis/analysis_files notebooks/analysis/mavedb_files + +# debug / ad-hoc mapping output files — prevent accidental commits +urn:*.json +tmp:*.json +*_mapping_*.json diff --git a/README.md b/README.md index da3cb1e..3c7ed4a 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ This library implements a novel method for mapping [MaveDB scoreset data](https: * Universal Transcript Archive (UTA): see [README](https://github.com/biocommons/uta?tab=readme-ov-file#installing-uta-locally) for setup instructions. Users with access to Docker on their local devices can use the available Docker image; otherwise, start a relatively recent (version 14+) PostgreSQL instance and add data from the available database dump. * SeqRepo: see [README](https://github.com/biocommons/biocommons.seqrepo?tab=readme-ov-file#requirements) for setup instructions. The SeqRepo data directory must be writeable; see specific instructions [here](https://github.com/biocommons/biocommons.seqrepo/blob/main/docs/store.rst) for more. * Gene Normalizer: see [documentation](https://gene-normalizer.readthedocs.io/0.3.0-dev1/install.html) for data setup instructions. -* blat: Must be available on the local PATH and executable by the user. Otherwise, its location can be set manually with the `BLAT_BIN_PATH` env var. See the [UCSC Genome Browser FAQ](https://genome.ucsc.edu/FAQ/FAQblat.html#blat3) for download instructions. +* blat: Must be available on the local PATH and executable by the user. Otherwise, its location can be set manually with the `BLAT_BIN_PATH` env var. See the [UCSC Genome Browser FAQ](https://genome.ucsc.edu/FAQ/FAQblat.html#blat3) for download instructions. ## Installation @@ -40,6 +40,86 @@ Output is saved in the format `_mapping_results_.json` in the Use `dcd-map --help` to see other available options. +## Mapping output + +Each mapping run produces a single JSON document conforming to [`schema.json`](schema.json) (the JSON Schema serialization of `ScoresetMapping`). Top-level keys: + +* `metadata` — the verbatim MaveDB API scoreset response, stored unchanged so no upstream fields are lost. +* `mapped_date` — ISO 8601 UTC timestamp of when this run completed. +* `reference_sequences` — per-target reference sequence info per annotation layer. +* `mapped_scores` — flat list of per-variant `ScoreAnnotation` records (see below). +* `target_mappings` — per-`(target, alignment_level)` provenance and alignment QC rows. The MaveDB API consumes these as `target_gene_mappings` and uses them to attribute every `mapped_score` back to the alignment that produced it. +* `error_message` — populated only when the run failed before producing scores. + +### `metadata` + +The verbatim MaveDB API scoreset response. Stored unchanged so downstream consumers retain access to every upstream field (URN, title, description, target gene definitions, score-column metadata, etc.) without having to query MaveDB again. + +### `reference_sequences` + +A `dict[target_gene_name, TargetAnnotation]` describing the reference sequences each target was mapped against, organized by annotation layer. Each `TargetAnnotation` carries: + +* `gene_info` — `hgnc_symbol` plus the `selection_method` that picked it (transcript-derived, alignment-overlap-derived, variant-overlap-derived, or metadata fallback). +* `layers` — a `dict[AnnotationLayer, {computed_reference_sequence, mapped_reference_sequence}]` populated only for layers that actually produced mappings. `computed_reference_sequence` is the in-pipeline sequence (e.g. translated protein); `mapped_reference_sequence` lists the canonical accession(s) the variants were ultimately grounded in. Layers with no usable reference are pruned, not emitted as `null`. + +This block is the human-readable "what was used as reference" view; programmatic auditing should use `target_mappings` instead. + +### `mapped_scores` + +A flat list of per-variant `ScoreAnnotation` records. One entry per `(score_record, emitted annotation_layer)` pair. Key fields: + +* `mavedb_id`, `score` — identifier and numeric score copied from the MaveDB record. +* `relation` — fixed at `"SO:is_homologous_to"` while `pre_mapped` is populated. +* `target_gene_identifier`, `alignment_level` — composite key linking back to a `target_mappings` row (see below). +* `pre_mapped`, `post_mapped` — VRS variant objects in the target's coordinate frame and in the reference frame, respectively. Either may be `null` for failed mappings. +* `vrs_version` — VRS schema version used for this record. +* `error_message` — populated when `post_mapped` is `null` *or* when mapping succeeded with a caveat (e.g. RLE fallback, ambiguous reference allele). +* `at_mismatched_locus`, `near_gap` — per-variant audit flags, described below. + +### `target_mappings` + +Per-`(target, alignment_level)` provenance and alignment QC rows. The MaveDB API consumes these as `target_gene_mappings` and uses them to attribute every `mapped_score` back to the alignment that produced it. (See [`schema.json`](schema.json) `TargetMapping` for the wire format.) + +### `error_message` + +Populated only when the run failed before producing any scores; otherwise omitted. Per-variant errors live on `mapped_scores[].error_message`, not here. + +## Audit and provenance details + +### `target_mappings` fields + +Each row describes the alignment that one set of mapped variants is grounded in: + +| Field | Notes | +|---|---| +| `target_gene_identifier`, `alignment_level`, `preferred` | Composite key. `(target_gene_identifier, alignment_level)` is unique per run. Exactly one row per target has `preferred=True`. | +| `tool_name`, `tool_version`, `tool_parameters` | Aligner provenance. `tool_parameters.aligner` is `"blat"` for sequence-based targets and `"cdot_transcript_placement"` for accession-based targets. | +| `reference_accession`, `reference_sequence_id`, `vrs_version` | Coordinate-frame and run provenance. | +| `percent_identity`, `alignment_score`, `next_best_alignment_score`, `alignment_length`, `mismatch_count`, `gap_count` | Aggregate QC for the winning HSP. `alignment_score` is the canonical PSL score (`identities − mismatches − qNumInsert − tNumInsert`). | +| `alignment_string`, `alignment_metadata` | Pairwise visualization plus a small structured payload (CIGAR, `near_gap_window`, `at_mismatched_locus_evaluated`). | +| `total_variants`, `variants_mapped_cleanly`, `variants_with_mapping_warnings`, `variants_with_alignment_warnings`, `variants_failed` | Per-row variant counts. `variants_with_alignment_warnings` counts variants whose reference position fell on a mismatched base or near a gap. | + +### Per-variant audit flags + +Each `ScoreAnnotation` is attributable to exactly one `target_mappings` row via the composite key `(target_gene_identifier, alignment_level)`. The pipeline enforces this as a runtime invariant — orphaned scores raise `RuntimeError` rather than silently corrupting downstream joins. + +Per-variant locus flags: + +* `at_mismatched_locus` — `True` when any base in the variant's reference span mismatches between the target sequence and the reference; `False` when evaluated and no mismatch was found; `None` when per-base sequence content was unavailable for that layer (see `alignment_metadata.at_mismatched_locus_evaluated`), or when the variant is a `ReferenceLengthExpression` allele (large deletions/duplications, always `None`/`None`). +* `near_gap` — `True` when the variant lies within `alignment_metadata.near_gap_window` reference bases of any alignment gap; `None` for layers without an alignment (e.g. `cdna`). + +Completely-failed variants (`pre_mapped is None` and no annotation layer was determined) are attributed to the target's preferred layer so the join invariant holds. + +### Regenerating `schema.json` + +`schema.json` is checked in and consumed by downstream services (notably the MaveDB API). After any change to `src/dcd_mapping/schemas.py` that alters the public output contract, regenerate it: + +```shell +python scripts/generate_schema.py +``` + +Commit the regenerated `schema.json` in the same change. + ## Notebooks Notebooks for manuscript data analysis and figure generation are provided within `notebooks/analysis`. See [`notebooks/analysis/README.md`](notebooks/analysis/README.md) for more information. diff --git a/pyproject.toml b/pyproject.toml index 3ae7444..c4436a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -169,4 +169,4 @@ ignore = [ # ANN102 - missing-type-cls # S101 - assert # B011 - assert-false -"tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "D103"] +"tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "D101", "D102", "D103"] diff --git a/schema.json b/schema.json index 977a590..708e4d1 100644 --- a/schema.json +++ b/schema.json @@ -1,18 +1,28 @@ { "$defs": { + "AnnotationLayer": { + "description": "Create enum for supported annotation layers", + "enum": [ + "p", + "c", + "g" + ], + "title": "AnnotationLayer", + "type": "string" + }, "ComputedReferenceSequence": { "description": "Define metadata describing a computed reference sequence", "properties": { - "sequence_type": { - "$ref": "#/$defs/TargetSequenceType" + "sequence": { + "title": "Sequence", + "type": "string" }, "sequence_id": { "title": "Sequence Id", "type": "string" }, - "sequence": { - "title": "Sequence", - "type": "string" + "sequence_type": { + "$ref": "#/$defs/TargetSequenceType" } }, "required": [ @@ -23,6 +33,37 @@ "title": "ComputedReferenceSequence", "type": "object" }, + "GeneInfo": { + "description": "Basic gene metadata for a target, including symbol and selection method.", + "properties": { + "hgnc_symbol": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Hgnc Symbol" + }, + "selection_method": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Selection Method" + } + }, + "title": "GeneInfo", + "type": "object" + }, "IRI": { "description": "An IRI Reference (either an IRI or a relative-reference), according to `RFC3986 section 4.1 ` and `RFC3987 section 2.1 `. MAY be a JSON Pointer as an IRI fragment, as described by `RFC6901 section 6 `.", "title": "IRI", @@ -31,19 +72,19 @@ "MappedReferenceSequence": { "description": "Define metadata describing a mapped, human reference sequence", "properties": { - "sequence_type": { - "$ref": "#/$defs/TargetSequenceType" - }, - "sequence_id": { - "title": "Sequence Id", - "type": "string" - }, "sequence_accessions": { "items": { "type": "string" }, "title": "Sequence Accessions", "type": "array" + }, + "sequence_id": { + "title": "Sequence Id", + "type": "string" + }, + "sequence_type": { + "$ref": "#/$defs/TargetSequenceType" } }, "required": [ @@ -60,9 +101,6 @@ "type": { "const": "Number", "default": "Number", - "enum": [ - "Number" - ], "title": "Type", "type": "string" }, @@ -97,7 +135,7 @@ "ReferenceLengthExpression": { "description": "An expression sequence derived from a reference.", "properties": { - "id": { + "description": { "anyOf": [ { "type": "string" @@ -107,23 +145,25 @@ } ], "default": null, - "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", - "title": "Id" + "description": "A free-text description of the entity.", + "title": "Description" }, - "label": { + "extensions": { "anyOf": [ { - "type": "string" + "items": { + "$ref": "#/$defs/ga4gh__core___internal__models__Extension" + }, + "type": "array" }, { "type": "null" } ], "default": null, - "description": "A primary label for the entity.", - "title": "Label" + "title": "Extensions" }, - "description": { + "id": { "anyOf": [ { "type": "string" @@ -133,33 +173,21 @@ } ], "default": null, - "description": "A free-text description of the entity.", - "title": "Description" + "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", + "title": "Id" }, - "extensions": { + "label": { "anyOf": [ { - "items": { - "$ref": "#/$defs/ga4gh__core___internal__models__Extension" - }, - "type": "array" + "type": "string" }, { "type": "null" } ], "default": null, - "title": "Extensions" - }, - "type": { - "const": "ReferenceLengthExpression", - "default": "ReferenceLengthExpression", - "description": "MUST be \"ReferenceLengthExpression\"", - "enum": [ - "ReferenceLengthExpression" - ], - "title": "Type", - "type": "string" + "description": "A primary label for the entity.", + "title": "Label" }, "length": { "anyOf": [ @@ -173,6 +201,12 @@ "description": "The number of residues of the expressed sequence.", "title": "Length" }, + "repeatSubunitLength": { + "default": null, + "description": "The number of residues of the repeat subunit.", + "title": "Repeatsubunitlength", + "type": "integer" + }, "sequence": { "anyOf": [ { @@ -185,11 +219,12 @@ "default": null, "description": "the Sequence encoded by the Reference Length Expression." }, - "repeatSubunitLength": { - "default": null, - "description": "The number of residues of the repeat subunit.", - "title": "Repeatsubunitlength", - "type": "integer" + "type": { + "const": "ReferenceLengthExpression", + "default": "ReferenceLengthExpression", + "description": "MUST be \"ReferenceLengthExpression\"", + "title": "Type", + "type": "string" } }, "required": [ @@ -209,30 +244,65 @@ "ScoreAnnotation": { "description": "Provide extra annotations on top of mappings for an individual experiment score.\n\nThis model defines what an individual mapping instance looks like in the final JSON.", "properties": { - "pre_mapped": { + "alignment_level": { "anyOf": [ { - "$ref": "#/$defs/VariationDescriptor" + "$ref": "#/$defs/AnnotationLayer" }, { - "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Haplotype" + "type": "null" } ], - "title": "Pre Mapped" + "default": null }, - "post_mapped": { + "at_mismatched_locus": { "anyOf": [ { - "$ref": "#/$defs/VariationDescriptor" + "type": "boolean" }, { - "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Haplotype" + "type": "null" } ], - "title": "Post Mapped" + "default": null, + "title": "At Mismatched Locus" + }, + "error_message": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Error Message" + }, + "mavedb_id": { + "title": "Mavedb Id", + "type": "string" + }, + "near_gap": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Near Gap" }, - "pre_mapped_2_0": { + "post_mapped": { "anyOf": [ + { + "$ref": "#/$defs/VariationDescriptor" + }, + { + "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Haplotype" + }, { "$ref": "#/$defs/ga4gh__vrs___internal__models__Allele" }, @@ -244,10 +314,16 @@ } ], "default": null, - "title": "Pre Mapped 2 0" + "title": "Post Mapped" }, - "post_mapped_2_0": { + "pre_mapped": { "anyOf": [ + { + "$ref": "#/$defs/VariationDescriptor" + }, + { + "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Haplotype" + }, { "$ref": "#/$defs/ga4gh__vrs___internal__models__Allele" }, @@ -259,18 +335,11 @@ } ], "default": null, - "title": "Post Mapped 2 0" - }, - "mavedb_id": { - "title": "Mavedb Id", - "type": "string" + "title": "Pre Mapped" }, "relation": { "const": "SO:is_homologous_to", "default": "SO:is_homologous_to", - "enum": [ - "SO:is_homologous_to" - ], "title": "Relation", "type": "string" }, @@ -283,14 +352,35 @@ "type": "null" } ], + "default": null, "title": "Score" + }, + "target_gene_identifier": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Target Gene Identifier" + }, + "vrs_version": { + "anyOf": [ + { + "$ref": "#/$defs/VrsVersion" + }, + { + "type": "null" + } + ], + "default": null } }, "required": [ - "pre_mapped", - "post_mapped", - "mavedb_id", - "score" + "mavedb_id" ], "title": "ScoreAnnotation", "type": "object" @@ -298,20 +388,17 @@ "SequenceInterval": { "description": "Define VRS 1.3 SequenceInterval.", "properties": { + "end": { + "$ref": "#/$defs/Number" + }, + "start": { + "$ref": "#/$defs/Number" + }, "type": { "const": "SequenceInterval", "default": "SequenceInterval", - "enum": [ - "SequenceInterval" - ], "title": "Type", "type": "string" - }, - "start": { - "$ref": "#/$defs/Number" - }, - "end": { - "$ref": "#/$defs/Number" } }, "required": [ @@ -323,7 +410,7 @@ }, "SequenceReference": { "properties": { - "id": { + "description": { "anyOf": [ { "type": "string" @@ -333,23 +420,25 @@ } ], "default": null, - "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", - "title": "Id" + "description": "A free-text description of the entity.", + "title": "Description" }, - "label": { + "extensions": { "anyOf": [ { - "type": "string" + "items": { + "$ref": "#/$defs/ga4gh__core___internal__models__Extension" + }, + "type": "array" }, { "type": "null" } ], "default": null, - "description": "A primary label for the entity.", - "title": "Label" + "title": "Extensions" }, - "description": { + "id": { "anyOf": [ { "type": "string" @@ -359,33 +448,21 @@ } ], "default": null, - "description": "A free-text description of the entity.", - "title": "Description" + "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", + "title": "Id" }, - "extensions": { + "label": { "anyOf": [ { - "items": { - "$ref": "#/$defs/ga4gh__core___internal__models__Extension" - }, - "type": "array" + "type": "string" }, { "type": "null" } ], "default": null, - "title": "Extensions" - }, - "type": { - "const": "SequenceReference", - "default": "SequenceReference", - "description": "MUST be \"SequenceReference\"", - "enum": [ - "SequenceReference" - ], - "title": "Type", - "type": "string" + "description": "A primary label for the entity.", + "title": "Label" }, "refgetAccession": { "description": "A `GA4GH RefGet ` identifier for the referenced sequence, using the sha512t24u digest.", @@ -403,6 +480,13 @@ } ], "default": null + }, + "type": { + "const": "SequenceReference", + "default": "SequenceReference", + "description": "MUST be \"SequenceReference\"", + "title": "Type", + "type": "string" } }, "required": [ @@ -433,34 +517,326 @@ "title": "Syntax", "type": "string" }, - "TargetSequenceType": { - "description": "Define target sequence type. Add more definitions as needed.", - "enum": [ - "protein", - "dna" - ], - "title": "TargetSequenceType", - "type": "string" + "TargetAnnotation": { + "description": "Represents annotations associated with a biological target, including optional gene metadata\nand structured annotation layers.\n\nAttributes\n----------\ngene_info : GeneInfo | None\n Optional metadata describing the gene associated with the target,\n including identifiers and descriptive information where available.\n\nlayers : dict[AnnotationLayer, dict[str, ComputedReferenceSequence | MappedReferenceSequence | dict | None]]\n A mapping of annotation layers to keyed layer data. Each layer is identified by an\n AnnotationLayer key and contains a dictionary where:\n - keys are string identifiers for items within the layer (e.g., feature names),\n - values are one of:\n - ComputedReferenceSequence: a computed sequence representation for the item,\n - MappedReferenceSequence: a sequence mapped to a reference coordinate system,\n - dict: a generic dictionary for custom layer-specific payloads,\n - None: indicating missing or intentionally omitted data.\n\nNotes\n-----\n- The default value for 'layers' is an empty dictionary.\n- This model is intended to standardize layer-based annotations for downstream processing\n and validation, allowing both computed and mapped sequence data to coexist within the same\n structure.", + "properties": { + "gene_info": { + "anyOf": [ + { + "$ref": "#/$defs/GeneInfo" + }, + { + "type": "null" + } + ], + "default": null + }, + "layers": { + "additionalProperties": { + "additionalProperties": { + "anyOf": [ + { + "$ref": "#/$defs/ComputedReferenceSequence" + }, + { + "$ref": "#/$defs/MappedReferenceSequence" + }, + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ] + }, + "type": "object" + }, + "default": {}, + "propertyNames": { + "$ref": "#/$defs/AnnotationLayer" + }, + "title": "Layers", + "type": "object" + } + }, + "title": "TargetAnnotation", + "type": "object" }, - "VariationDescriptor": { - "description": "Define VRSATILE VariationDescriptor.", + "TargetMapping": { + "description": "Per-(target, alignment_level) provenance and QC block.\n\nField names mirror the corresponding columns on `target_gene_mappings` in the\nMaveDB API so the API worker can deserialize directly with minimal transformation.\nAny aligner-specific structured details go in ``tool_parameters`` /\n``alignment_metadata`` (JSONB on the API side).\n\n``reference_assembly`` is a top-level column (not nested in ``tool_parameters``)\nbecause it describes the coordinate frame of the mapping result, not aligner\nconfiguration. It is ``None`` for alignments with no genomic frame (e.g.\nprotein-vs-protein).\n\n``tool_parameters`` shape per aligner\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nBLAT genomic (sequence-based, nucleotide or protein-vs-genome)::\n\n {\n \"aligner\": \"blat\",\n \"aligner_version\": \"\",\n \"min_score\": ,\n \"out_format\": \"\",\n \"target_args\": \"\",\n }\n\nBLAT protein-vs-protein (sequence-based, protein annotation layer)::\n\n {\n \"aligner\": \"blat\",\n \"aligner_version\": \"\",\n \"min_score\": ,\n \"out_format\": \"\",\n \"target_args\": \"-q=prot -t=prot\",\n }\n\ncdot transcript placement (accession-based)::\n\n {\n \"aligner\": \"cdot_transcript_placement\",\n \"aligner_version\": \"\",\n \"cdot_url\": \"\",\n \"cdot_data_version\": \"\",\n }\n\n``cdot_data_version`` is ``null`` when the cdot REST response did not\ninclude a ``cdot_data_version`` field; its presence (even as ``null``)\ndistinguishes \"cdot run, version unknown\" from \"not a cdot run\".", "properties": { - "id": { - "title": "Id", + "alignment_length": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Alignment Length" + }, + "alignment_level": { + "$ref": "#/$defs/AnnotationLayer" + }, + "alignment_metadata": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Alignment Metadata" + }, + "alignment_score": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Alignment Score" + }, + "alignment_string": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Alignment String" + }, + "gap_count": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Gap Count" + }, + "mismatch_count": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Mismatch Count" + }, + "next_best_alignment_score": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Next Best Alignment Score" + }, + "percent_identity": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Percent Identity" + }, + "preferred": { + "default": false, + "title": "Preferred", + "type": "boolean" + }, + "reference_accession": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Reference Accession" + }, + "reference_assembly": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Reference Assembly" + }, + "reference_sequence_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Reference Sequence Id" + }, + "target_gene_identifier": { + "title": "Target Gene Identifier", "type": "string" }, - "type": { - "const": "VariationDescriptor", - "default": "VariationDescriptor", - "enum": [ - "VariationDescriptor" + "tool_name": { + "default": "dcd-mapping", + "title": "Tool Name", + "type": "string" + }, + "tool_parameters": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } ], - "title": "Type", + "default": null, + "title": "Tool Parameters" + }, + "tool_version": { + "default": "2026.1.2", + "title": "Tool Version", "type": "string" }, - "variation": { - "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Allele" + "total_variants": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Total Variants" }, + "variants_failed": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Variants Failed" + }, + "variants_failed_pre_layer_selection": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Variants Failed Pre Layer Selection" + }, + "variants_mapped_cleanly": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Variants Mapped Cleanly" + }, + "variants_with_alignment_warnings": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Variants With Alignment Warnings" + }, + "variants_with_mapping_warnings": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Variants With Mapping Warnings" + }, + "vrs_version": { + "anyOf": [ + { + "$ref": "#/$defs/VrsVersion" + }, + { + "type": "null" + } + ], + "default": null + } + }, + "required": [ + "target_gene_identifier", + "alignment_level" + ], + "title": "TargetMapping", + "type": "object" + }, + "TargetSequenceType": { + "description": "Define target sequence type. Add more definitions as needed.", + "enum": [ + "protein", + "dna" + ], + "title": "TargetSequenceType", + "type": "string" + }, + "VariationDescriptor": { + "description": "Define VRSATILE VariationDescriptor.", + "properties": { "expressions": { "anyOf": [ { @@ -476,16 +852,29 @@ "default": null, "title": "Expressions" }, - "vrs_ref_allele_seq": { - "title": "Vrs Ref Allele Seq", - "type": "string" - }, "extensions": { "items": { "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Extension" }, "title": "Extensions", "type": "array" + }, + "id": { + "title": "Id", + "type": "string" + }, + "type": { + "const": "VariationDescriptor", + "default": "VariationDescriptor", + "title": "Type", + "type": "string" + }, + "variation": { + "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__Allele" + }, + "vrs_ref_allele_seq": { + "title": "Vrs Ref Allele Seq", + "type": "string" } }, "required": [ @@ -497,18 +886,18 @@ "title": "VariationDescriptor", "type": "object" }, + "VrsVersion": { + "description": "Define VRS versions", + "enum": [ + "1.3", + "2" + ], + "title": "VrsVersion", + "type": "string" + }, "dcd_mapping__vrs_v1_schemas__Allele": { "description": "Define VRS 1.3 Allele.", "properties": { - "type": { - "const": "Allele", - "default": "Allele", - "enum": [ - "Allele" - ], - "title": "Type", - "type": "string" - }, "id": { "title": "Id", "type": "string" @@ -518,6 +907,12 @@ }, "state": { "$ref": "#/$defs/dcd_mapping__vrs_v1_schemas__LiteralSequenceExpression" + }, + "type": { + "const": "Allele", + "default": "Allele", + "title": "Type", + "type": "string" } }, "required": [ @@ -531,25 +926,22 @@ "dcd_mapping__vrs_v1_schemas__Expression": { "description": "Define VRS 1.3 Expression.", "properties": { + "syntax": { + "title": "Syntax", + "type": "string" + }, + "syntax_version": { + "title": "Syntax Version" + }, "type": { "const": "Expression", "default": "Expression", - "enum": [ - "Expression" - ], "title": "Type", "type": "string" }, - "syntax": { - "title": "Syntax", - "type": "string" - }, "value": { "title": "Value", "type": "string" - }, - "syntax_version": { - "title": "Syntax Version" } }, "required": [ @@ -563,19 +955,16 @@ "dcd_mapping__vrs_v1_schemas__Extension": { "description": "Define VRS 1.3 Extension.", "properties": { + "name": { + "title": "Name", + "type": "string" + }, "type": { "const": "Extension", "default": "Extension", - "enum": [ - "Extension" - ], "title": "Type", "type": "string" }, - "name": { - "title": "Name", - "type": "string" - }, "value": { "title": "Value" } @@ -590,15 +979,6 @@ "dcd_mapping__vrs_v1_schemas__Haplotype": { "description": "Define VRS 1.3 Haplotype.", "properties": { - "type": { - "const": "Haplotype", - "default": "Haplotype", - "enum": [ - "Haplotype" - ], - "title": "Type", - "type": "string" - }, "members": { "anyOf": [ { @@ -615,6 +995,12 @@ } ], "title": "Members" + }, + "type": { + "const": "Haplotype", + "default": "Haplotype", + "title": "Type", + "type": "string" } }, "required": [ @@ -626,18 +1012,15 @@ "dcd_mapping__vrs_v1_schemas__LiteralSequenceExpression": { "description": "Define VRS 1.3 LiteralSequenceExpression.", "properties": { + "sequence": { + "title": "Sequence", + "type": "string" + }, "type": { "const": "LiteralSequenceExpression", "default": "LiteralSequenceExpression", - "enum": [ - "LiteralSequenceExpression" - ], "title": "Type", "type": "string" - }, - "sequence": { - "title": "Sequence", - "type": "string" } }, "required": [ @@ -660,21 +1043,18 @@ ], "title": "Id" }, - "type": { - "const": "SequenceLocation", - "default": "SequenceLocation", - "enum": [ - "SequenceLocation" - ], - "title": "Type", - "type": "string" + "interval": { + "$ref": "#/$defs/SequenceInterval" }, "sequence_id": { "title": "Sequence Id", "type": "string" }, - "interval": { - "$ref": "#/$defs/SequenceInterval" + "type": { + "const": "SequenceLocation", + "default": "SequenceLocation", + "title": "Type", + "type": "string" } }, "required": [ @@ -689,21 +1069,18 @@ "additionalProperties": true, "description": "The Extension class provides VODs with a means to extend descriptions with other\nattributes unique to a content provider. These extensions are not expected to be\nnatively understood under VRSATILE, but may be used for pre-negotiated exchange of\nmessage attributes when needed.", "properties": { + "name": { + "description": "A name for the Extension", + "title": "Name", + "type": "string" + }, "type": { "const": "Extension", "default": "Extension", "description": "MUST be \"Extension\".", - "enum": [ - "Extension" - ], "title": "Type", "type": "string" }, - "name": { - "description": "A name for the Extension", - "title": "Name", - "type": "string" - }, "value": { "anyOf": [ { @@ -716,6 +1093,7 @@ "type": "boolean" }, { + "additionalProperties": true, "type": "object" }, { @@ -739,7 +1117,7 @@ }, "ga4gh__vrs___internal__models__Allele": { "properties": { - "id": { + "description": { "anyOf": [ { "type": "string" @@ -748,13 +1126,14 @@ "type": "null" } ], - "default": null, - "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", - "title": "Id" + "default": null, + "description": "A free-text description of the entity.", + "title": "Description" }, - "label": { + "digest": { "anyOf": [ { + "pattern": "^[0-9A-Za-z_\\-]{32}$", "type": "string" }, { @@ -762,21 +1141,23 @@ } ], "default": null, - "description": "A primary label for the entity.", - "title": "Label" + "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.", + "title": "Digest" }, - "description": { + "expressions": { "anyOf": [ { - "type": "string" + "items": { + "$ref": "#/$defs/ga4gh__vrs___internal__models__Expression" + }, + "type": "array" }, { "type": "null" } ], "default": null, - "description": "A free-text description of the entity.", - "title": "Description" + "title": "Expressions" }, "extensions": { "anyOf": [ @@ -793,20 +1174,9 @@ "default": null, "title": "Extensions" }, - "type": { - "const": "Allele", - "default": "Allele", - "description": "MUST be \"Allele\"", - "enum": [ - "Allele" - ], - "title": "Type", - "type": "string" - }, - "digest": { + "id": { "anyOf": [ { - "pattern": "^[0-9A-Za-z_\\-]{32}$", "type": "string" }, { @@ -814,23 +1184,21 @@ } ], "default": null, - "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.", - "title": "Digest" + "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", + "title": "Id" }, - "expressions": { + "label": { "anyOf": [ { - "items": { - "$ref": "#/$defs/ga4gh__vrs___internal__models__Expression" - }, - "type": "array" + "type": "string" }, { "type": "null" } ], "default": null, - "title": "Expressions" + "description": "A primary label for the entity.", + "title": "Label" }, "location": { "anyOf": [ @@ -855,6 +1223,13 @@ ], "description": "An expression of the sequence state", "title": "State" + }, + "type": { + "const": "Allele", + "default": "Allele", + "description": "MUST be \"Allele\"", + "title": "Type", + "type": "string" } }, "required": [ @@ -870,10 +1245,6 @@ "syntax": { "$ref": "#/$defs/Syntax" }, - "value": { - "title": "Value", - "type": "string" - }, "syntax_version": { "anyOf": [ { @@ -885,6 +1256,10 @@ ], "default": null, "title": "Syntax Version" + }, + "value": { + "title": "Value", + "type": "string" } }, "required": [ @@ -897,7 +1272,7 @@ "ga4gh__vrs___internal__models__Haplotype": { "description": "A set of non-overlapping Allele members that co-occur on the same molecule.", "properties": { - "id": { + "description": { "anyOf": [ { "type": "string" @@ -907,12 +1282,13 @@ } ], "default": null, - "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", - "title": "Id" + "description": "A free-text description of the entity.", + "title": "Description" }, - "label": { + "digest": { "anyOf": [ { + "pattern": "^[0-9A-Za-z_\\-]{32}$", "type": "string" }, { @@ -920,21 +1296,23 @@ } ], "default": null, - "description": "A primary label for the entity.", - "title": "Label" + "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.", + "title": "Digest" }, - "description": { + "expressions": { "anyOf": [ { - "type": "string" + "items": { + "$ref": "#/$defs/ga4gh__vrs___internal__models__Expression" + }, + "type": "array" }, { "type": "null" } ], "default": null, - "description": "A free-text description of the entity.", - "title": "Description" + "title": "Expressions" }, "extensions": { "anyOf": [ @@ -951,20 +1329,9 @@ "default": null, "title": "Extensions" }, - "type": { - "const": "Haplotype", - "default": "Haplotype", - "description": "MUST be \"Haplotype\"", - "enum": [ - "Haplotype" - ], - "title": "Type", - "type": "string" - }, - "digest": { + "id": { "anyOf": [ { - "pattern": "^[0-9A-Za-z_\\-]{32}$", "type": "string" }, { @@ -972,23 +1339,21 @@ } ], "default": null, - "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.", - "title": "Digest" + "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", + "title": "Id" }, - "expressions": { + "label": { "anyOf": [ { - "items": { - "$ref": "#/$defs/ga4gh__vrs___internal__models__Expression" - }, - "type": "array" + "type": "string" }, { "type": "null" } ], "default": null, - "title": "Expressions" + "description": "A primary label for the entity.", + "title": "Label" }, "members": { "description": "A list of Alleles (or IRI references to `Alleles`) that comprise a Haplotype. Since each `Haplotype` member MUST be an `Allele`, and all members MUST share a common `SequenceReference`, implementations MAY use a compact representation of Haplotype that omits type and `SequenceReference` information in individual Haplotype members. Implementations MUST transform compact `Allele` representations into an `Allele` when computing GA4GH identifiers.", @@ -1005,6 +1370,13 @@ "minItems": 2, "title": "Members", "type": "array" + }, + "type": { + "const": "Haplotype", + "default": "Haplotype", + "description": "MUST be \"Haplotype\"", + "title": "Type", + "type": "string" } }, "required": [ @@ -1016,7 +1388,7 @@ "ga4gh__vrs___internal__models__LiteralSequenceExpression": { "description": "An explicit expression of a Sequence.", "properties": { - "id": { + "description": { "anyOf": [ { "type": "string" @@ -1026,23 +1398,25 @@ } ], "default": null, - "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", - "title": "Id" + "description": "A free-text description of the entity.", + "title": "Description" }, - "label": { + "extensions": { "anyOf": [ { - "type": "string" + "items": { + "$ref": "#/$defs/ga4gh__core___internal__models__Extension" + }, + "type": "array" }, { "type": "null" } ], "default": null, - "description": "A primary label for the entity.", - "title": "Label" + "title": "Extensions" }, - "description": { + "id": { "anyOf": [ { "type": "string" @@ -1052,41 +1426,32 @@ } ], "default": null, - "description": "A free-text description of the entity.", - "title": "Description" + "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", + "title": "Id" }, - "extensions": { + "label": { "anyOf": [ { - "items": { - "$ref": "#/$defs/ga4gh__core___internal__models__Extension" - }, - "type": "array" + "type": "string" }, { "type": "null" } ], "default": null, - "title": "Extensions" + "description": "A primary label for the entity.", + "title": "Label" + }, + "sequence": { + "$ref": "#/$defs/SequenceString", + "description": "the literal sequence" }, "type": { "const": "LiteralSequenceExpression", "default": "LiteralSequenceExpression", "description": "MUST be \"LiteralSequenceExpression\"", - "enum": [ - "LiteralSequenceExpression" - ], "title": "Type", "type": "string" - }, - "sequence": { - "allOf": [ - { - "$ref": "#/$defs/SequenceString" - } - ], - "description": "the literal sequence" } }, "required": [ @@ -1098,7 +1463,7 @@ "ga4gh__vrs___internal__models__SequenceLocation": { "description": "A `Location` defined by an interval on a referenced `Sequence`.", "properties": { - "id": { + "description": { "anyOf": [ { "type": "string" @@ -1108,12 +1473,13 @@ } ], "default": null, - "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", - "title": "Id" + "description": "A free-text description of the entity.", + "title": "Description" }, - "label": { + "digest": { "anyOf": [ { + "pattern": "^[0-9A-Za-z_\\-]{32}$", "type": "string" }, { @@ -1121,21 +1487,20 @@ } ], "default": null, - "description": "A primary label for the entity.", - "title": "Label" + "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.", + "title": "Digest" }, - "description": { + "end": { "anyOf": [ { - "type": "string" + "$ref": "#/$defs/Range" }, { - "type": "null" + "type": "integer" } ], - "default": null, - "description": "A free-text description of the entity.", - "title": "Description" + "description": "The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than the value of `start`.", + "title": "End" }, "extensions": { "anyOf": [ @@ -1152,20 +1517,22 @@ "default": null, "title": "Extensions" }, - "type": { - "const": "SequenceLocation", - "default": "SequenceLocation", - "description": "MUST be \"SequenceLocation\"", - "enum": [ - "SequenceLocation" + "id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } ], - "title": "Type", - "type": "string" + "default": null, + "description": "The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE).", + "title": "Id" }, - "digest": { + "label": { "anyOf": [ { - "pattern": "^[0-9A-Za-z_\\-]{32}$", "type": "string" }, { @@ -1173,8 +1540,8 @@ } ], "default": null, - "description": "A sha512t24u digest created using the VRS Computed Identifier algorithm.", - "title": "Digest" + "description": "A primary label for the entity.", + "title": "Label" }, "sequenceReference": { "anyOf": [ @@ -1204,17 +1571,12 @@ "description": "The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range less than the value of `end`.", "title": "Start" }, - "end": { - "anyOf": [ - { - "$ref": "#/$defs/Range" - }, - { - "type": "integer" - } - ], - "description": "The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than the value of `start`.", - "title": "End" + "type": { + "const": "SequenceLocation", + "default": "SequenceLocation", + "description": "MUST be \"SequenceLocation\"", + "title": "Type", + "type": "string" } }, "required": [ @@ -1227,28 +1589,82 @@ }, "description": "Provide all mapped scores for a scoreset.", "properties": { + "error_message": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Error Message" + }, + "mapped_date": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Mapped Date" + }, + "mapped_scores": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/ScoreAnnotation" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Mapped Scores" + }, "metadata": { "title": "Metadata" }, - "computed_reference_sequence": { - "$ref": "#/$defs/ComputedReferenceSequence" - }, - "mapped_reference_sequence": { - "$ref": "#/$defs/MappedReferenceSequence" + "reference_sequences": { + "anyOf": [ + { + "additionalProperties": { + "$ref": "#/$defs/TargetAnnotation" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Reference Sequences" }, - "mapped_scores": { - "items": { - "$ref": "#/$defs/ScoreAnnotation" - }, - "title": "Mapped Scores", - "type": "array" + "target_mappings": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/TargetMapping" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Target Mappings" } }, "required": [ - "metadata", - "computed_reference_sequence", - "mapped_reference_sequence", - "mapped_scores" + "metadata" ], "title": "ScoresetMapping", "type": "object" diff --git a/scripts/generate_schema.py b/scripts/generate_schema.py new file mode 100644 index 0000000..f25f25f --- /dev/null +++ b/scripts/generate_schema.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +"""Regenerate schema.json from the current Pydantic models. + +Run from the repository root after any change to src/dcd_mapping/schemas.py +that alters the public output contract: + + python scripts/generate_schema.py + +The script writes the JSON Schema for :class:`~dcd_mapping.schemas.ScoresetMapping` +to ``schema.json`` at the repository root, overwriting the previous file. + +If you change schemas.py, re-run this script and commit the result. +""" +import json +import pathlib + +from dcd_mapping.schemas import ScoresetMapping + +REPO_ROOT = pathlib.Path(__file__).parent.parent +SCHEMA_PATH = REPO_ROOT / "schema.json" + + +def main() -> None: + """Generate JSON Schema for ScoresetMapping and write to file.""" + schema = ScoresetMapping.model_json_schema() + SCHEMA_PATH.write_text(json.dumps(schema, indent=2, sort_keys=True) + "\n") + print(f"Written: {SCHEMA_PATH}") # noqa: T201 + + +if __name__ == "__main__": + main() diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 9235061..cc0149b 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -2,17 +2,14 @@ import logging from pathlib import Path -from cool_seq_tool.schemas import AnnotationLayer from fastapi import APIRouter, HTTPException from fastapi.responses import JSONResponse from httpx import HTTPStatusError from dcd_mapping.align import build_alignment_result from dcd_mapping.annotate import ( - _get_computed_reference_sequence, - _get_mapped_reference_sequence, - _set_scoreset_layer, annotate, + build_scoreset_mapping, compute_target_gene_info, ) from dcd_mapping.exceptions import ( @@ -34,11 +31,8 @@ with_mavedb_score_set, ) from dcd_mapping.schemas import ( - ScoreAnnotation, + AlignmentResult, ScoresetMapping, - TargetAnnotation, - TargetType, - TxSelectResult, VrsVersion, ) from dcd_mapping.transcripts import select_transcripts @@ -60,6 +54,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse :param store_path: optional path to save output at """ try: + raw_metadata = get_raw_scoreset_metadata(urn, store_path) metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, True, store_path) metadata = patch_target_sequence_type(metadata, records, force=False) @@ -79,7 +74,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse if not records: return JSONResponse( content=ScoresetMapping( - metadata=metadata, + metadata=raw_metadata, error_message="Score set contains no variants to map", ).model_dump(exclude_none=True) ) @@ -99,14 +94,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse _logger.error("Alignment error for %s: %s", urn, e) return JSONResponse( content=ScoresetMapping( - metadata=metadata, error_message=str(e).strip("'") + metadata=raw_metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) except ScoresetNotSupportedError as e: _logger.error("Scoreset not supported during alignment for %s: %s", urn, e) return JSONResponse( content=ScoresetMapping( - metadata=metadata, error_message=str(e).strip("'") + metadata=raw_metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) @@ -127,15 +122,18 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse raise HTTPException(status_code=500, detail=msg) from e vrs_results = {} + protein_align_results: dict[str, AlignmentResult | None] = {} try: for target_gene in metadata.target_genes: - vrs_results[target_gene] = vrs_map( + vrs_map_result = vrs_map( metadata=metadata.target_genes[target_gene], align_result=alignment_results[target_gene], records=records[target_gene], transcript=transcripts[target_gene], silent=True, ) + vrs_results[target_gene] = vrs_map_result.mappings + protein_align_results[target_gene] = vrs_map_result.protein_align_result except ( UnsupportedReferenceSequenceNameSpaceError, VrsMapError, @@ -145,7 +143,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse _logger.error("VRS mapping error for %s: %s", urn, e) return JSONResponse( content=ScoresetMapping( - metadata=metadata, error_message=str(e).strip("'") + metadata=raw_metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) @@ -158,14 +156,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse if not vrs_results or all(nonetype_vrs_results): return JSONResponse( content=ScoresetMapping( - metadata=metadata, + metadata=raw_metadata, error_message="No variant mappings available for this score set", ).model_dump(exclude_none=True) ) if any(nonetype_vrs_results): return JSONResponse( content=ScoresetMapping( - metadata=metadata, + metadata=raw_metadata, error_message="Some variants generated vrs results, but not all. If any variants were mapped, all should have been.", ).model_dump(exclude_none=True) ) @@ -184,7 +182,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse _logger.error("Unexpected error during annotation for %s: %s", urn, e) return JSONResponse( content=ScoresetMapping( - metadata=metadata, error_message=str(e).strip("'") + metadata=raw_metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) @@ -197,125 +195,57 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse if not annotated_vrs_results or all(nonetype_annotated_vrs_results): return JSONResponse( content=ScoresetMapping( - metadata=metadata, + metadata=raw_metadata, error_message="No annotated variant mappings available for this score set", ).model_dump(exclude_none=True) ) if any(nonetype_annotated_vrs_results): return JSONResponse( content=ScoresetMapping( - metadata=metadata, + metadata=raw_metadata, error_message="Some variants generated annotated vrs results, but not all. If any variants were annotated, all should have been.", ).model_dump(exclude_none=True) ) try: - raw_metadata = get_raw_scoreset_metadata(urn, store_path) - reference_sequences: dict[str, TargetAnnotation] = {} - mapped_scores: list[ScoreAnnotation] = [] + gene_info = {} for target_gene in annotated_vrs_results: - preferred_layers = { - _set_scoreset_layer(urn, annotated_vrs_results[target_gene]), - } - target_gene_name = metadata.target_genes[target_gene].target_gene_name - reference_sequences[target_gene_name] = TargetAnnotation() - reference_sequences[target_gene_name].layers = { - layer: { - "computed_reference_sequence": None, - "mapped_reference_sequence": None, - } - for layer in preferred_layers - } - - # sometimes Nonetype layers show up in preferred layers dict; remove these - preferred_layers.discard(None) - - # Determine one gene symbol per target and its selection method - gene_info = await compute_target_gene_info( + gene_info[target_gene] = await compute_target_gene_info( target_key=target_gene, transcripts=transcripts, alignment_results=alignment_results, metadata=metadata, mapped_scores=annotated_vrs_results[target_gene], ) - - for layer in preferred_layers: - reference_sequences[target_gene_name].layers[layer][ - "computed_reference_sequence" - ] = _get_computed_reference_sequence( - metadata.target_genes[target_gene], layer, transcripts[target_gene] - ) - reference_sequences[target_gene_name].layers[layer][ - "mapped_reference_sequence" - ] = _get_mapped_reference_sequence( - metadata.target_genes[target_gene], - layer, - transcripts[target_gene], - alignment_results[target_gene], - ) - - if gene_info is not None: - reference_sequences[target_gene_name].gene_info = gene_info - - for m in annotated_vrs_results[target_gene]: - if m.pre_mapped is None: - mapped_scores.append(ScoreAnnotation(**m.model_dump())) - elif m.annotation_layer in preferred_layers: - # drop annotation layer from mapping object - mapped_scores.append(ScoreAnnotation(**m.model_dump())) - - # if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict - if ( - AnnotationLayer.GENOMIC in reference_sequences[target_gene_name].layers - and metadata.target_genes[target_gene].target_gene_category - == TargetType.PROTEIN_CODING - and metadata.target_genes[target_gene].target_accession_id is None - and transcripts[target_gene] is not None - and isinstance(transcripts[target_gene], TxSelectResult) - and transcripts[target_gene].nm is not None - ): - reference_sequences[target_gene_name].layers[AnnotationLayer.CDNA] = { - "computed_reference_sequence": None, - "mapped_reference_sequence": { - "sequence_accessions": [transcripts[target_gene].nm] - }, - } - - # drop Nonetype reference sequences - for target_gene in reference_sequences: - for layer in list(reference_sequences[target_gene].layers.keys()): - if ( - reference_sequences[target_gene].layers[layer][ - "mapped_reference_sequence" - ] - is None - and reference_sequences[target_gene].layers[layer][ - "computed_reference_sequence" - ] - is None - ) or layer is None: - del reference_sequences[target_gene].layers[layer] - + output = build_scoreset_mapping( + metadata=metadata, + mappings=annotated_vrs_results, + align_results=alignment_results, + tx_output=transcripts, + gene_info=gene_info, + preferred_layer_only=True, + vrs_version=VrsVersion.V_2, + raw_metadata=raw_metadata, + protein_align_results=protein_align_results, + ) except Exception as e: _logger.error("Unexpected error during result assembly for %s: %s", urn, e) return JSONResponse( content=ScoresetMapping( - metadata=metadata, error_message=str(e).strip("'") + metadata=raw_metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - if len(mapped_scores) != total_score_records: + # With preferred_layer_only=True, build_scoreset_mapping emits exactly one + # mapped_score per input variant: preferred-layer successes/failures plus + # completely-failed variants (annotation_layer=None) re-attributed to the + # preferred layer. The count must equal total_score_records. + if len(output.mapped_scores or []) != total_score_records: return JSONResponse( content=ScoresetMapping( - metadata=metadata, - error_message=f"Mismatch between number of mapped scores ({len(mapped_scores)}) and total score records ({total_score_records}). This is unexpected and indicates an issue with the mapping process.", + metadata=raw_metadata, + error_message=f"Mismatch between number of mapped scores ({len(output.mapped_scores or [])}) and total score records ({total_score_records}). This is unexpected and indicates an issue with the mapping process.", ).model_dump(exclude_none=True) ) - return JSONResponse( - content=ScoresetMapping( - metadata=raw_metadata, - reference_sequences=reference_sequences, - mapped_scores=mapped_scores, - ).model_dump(exclude_none=True) - ) + return JSONResponse(content=output.model_dump(mode="json", exclude_none=True)) diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index 6002dea..de547af 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -1,6 +1,8 @@ """Align MaveDB target sequences to a human reference genome.""" +import functools import logging import os +import shlex import subprocess import tempfile from collections.abc import Mapping @@ -8,10 +10,8 @@ from urllib.parse import urlparse import httpx -from Bio.SearchIO import HSP -from Bio.SearchIO import parse as parse_blat -from Bio.SearchIO import read as read_blat -from Bio.SearchIO._model import Hit, QueryResult +from Bio import Align as BioAlign +from Bio.Seq import Seq, UndefinedSequenceError from cool_seq_tool.schemas import Strand from dcd_mapping.exceptions import ( @@ -24,6 +24,7 @@ from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, patch_target_sequence_type from dcd_mapping.resource_utils import CDOT_URL, http_download from dcd_mapping.schemas import ( + AlignmentQc, AlignmentResult, GeneLocation, ScoresetMetadata, @@ -34,8 +35,47 @@ __all__ = ["align"] + _logger = logging.getLogger(__name__) +# The assembly name is a derived property of the 2bit file built into the Docker image— +# if the URL is ever changed to another build, update REFERENCE_GENOME_ASSEMBLY here as +# well so the audit field in AlignmentResult.reference_assembly stays accurate. +REFERENCE_GENOME_URL = ( + "https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.2bit" +) +REFERENCE_GENOME_ASSEMBLY = "GRCh38" + +# BLAT invocation parameters +BLAT_MIN_SCORE = 20 +BLAT_OUT_FORMAT = "pslx" + + +@functools.lru_cache +def _get_blat_version(bin_name: str) -> str: + """Return BLAT's self-reported version line by invoking the binary with no arguments. + + BLAT prints usage text to stdout when called without args; the first line + typically reads: ``blat - Standalone BLAT v. 36 fast sequence search command line tool`` + We return that line verbatim rather than trying to parse a version number out of it, + so the record is robust to BLAT changing its version string format. + Result is cached per binary path so we only probe once per process. + Returns ``"unknown"`` if the binary cannot be invoked or produces no output, + so the audit field is always a string rather than null. + """ + try: + # + result = subprocess.run([bin_name], capture_output=True, timeout=5) # noqa: S603 + output = result.stdout.decode("utf-8", errors="replace") + first_line = output.splitlines()[0].strip() if output.strip() else None + if first_line: + return first_line + except (OSError, IndexError): + _logger.debug( + "Could not determine BLAT version for %s", bin_name, exc_info=True + ) + return "unknown" + def _write_query_file(file: Path, lines: list[str]) -> None: """Write lines to query file. This method is broken out to enable easy mocking while @@ -61,6 +101,7 @@ def _build_query_file(scoreset_metadata: ScoresetMetadata, query_file: Path) -> for target_gene in scoreset_metadata.target_genes: lines.append(f">{target_gene}") lines.append(scoreset_metadata.target_genes[target_gene].target_sequence) + _write_query_file(query_file, lines) return query_file @@ -76,12 +117,13 @@ def get_ref_genome_file( :return: path to acquired file :raise ResourceAcquisitionError: if unable to acquire file. """ - url = "https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.2bit" + url = REFERENCE_GENOME_URL parsed_url = urlparse(url) if not dcd_mapping_dir: dcd_mapping_dir = LOCAL_STORE_PATH - genome_file = dcd_mapping_dir / Path(parsed_url.path).name + # this file shouldn't change, so no need to think about more advanced caching + genome_file = dcd_mapping_dir / Path(parsed_url.path).name if not genome_file.exists(): try: http_download(url, genome_file, silent) @@ -89,6 +131,7 @@ def get_ref_genome_file( msg = f"HTTPError when fetching reference genome file from {url}" _logger.error(msg) raise ResourceAcquisitionError(msg) from e + return genome_file @@ -98,8 +141,13 @@ def _run_blat( reference_file: Path, out_file: str, silent: bool, -) -> subprocess.CompletedProcess: - """Execute BLAT binary with relevant params. + min_score: int = BLAT_MIN_SCORE, + out_format: str = BLAT_OUT_FORMAT, +) -> tuple[subprocess.CompletedProcess, dict]: + """Execute BLAT binary with the given parameters. + + Returns both the process result and a parameters dict built from the exact + values passed in — so the QC record is inseparably paired with what ran. Currently, we rely on a system-installed BLAT binary accessible in the containing environment's PATH, or under env var ``BLAT_BIN_PATH``. This is sort of awkward and @@ -110,31 +158,64 @@ def _run_blat( :param target_args: target params eg ``"-q=prot -t=dnax"`` (can be empty) :param query_file: path to query FASTA file - :param out_file: path-like string to output fill (could be "/dev/stdout") + :param out_file: path-like string to output file (could be "/dev/stdout") :param silent: if True, suppress all console output - :return: process result + :param min_score: value passed as -minScore to BLAT + :param out_format: value passed as -out to BLAT + :return: (process result, aligner_parameters dict) """ - bin_name = os.environ["BLAT_BIN_PATH"] if "BLAT_BIN_PATH" in os.environ else "blat" # noqa: SIM401 - command = f"{bin_name} {reference_file} {target_args} -minScore=20 {query_file} {out_file}" - _logger.debug("Running BLAT command: %s", command) - result = subprocess.run( # noqa: UP022 - command, - shell=True, # noqa: S602 - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + bin_name = os.environ.get("BLAT_BIN_PATH", "blat") + + # -out=pslx emits PSL plus per-block aligned bases (tseqs/qseqs). Biopython's + # psl parser surfaces those as aln.target.seq / aln.query.seq, which is + # what we need to report real per-base mismatches and gaps on the output. + # target_args is a space-separated string of BLAT flags (e.g. "-q=prot -t=dnax"); + # all values originate from internal constants, so shlex.split is safe here. + cmd = [bin_name, str(reference_file)] + if target_args: + cmd.extend(shlex.split(target_args)) + + cmd.extend( + [f"-minScore={min_score}", f"-out={out_format}", str(query_file), out_file] ) + _logger.debug("Running BLAT command: %s", " ".join(cmd)) + + try: + result = subprocess.run( + cmd, + shell=False, # noqa: S603 -- BLAT args are all internally derived constants, not user input, so shell=False is safe here + capture_output=True, + timeout=600, + ) + except subprocess.TimeoutExpired as e: + msg = f"BLAT timed out after 600 s: {target_args} {query_file} {out_file}" + raise AlignmentError(msg) from e + except FileNotFoundError as e: + raise BlatNotFoundError from e _logger.debug("BLAT command finished with result %s", result.returncode) + if result.returncode == 127: raise BlatNotFoundError if result.returncode != 0: msg = f"BLAT process returned error code {result.returncode}: {target_args} {query_file} {out_file}" raise AlignmentError(msg) - return result + + params = { + "aligner": "blat", + "aligner_version": _get_blat_version(bin_name), + "min_score": min_score, + "out_format": out_format, + "target_args": target_args, + } + return result, params def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str: - """Create temp BLAT output file. Not immediately deleted, but should eventually - be cleared by the OS. + """Write BLAT pslx stdout to a tempfile and return its path. + + BLAT is invoked with ``-out=pslx``, so the file contains the standard 21 + PSL columns plus 2 extra columns (tseqs, qseqs) of aligned bases. + ``Bio.Align.parse(..., "psl")`` reads this directly. :param result: BLAT process result object :return: path-like string representing file location @@ -145,6 +226,379 @@ def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str: return tmp.name +def _count_query_insert_blocks(aln: "BioAlign.Alignment") -> int: + """Count gap-open events where extra bases appear in the query (query inserts). + + Corresponds to PSL ``qNumInsert`` — the **number of gap-opening blocks** + in the query, not the total number of inserted bases (that would be + ``qBaseInsert``). A single contiguous run of inserted query bases counts + as one, regardless of how many bases it spans. + (PSL field reference: https://genome.ucsc.edu/FAQ/FAQformat.html#format2) + + Used for BLAT-style identity and score, which penalize query inserts but + ignore target inserts (introns). + """ + coords = aln.coordinates + n = 0 + for i in range(coords.shape[1] - 1): + ts, te = int(coords[0][i]), int(coords[0][i + 1]) + qs, qe = int(coords[1][i]), int(coords[1][i + 1]) + if ts == te and qs != qe: + n += 1 + + return n + + +def _count_target_insert_blocks(aln: "BioAlign.Alignment") -> int: + """Count gap-open events where the target advances but the query does not. + + Corresponds to PSL ``tNumInsert`` — the **number of gap-opening blocks** + in the target, not the total bases skipped (that would be + ``tBaseInsert``). A single contiguous deleted region in the query counts + as one block, regardless of its length. + (PSL field reference: https://genome.ucsc.edu/FAQ/FAQformat.html#format2) + + Together with ``qNumInsert`` these make up the insert penalty in the + canonical BLAT PSL score formula. + """ + coords = aln.coordinates + n = 0 + for i in range(coords.shape[1] - 1): + ts, te = int(coords[0][i]), int(coords[0][i + 1]) + qs, qe = int(coords[1][i]), int(coords[1][i + 1]) + if qs == qe and ts != te: + n += 1 + + return n + + +def _blat_score(aln: "BioAlign.Alignment") -> int: + """Compute the BLAT PSL score for a single HSP. + + Equivalent to the ``score`` column that BLAT writes to PSL output:: + + score = matches - misMatches - qNumInsert - tNumInsert + + This formula implements the canonical PSL scoring described in the UCSC + BLAT documentation (https://genome.ucsc.edu/FAQ/FAQblat.html#blat4) and + matches the ``score`` field definition in the PSL format spec + (https://genome.ucsc.edu/FAQ/FAQformat.html#format2): + ``matches - misMatches - qNumInsert - tNumInsert``. + + **Block count, not base count:** ``qNumInsert`` and ``tNumInsert`` are + gap-open event counts (one per contiguous inserted run), not total inserted + bases (those are ``qBaseInsert`` / ``tBaseInsert`` in PSL). The helpers + :func:`_count_query_insert_blocks` and :func:`_count_target_insert_blocks` + implement this correctly by counting coordinate transitions where one + dimension is zero-width — each such transition is exactly one gap-open + block in Biopython's alignment representation. + + This was the original basis for best-HSP selection before the BioPython + port and correctly penalises both mismatches and gap-open events without + double-counting individual gap bases. Penalising target inserts is safe + here because ``_get_best_hsp`` operates on a chromosome-filtered list + where target gaps do not represent tolerated intron splices. + + :param aln: Bio.Align.Alignment representing a single BLAT HSP + :return: integer BLAT score (can be negative for very noisy alignments) + """ + counts = aln.counts() + q_inserts = _count_query_insert_blocks(aln) + t_inserts = _count_target_insert_blocks(aln) + return int(counts.identities) - int(counts.mismatches) - q_inserts - t_inserts + + +def _blat_style_identity( + identities: int, mismatches: int, query_insert_blocks: int +) -> float | None: + """Compute BLAT-style percent identity: ``matches / (matches + mismatches + qNumInsert)``. + + Penalizes real query indels but ignores target gaps (intron-style splits), + matching what BLAT itself reports for spliced cDNA-to-genome alignments. + """ + denom = identities + mismatches + query_insert_blocks + if denom == 0: + _logger.warning( + "BLAT identity denominator is zero — alignment has no matches, mismatches, or query inserts." + ) + return None + + return 100.0 * identities / denom + + +def _compact_alignment_string(aln_str: str) -> str: + """Collapse consecutive all-gap display blocks in a Biopython alignment string. + + Biopython renders every 60-bp genomic window as a three-line display group + (target / match / query). For cDNA→genome alignments the intronic regions + produce long runs of ``?`` characters in the target row and ``-`` in the + match/query rows, ballooning the string to tens of thousands of characters. + + This function replaces each run of consecutive all-gap groups (where the + target sequence is entirely ``?``) with a single summary line:: + + ... [4,560 bp gap: chr14:90400123-90404683] ... + + Groups that contain any real sequence characters are left untouched, so + exon-intron boundaries and the flanking partial-gap rows remain visible. + + The function is a no-op on protein or purely genomic alignments that + contain no ``?`` characters. + """ + groups = aln_str.split("\n\n") + output: list[str] = [] + + # State for the currently-accumulating gap run. + gap_chrom: str | None = None + gap_start: int | None = None + gap_end: int | None = None + + def _flush_gap() -> None: + nonlocal gap_chrom, gap_start, gap_end + if gap_start is not None: + span = gap_end - gap_start # type: ignore[operator] + output.append( + f"... [{span:,} bp gap: {gap_chrom}:{gap_start}-{gap_end}] ..." + ) + gap_chrom = gap_start = gap_end = None + + for group in groups: + lines = group.strip("\n").splitlines() + if len(lines) != 3: + _flush_gap() + if group.strip(): + output.append(group) + continue + + target_parts = lines[0].split() + # Expect at least: + if len(target_parts) < 3: + _flush_gap() + output.append(group) + continue + + try: + t_start = int(target_parts[1]) + except ValueError: + _flush_gap() + output.append(group) + continue + + seq = target_parts[2] + + if all(c == "?" for c in seq): + # Pure-gap block — accumulate into the current run. + t_end = t_start + len(seq) + if gap_start is None: + gap_chrom = target_parts[0] + gap_start = t_start + gap_end = t_end + else: + gap_end = t_end + else: + _flush_gap() + output.append(group) + + _flush_gap() + return "\n\n".join(output) + + +def _log_alignment_summary(label: str, result: "AlignmentResult") -> None: + """Emit a single human-readable INFO log summarizing a finished alignment. + + ``label`` identifies the alignment context (target gene name, or + ``" (protein)"`` for protein-to-protein BLAT). The pairwise + visualization is logged at DEBUG to keep INFO compact. + """ + qc = result.alignment_qc + score = result.score + next_best = result.next_best_score + pct = result.percent_identity + chrom = result.chrom or "N/A" + span = ( + f"{result.hit_range.start}-{result.hit_range.end}" + if result.hit_range is not None + else "N/A" + ) + mismatches = qc.mismatch_count if qc is not None else "N/A" + gaps = qc.gap_count if qc is not None else "N/A" + aln_len = qc.alignment_length if qc is not None else "N/A" + cigar = qc.cigar if qc is not None else None + + _logger.info( + "Alignment[%s]: chrom=%s span=%s strand=%s " + "score=%s next_best=%s identity=%s length=%s mismatches=%s gaps=%s%s", + label, + chrom, + span, + result.strand.value if result.strand is not None else "N/A", + f"{score:g}" if score is not None else "N/A", + f"{next_best:g}" if next_best is not None else "N/A", + f"{pct:.2f}%" if pct is not None else "N/A", + aln_len, + mismatches, + gaps, + f" cigar={cigar}" if cigar else "", + ) + if qc is not None and qc.alignment_string: + _logger.debug("Alignment[%s] visualization:\n%s", label, qc.alignment_string) + else: + _logger.debug( + "Alignment[%s] visualization unavailable (BLAT output may have lacked aligned sequence content).", + label, + ) + + +def _cigar_from_coords(aln: "BioAlign.Alignment") -> str | None: + """Build a CIGAR string directly from alignment coordinates. + + Biopython's SAM formatter raises ``ValueError: Unequal step sizes`` for + minus-strand PSL alignments because query coordinates are stored in + descending order (step = -1) while target steps are +1. Building the + CIGAR ourselves from absolute step sizes avoids this entirely. + """ + coords = aln.coordinates + assert coords.shape[0] == 2, ( # noqa: S101 + f"_cigar_from_coords expects a pairwise alignment (shape[0]==2), got {coords.shape[0]}" + ) + if coords.shape[1] < 2: + return None + + parts: list[str] = [] + + # Leading soft clip: query bases before the first aligned block. + # min() handles both strand orientations (descending coords for minus-strand). + q_aligned_min = int(min(coords[1][0], coords[1][-1])) + if q_aligned_min > 0: + parts.append(f"{q_aligned_min}S") + + for i in range(coords.shape[1] - 1): + t_step = abs(int(coords[0][i + 1]) - int(coords[0][i])) + q_step = abs(int(coords[1][i + 1]) - int(coords[1][i])) + # query insert: query advances, target does not + if t_step == 0 and q_step > 0: + parts.append(f"{q_step}I") + # deletion: target advances, query does not + elif t_step > 0 and q_step == 0: + parts.append(f"{t_step}D") + # aligned block: both advance (match or mismatch) + elif t_step > 0 and q_step > 0: + parts.append(f"{t_step}M") + # t_step == 0 and q_step == 0: degenerate zero-width column; skip + + # Trailing soft clip. + q_aligned_max = int(max(coords[1][0], coords[1][-1])) + q_len = len(aln.query.seq) # works even when .seq is UndefinedSequenceData + if q_len > q_aligned_max: + parts.append(f"{q_len - q_aligned_max}S") + + return "".join(parts) if parts else None + + +def _build_alignment_qc(aln: "BioAlign.Alignment") -> "AlignmentQc": + """Derive the per-(target, alignment_level) QC block from a Bio.Align + ``Alignment`` parsed out of BLAT's pslx output. + + Aggregate stats come from ``Alignment.counts()``; CIGAR from + ``_cigar_from_coords()``; the pairwise visualization from ``str(aln)``. A single + walk over ``.coordinates`` collects mismatch positions and gap intervals on + the target (genome) for downstream per-variant flagging; those lists live + in memory only and are not serialized. + + When per-base sequence content is unavailable (e.g. BLAT pslx omitted + tseqs/qseqs for cDNA→genome runs), ``mismatch_count`` from + ``Alignment.counts()`` remains accurate but ``mismatch_positions`` will be + empty and ``mismatch_positions_unavailable`` will be ``True``. Callers + must treat ``at_mismatched_locus`` as ``None`` (not evaluated) in that + situation to avoid a silent disagreement where ``mismatch_count > 0`` but + no variant is ever flagged. + """ + counts = aln.counts() + coords = aln.coordinates + mismatch_positions: list[int] = [] + gap_intervals: list[tuple[int, int]] = [] + query_insert_blocks = 0 + + # Per-base mismatch detection requires concrete sequence content for both + # target and query. Biopython's pslx parser sometimes leaves bases outside + # aligned blocks (or even within, depending on BLAT output) as undefined + # placeholders. The aggregate mismatch *count* still comes from + # ``aln.counts()`` below; we treat per-position recording as best-effort + # and set mismatch_positions_unavailable=True if the sequence content isn't + # materialized so callers know at_mismatched_locus is unreliable. + mismatch_positions_unavailable = False + sequences_available = True + for i in range(coords.shape[1] - 1): + ts, te = int(coords[0][i]), int(coords[0][i + 1]) + qs, qe = int(coords[1][i]), int(coords[1][i + 1]) + + # gap in reference (extra bases in query): zero-width interval + # at ts so variants exactly at ts are "near" it within any + # positive window. This is a query insert in PSL terms. + if ts == te and qs != qe: + gap_intervals.append((ts, ts)) + query_insert_blocks += 1 + + # gap in query (target advances, query does not): deletion in the + # query relative to the reference. This is a target insert in PSL + # terms (tNumInsert). Record the full genomic span so variants + # overlapping or adjacent to the deleted region can be flagged as + # near_gap. + elif qs == qe and ts != te: + gap_intervals.append((min(ts, te), max(ts, te))) + + # Aligned block -- target coords are always ascending (ts < te). + # For minus-strand query blocks (qe < qs), Biopython stores the + # query SeqRecord in plus-strand orientation; we need the reverse + # complement of the [qe:qs] slice to match the target bases. + elif ts != te and qs != qe and sequences_available: + try: + t_bases = str(aln.target.seq[ts:te]) + if qe < qs: + q_bases = str(aln.query.seq[qe:qs].reverse_complement()) + else: + q_bases = str(aln.query.seq[qs:qe]) + except UndefinedSequenceError: + _logger.debug( + "Skipping per-base mismatch positions: aligned-block " + "sequence content unavailable in BLAT output. " + "mismatch_count from counts() is still accurate; " + "mismatch_positions_unavailable will be set to True." + ) + mismatch_positions = [] + mismatch_positions_unavailable = True + sequences_available = False + continue + + for j, (tb, qb) in enumerate(zip(t_bases, q_bases, strict=True)): + if qb.upper() != tb.upper(): + mismatch_positions.append(ts + j) + + cigar = _cigar_from_coords(aln) + percent_identity = _blat_style_identity( + counts.identities, counts.mismatches, query_insert_blocks + ) + + # PSL target coordinates are always on the + strand (ascending), so the + # target span equals the number of reference bases consumed by M+D ops — + # which is what alignment_length represents. Using aln.length directly + # raises "Unequal step sizes" for minus-strand alignments because Biopython + # derives length from coordinate steps and rejects descending query strides. + alignment_length = int(coords[0][-1]) - int(coords[0][0]) + + return AlignmentQc( + percent_identity=percent_identity, + alignment_length=alignment_length, + mismatch_count=int(counts.mismatches), + gap_count=len(gap_intervals), + cigar=cigar, + alignment_string=_compact_alignment_string(str(aln)), + mismatch_positions_unavailable=mismatch_positions_unavailable, + mismatch_positions=mismatch_positions, + gap_intervals=gap_intervals, + ) + + def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType | str: """Get overall target sequence type for a score set's target genes. Protein if all target sequences are protein sequences, nucleotide if all target @@ -157,6 +611,7 @@ def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType target_sequence_types.add( metadata.target_genes[target_gene].target_sequence_type ) + if len(target_sequence_types) > 1: return "mixed" elif len(target_sequence_types) == 1: # noqa: RET505 @@ -166,38 +621,72 @@ def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType raise ValueError(msg) +def _group_alignments_by_query( + pslx_path: str, +) -> dict[str, list["BioAlign.Alignment"]]: + """Parse a pslx file and group Bio.Align Alignments by query ID. + + :param pslx_path: path to a pslx-format file produced by BLAT + :return: mapping of query ID to the list of all alignments for that query + :raise ValueError: propagated from Bio.Align.parse on malformed input + """ + grouped: dict[str, list[BioAlign.Alignment]] = {} + for aln in BioAlign.parse(pslx_path, "psl"): + grouped.setdefault(str(aln.query.id), []).append(aln) + + return grouped + + def _get_blat_output( metadata: ScoresetMetadata, silent: bool -) -> dict[str, QueryResult]: - """Run a BLAT query and returns a path to the output object. +) -> tuple[dict[str, list["BioAlign.Alignment"]], dict]: + """Run a BLAT query and return a per-gene grouping of Bio.Align Alignments. - If unable to produce a valid query the first time, then try a query using ``dnax`` - bases. + If unable to produce a valid query the first time, retries with + ``-q=dnax -t=dnax`` flags. - :param scoreset_metadata: object containing scoreset attributes + :param metadata: object containing scoreset attributes :param silent: suppress BLAT command output - :return: dict where keys are target gene identifiers and values are BLAT query result objects - :raise AlignmentError: if BLAT subprocess returns error code + :return: ``(grouped, blat_params)`` — ``grouped`` maps each query gene + label to its alignments; ``blat_params`` is the parameters dict + returned directly by ``_run_blat`` from the successful invocation. + :raise AlignmentError: if BLAT subprocess returns error code or both + parse attempts fail. """ with tempfile.NamedTemporaryFile() as tmp_file: query_file = _build_query_file(metadata, Path(tmp_file.name)) target_sequence_type = _get_target_sequence_type(metadata) + if target_sequence_type == TargetSequenceType.PROTEIN: target_args = "-q=prot -t=dnax" elif target_sequence_type == TargetSequenceType.DNA: target_args = "" else: - # TODO consider implementing support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments. + # TODO consider implementing support for mixed types msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported." raise NotImplementedError(msg) + + _logger.info( + "Running BLAT for %s with mode: %s", + metadata.urn, + target_args if target_args else "default (nucleotide DNA)", + ) + reference_genome_file = get_ref_genome_file(silent=silent) - process_result = _run_blat( - target_args, query_file, reference_genome_file, "/dev/stdout", silent + process_result, blat_params = _run_blat( + target_args, + query_file, + reference_genome_file, + "/dev/stdout", + silent, ) - out_file = _write_blat_output_tempfile(process_result) + pslx_path = _write_blat_output_tempfile(process_result) try: - output = parse_blat(out_file, "blat-psl") + grouped = _group_alignments_by_query(pslx_path) + if not grouped: + msg = f"BLAT produced no alignments for {metadata.urn} with mode: {target_args if target_args else 'default (nucleotide DNA)'}" + raise ValueError(msg) except ValueError: _logger.debug( @@ -205,129 +694,234 @@ def _get_blat_output( metadata.urn, exc_info=True, ) - target_args = "-q=dnax -t=dnax" - process_result = _run_blat( - target_args, query_file, reference_genome_file, "/dev/stdout", silent + + process_result, blat_params = _run_blat( + "-q=dnax -t=dnax", + query_file, + reference_genome_file, + "/dev/stdout", + silent, ) - out_file = _write_blat_output_tempfile(process_result) + pslx_path = _write_blat_output_tempfile(process_result) + try: - output = parse_blat(out_file, "blat-psl") + grouped = _group_alignments_by_query(pslx_path) + if not grouped: + msg = f"BLAT produced no alignments for {metadata.urn} after retry with -q=dnax -t=dnax" + raise ValueError(msg) + except ValueError as e: msg = f"Unable to run successful BLAT on {metadata.urn}" _logger.exception(msg=msg, exc_info=e) raise AlignmentError(msg) from e - return output + _logger.info( + "BLAT -q=dnax -t=dnax retry succeeded for %s (%d query gene(s)).", + metadata.urn, + len(grouped), + ) + + return grouped, blat_params -def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit: - """Get best hit from BLAT output. +def _get_best_hit( + alignments: "list[BioAlign.Alignment]", + chromosome: str | None, + scores: dict[int, int] | None = None, +) -> "list[BioAlign.Alignment]": + """Return the subset of alignments on the best-matching chromosome. - First, try to return hit corresponding to expected chromosome taken from scoreset - metadata. If chromosome doesn't match any of the outputs or is unavailable, take - the hit with the single highest-scoring HSP. + First, try to find alignments on the expected chromosome taken from scoreset + metadata. If no chromosome matches or none is provided, fall back to the + chromosome whose best-scoring alignment has the highest BLAT score. - :param output: BLAT output - :param chromosome: refseq chromosome ID, e.g. ``"NC_000001.11"`` - :return: best Hit - :raise AlignmentError: if unable to get hits from output + :param alignments: all alignments for one query gene + :param chromosome: RefSeq chromosome accession, e.g. ``"NC_000001.11"`` + :param scores: optional pre-computed ``{id(aln): score}`` cache; when + provided avoids redundant ``aln.counts()`` walks inside the fallback. + :return: alignments filtered to the selected chromosome + :raise AlignmentError: if the alignment list is empty """ + if not alignments: + msg = f"Empty alignment list passed to _get_best_hit for chromosome {chromosome or 'N/A'}." + raise AlignmentError(msg) + if chromosome: - if chromosome.startswith("refseq"): + if chromosome.startswith("refseq:"): chromosome = chromosome[7:] - for hit in output: - hit_chr = hit.id + for aln in alignments: + hit_chr = str(aln.target.id) if hit_chr.startswith("chr"): hit_chr = hit_chr[3:] + hit_chr_ac = get_chromosome_identifier(hit_chr) if hit_chr_ac == chromosome: - return hit - else: - if list(output): - hit_chrs = [h.id for h in output] - # TODO should this be an error rather than a warning? it seems like a problem if we can't find a hit on the expected chromosome - _logger.warning( - "Failed to match hit chromosomes during alignment for target %s. Expected chromosome: %s, hit chromosomes: %s", - output.id, - chromosome, - hit_chrs, - ) - - best_score = 0 - best_score_hit = None - for hit in output: - best_local_score = max(hit, key=lambda i: i.score).score - if best_local_score > best_score: - best_score = best_local_score - best_score_hit = hit + matched_chrom = str(aln.target.id) + return [a for a in alignments if str(a.target.id) == matched_chrom] + + # No hit matched the expected chromosome; log and fall through. + hit_chrs = list({str(a.target.id) for a in alignments}) + query_id = str(alignments[0].query.id) + + _logger.warning( + "Failed to match hit chromosomes during alignment for target %s. " + "Expected chromosome: %s, hit chromosomes: %s", + query_id, + chromosome, + hit_chrs, + ) - if best_score_hit is None: - msg = f"Couldn't get BLAT hits for target {output.id}." + # Fallback: find the chromosome with the highest-scoring alignment. + chrom_best: dict[str, float] = {} + for aln in alignments: + chrom = str(aln.target.id) + s = float(scores[id(aln)] if scores is not None else _blat_score(aln)) + if chrom not in chrom_best or s > chrom_best[chrom]: + chrom_best[chrom] = s + + if not chrom_best: + query_id = str(alignments[0].query.id) if alignments else "unknown" + msg = f"Couldn't determine best chromosome for target {query_id} — no valid alignments found." raise AlignmentError(msg) - return best_score_hit + best_chrom = max(chrom_best, key=chrom_best.__getitem__) + query_id = str(alignments[0].query.id) + _logger.info( + "Chromosome fallback selected %s for target %s (best identity score: %.1f, %d candidates).", + best_chrom, + query_id, + chrom_best[best_chrom], + len(chrom_best), + ) + return [a for a in alignments if str(a.target.id) == best_chrom] + + +def _get_best_hsp( + alignments: "list[BioAlign.Alignment]", + gene_location: GeneLocation | None = None, + scores: dict[int, int] | None = None, +) -> "BioAlign.Alignment": + """Retrieve the preferred alignment from a list filtered to one chromosome. + + If gene location data is available, prefer the alignment with the least + distance between its target start and the known gene start coordinate. + Otherwise, take the alignment with the highest BLAT score. + + :param alignments: alignments on a single chromosome for one query gene + :param gene_location: location data acquired by normalising scoreset metadata + :param scores: optional pre-computed ``{id(aln): score}`` cache; when + provided avoids redundant ``aln.counts()`` walks. + :return: preferred alignment + :raise AlignmentError: if the list is empty + """ + if not alignments: + msg = f"Empty alignment list passed to _get_best_hsp for gene location {gene_location or 'N/A'}." + raise AlignmentError(msg) -def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None = None) -> HSP: - """Retrieve preferred HSP from BLAT Hit object. + if gene_location and gene_location.start is not None: + best = min( + alignments, + key=lambda a: abs(int(a.coordinates[0][0]) - gene_location.start), + ) - If gene location data is available, prefer the HSP with the least distance - between the start of the hit and the start coordinate of the gene. Otherwise, - take the HSP with the highest score value. + _logger.info( + "Selected HSP by gene-location proximity (gene start: %d, hit start: %d, %d candidate(s)).", + gene_location.start, + int(best.coordinates[0][0]), + len(alignments), + ) + return best - :param hit: hit object from BLAT result - :param gene_location: location data acquired by normalizing scoreset metadata - :return: Preferred HSP object - :raise AlignmentError: if hit object appears to be empty (should be impossible) - """ - best_hsp = None - if gene_location and gene_location.start is not None: - best_hsp = min(hit, key=lambda hsp: abs(hsp.hit_start - gene_location.start)) - else: - best_hsp = max(hit, key=lambda hsp: hsp.score) - if best_hsp is None: - msg = f"Unable to get best HSP from BLAT hit: {hit}" - raise AlignmentError(msg) - return best_hsp + best = max( + alignments, + key=lambda a: scores[id(a)] if scores is not None else _blat_score(a), + ) + _logger.info( + "Selected HSP by best BLAT score (%d, %d candidate(s)).", + scores[id(best)] if scores is not None else _blat_score(best), + len(alignments), + ) + return best -def _get_best_match(output: QueryResult, target_gene: TargetGene) -> AlignmentResult: - """Obtain best high-scoring pairs (HSP) object for query sequence. - :param metadata: scoreset metadata - :param output: BLAT result object - :return: alignment result ?? +def _get_best_match( + alignments: "list[BioAlign.Alignment]", + target_gene: TargetGene, + seq_len: int, + blat_params: dict, +) -> AlignmentResult: + """Obtain the best-matching alignment for one query gene. + + :param alignments: all alignments for this gene across all chromosomes + :param target_gene: target gene metadata (used for gene-location lookup) + :param seq_len: query sequence length (for coverage calculation) + :param blat_params: parameters dict returned by ``_run_blat`` + :return: AlignmentResult """ location = get_gene_location(target_gene) chromosome = location.chromosome if location else None - best_hit = _get_best_hit(output, chromosome) - best_hsp = _get_best_hsp(best_hit, location) - strand = Strand.POSITIVE if best_hsp[0].query_strand == 1 else Strand.NEGATIVE - coverage = 100 * (best_hsp.query_end - best_hsp.query_start) / output.seq_len - identity = best_hsp.ident_pct - chrom = best_hsp.hit_id + # Pre-compute BLAT scores once per alignment — each call to _blat_score walks + # the alignment coordinates, so caching avoids O(n * alignment_length) work + # when the same alignments are ranked by multiple selection helpers. + _scores: dict[int, int] = {id(a): _blat_score(a) for a in alignments} + chrom_alns = _get_best_hit(alignments, chromosome, _scores) + best_aln = _get_best_hsp(chrom_alns, location, _scores) + + best_counts = best_aln.counts() + others = [_scores[id(a)] for a in alignments if a is not best_aln] + next_best = float(max(others)) if others else None + + coords = best_aln.coordinates + tcoords = coords[0] + qcoords = coords[1] + + strand = Strand.POSITIVE if int(qcoords[0]) <= int(qcoords[-1]) else Strand.NEGATIVE + q_start = int(qcoords.min()) + q_end = int(qcoords.max()) + + coverage = 100.0 * (q_end - q_start) / seq_len if seq_len else 0.0 + identity = _blat_style_identity( + best_counts.identities, + best_counts.mismatches, + _count_query_insert_blocks(best_aln), + ) + chrom = str(best_aln.target.id) - query_subranges = [] - hit_subranges = [] - for fragment in best_hsp: - query_subranges.append( - SequenceRange(start=fragment.query_start, end=fragment.query_end) - ) - hit_subranges.append( - SequenceRange(start=fragment.hit_start, end=fragment.hit_end) - ) + query_subranges: list[SequenceRange] = [] + hit_subranges: list[SequenceRange] = [] + for i in range(coords.shape[1] - 1): + ts, te = int(tcoords[i]), int(tcoords[i + 1]) + qs, qe = int(qcoords[i]), int(qcoords[i + 1]) + + # skip gap-only segments (BLAT-style identity ignores target gaps + # and penalizes query gaps, so these don't represent real aligned + # sequence) + if ts == te or qs == qe: + continue + + hit_subranges.append(SequenceRange(start=ts, end=te)) + query_subranges.append(SequenceRange(start=min(qs, qe), end=max(qs, qe))) + + alignment_qc = _build_alignment_qc(best_aln) return AlignmentResult( + aligner_parameters=blat_params, + reference_assembly=REFERENCE_GENOME_ASSEMBLY, chrom=chrom, strand=strand, - ident_pct=identity, + percent_identity=identity, coverage=coverage, - query_range=SequenceRange(start=best_hsp.query_start, end=best_hsp.query_end), + query_range=SequenceRange(start=q_start, end=q_end), query_subranges=query_subranges, - hit_range=SequenceRange(start=best_hsp.hit_start, end=best_hsp.hit_end), + hit_range=SequenceRange(start=int(tcoords[0]), end=int(tcoords[-1])), hit_subranges=hit_subranges, + score=float(_scores[id(best_aln)]), + next_best_score=next_best, + alignment_qc=alignment_qc, ) @@ -340,20 +934,21 @@ def align( :param silent: suppress BLAT process output if true :return: dictionary where keys are target gene identifiers and values are alignment result objects """ - blat_output = _get_blat_output(scoreset_metadata, silent) + grouped, blat_params = _get_blat_output(scoreset_metadata, silent) alignment_results = {} - for blat_result in blat_output: - target_label = blat_result.id - # blat names the result id "query" if there is only one query; replace "query" with the target gene name for single-target score sets + for blat_id, aln_list in grouped.items(): + target_label = blat_id + + # BLAT names the result "query" when there is only one query sequence; + # replace with the actual target gene name for single-target score sets. if target_label == "query" and len(scoreset_metadata.target_genes) == 1: target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 - # blat automatically reformats query names, so sometimes they don't match our metadata + + # BLAT automatically reformats query names; resolve back to metadata key. if target_label not in scoreset_metadata.target_genes: - # if single-target score set, don't need to match by name if len(scoreset_metadata.target_genes) == 1: target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 else: - # try to match query name to a target gene in the metadata matches = 0 for target_gene_name in scoreset_metadata.target_genes: blat_target_gene_name = ( @@ -362,35 +957,53 @@ def align( .replace(")", "") .replace(",", "") ) - if blat_target_gene_name == target_label: + + if blat_target_gene_name == blat_id: target_label = target_gene_name matches += 1 - # we may be missing some blat reformatting rules here - if so, this error will be thrown + if matches == 0: - msg = f"BLAT result {target_label} does not match any target gene names in scoreset {scoreset_metadata.urn}." + msg = f"BLAT result {blat_id} does not match any target gene names in scoreset {scoreset_metadata.urn}." raise AlignmentError(msg) if matches > 1: - # could happen if multiple target genes have the same first word in their label (unlikely) - msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}" + msg = f"BLAT result {blat_id} matches multiple target gene names in scoreset {scoreset_metadata.urn}" + raise AlignmentError(msg) + target_gene = scoreset_metadata.target_genes[target_label] - alignment_results[target_label] = _get_best_match(blat_result, target_gene) + seq_len = len(target_gene.target_sequence or "") + alignment_results[target_label] = _get_best_match( + aln_list, target_gene, seq_len, blat_params + ) + _log_alignment_summary(target_label, alignment_results[target_label]) + # confirm that there is an alignment result for each target gene - for target_gene in scoreset_metadata.target_genes: - if target_gene not in alignment_results: - msg = f"No BLAT result found for target gene {target_gene} in scoreset {scoreset_metadata.urn}" + for tgt_gene in scoreset_metadata.target_genes: + if tgt_gene not in alignment_results: + msg = f"No BLAT result found for target gene {tgt_gene} in scoreset {scoreset_metadata.urn}" raise AlignmentError(msg) + return alignment_results def fetch_alignment( metadata: ScoresetMetadata, silent: bool ) -> dict[str, AlignmentResult | None]: - alignment_results = {} + alignment_results: dict[str, AlignmentResult | None] = {} for target_gene in metadata.target_genes: accession_id = metadata.target_genes[target_gene].target_accession_id + if not accession_id: + msg = f"Target gene {target_gene} in scoreset {metadata.urn} is missing an accession ID, which is required for fetching CDOT alignments." + raise AlignmentError(msg) + # protein and contig/chromosome accession ids do not need to be aligned to the genome if accession_id.startswith(("NP", "ENSP", "NC_")): + _logger.info( + "Skipping BLAT for %s (%s): direct protein/contig accession — alignment not required.", + target_gene, + accession_id, + ) alignment_results[accession_id] = None + else: url = f"{CDOT_URL}/transcript/{accession_id}" r = httpx.get(url, timeout=30) @@ -403,7 +1016,18 @@ def fetch_alignment( raise ResourceAcquisitionError(msg) from e cdot_mapping = r.json() - alignment_results[accession_id] = parse_cdot_mapping(cdot_mapping, silent) + cdot_alignment = parse_cdot_mapping(cdot_mapping, silent) + alignment_results[accession_id] = cdot_alignment + + _logger.info( + "CDOT alignment parsed for %s (%s): %s, strand=%s, %d exon(s)", + target_gene, + accession_id, + cdot_alignment.chrom, + cdot_alignment.strand, + len(cdot_alignment.hit_subranges), + ) + return alignment_results @@ -445,6 +1069,12 @@ def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult: start=hit_subranges[-1].start, end=hit_subranges[0].end ) + reference_assembly = "GRCh38" if grch38 else "GRCh37" + cdot_data_version = cdot_mapping.get("cdot_data_version") + # Always emit the dict so consumers can distinguish + # "cdot run, version unknown" (cdot_data_version=None) from "not a cdot run" (None). + aligner_parameters: dict = {"cdot_data_version": cdot_data_version} + return AlignmentResult( chrom=chrom, strand=strand, @@ -452,6 +1082,8 @@ def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult: query_subranges=query_subranges, hit_range=hit_range, hit_subranges=hit_subranges, + aligner_parameters=aligner_parameters, + reference_assembly=reference_assembly, ) @@ -477,6 +1109,13 @@ def build_alignment_result( raise ScoresetNotSupportedError(msg) score_set_type = "sequence" + _logger.info( + "Building alignment result for %s (path: %s, targets: %s)", + metadata.urn, + score_set_type, + list(metadata.target_genes.keys()), + ) + if score_set_type == "sequence": try: alignment_result = align(metadata, silent) @@ -497,6 +1136,12 @@ def build_alignment_result( alignment_result = align( patch_target_sequence_type(metadata, {}, force=True), silent ) + _logger.info( + "Protein-level BLAT retry succeeded for %s. " + "The nucleotide-level alignment result has been discarded and replaced with the protein-level result.", + metadata.urn, + ) + except AlignmentError as e2: msg = f"BLAT alignment failed for {metadata.urn} at the protein level after failing at the nucleotide level." _logger.error(msg) @@ -511,45 +1156,83 @@ def build_alignment_result( def align_target_to_protein( target_sequence: str, reference_sequence: str, silent: bool ) -> AlignmentResult: + """Align a protein target sequence against a reference protein using BLAT. + + :param target_sequence: query protein sequence + :param reference_sequence: reference protein sequence + :param silent: suppress BLAT process output if true + :return: AlignmentResult for the best alignment + :raise AlignmentError: if BLAT produces no usable alignment + """ + target_args = "-q=prot -t=prot" with tempfile.NamedTemporaryFile() as query_tmp_file, tempfile.NamedTemporaryFile() as reference_tmp_file: _write_query_file(Path(query_tmp_file.name), [">query", target_sequence]) _write_query_file( Path(reference_tmp_file.name), [">reference", reference_sequence] ) - target_args = "-q=prot -t=prot" - process_result = _run_blat( + process_result, blat_params = _run_blat( target_args, query_tmp_file.name, reference_tmp_file.name, "/dev/stdout", silent, ) - out_file = _write_blat_output_tempfile(process_result) + pslx_path = _write_blat_output_tempfile(process_result) try: - blat_output = read_blat(out_file, "blat-psl") + alignments = list(BioAlign.parse(pslx_path, "psl")) + if not alignments: + raise ValueError except ValueError as e: msg = "Unable to run successful BLAT on target sequence against selected reference protein sequence." + _logger.error(msg) raise AlignmentError(msg) from e - best_hit = _get_best_hit(blat_output, None) - best_hsp = _get_best_hsp(best_hit) - - query_subranges = [] - hit_subranges = [] - for fragment in best_hsp: - query_subranges.append( - SequenceRange(start=fragment.query_start, end=fragment.query_end) - ) - hit_subranges.append( - SequenceRange(start=fragment.hit_start, end=fragment.hit_end) - ) - - return AlignmentResult( - query_range=SequenceRange(start=best_hsp.query_start, end=best_hsp.query_end), + _scores: dict[int, int] = {id(a): _blat_score(a) for a in alignments} + best_aln = max(alignments, key=lambda a: _scores[id(a)]) + best_counts = best_aln.counts() + others = [_scores[id(a)] for a in alignments if a is not best_aln] + next_best = float(max(others)) if others else None + + coords = best_aln.coordinates + tcoords = coords[0] + qcoords = coords[1] + + query_subranges: list[SequenceRange] = [] + hit_subranges: list[SequenceRange] = [] + for i in range(coords.shape[1] - 1): + ts, te = int(tcoords[i]), int(tcoords[i + 1]) + qs, qe = int(qcoords[i]), int(qcoords[i + 1]) + if ts == te or qs == qe: + continue + hit_subranges.append(SequenceRange(start=ts, end=te)) + query_subranges.append(SequenceRange(start=min(qs, qe), end=max(qs, qe))) + + # Attach full sequences so _build_alignment_qc can do per-base mismatch + # detection. BLAT protein pslx does not populate tseqs/qseqs columns, so + # the parser leaves SeqRecords with undefined content; we supply the same + # strings we passed to BLAT, which share coordinate space with the PSL output. + best_aln.target.seq = Seq(reference_sequence) + best_aln.query.seq = Seq(target_sequence) + alignment_qc = _build_alignment_qc(best_aln) + + result = AlignmentResult( + query_range=SequenceRange(start=int(qcoords.min()), end=int(qcoords.max())), query_subranges=query_subranges, - hit_range=SequenceRange(start=best_hsp.hit_start, end=best_hsp.hit_end), + hit_range=SequenceRange(start=int(tcoords[0]), end=int(tcoords[-1])), hit_subranges=hit_subranges, + percent_identity=_blat_style_identity( + best_counts.identities, + best_counts.mismatches, + _count_query_insert_blocks(best_aln), + ), + score=float(_scores[id(best_aln)]), + next_best_score=next_best, + alignment_qc=alignment_qc, + aligner_parameters=blat_params, ) + + _log_alignment_summary("protein->protein", result) + return result diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 941f45f..2f6c291 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -1,10 +1,13 @@ """Annotate MaveDB score set metadata with mapped scores.""" +import bisect import datetime import json import logging from pathlib import Path +from typing import Any +import cdot import hgvs.edit import hgvs.location import hgvs.parser @@ -34,7 +37,7 @@ get_ucsc_chromosome_name, get_vrs_id_from_identifier, ) -from dcd_mapping.resource_utils import LOCAL_STORE_PATH +from dcd_mapping.resource_utils import CDOT_URL, LOCAL_STORE_PATH from dcd_mapping.schemas import ( AlignmentResult, ComputedReferenceSequence, @@ -42,20 +45,175 @@ MappedReferenceSequence, MappedScore, ScoreAnnotation, - ScoreAnnotationWithLayer, ScoresetMapping, ScoresetMetadata, TargetAnnotation, TargetGene, + TargetMapping, TargetSequenceType, TargetType, TxSelectResult, VrsVersion, ) from dcd_mapping.transcripts import TxSelectError +from dcd_mapping.version import dcd_mapping_version _logger = logging.getLogger(__name__) +# How close (in reference bases) a variant must be to an alignment gap before +# we flag it as ``near_gap``. This is somewhat arbitrary, but seems like a reasonable window to +# capture potential edge effects without being so large that it flags a large +# fraction of variants as near-gap. We might consider re-evaluating this window with more +# rigor in the future, or making it configurable to support situations where callers +# want to be more or less conservative in their definition of "near". +NEAR_GAP_WINDOW = 5 + + +def _allele_ref_positions(post_mapped: object) -> list[tuple[int, int]]: + """Return the reference-frame half-open intervals covered by an Allele or + Haplotype-of-Alleles. + + Returns an empty list for: + - Unrecognised shapes (e.g. pre-VRS-2.0 representations) — caller leaves + locus flags as None. + - ReferenceLengthExpression alleles (reference-identical variants) — their + span covers the entire reference allele unchanged, so per-variant locus + flags carry no meaningful audit value (and could span entire chromosomes). + """ + members = ( + post_mapped.members if isinstance(post_mapped, Haplotype) else [post_mapped] + ) + intervals: list[tuple[int, int]] = [] + for member in members: + if ( + not isinstance(member, Allele) + or member.location is None + or isinstance(member.state, ReferenceLengthExpression) + ): + continue + + try: + start = int(member.location.start) + end = int(member.location.end) + except (TypeError, AttributeError, ValueError): + continue + + # SequenceLocation is half-open [start, end). Ensure we don't produce + # a zero-length interval when start == end. + intervals.append((start, max(start + 1, end))) + + return intervals + + +def _stamp_alignment_locus_flags( + annotations: list[ScoreAnnotation], + align_result: AlignmentResult | None, + target_layer: AnnotationLayer, + near_gap_window: int = NEAR_GAP_WINDOW, +) -> None: + """Stamp ``at_mismatched_locus`` and ``near_gap`` on each annotation whose + ``annotation_layer`` matches ``target_layer``, using positions from the + given alignment's QC block. + + The caller is responsible for pairing each layer with its own alignment: + GENOMIC variants get checked against the BLAT genomic alignment; PROTEIN + variants get checked against the target-protein-to-reference-protein + alignment. CDNA flags are not currently evaluated -- we'd need either a + dedicated cDNA alignment or to translate genomic mismatch/gap positions + through exon structure -- cdna annotations leave the flags as None to + signal "not evaluated". + + When ``AlignmentQc.mismatch_positions_unavailable`` is ``True`` and + ``mismatch_count > 0``, the per-base positions are absent but the aggregate + count is non-zero. In that case ``at_mismatched_locus`` is left as ``None`` + (not evaluated) for all variants to prevent a silent disagreement between + ``mismatch_count`` and the per-variant flag. ``near_gap`` is still evaluated + from gap intervals, which are always coordinate-derived. + """ + if align_result is None or align_result.alignment_qc is None: + return + + qc = align_result.alignment_qc + mismatches = sorted(qc.mismatch_positions) + gap_intervals = sorted(qc.gap_intervals) + + # When sequence content was unavailable during QC computation, mismatch_count + # may be non-zero but mismatch_positions is empty. Marking at_mismatched_locus + # as False would silently disagree with mismatch_count; leave it as None instead. + positions_reliable = not ( + qc.mismatch_positions_unavailable and qc.mismatch_count > 0 + ) + + layer_anns = [ + ann + for ann in annotations + if ann.alignment_level == target_layer and ann.post_mapped is not None + ] + if not layer_anns: + return + + # Explicitly record that this target/layer had no mismatches or gaps. Use + # mismatch_count (not the positions list) as the source of truth: when + # mismatch_positions_unavailable is True the list is empty even though + # mismatches exist, so testing `not mismatches` would be a faulty premise. + # mismatch_count comes from aln.counts() which is purely coordinate-based + # and is always accurate regardless of sequence availability. + if qc.mismatch_count == 0 and not gap_intervals: + for ann in layer_anns: + ann.at_mismatched_locus = False + ann.near_gap = False + return + + # Pre-compute sorted lists for O(log G) gap overlap queries. + gap_starts = [gs for gs, _ge in gap_intervals] + gap_ends = [ge for _gs, ge in gap_intervals] + + for ann in layer_anns: + intervals = _allele_ref_positions(ann.post_mapped) + if not intervals: + # Empty means either RLE allele or unrecognised shape; leave flags None. + continue + + at_mismatch = False + near_g = False + for int_start, int_end in intervals: + # -- Mismatch check: any mismatch position in [int_start, int_end)? O(log M) + if mismatches and not at_mismatch: + idx = bisect.bisect_left(mismatches, int_start) + if idx < len(mismatches) and mismatches[idx] < int_end: + at_mismatch = True + + # -- Gap proximity check: any gap overlaps [int_start-W, int_end+W)? O(log G) + # A gap (gs, ge) overlaps window [L, R) iff gs < R AND ge > L. + # With sorted (non-overlapping) gap intervals: + # Case 1: gap start in [L, R) — definitively overlaps. + # Case 2: gap start < L — overlaps only if its end > L. + if not near_g: + win_l = int_start - near_gap_window + win_r = int_end + near_gap_window + first_at_or_after_l = bisect.bisect_left(gap_starts, win_l) + first_at_or_after_r = bisect.bisect_left(gap_starts, win_r) + # At least one gap starts in [win_l, win_r) OR the last gap starting before win_l extends into [win_l, win_r). + if first_at_or_after_l < first_at_or_after_r or ( + first_at_or_after_l > 0 + and gap_ends[first_at_or_after_l - 1] > win_l + ): + near_g = True + + if at_mismatch and near_g: + break + + # Stamp flags. + # Note: when positions_reliable is False (mismatch_positions_unavailable + # AND count>0), at_mismatch will always be False here because mismatches + # is an empty list — we cannot trust it. Leaving at_mismatched_locus as + # None (not evaluated) rather than stamping a misleading False. The + # near_gap branch is unaffected: gap_intervals are coordinate-derived and + # always reliable, so near_g is evaluated regardless. + if positions_reliable: + ann.at_mismatched_locus = at_mismatch + ann.near_gap = near_g + async def compute_target_gene_info( target_key: str, @@ -353,7 +511,7 @@ def _iter_genomic_spans_from_mapped_scores( """ spans: list[tuple[str, int, int]] = [] for ms in mapped_scores: - if ms.annotation_layer != AnnotationLayer.GENOMIC or ms.post_mapped is None: + if ms.alignment_level != AnnotationLayer.GENOMIC or ms.post_mapped is None: continue if isinstance(ms.post_mapped, Allele): loc = ms.post_mapped.location @@ -559,7 +717,7 @@ def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]: if alt == "": edit = "del" - if edit != "dup" or edit != "del" or edit != "=": + if edit not in ("dup", "del", "="): edit = ( hgvs.edit.AARefAlt(ref=ref, alt=alt) if syntax == Syntax.HGVS_P @@ -589,7 +747,7 @@ def _annotate_allele_mapping( metadata: TargetGene, urn: str, vrs_version: VrsVersion = VrsVersion.V_2, -) -> ScoreAnnotationWithLayer: +) -> ScoreAnnotation: """Perform annotations and, if necessary, create VRS 1.3 equivalents for allele mappings.""" pre_mapped: Allele = mapped_score.pre_mapped post_mapped: Allele = mapped_score.post_mapped @@ -607,7 +765,7 @@ def _annotate_allele_mapping( if post_mapped: # Determine reference sequence - if mapped_score.annotation_layer == AnnotationLayer.GENOMIC: + if mapped_score.alignment_level == AnnotationLayer.GENOMIC: sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}" accession = get_chromosome_identifier_from_vrs_id(sequence_id) if accession is None: @@ -641,14 +799,14 @@ def _annotate_allele_mapping( pre_mapped = _allele_to_vod(pre_mapped) post_mapped = _allele_to_vod(post_mapped) if post_mapped else None - return ScoreAnnotationWithLayer( + return ScoreAnnotation( pre_mapped=pre_mapped, post_mapped=post_mapped, vrs_version=vrs_version, mavedb_id=mapped_score.accession_id, score=float(mapped_score.score) if mapped_score.score is not None else None, - annotation_layer=mapped_score.annotation_layer, error_message=mapped_score.error_message, + alignment_level=mapped_score.alignment_level, ) @@ -658,7 +816,7 @@ def _annotate_haplotype_mapping( metadata: TargetGene, urn: str, vrs_version: VrsVersion = VrsVersion.V_2, -) -> ScoreAnnotationWithLayer: +) -> ScoreAnnotation: """Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings.""" pre_mapped: Haplotype = mapped_score.pre_mapped # type: ignore post_mapped: Haplotype = mapped_score.post_mapped # type: ignore @@ -674,7 +832,7 @@ def _annotate_haplotype_mapping( if post_mapped: # Determine reference sequence - if mapped_score.annotation_layer == AnnotationLayer.GENOMIC: + if mapped_score.alignment_level == AnnotationLayer.GENOMIC: sequence_id = f"ga4gh:{post_mapped.members[0].location.sequenceReference.refgetAccession}" accession = get_chromosome_identifier_from_vrs_id(sequence_id) if accession is None: @@ -712,14 +870,14 @@ def _annotate_haplotype_mapping( pre_mapped = _haplotype_to_haplotype_1_3(pre_mapped) post_mapped = _haplotype_to_haplotype_1_3(post_mapped) if post_mapped else None - return ScoreAnnotationWithLayer( + return ScoreAnnotation( pre_mapped=pre_mapped, post_mapped=post_mapped, vrs_version=vrs_version, mavedb_id=mapped_score.accession_id, score=float(mapped_score.score) if mapped_score.score is not None else None, - annotation_layer=mapped_score.annotation_layer, error_message=mapped_score.error_message, + alignment_level=mapped_score.alignment_level, ) @@ -729,7 +887,7 @@ def annotate( metadata: TargetGene, urn: str, vrs_version: VrsVersion = VrsVersion.V_2, -) -> list[ScoreAnnotationWithLayer]: +) -> list[ScoreAnnotation]: """Given a list of mappings, add additional contextual data: 1. ``vrs_ref_allele_seq``: The sequence between the start and end positions @@ -751,13 +909,14 @@ def annotate( for mapped_score in mapped_scores: if mapped_score.pre_mapped is None: score_annotations.append( - ScoreAnnotationWithLayer( + ScoreAnnotation( mavedb_id=mapped_score.accession_id, score=float(mapped_score.score) if mapped_score.score is not None else None, vrs_version=vrs_version, error_message=mapped_score.error_message, + alignment_level=mapped_score.alignment_level, ) ) elif isinstance(mapped_score.pre_mapped, Haplotype) and ( @@ -780,7 +939,7 @@ def annotate( ) else: score_annotations.append( - ScoreAnnotationWithLayer( + ScoreAnnotation( pre_mapped=mapped_score.pre_mapped, post_mapped=mapped_score.post_mapped, vrs_version=vrs_version, @@ -789,6 +948,7 @@ def annotate( if mapped_score.score is not None else None, error_message=f"Multiple issues with annotation: Inconsistent variant structure (Allele and Haplotype mix).{' ' + mapped_score.error_message if mapped_score.error_message else ''}", + alignment_level=mapped_score.alignment_level, ) ) @@ -891,16 +1051,44 @@ def _get_mapped_reference_sequence( ) -def _set_scoreset_layer( - urn: str, mappings: list[ScoreAnnotationWithLayer] +def _align_result_for_target( + target_gene: str, + metadata: ScoresetMetadata, + align_results: dict[str, "AlignmentResult"], +) -> "AlignmentResult | None": + """Return the AlignmentResult for a target, handling both sequence-based and + accession-based targets. + + Sequence-based targets key ``align_results`` by target gene name. + Accession-based targets key it by ``target_accession_id`` (set by + ``parse_cdot_mapping`` in ``align.py``). This helper does the fallback so + callers don't have to repeat the lookup logic. + """ + result = align_results.get(target_gene) + if result is None: + accession_id = metadata.target_genes[target_gene].target_accession_id + if accession_id: + result = align_results.get(accession_id) + + return result + + +def _pick_preferred_genomic_or_protein_layer( + mappings: list[ScoreAnnotation], ) -> AnnotationLayer: - """Many individual score results provide both genomic and protein variant - expressions. If genomic expressions are available, that's what we'd like to use. - This function tells us how to filter the final annotation objects. + """Return GENOMIC if any annotation has that layer, else PROTEIN. + + Precondition: ``mappings`` must contain at least one annotation with + ``alignment_level`` in {GENOMIC, PROTEIN}. CDNA-only score sets are not + currently supported (``vrs_map`` does not emit CDNA-layer scores); this + function will silently return PROTEIN for them, which would then drop + every CDNA score from the output. Update this function if CDNA scores + become emittable. """ for mapping in mappings: - if mapping.annotation_layer == AnnotationLayer.GENOMIC: + if mapping.alignment_level == AnnotationLayer.GENOMIC: return AnnotationLayer.GENOMIC + return AnnotationLayer.PROTEIN @@ -919,48 +1107,345 @@ def write_scoreset_mapping_to_json( _logger.info("Saving mapping output to %s", output_path) with output_path.open("w") as file: json.dump( - scoreset_mapping.model_dump(exclude_unset=False, exclude_none=True), + scoreset_mapping.model_dump( + mode="json", exclude_unset=False, exclude_none=True + ), file, indent=4, ) return output_path -def save_mapped_output_json( +def _resolve_protein_accession( + target_metadata: TargetGene, + tx_result: TxSelectResult | TxSelectError | None, + _align_result: AlignmentResult + | None, # protein accession comes from tx_result only +) -> str | None: + if isinstance(tx_result, TxSelectResult) and tx_result.np: + return tx_result.np + + accession_id = target_metadata.target_accession_id + if accession_id and accession_id.startswith(("NP", "ENSP")): + return accession_id + + return None + + +def _resolve_cdna_accession( + target_metadata: TargetGene, + tx_result: TxSelectResult | TxSelectError | None, + _align_result: AlignmentResult | None, # cDNA accession comes from tx_result only +) -> str | None: + if isinstance(tx_result, TxSelectResult) and tx_result.nm: + return tx_result.nm + + accession_id = target_metadata.target_accession_id + if accession_id and accession_id.startswith(("NM", "ENST")): + return accession_id + + return None + + +def _resolve_genomic_accession( + target_metadata: TargetGene, + _tx_result: TxSelectResult + | TxSelectError + | None, # genomic accession comes from align_result only + align_result: AlignmentResult | None, +) -> str | None: + if align_result is not None and align_result.chrom: + try: + chrom_id = get_chromosome_identifier(align_result.chrom) + except Exception: + _logger.exception( + "Could not resolve chromosome identifier for %s", align_result.chrom + ) + return None + + if chrom_id and chrom_id.startswith("refseq:"): + return chrom_id[len("refseq:") :] + + return chrom_id + + accession_id = target_metadata.target_accession_id + if accession_id and accession_id.startswith("NC"): + return accession_id + + return None + + +_LAYER_ACCESSION_RESOLVERS = { + AnnotationLayer.PROTEIN: _resolve_protein_accession, + AnnotationLayer.CDNA: _resolve_cdna_accession, + AnnotationLayer.GENOMIC: _resolve_genomic_accession, +} + + +def _reference_accession_for_target_level( + alignment_level: AnnotationLayer, + target_metadata: TargetGene, + tx_result: TxSelectResult | TxSelectError | None, + align_result: AlignmentResult | None, +) -> str | None: + """Pick the most informative RefSeq-style accession describing the reference + sequence that variants at ``alignment_level`` were mapped against. + + Falls back to None when nothing sensible is available; the API column is nullable. + """ + resolver = _LAYER_ACCESSION_RESOLVERS.get(alignment_level) + if resolver is None: + return None + + return resolver(target_metadata, tx_result, align_result) + + +def _build_target_mapping( + target_gene_identifier: str, + target_meta: TargetGene, + alignment_level: AnnotationLayer, + preferred: bool, + tx_result: TxSelectResult | TxSelectError | None, + align_result: AlignmentResult | None, + vrs_version: VrsVersion, + annotations: list[ScoreAnnotation], + protein_align_result: AlignmentResult | None = None, + null_failure_count: int = 0, + near_gap_window: int = NEAR_GAP_WINDOW, +) -> TargetMapping: + """Assemble one target_mappings[] row for a (target, alignment_level) pair. + + Provenance and aggregate counts are populated from the data the pipeline + already has. Alignment QC columns are populated from + ``align_result.alignment_qc`` for GENOMIC rows or + ``protein_align_result.alignment_qc`` for PROTEIN rows. + + Note: CDNA ``TargetMapping`` rows are **not** emitted today — no + ``mapped_score`` carries ``alignment_level=CDNA`` (``vrs_map`` only + produces GENOMIC and PROTEIN scores). The CDNA entry added to + ``reference_sequences`` in ``build_scoreset_mapping`` is purely a + reference-sequence accession record, not a scored layer. + """ + reference_accession = _reference_accession_for_target_level( + alignment_level, target_meta, tx_result, align_result + ) + reference_sequence_id: str | None = None + if reference_accession is not None: + try: + reference_sequence_id = get_vrs_id_from_identifier(reference_accession) + except Exception: + _logger.exception( + "Could not resolve VRS sequence id for %s", reference_accession + ) + + # Pick the alignment that lives in this row's coordinate frame: + # GENOMIC -> BLAT genomic alignment; PROTEIN -> target-protein-to-reference + # alignment from vrs_map. + # No CDNA branch: vrs_map never emits CDNA-layer scores, so this function + # is never called with alignment_level=CDNA in practice. + qc_source: AlignmentResult | None = None + if alignment_level == AnnotationLayer.GENOMIC: + qc_source = align_result + elif alignment_level == AnnotationLayer.PROTEIN: + qc_source = protein_align_result + + tool_parameters: dict[str, Any] | None + # Accession-based targets split into two cases: + # NC_ (chromosome/contig) — no alignment tool was run; no tool_parameters. + # NM_/ENST (transcript) — cdot was used for transcript placement; record its parameters. + # For non-accession-based targets we pull aligner parameters from the BLAT alignment as usual. + accession_id = target_meta.target_accession_id + if accession_id is not None and accession_id.startswith("NC"): + # Direct chromosome/contig accession: the coordinate frame is defined by + # the accession itself — no placement tool was invoked. + tool_parameters = { + "aligner": "direct_contig_accession", + } + elif accession_id is not None: + # Transcript accession (NM_/ENST): cdot was used for transcript placement. + # cdot_data_version is present in aligner_parameters (possibly None) for cdot + # paths after parse_cdot_mapping, so we can read it unconditionally. + cdot_data_version = ( + align_result.aligner_parameters.get("cdot_data_version") + if align_result is not None and align_result.aligner_parameters is not None + else None + ) + tool_parameters = { + "aligner": "cdot_transcript_placement", + "aligner_version": cdot.__version__, + "cdot_url": CDOT_URL, + # Always emit the key; None means "cdot run, version unknown". + "cdot_data_version": cdot_data_version, + } + else: + tool_parameters = ( + qc_source.aligner_parameters if qc_source is not None else None + ) + + qc = qc_source.alignment_qc if qc_source is not None else None + percent_identity = qc.percent_identity if qc is not None else None + alignment_length = qc.alignment_length if qc is not None else None + mismatch_count = qc.mismatch_count if qc is not None else None + gap_count = qc.gap_count if qc is not None else None + alignment_string = qc.alignment_string if qc is not None else None + + # Record the near-gap window alongside the CIGAR so consumers can interpret + # the per-variant ``near_gap`` flag. Stamped on every layer that had its + # flags evaluated against an alignment (any layer with a non-None qc). + alignment_metadata: dict[str, Any] | None = None + if qc is not None and qc.cigar: + alignment_metadata = {"cigar": qc.cigar} + if qc is not None: + alignment_metadata = alignment_metadata or {} + alignment_metadata["near_gap_window"] = near_gap_window + # Always emit this bit so consumers never have to infer it from absence. + alignment_metadata[ + "at_mismatched_locus_evaluated" + ] = not qc.mismatch_positions_unavailable + + alignment_score = qc_source.score if qc_source is not None else None + next_best_alignment_score = ( + qc_source.next_best_score if qc_source is not None else None + ) + + total = len(annotations) + failed = sum(1 for a in annotations if a.post_mapped is None) + warnings = sum( + 1 + for a in annotations + if a.post_mapped is not None and a.error_message is not None + ) + clean = total - failed - warnings + + # near_gap is always evaluable (derived from gap positions in the alignment, + # not per-base sequence content), so count it unconditionally. + # at_mismatched_locus is only reliable when mismatch_positions_unavailable=False; + # when pslx tseq/qseq data was absent we leave it as None (falsy) and omit it + # from the tally so we don't silently under-count. The schema's + # at_mismatched_locus_evaluated flag in alignment_metadata lets consumers + # know they may be seeing only the near_gap portion of this count. + mismatch_unavailable = qc is not None and qc.mismatch_positions_unavailable + alignment_warnings = sum( + 1 + for a in annotations + if a.near_gap or (not mismatch_unavailable and a.at_mismatched_locus) + ) + + return TargetMapping( + target_gene_identifier=target_gene_identifier, + alignment_level=alignment_level, + preferred=preferred, + tool_name="dcd-mapping", + tool_version=dcd_mapping_version, + tool_parameters=tool_parameters, + reference_assembly=align_result.reference_assembly + if align_result is not None + else None, + reference_accession=reference_accession, + reference_sequence_id=reference_sequence_id, + vrs_version=vrs_version, + percent_identity=percent_identity, + alignment_score=alignment_score, + next_best_alignment_score=next_best_alignment_score, + alignment_length=alignment_length, + mismatch_count=mismatch_count, + gap_count=gap_count, + alignment_string=alignment_string, + alignment_metadata=alignment_metadata, + total_variants=total, + variants_mapped_cleanly=clean, + variants_with_mapping_warnings=warnings, + variants_with_alignment_warnings=alignment_warnings, + variants_failed=failed, + variants_failed_pre_layer_selection=null_failure_count + if null_failure_count + else None, + ) + + +def build_scoreset_mapping( metadata: ScoresetMetadata, - mappings: dict[str, list[ScoreAnnotationWithLayer]], + raw_metadata: dict, + mappings: dict[str, list[ScoreAnnotation]], align_results: dict[str, AlignmentResult], tx_output: dict[str, TxSelectResult | TxSelectError | None], gene_info: dict[str, GeneInfo | None], preferred_layer_only: bool = False, - output_path: Path | None = None, -) -> Path: - """Save mapping output for a score set in a JSON file - - :param urn: Score set accession - :param mappings: A dictionary of annotated VRS mappings for each target - :param align_result: A dictionary of alignment information for each target - :param tx_output: A dictionary of transcript output for each target - :param output_path: specific location to save output to. Default to - /urn:mavedb:00000XXX-X-X_mapping_.json - :return: output location + vrs_version: VrsVersion = VrsVersion.V_2, + protein_align_results: dict[str, AlignmentResult | None] | None = None, + near_gap_window: int = NEAR_GAP_WINDOW, +) -> ScoresetMapping: + """Assemble and return a :class:`ScoresetMapping` without writing to disk. + + :param metadata: parsed scoreset metadata (used for pipeline logic) + :param raw_metadata: the full MaveDB API response dict, stored verbatim in + the output ``metadata`` field so no fields are lost + :param mappings: annotated VRS mappings per target. **Mutated in place** — + ``_stamp_alignment_locus_flags`` sets ``at_mismatched_locus`` and + ``near_gap`` on each ``ScoreAnnotation``. Do not pass the same dict to + two successive calls if you need reproducible per-call results; deep-copy + it first. + :param align_results: alignment results per target (genomic frame, from BLAT) + :param tx_output: transcript selection results per target + :param gene_info: gene info per target + :param preferred_layer_only: if True, only emit the preferred annotation layer + :param vrs_version: VRS schema version for this run + :param protein_align_results: per-target target-protein-to-reference-protein + alignments (from vrs_map). Used to flag protein-layer variants at + mismatched/near-gap loci and as the QC source for PROTEIN target_mappings + rows. Pass ``None`` if you don't have them. + :param near_gap_window: half-width (in reference bases) of the window used to + flag variants as ``near_gap``. Defaults to :data:`NEAR_GAP_WINDOW` (5). + Increase for a more conservative call set; decrease to restrict to + immediately adjacent variants. + :return: fully assembled ScoresetMapping """ + run_mapped_date_utc = datetime.datetime.now(tz=datetime.UTC) # set preferred layers for each target, to allow a mix of coding and noncoding targets reference_sequences: dict[str, TargetAnnotation] = {} mapped_scores: list[ScoreAnnotation] = [] + target_mappings: list[TargetMapping] = [] for target_gene in mappings: + # Stamp per-variant locus flags first so the warning counts in + # TargetMapping and the emitted ScoreAnnotations are consistent. + protein_align_for_target = (protein_align_results or {}).get(target_gene) + genomic_align_for_target = _align_result_for_target( + target_gene, metadata, align_results + ) + _stamp_alignment_locus_flags( + mappings[target_gene], + genomic_align_for_target, + AnnotationLayer.GENOMIC, + near_gap_window, + ) + _stamp_alignment_locus_flags( + mappings[target_gene], + protein_align_for_target, + AnnotationLayer.PROTEIN, + near_gap_window, + ) + + # preferred_layer_for_target is the single "best" layer for this target + # (GENOMIC when available, else PROTEIN). It drives both which + # TargetMapping row gets preferred=True and where completely-failed + # variants (annotation_layer=None) are attributed. + preferred_layer_for_target = _pick_preferred_genomic_or_protein_layer( + mappings[target_gene] + ) if preferred_layer_only: - preferred_layers = { - _set_scoreset_layer(metadata.urn, mappings[target_gene]), - } + preferred_layers = {preferred_layer_for_target} else: preferred_layers = { - mapping.annotation_layer for mapping in mappings[target_gene] + mapping.alignment_level for mapping in mappings[target_gene] } # use target gene name in reference sequence dictionary, rather than the label, which differs between score sets target_gene_name = metadata.target_genes[target_gene].target_gene_name + # sometimes Nonetype layers show up in preferred layers dict; remove these + # before constructing TargetAnnotation (which requires valid AnnotationLayer keys). + preferred_layers.discard(None) + reference_sequences[target_gene_name] = TargetAnnotation( gene_info=gene_info.get(target_gene), layers={ @@ -971,8 +1456,6 @@ def save_mapped_output_json( for layer in preferred_layers }, ) - # sometimes Nonetype layers show up in preferred layers dict; remove these - preferred_layers.discard(None) for layer in preferred_layers: reference_sequences[target_gene_name].layers[layer][ "computed_reference_sequence" @@ -985,7 +1468,7 @@ def save_mapped_output_json( metadata.target_genes[target_gene], layer, tx_output[target_gene], - align_results[target_gene], + genomic_align_for_target, ) # if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict @@ -1006,31 +1489,152 @@ def save_mapped_output_json( } for m in mappings[target_gene]: - if m.pre_mapped is None: - mapped_scores.append(ScoreAnnotation(**m.model_dump())) - elif m.annotation_layer in preferred_layers: - # drop annotation layer from mapping object - mapped_scores.append(ScoreAnnotation(**m.model_dump())) - - # drop Nonetype reference sequences - for target_gene in reference_sequences: - for layer in list(reference_sequences[target_gene].layers.keys()): - if ( - reference_sequences[target_gene].layers[layer][ - "mapped_reference_sequence" - ] - is None - and reference_sequences[target_gene].layers[layer][ - "computed_reference_sequence" - ] - is None - ) or layer is None: - del reference_sequences[target_gene].layers[layer] - - output = ScoresetMapping( - metadata=metadata.model_dump(), + if m.alignment_level is None and m.pre_mapped is None: + # Completely-failed variant — vrs_map could not determine a layer. + # Re-attribute to the preferred layer so every mapped_score has a + # parent TargetMapping row (the API joins on alignment_level). + score_dict = m.model_dump() + score_dict["alignment_level"] = preferred_layer_for_target + score_dict["target_gene_identifier"] = target_gene_name + mapped_scores.append(ScoreAnnotation(**score_dict)) + elif m.alignment_level in preferred_layers: + score_dict = m.model_dump() + score_dict["target_gene_identifier"] = target_gene_name + mapped_scores.append(ScoreAnnotation(**score_dict)) + + # Provenance/QC: emit one TargetMapping per (target, alignment_level) that + # actually produced variants in this run. The (target_gene_identifier, + # alignment_level) pair must be unique per run; the API uses it to attribute + # each mapped_variant to its source row via target_gene_mapping_id. + # + # Completely-failed variants (annotation_layer=None) were re-attributed to + # preferred_layer_for_target in mapped_scores above; include them in that + # layer's TargetMapping count so total_variants is accurate. + null_failures = [ + m + for m in mappings[target_gene] + if m.alignment_level is None and m.pre_mapped is None + ] + align_result_for_target = genomic_align_for_target + emitted_mappings = [ + m for m in mappings[target_gene] if m.alignment_level in preferred_layers + ] + + # Group by alignment_level. + layers_seen: dict = {} + for m in emitted_mappings: + layers_seen.setdefault(m.alignment_level, []).append(m) + + for layer, layer_annotations in layers_seen.items(): + if layer is None: + continue + + # Defensive: coerce raw string values (e.g. "g") to AnnotationLayer. + if not isinstance(layer, AnnotationLayer): + try: + layer = AnnotationLayer(layer) + except ValueError: + _logger.warning( + "Skipping target_mappings row for unknown annotation layer %r", + layer, + ) + continue + + # Null-layer failures are attributed to the preferred layer; fold them + # into that layer's annotation list so variant counts are correct. + annotations_for_tm = ( + list(layer_annotations) + null_failures + if layer == preferred_layer_for_target + else layer_annotations + ) + target_mappings.append( + _build_target_mapping( + target_gene_identifier=target_gene_name, + target_meta=metadata.target_genes[target_gene], + alignment_level=layer, + preferred=(layer == preferred_layer_for_target), + tx_result=tx_output.get(target_gene), + align_result=align_result_for_target, + vrs_version=vrs_version, + annotations=annotations_for_tm, + protein_align_result=protein_align_for_target, + null_failure_count=len(null_failures) + if layer == preferred_layer_for_target + else 0, + near_gap_window=near_gap_window, + ) + ) + + # Drop layers where both reference sequence entries are None, and any None-keyed + # layers. Moved outside the per-target loop to avoid O(n²) scans and eliminate + # the variable-shadowing hazard (the inner loop previously reused `target_gene`). + for tgt_name in reference_sequences: + for layer in list(reference_sequences[tgt_name].layers.keys()): + if ( + reference_sequences[tgt_name].layers[layer]["mapped_reference_sequence"] + is None + and reference_sequences[tgt_name].layers[layer][ + "computed_reference_sequence" + ] + is None + ) or layer is None: + del reference_sequences[tgt_name].layers[layer] + + # Runtime invariant: every mapped_score must be attributable to a TargetMapping + # row via (target_gene_identifier, alignment_level). If any orphaned keys exist + # the API join would silently drop those scores, undermining the audit guarantee. + # Using an explicit RuntimeError rather than `assert` so this is never compiled + # away with `-O` optimizations in production deployments. + tm_keys = { + (tm.target_gene_identifier, tm.alignment_level) for tm in target_mappings + } + orphaned = { + (ms.target_gene_identifier, ms.alignment_level) + for ms in mapped_scores + if ms.alignment_level is not None + } - tm_keys + if orphaned: + raise RuntimeError( + f"build_scoreset_mapping produced {len(orphaned)} orphaned mapped_score " # noqa: EM102 + f"key(s) with no corresponding TargetMapping row: {orphaned!r}. " + "This is a pipeline invariant violation — every mapped_score must have " + "a parent TargetMapping so the API join is unambiguous." + ) + + return ScoresetMapping( + metadata=raw_metadata, + mapped_date=run_mapped_date_utc, reference_sequences=reference_sequences, mapped_scores=mapped_scores, + target_mappings=target_mappings, ) + +def save_mapped_output_json( + metadata: ScoresetMetadata, + raw_metadata: dict, + mappings: dict[str, list[ScoreAnnotation]], + align_results: dict[str, AlignmentResult], + tx_output: dict[str, TxSelectResult | TxSelectError | None], + gene_info: dict[str, GeneInfo | None], + preferred_layer_only: bool = False, + output_path: Path | None = None, + vrs_version: VrsVersion = VrsVersion.V_2, + protein_align_results: dict[str, AlignmentResult | None] | None = None, +) -> Path: + """Build and write mapping output for a score set to a JSON file. + + :return: output file path + """ + output = build_scoreset_mapping( + metadata=metadata, + raw_metadata=raw_metadata, + mappings=mappings, + align_results=align_results, + tx_output=tx_output, + gene_info=gene_info, + preferred_layer_only=preferred_layer_only, + vrs_version=vrs_version, + protein_align_results=protein_align_results, + ) return write_scoreset_mapping_to_json(metadata.urn, output, output_path) diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 8216bf2..b389538 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -32,12 +32,14 @@ check_uta, ) from dcd_mapping.mavedb_data import ( + get_raw_scoreset_metadata, get_scoreset_metadata, get_scoreset_records, patch_target_sequence_type, with_mavedb_score_set, ) from dcd_mapping.schemas import ( + AlignmentResult, ScoreRow, ScoresetMapping, ScoresetMetadata, @@ -151,6 +153,7 @@ async def _check_data_prereqs(silent: bool) -> None: async def map_scoreset( metadata: ScoresetMetadata, + raw_metadata: dict, records: dict[str, list[ScoreRow]], output_path: Path | None = None, vrs_version: VrsVersion = VrsVersion.V_2, @@ -229,17 +232,20 @@ async def map_scoreset( _emit_info("Reference selection complete.", silent) _emit_info("Mapping to VRS...", silent) - vrs_results = {} gene_info = {} + vrs_results = {} + protein_align_results: dict[str, AlignmentResult | None] = {} try: for target_gene in metadata.target_genes: - vrs_results[target_gene] = vrs_map( + vrs_map_result = vrs_map( metadata=metadata.target_genes[target_gene], align_result=alignment_results[target_gene], records=records[target_gene], transcript=transcripts[target_gene], silent=silent, ) + vrs_results[target_gene] = vrs_map_result.mappings + protein_align_results[target_gene] = vrs_map_result.protein_align_result gene_info[target_gene] = await compute_target_gene_info( target_gene, @@ -327,12 +333,15 @@ async def map_scoreset( try: final_output = save_mapped_output_json( metadata, + raw_metadata, annotated_vrs_results, alignment_results, transcripts, gene_info, prefer_genomic, output_path, + vrs_version=vrs_version, + protein_align_results=protein_align_results, ) except Exception as e: _logger.exception( @@ -372,6 +381,7 @@ async def map_scoreset_urn( :param silent: if True, suppress console information output """ try: + raw_metadata = get_raw_scoreset_metadata(urn, store_path) metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, silent, store_path) metadata = patch_target_sequence_type(metadata, records, force=False) @@ -407,5 +417,11 @@ async def map_scoreset_urn( return await map_scoreset( - metadata, records, output_path, vrs_version, prefer_genomic, silent + metadata, + raw_metadata, + records, + output_path, + vrs_version, + prefer_genomic, + silent, ) diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 75f0be6..b0e6889 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -1,11 +1,19 @@ """Provide class definitions for commonly-used information objects.""" import datetime from enum import Enum -from typing import Any, Literal +from typing import Any, Literal, NamedTuple from cool_seq_tool.schemas import AnnotationLayer, Strand, TranscriptPriority from ga4gh.vrs._internal.models import Allele, Haplotype -from pydantic import BaseModel, ConfigDict, Field, StrictBool, StrictInt, StrictStr +from pydantic import ( + BaseModel, + ConfigDict, + Field, + StrictBool, + StrictInt, + StrictStr, + field_serializer, +) from dcd_mapping import vrs_v1_schemas from dcd_mapping.version import dcd_mapping_version @@ -76,6 +84,61 @@ class SequenceRange(BaseModel): end: int +class AlignmentQc(BaseModel): + """Aggregate QC for a BLAT pslx alignment. + + Mismatch positions and gap intervals are kept on the in-memory model so + downstream code can flag individual variants that fall on a mismatched + base or near a gap, but they are excluded from JSON serialization to keep + payloads compact -- consumers reconstruct per-base detail from the CIGAR + or use the per-variant flags emitted on each :class:`ScoreAnnotation`. + + **cDNA→genome BLAT limitation:** BLAT's pslx format is supposed to include + per-block target and query sequences (tseqs/qseqs), but in practice + Biopython's PSL parser may leave aligned-block bases as undefined when + the pslx output is incomplete or when blocks fall outside the portion of + the database sequence that BLAT includes in its output. When this occurs, + ``mismatch_count`` (derived from ``Alignment.counts()``, which is purely + coordinate-based) remains accurate, but per-position detail cannot be + extracted. The ``mismatch_positions_unavailable`` flag is set to ``True`` + in that case; consumers should treat ``at_mismatched_locus`` as ``None`` + (not evaluated) for affected rows rather than trusting a ``False`` value. + + A future improvement would be to fetch target slices directly via SeqRepo + when per-base detail is required, decoupling per-position accuracy from + what pslx happens to include. + + **Downstream effect** — ``annotate._stamp_alignment_locus_flags`` is the + sole consumer of ``mismatch_positions`` and ``gap_intervals``. When + ``mismatch_positions_unavailable`` is ``True`` it skips the + ``at_mismatched_locus`` assignment entirely (leaving it ``None``) so + that a ``False`` value is never incorrectly written for a variant whose + position could not actually be checked against the alignment. + """ + + percent_identity: float | None = None + alignment_length: int + mismatch_count: int + gap_count: int + cigar: StrictStr | None = None + # Compact Biopython alignment visualization. Consecutive all-gap display + # blocks (intronic regions rendered as '?' characters in cDNA→genome + # alignments) are collapsed to a single summary line of the form + # ``... [N bp gap: chrom:start-end] ...`` by ``_compact_alignment_string`` + # in align.py. This keeps the string proportional to exon count rather + # than genomic span. + alignment_string: StrictStr | None = None + # True when per-base sequence content was unavailable during QC computation + # (e.g. BLAT pslx did not populate tseqs/qseqs). mismatch_count is still + # correct in that case, but mismatch_positions is empty and at_mismatched_locus + # should be treated as None (not evaluated) for all variants in this target/level. + mismatch_positions_unavailable: bool = False + # In-memory only -- not serialized. Used to compute per-variant flags for variants that + # fall on a mismatched base or near a gap. + mismatch_positions: list[int] = Field(default_factory=list, exclude=True) + gap_intervals: list[tuple[int, int]] = Field(default_factory=list, exclude=True) + + class GeneLocation(BaseModel): """Gene location info, gathered from normalizer result. Likely to be incomplete.""" @@ -109,11 +172,28 @@ class AlignmentResult(BaseModel): chrom: str | None = None strand: Strand | None = None coverage: float | None = None - ident_pct: float | None = None + percent_identity: float | None = None query_range: SequenceRange query_subranges: list[SequenceRange] hit_range: SequenceRange hit_subranges: list[SequenceRange] + # BLAT PSL score (matches - misMatches - qNumInsert - tNumInsert) for the + # winning HSP. Only set for BLAT-derived results. + score: float | None = None + # BLAT PSL score (same units as ``score``) of the next-best HSP considered + # during selection. Presence signals that an ambiguity check was performed; + # None means no ambiguity check was run (not that there was no ambiguity). + # For instance, when returning results for protein->protein alignments, BLAT + # returns only the highest scoring HSP. + next_best_score: float | None = None + # Structured per-base QC derived from the pslx alignment (populated for + # BLAT-sourced alignments; None for cdot-fetched results). + alignment_qc: AlignmentQc | None = None + # Aligner invocation parameters recorded for reproducibility. + aligner_parameters: dict[str, Any] | None = None + # Human reference genome assembly used for this alignment (e.g. "GRCh38"). + # None for alignments that have no genomic coordinate frame (protein-vs-protein). + reference_assembly: StrictStr | None = None class TranscriptDescription(BaseModel): @@ -165,7 +245,7 @@ class MappedScore(BaseModel): accession_id: StrictStr score: str | None - annotation_layer: AnnotationLayer | None = None + alignment_level: AnnotationLayer | None = None pre_mapped: Allele | Haplotype | None = None post_mapped: Allele | Haplotype | None = None error_message: str | None = None @@ -181,11 +261,22 @@ class ScoreAnnotation(BaseModel): relation: Literal[ "SO:is_homologous_to" ] = "SO:is_homologous_to" # TODO this should probably be None if pre_mapped is false? + # Identifies which target gene this score belongs to. Required for the API + # to join mapped_scores → target_mappings unambiguously when a score set has + # more than one target gene (joining on alignment_level alone is ambiguous in + # that case). Nullable for backwards compatibility with older pipeline outputs. + target_gene_identifier: StrictStr | None = None pre_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype | None = None post_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype | None = None vrs_version: VrsVersion | None = None score: float | None = None error_message: str | None = None + alignment_level: AnnotationLayer | None = None + # Per-variant alignment-locus flags. None means "not evaluated" (e.g. no + # genomic alignment available, or non-genomic annotation layer); True/False + # mean the flag was computed against this run's alignment. + at_mismatched_locus: bool | None = None + near_gap: bool | None = None class GeneInfo(BaseModel): @@ -231,24 +322,146 @@ class TargetAnnotation(BaseModel): ] = {} -class ScoreAnnotationWithLayer(ScoreAnnotation): - """Couple annotations with an easily-computable definition of the annotation layer - from which they originate. - - Used for filtering individual annotations just before saving the final JSON product. - """ - - annotation_layer: AnnotationLayer | None = None - - class ScoresetMapping(BaseModel): """Provide all mapped scores for a scoreset.""" metadata: Any # TODO get exact MaveDB metadata structure? - dcd_mapping_version: str = Field(default=dcd_mapping_version) - mapped_date_utc: str = Field( - default=datetime.datetime.now(tz=datetime.UTC).isoformat() - ) + mapped_date: datetime.datetime | None = None reference_sequences: dict[str, TargetAnnotation] | None = None mapped_scores: list[ScoreAnnotation] | None = None + target_mappings: list["TargetMapping"] | None = None error_message: str | None = None + + @field_serializer("mapped_date") + def _serialize_mapped_date(self, value: datetime.datetime | None) -> str | None: + """Serialize mapped_date as an ISO 8601 string so it is always JSON-safe.""" + return value.isoformat() if value is not None else None + + +class TargetMapping(BaseModel): + """Per-(target, alignment_level) provenance and QC block. + + Field names mirror the corresponding columns on `target_gene_mappings` in the + MaveDB API so the API worker can deserialize directly with minimal transformation. + Any aligner-specific structured details go in ``tool_parameters`` / + ``alignment_metadata`` (JSONB on the API side). + + ``reference_assembly`` is a top-level column (not nested in ``tool_parameters``) + because it describes the coordinate frame of the mapping result, not aligner + configuration. It is ``None`` for alignments with no genomic frame (e.g. + protein-vs-protein). + + ``tool_parameters`` shape per aligner + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BLAT genomic (sequence-based, nucleotide or protein-vs-genome):: + + { + "aligner": "blat", + "aligner_version": "", + "min_score": , + "out_format": "", + "target_args": "", + } + + BLAT protein-vs-protein (sequence-based, protein annotation layer):: + + { + "aligner": "blat", + "aligner_version": "", + "min_score": , + "out_format": "", + "target_args": "-q=prot -t=prot", + } + + Direct contig accession (accession-based):: + + { + "aligner": "direct_contig_accession", + } + + cdot transcript placement (accession-based):: + + { + "aligner": "cdot_transcript_placement", + "aligner_version": "", + "cdot_url": "", + "cdot_data_version": "", + } + + ``cdot_data_version`` is ``null`` when the cdot REST response did not + include a ``cdot_data_version`` field; its presence (even as ``null``) + distinguishes "cdot run, version unknown" from "not a cdot run". + """ + + # Identity + target_gene_identifier: StrictStr + # Single-char cool-seq-tool AnnotationLayer value ("p", "c", "g"). + alignment_level: AnnotationLayer + preferred: StrictBool = False + + # Provenance (required on the API side: tool_name, tool_version) + tool_name: StrictStr = "dcd-mapping" + tool_version: StrictStr = dcd_mapping_version + tool_parameters: dict[str, Any] | None = None + # Human reference genome assembly (e.g. "GRCh38"). None for protein-vs-protein alignments. + reference_assembly: StrictStr | None = None + reference_accession: StrictStr | None = None + reference_sequence_id: StrictStr | None = None + vrs_version: VrsVersion | None = None + + # Alignment QC -- all optional; omit (leave None) fields the aligner doesn't produce. + percent_identity: float | None = None + alignment_score: float | None = None + next_best_alignment_score: float | None = None + alignment_length: int | None = None + mismatch_count: int | None = None + gap_count: int | None = None + # Compact alignment visualization (see AlignmentQc.alignment_string). + # Intronic gap runs are collapsed to ``... [N bp gap: chrom:start-end] ...`` + # summaries so the string length is proportional to exon count, not + # genomic span. None for protein-layer or cdot-derived alignments. + alignment_string: StrictStr | None = None + # Structured per-alignment metadata payload (JSONB on the API side). + # Keys present when available: + # "cigar" — CIGAR string for the winning HSP. + # "near_gap_window" — half-width (in ref bases) of the window + # used to flag ``near_gap`` on each variant. + # "at_mismatched_locus_evaluated" — False when per-base sequence content + # was unavailable (BLAT pslx omitted + # tseqs/qseqs); treat ``at_mismatched_locus`` + # as None (not evaluated) for all variants in + # this target/level. True otherwise. + alignment_metadata: dict[str, Any] | None = None + + # Annotation QC -- totals for this target x level. The two warning columns + # count disjoint things: + # * variants_with_mapping_warnings: the mapper itself attached an + # ``error_message`` despite producing a ``post_mapped`` value (e.g. + # fell back to RLE, ambiguous reference allele). + # * variants_with_alignment_warnings: the variant's reference position + # fell on a mismatched locus or near a gap in the underlying alignment + # (``at_mismatched_locus`` or ``near_gap`` was True). Mapping itself + # succeeded cleanly; this is purely about alignment context. + # A single variant can be in neither, either, or both. + total_variants: int | None = None + # Count of variants where post_mapped is not None AND error_message is None. + # Alignment warnings (at_mismatched_locus, near_gap) are counted separately in + # variants_with_alignment_warnings and do NOT reduce this count; a variant can + # be "cleanly mapped" and still sit at a mismatched locus or near a gap. + variants_mapped_cleanly: int | None = None + variants_with_mapping_warnings: int | None = None + variants_with_alignment_warnings: int | None = None + variants_failed: int | None = None + # Count of variants that failed before any annotation layer was determined + # (annotation_layer=None, pre_mapped=None in the VRS output). These are + # re-attributed to the preferred layer's variants_failed total so the join + # invariant holds, but keeping them separate lets consumers see a clean + # per-layer failure count on all other (non-preferred) rows. + variants_failed_pre_layer_selection: int | None = None + + +class VrsMapResult(NamedTuple): + """Provide mappings for an individual experiment score.""" + + mappings: list["MappedScore"] + protein_align_result: "AlignmentResult | None" diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index af17f8d..eb7364d 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -44,6 +44,7 @@ TargetSequenceType, TargetType, TxSelectResult, + VrsMapResult, ) from dcd_mapping.transcripts import TxSelectError @@ -472,7 +473,7 @@ def _map_protein_coding_pro( return MappedScore( accession_id=row.accession, score=row.score, - annotation_layer=AnnotationLayer.PROTEIN, + alignment_level=AnnotationLayer.PROTEIN, pre_mapped=vrs_variation, post_mapped=vrs_variation, ) @@ -527,7 +528,7 @@ def _map_protein_coding_pro( return MappedScore( accession_id=row.accession, score=row.score, - annotation_layer=AnnotationLayer.PROTEIN, + alignment_level=AnnotationLayer.PROTEIN, pre_mapped=pre_mapped_protein, error_message=f"{type(e).__name__}: {e}", ) @@ -535,7 +536,7 @@ def _map_protein_coding_pro( return MappedScore( accession_id=row.accession, score=row.score, - annotation_layer=AnnotationLayer.PROTEIN, + alignment_level=AnnotationLayer.PROTEIN, pre_mapped=pre_mapped_protein, post_mapped=post_mapped_protein, ) @@ -670,7 +671,7 @@ def _map_genomic( return MappedScore( accession_id=row.accession, score=row.score, - annotation_layer=AnnotationLayer.GENOMIC, + alignment_level=AnnotationLayer.GENOMIC, pre_mapped=pre_mapped_genomic, error_message=f"{type(e).__name__}: {e}", ) @@ -725,7 +726,7 @@ def _map_genomic( return MappedScore( accession_id=row.accession, score=row.score, - annotation_layer=AnnotationLayer.GENOMIC, + alignment_level=AnnotationLayer.GENOMIC, pre_mapped=pre_mapped_genomic, error_message=f"{type(e).__name__}: {e}", ) @@ -736,7 +737,7 @@ def _map_genomic( return MappedScore( accession_id=row.accession, score=row.score, - annotation_layer=AnnotationLayer.GENOMIC, + alignment_level=AnnotationLayer.GENOMIC, pre_mapped=pre_mapped_genomic, post_mapped=post_mapped_genomic, ) @@ -801,14 +802,17 @@ def _map_protein_coding( transcript: TxSelectResult | TxSelectError, align_result: AlignmentResult, silent: bool, -) -> list[MappedScore]: - """Perform mapping on protein coding experiment results +) -> tuple[list[MappedScore], AlignmentResult | None]: + """Perform mapping on protein coding experiment results. :param metadata: Target gene metadata from MaveDB API :param records: The list of MAVE variants in a given score set :param transcript: The transcript data for a score set, or an error message describing why an expected transcript is missing - :param align_results: The alignment data for a score set - :return: A list of mappings + :param align_result: The alignment data for a score set + :return: ``(mappings, protein_align_result)``. The target-protein-to-reference-protein + alignment is surfaced so downstream annotation can flag protein-layer + variants at mismatched/near-gap loci using its QC block; pipelines that + don't need it can ignore the second element. """ if metadata.target_sequence_type == TargetSequenceType.DNA: sequence = str( @@ -821,11 +825,24 @@ def _map_protein_coding( psequence_id = gsequence_id = store_sequence(sequence) # align protein sequence to selected reference protein sequence to get offsets and gaps - protein_align_result = None + protein_align_result: AlignmentResult | None = None if isinstance(transcript, TxSelectResult): protein_align_result = align_target_to_protein( sequence, transcript.sequence, silent ) + _logger.info( + "Protein-to-protein alignment produced for %s (ref protein: %s). " + "This alignment will be used for pro-layer variants in place of the genomic alignment.", + metadata.target_gene_name, + transcript.np, + ) + else: + _logger.warning( + "Skipping protein-to-protein alignment for %s: no valid transcript selected (%s). " + "Pro-layer variants will not be mapped.", + metadata.target_gene_name, + transcript, + ) variations: list[MappedScore] = [] for row in records: @@ -872,7 +889,8 @@ def _map_protein_coding( variations.append(hgvs_pro_mappings) if hgvs_nt_mappings: variations.append(hgvs_nt_mappings) - return variations + + return variations, protein_align_result def _map_regulatory_noncoding( @@ -1042,7 +1060,7 @@ def vrs_map( records: list[ScoreRow], transcript: TxSelectResult | TxSelectError | None = None, silent: bool = True, -) -> list[MappedScore]: +) -> VrsMapResult: """Given a description of a MAVE scoreset and an aligned transcript, generate the corresponding VRS objects. @@ -1051,20 +1069,32 @@ def vrs_map( :param records: scoreset records :param transcript: output of transcript selection process :param silent: If true, suppress console output - :return: A list of mapping results + :return: :class:`~dcd_mapping.schemas.VrsMapResult` with ``mappings`` and + ``protein_align_result``. ``protein_align_result`` is the + target-protein-to-reference-protein alignment used for protein-level + mapping, surfaced so downstream annotation can flag protein-layer + variants at mismatched/near-gap loci. It is ``None`` for accession-based + and regulatory/non-coding targets. """ if metadata.target_accession_id: - return _map_accession(metadata, records, align_result, transcript) + return VrsMapResult( + mappings=_map_accession(metadata, records, align_result, transcript), + protein_align_result=None, + ) + if metadata.target_gene_category == TargetType.PROTEIN_CODING and transcript: - return _map_protein_coding( + mappings, protein_align_result = _map_protein_coding( metadata, records, transcript, align_result, silent, ) - return _map_regulatory_noncoding( - metadata, - records, - align_result, + return VrsMapResult( + mappings=mappings, protein_align_result=protein_align_result + ) + + return VrsMapResult( + mappings=_map_regulatory_noncoding(metadata, records, align_result), + protein_align_result=None, ) diff --git a/tests/fixtures/align_result.json b/tests/fixtures/align_result.json index 6b2d086..b570171 100644 --- a/tests/fixtures/align_result.json +++ b/tests/fixtures/align_result.json @@ -4,7 +4,7 @@ "chrom": "chr11", "strand": 1, "coverage": 100.0, - "ident_pct": 77.45098039215686, + "percent_identity": 77.45098039215686, "query_range": { "start": 0, "end": 102 @@ -40,7 +40,7 @@ "chrom": "chr3", "strand": 1, "coverage": 100.0, - "ident_pct": 100.0, + "percent_identity": 100.0, "query_range": { "start": 0, "end": 1047 @@ -100,7 +100,7 @@ "chrom": "chr22", "strand": -1, "coverage": 100.0, - "ident_pct": 99.9071494893222, + "percent_identity": 99.9071494893222, "query_range": { "start": 0, "end": 360 @@ -184,7 +184,7 @@ "chrom": "chr20", "strand": 1, "coverage": 100.0, - "ident_pct": 99.86666666666666, + "percent_identity": 99.86666666666666, "query_range": { "start": 0, "end": 750 @@ -252,7 +252,7 @@ "chrom": "chr11", "strand": 1, "coverage": 100.0, - "ident_pct": 100.0, + "percent_identity": 100.0, "query_range": { "start": 0, "end": 187 @@ -280,7 +280,7 @@ "chrom": "chr16", "strand": 1, "coverage": 100.0, - "ident_pct": 100.0, + "percent_identity": 100.0, "query_range": { "start": 0, "end": 477 @@ -348,7 +348,7 @@ "chrom": "chr3", "strand": -1, "coverage": 100.0, - "ident_pct": 100.0, + "percent_identity": 100.0, "query_range": { "start": 0, "end": 12 @@ -376,7 +376,7 @@ "chrom": "chr3", "strand": -1, "coverage": 100.0, - "ident_pct": 100.0, + "percent_identity": 100.0, "query_range": { "start": 0, "end": 117 @@ -412,7 +412,7 @@ "chrom": "chr17", "strand": -1, "coverage": 99.83079526226734, - "ident_pct": 99.91525423728814, + "percent_identity": 99.91525423728814, "query_range": { "start": 0, "end": 1180 @@ -512,7 +512,7 @@ "chrom": "chr17", "strand": -1, "coverage": 100.0, - "ident_pct": 100.0, + "percent_identity": 100.0, "query_range": { "start": 0, "end": 300 @@ -564,7 +564,7 @@ "chrom": "chr2", "strand": -1, "coverage": 97.05882352941177, - "ident_pct": 100.0, + "percent_identity": 100.0, "query_range": { "start": 9, "end": 306 diff --git a/tests/test_align.py b/tests/test_align.py index 468d719..0dda37c 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -6,18 +6,332 @@ """ +from unittest.mock import patch + +import numpy as np import pytest +from Bio import Align as BioAlign +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from dcd_mapping.align import ( + _blat_score, + _build_alignment_qc, + _compact_alignment_string, + _get_best_hsp, + align, +) +from dcd_mapping.exceptions import AlignmentError +from dcd_mapping.schemas import ( + AlignmentQc, + AlignmentResult, + ScoresetMetadata, + TargetGene, + TargetType, +) + + +def _make_aln( + query_seq: str, + target_seq: str, + q_start: int = 0, + t_start: int = 0, + coords: "np.ndarray | None" = None, +) -> BioAlign.Alignment: + """Build a minimal Bio.Align.Alignment for unit-testing _build_alignment_qc.""" + n = len(query_seq) + t_len = max(t_start + n + 10, len(target_seq) + 10) + # Pad target so sequence slicing is always in bounds + padded_target = target_seq.ljust(t_len, "N") + target = SeqRecord(Seq(padded_target), id="chr1") + query = SeqRecord(Seq(query_seq), id="query") + if coords is None: + coords = np.array([[t_start, t_start + n], [q_start, q_start + n]]) + return BioAlign.Alignment(sequences=[target, query], coordinates=coords) + + +class TestBuildAlignmentQc: + def test_exact_match(self): + seq = "ACGTACGTAC" + aln = _make_aln(seq, seq) + qc = _build_alignment_qc(aln) + assert isinstance(qc, AlignmentQc) + assert qc.mismatch_count == 0 + assert qc.gap_count == 0 + assert qc.mismatch_positions == [] + assert qc.mismatch_positions_unavailable is False + assert qc.gap_intervals == [] + assert qc.cigar == "10M" + assert qc.percent_identity == pytest.approx(100.0) + + def test_single_mismatch(self): + query = "ACGTACGTAC" + # position 3: T->G mismatch + target = "ACGGACGTAC" + aln = _make_aln(query, target) + qc = _build_alignment_qc(aln) + assert qc.mismatch_count == 1 + assert qc.mismatch_positions == [3] + + def test_gap_in_ref(self): + """A gap appears when target has fewer bases than query (insertion in query).""" + target_seq = "ACGTAACGTA" # 10 bases + query_seq = "ACGTAGGACGTA" # 12 bases (2 extra at position 5-7) + target = SeqRecord(Seq(target_seq + "N" * 10), id="chr1") + query = SeqRecord(Seq(query_seq), id="query") + coords = np.array([[0, 5, 5, 10], [0, 5, 7, 12]]) + aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords) + qc = _build_alignment_qc(aln) + assert qc.gap_count == 1 + assert len(qc.gap_intervals) == 1 + + def test_gap_in_query(self): + """A gap appears when query has fewer bases than target (deletion in query).""" + target_seq = "ACGTANNNACGTA" # 13 bases + query_seq = "ACGTAACGTA" # 10 bases + target = SeqRecord(Seq(target_seq + "N" * 10), id="chr1") + query = SeqRecord(Seq(query_seq), id="query") + coords = np.array([[0, 5, 8, 13], [0, 5, 5, 10]]) + aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords) + qc = _build_alignment_qc(aln) + assert qc.gap_count == 1 + assert len(qc.gap_intervals) == 1 + gs, ge = qc.gap_intervals[0] + assert ge - gs == 3 + + def test_multi_block_alignment(self): + """Two aligned blocks with a gap between them.""" + seq = "ACGTAACGTA" # 10-base query + target_seq = seq[:5] + "NNN" + seq[5:] # 13-base target with 3-base insertion + target = SeqRecord(Seq(target_seq + "N" * 10), id="chr1") + query = SeqRecord(Seq(seq), id="query") + coords = np.array([[0, 5, 8, 13], [0, 5, 5, 10]]) + aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords) + qc = _build_alignment_qc(aln) + # alignment_length = target span = coords[0][-1] - coords[0][0] = 13 + assert qc.alignment_length == 13 + assert qc.gap_count == 1 + assert qc.mismatch_count == 0 + + def test_undefined_sequence_fallback(self): + """_build_alignment_qc falls back gracefully when sequence content is unavailable. + + Biopython's PSL parser leaves SeqRecords with undefined content when BLAT's + pslx output omits per-block sequences (common in protein-vs-protein mode). + Slicing raises UndefinedSequenceError; the function should fall back to an + empty mismatch_positions list rather than crashing, and must set + mismatch_positions_unavailable=True so callers know at_mismatched_locus is + unreliable when mismatch_count > 0. + """ + target = SeqRecord(Seq(None, length=20), id="reference") + query = SeqRecord(Seq(None, length=10), id="query") + coords = np.array([[0, 10], [0, 10]]) + aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords) + + qc = _build_alignment_qc(aln) + + assert qc.mismatch_positions == [] + assert qc.mismatch_positions_unavailable is True + # gap_intervals and gap_count are coordinate-driven (no sequence needed) + assert qc.gap_count == 0 + + def test_mismatch_positions_after_sequence_attachment(self): + """Attaching sequences to an undefined alignment enables per-base mismatch detection. + + This mirrors what align_target_to_protein does before calling _build_alignment_qc: + the PSL gives coordinates; we supply the original sequences so that slicing works. + """ + reference = "MKALVDQSLR" + query_seq = "MKALXDQZLR" # mismatches at positions 4 ('X') and 7 ('Z') + + target = SeqRecord(Seq(None, length=len(reference)), id="reference") + query = SeqRecord(Seq(None, length=len(query_seq)), id="query") + coords = np.array([[0, len(reference)], [0, len(query_seq)]]) + aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords) + + # Mimic align_target_to_protein: attach sequences before QC + aln.target.seq = Seq(reference) + aln.query.seq = Seq(query_seq) + + qc = _build_alignment_qc(aln) + + assert 4 in qc.mismatch_positions + assert 7 in qc.mismatch_positions + assert len(qc.mismatch_positions) == 2 + assert qc.mismatch_count == 2 + + def test_minus_strand_alignment(self): + """_build_alignment_qc works for minus-strand PSL alignments. + + Biopython's SAM formatter raises 'Unequal step sizes' for PSL + alignments where the query is on the minus strand, because query + coordinates are stored in descending order. We build CIGAR from + coordinates directly so this is a non-issue. + """ + # 8-base sequences; minus-strand: query[8:0] aligns to target[10:18]. + # Biopython stores minus-strand query coords as descending. + seq = "ACGTACGT" + target = SeqRecord(Seq(seq * 20), id="chr1") + query = SeqRecord(Seq(seq), id="query") + coords = np.array([[10, 18], [8, 0]]) + aln = BioAlign.Alignment(sequences=[target, query], coordinates=coords) + + qc = _build_alignment_qc(aln) + + assert qc.cigar == "8M" + assert qc.gap_count == 0 + + +class TestCompactAlignmentString: + """Unit tests for _compact_alignment_string. -from dcd_mapping.align import align -from dcd_mapping.schemas import AlignmentResult + The function collapses consecutive all-'?' target-sequence blocks (intronic + regions in cDNA→genome BLAT output) into summary lines of the form + ``... [N bp gap: chrom:start-end] ...``. + """ + + def _make_group( + self, chrom: str, start: int, seq: str, match_row: str, query_seq: str + ) -> str: + """Build a single Biopython-style 3-line display group.""" + return f"{chrom} {start} {seq}\n {match_row}\n query 0 {query_seq}" + + def test_noop_on_no_gap_blocks(self): + """A string with no all-'?' target rows is returned unchanged.""" + group = self._make_group("chr1", 100, "ACGT", "||||", "ACGT") + assert _compact_alignment_string(group) == group + + def test_single_gap_block_collapsed(self): + """One all-'?' block is replaced with a summary line.""" + exon1 = self._make_group("chr1", 100, "ACGT", "||||", "ACGT") + intron = self._make_group("chr1", 104, "????", " ", "----") + exon2 = self._make_group("chr1", 108, "TTTT", "||||", "TTTT") + inp = "\n\n".join([exon1, intron, exon2]) + out = _compact_alignment_string(inp) + # Intron block replaced; exon blocks preserved + assert "... [4 bp gap: chr1:104-108] ..." in out + assert "ACGT" in out + assert "TTTT" in out + # No raw '????' remaining + assert "????" not in out + + def test_consecutive_gap_blocks_merged(self): + """Multiple consecutive all-'?' blocks are merged into a single summary.""" + exon = self._make_group("chr1", 0, "ACGT", "||||", "ACGT") + intron1 = self._make_group("chr1", 4, "?" * 60, " " * 60, "-" * 60) + intron2 = self._make_group("chr1", 64, "?" * 60, " " * 60, "-" * 60) + inp = "\n\n".join([exon, intron1, intron2]) + out = _compact_alignment_string(inp) + # Both intron blocks merged into one summary + assert "... [120 bp gap: chr1:4-124] ..." in out + assert out.count("... [") == 1 + + def test_mixed_sequence_block_not_collapsed(self): + """A block with mixed '?' and real bases is left as-is.""" + mixed = self._make_group("chr1", 0, "AC?T", "|| |", "ACGT") + out = _compact_alignment_string(mixed) + assert "AC?T" in out + assert "... [" not in out + + def test_protein_alignment_noop(self): + """Protein alignments have no '?' characters; function is a no-op.""" + protein_group = "NP_001234.1 0 MAAAA\n |||||\n query 0 MAAAA" + assert _compact_alignment_string(protein_group) == protein_group + + def test_comma_formatting_in_large_gap(self): + """Gap length ≥ 1000 uses comma-formatted number.""" + exon = self._make_group("chr1", 0, "ACGT", "||||", "ACGT") + # Build consecutive 60-bp intron blocks totalling > 1000 bp + introns = [ + self._make_group("chr1", 4 + i * 60, "?" * 60, " " * 60, "-" * 60) + for i in range(17) # 17 * 60 = 1020 bp + ] + inp = "\n\n".join([exon, *introns]) + out = _compact_alignment_string(inp) + assert "1,020 bp gap" in out + + +class TestBlatScore: + """_blat_score and _get_best_hsp regression tests. + + The key invariant: _get_best_hsp must use BLAT PSL score + (identities - mismatches - qNumInsert - tNumInsert), NOT raw identity + count. These tests construct two alignments where the two metrics + disagree and verify that BLAT-score-based selection wins. + """ + + def _make_noisy_aln(self) -> BioAlign.Alignment: + """Alignment A: 10 identity matches + 8 mismatches in one block. + + BLAT score = 10 - 8 = 2 + identity count = 10 + """ + # First 10 chars identical, last 8 mismatched (N vs T) + target_seq = "ACGTACGTAC" + "TTTTTTTT" + query_seq = "ACGTACGTAC" + "NNNNNNNN" + padded = target_seq.ljust(len(target_seq) + 10, "A") + target = SeqRecord(Seq(padded), id="chr1") + query = SeqRecord(Seq(query_seq), id="query_noisy") + coords = np.array([[0, len(query_seq)], [0, len(query_seq)]]) + return BioAlign.Alignment(sequences=[target, query], coordinates=coords) + + def _make_clean_aln(self) -> BioAlign.Alignment: + """Alignment B: 7 perfect matches, no mismatches. + + BLAT score = 7 - 0 = 7 + identity count = 7 + + Shorter but cleaner: should be preferred by _get_best_hsp. + """ + seq = "ACGTACG" + padded = seq.ljust(len(seq) + 10, "A") + target = SeqRecord(Seq(padded), id="chr1") + query = SeqRecord(Seq(seq), id="query_clean") + coords = np.array([[0, len(seq)], [0, len(seq)]]) + return BioAlign.Alignment(sequences=[target, query], coordinates=coords) + + def test_blat_score_noisy_lower_than_clean(self): + """Noisy alignment has more identities but a lower BLAT score.""" + noisy = self._make_noisy_aln() + clean = self._make_clean_aln() + + noisy_counts = noisy.counts() + clean_counts = clean.counts() + + # Identity count: noisy wins + assert ( + noisy_counts.identities > clean_counts.identities + ), "Noisy alignment must have more raw identities for the test to be meaningful" + # BLAT score: clean wins + assert ( + _blat_score(noisy) < _blat_score(clean) + ), "Clean alignment must have the higher BLAT score for the test to be meaningful" + + def test_get_best_hsp_prefers_blat_score_over_identity_count(self): + """_get_best_hsp selects the alignment with the higher BLAT score, not + the higher raw identity count. + + Regression guard: before the fix, the selector used + ``a.counts().identities``, which would have returned the noisy + alignment. After the fix it must return the clean one. + """ + noisy = self._make_noisy_aln() + clean = self._make_clean_aln() + + best = _get_best_hsp([noisy, clean]) + + assert best is clean, ( + "_get_best_hsp chose the noisy alignment (higher identity count) " + "instead of the clean one (higher BLAT score). " + "The ranking criterion has regressed to identity-count-only." + ) def check_alignment_result_equality(actual: AlignmentResult, expected: AlignmentResult): - """Check correctness of alignment result against fixture""" assert actual.chrom == expected.chrom assert actual.strand == expected.strand assert actual.coverage == pytest.approx(expected.coverage) - assert actual.ident_pct == pytest.approx(expected.ident_pct) + assert actual.percent_identity == pytest.approx(expected.percent_identity) assert actual.query_range.start == expected.query_range.start assert actual.query_range.end == expected.query_range.end for a, e in zip(actual.query_subranges, expected.query_subranges, strict=False): @@ -90,3 +404,47 @@ def test_align_tp53(scoreset_metadata_fixture, align_result_fixture): expected = AlignmentResult(**align_result_fixture[urn]) assert align_result check_alignment_result_equality(align_result, expected) + + +def test_align_raises_on_ambiguous_blat_id(): + """align() must raise AlignmentError when a BLAT result ID matches more than one + target gene name after stripping whitespace and punctuation. + + Regression guard for the silent-pass bug where ``matches > 1`` constructed + an error message but never raised it. + """ + # Two gene names that both normalise to the same BLAT identifier "GENE". + # "GENE (isoform1)" -> split()[0].replace("(","").replace(")","") -> "GENE" + # "GENE (isoform2)" -> same result -> ambiguous. + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={ + "GENE (isoform1)": TargetGene( + target_gene_name="GENE (isoform1)", + target_gene_category=TargetType.PROTEIN_CODING, + target_sequence="ACGT", + target_sequence_type="dna", + target_accession_id=None, + ), + "GENE (isoform2)": TargetGene( + target_gene_name="GENE (isoform2)", + target_gene_category=TargetType.PROTEIN_CODING, + target_sequence="ACGT", + target_sequence_type="dna", + target_accession_id=None, + ), + }, + ) + # _get_blat_output returns a grouped dict keyed by the shortened BLAT id. + # We fake a single hit for the shared prefix so the disambiguation loop runs. + fake_grouped = {"GENE": []} + fake_params = {"aligner": "blat"} + + with ( + patch( + "dcd_mapping.align._get_blat_output", + return_value=(fake_grouped, fake_params), + ), + pytest.raises(AlignmentError, match="matches multiple target gene names"), + ): + align(metadata) diff --git a/tests/test_annotate.py b/tests/test_annotate.py index 6a7e18e..b7af03b 100644 --- a/tests/test_annotate.py +++ b/tests/test_annotate.py @@ -9,17 +9,21 @@ SequenceReference, ) +from dcd_mapping import vrs_v1_schemas from dcd_mapping.annotate import ( _compute_target_gene_info_from_alignment, _compute_target_gene_info_from_mapped_variant_spans, _covered_bases_from_overlapping_genes_of_chromosomal_intervals, + _stamp_alignment_locus_flags, compute_target_gene_info, ) from dcd_mapping.schemas import ( + AlignmentQc, AlignmentResult, AnnotationLayer, GeneInfo, MappedScore, + ScoreAnnotation, ScoresetMetadata, SequenceRange, TargetGene, @@ -56,7 +60,7 @@ def make_align(hit_intervals): chrom="NC_000001.11", strand=1, coverage=None, - ident_pct=None, + percent_identity=None, query_range=SequenceRange(start=1, end=10), query_subranges=[SequenceRange(start=1, end=10)], hit_range=SequenceRange(start=1, end=10), @@ -161,7 +165,7 @@ def test_compute_target_gene_info_mapped_variants_path(scoreset_metadata): accession_id="id", pre_mapped=None, post_mapped=allele, - annotation_layer=AnnotationLayer.GENOMIC, + alignment_level=AnnotationLayer.GENOMIC, score=None, error_message=None, ) @@ -246,7 +250,7 @@ def make_ms(start, end): accession_id=f"id_{start}", pre_mapped=None, post_mapped=allele, - annotation_layer=AnnotationLayer.GENOMIC, + alignment_level=AnnotationLayer.GENOMIC, score=None, error_message=None, ) @@ -281,3 +285,131 @@ def fake_overlap(chrom, s, e, features=None): assert isinstance(res, GeneInfo) assert res.hgnc_symbol == "GENEZ" assert res.selection_method == "variants_max_covered_bases" + + +# --------------------------------------------------------------------------- +# Tests for _apply_alignment_locus_flags / mismatch_positions_unavailable +# --------------------------------------------------------------------------- + + +def _make_genomic_annotation(start: int, end: int) -> "ScoreAnnotation": + """Return a ScoreAnnotation with a simple genomic post_mapped Allele.""" + allele = Allele( + location=SequenceLocation( + sequenceReference=SequenceReference( + refgetAccession="SQ.1234567890abcdef1234567890abcdef" + ), + start=start, + end=end, + ), + state=LiteralSequenceExpression(sequence="A"), + expressions=[], + ) + return ScoreAnnotation( + mavedb_id=f"id_{start}", + pre_mapped=None, + post_mapped=allele, + alignment_level=AnnotationLayer.GENOMIC, + score=None, + ) + + +def _make_align_result_with_qc(**qc_kwargs) -> "AlignmentResult": + """Build a minimal AlignmentResult with an AlignmentQc built from qc_kwargs.""" + qc = AlignmentQc( + alignment_length=100, + mismatch_count=qc_kwargs.pop("mismatch_count", 0), + gap_count=qc_kwargs.pop("gap_count", 0), + **qc_kwargs, + ) + return AlignmentResult( + chrom="NC_000001.11", + strand=1, + coverage=None, + percent_identity=None, + query_range=SequenceRange(start=0, end=100), + query_subranges=[SequenceRange(start=0, end=100)], + hit_range=SequenceRange(start=0, end=100), + hit_subranges=[SequenceRange(start=0, end=100)], + alignment_qc=qc, + ) + + +def test_apply_alignment_locus_flags_mismatch_positions_unavailable_leaves_at_mismatched_locus_none(): + """When mismatch_positions_unavailable=True and mismatch_count>0, at_mismatched_locus + must stay None (not evaluated) to prevent silent disagreement with mismatch_count. + """ + ann = _make_genomic_annotation(10, 11) + assert ann.at_mismatched_locus is None # pre-condition + + align_result = _make_align_result_with_qc( + mismatch_count=5, + mismatch_positions_unavailable=True, + ) + _stamp_alignment_locus_flags([ann], align_result, AnnotationLayer.GENOMIC) + + assert ann.at_mismatched_locus is None, ( + "at_mismatched_locus must remain None when mismatch_positions_unavailable=True " + "and mismatch_count>0 -- it cannot claim False when positions are unknown" + ) + assert ann.near_gap is False # gap_intervals empty → near_gap still evaluated + + +def test_apply_alignment_locus_flags_mismatch_positions_unavailable_zero_count_stamps_false(): + """When mismatch_positions_unavailable=True but mismatch_count==0, there are no + mismatches to miss, so at_mismatched_locus=False is safe. + """ + ann = _make_genomic_annotation(10, 11) + align_result = _make_align_result_with_qc( + mismatch_count=0, + mismatch_positions_unavailable=True, + ) + _stamp_alignment_locus_flags([ann], align_result, AnnotationLayer.GENOMIC) + + assert ann.at_mismatched_locus is False + assert ann.near_gap is False + + +def test_apply_alignment_locus_flags_early_exit_does_not_fire_when_count_nonzero(): + """The early-exit 'no mismatches and no gaps → stamp False' must NOT fire when + mismatch_positions_unavailable=True and mismatch_count>0, even if gap_intervals + is also empty. Previously the check was ``not mismatch_positions``, which is + always True under unavailability, causing the early-exit to stamp near_gap=False + on annotations whose positions could not be extracted (and thus were never evaluated). + The fix uses ``mismatch_count == 0`` as the ground-truth condition. + """ + # A VRS v1 Allele is not an instance of ga4gh.vrs.Allele (v2), so + # _allele_ref_positions returns [] and the main-loop body is skipped via `continue`. + v1_allele = vrs_v1_schemas.Allele( + id="ga4gh:VA.test", + location=vrs_v1_schemas.SequenceLocation( + id="loc1", + sequence_id="ga4gh:SQ.test", + interval=vrs_v1_schemas.SequenceInterval( + start=vrs_v1_schemas.Number(value=10), + end=vrs_v1_schemas.Number(value=11), + ), + ), + state=vrs_v1_schemas.LiteralSequenceExpression(sequence="A"), + ) + ann = ScoreAnnotation.model_construct( + mavedb_id="id_v1", + post_mapped=v1_allele, + alignment_level=AnnotationLayer.GENOMIC, + at_mismatched_locus=None, + near_gap=None, + ) + + align_result = _make_align_result_with_qc( + mismatch_count=3, + mismatch_positions_unavailable=True, + ) + _stamp_alignment_locus_flags([ann], align_result, AnnotationLayer.GENOMIC) + + # Both flags must remain None: positions could not be extracted, and the early-exit + # must not have fired (it would have incorrectly stamped near_gap=False). + assert ann.at_mismatched_locus is None + assert ann.near_gap is None, ( + "near_gap must be None when positions are unextractable and mismatch_count>0 " + "prevents the early-exit from firing -- old code stamped False via faulty early-exit" + ) diff --git a/tests/test_annotate_target_mapping.py b/tests/test_annotate_target_mapping.py new file mode 100644 index 0000000..744b225 --- /dev/null +++ b/tests/test_annotate_target_mapping.py @@ -0,0 +1,855 @@ +"""Tests for annotate._reference_accession_for_target_level and build_scoreset_mapping.""" +from unittest.mock import patch + +from cool_seq_tool.schemas import AnnotationLayer + +from dcd_mapping.annotate import ( + _align_result_for_target, + _reference_accession_for_target_level, + _stamp_alignment_locus_flags, + build_scoreset_mapping, +) +from dcd_mapping.schemas import ( + AlignmentQc, + AlignmentResult, + GeneInfo, + ScoreAnnotation, + ScoresetMapping, + ScoresetMetadata, + SequenceRange, + TargetGene, + TargetSequenceType, + TargetType, + TxSelectResult, + VrsVersion, +) + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +def _make_seq_target(name: str = "GENE1") -> TargetGene: + return TargetGene( + target_gene_name=name, + target_gene_category=TargetType.PROTEIN_CODING, + target_sequence="ATGCATGC", + target_sequence_type=TargetSequenceType.DNA, + target_accession_id=None, + ) + + +def _make_acc_target(accession_id: str, name: str = "GENE1") -> TargetGene: + return TargetGene( + target_gene_name=name, + target_gene_category=TargetType.PROTEIN_CODING, + target_sequence=None, + target_sequence_type=None, + target_accession_id=accession_id, + ) + + +def _make_tx(nm: str = "NM_000001.1", np: str = "NP_000001.1") -> TxSelectResult: + return TxSelectResult( + nm=nm, + np=np, + start=0, + is_full_match=True, + sequence="MAAAA", + hgnc_symbol="GENE1", + ) + + +def _make_align(chrom: str = "chr1") -> AlignmentResult: + return AlignmentResult( + chrom=chrom, + query_range=SequenceRange(start=0, end=100), + query_subranges=[SequenceRange(start=0, end=100)], + hit_range=SequenceRange(start=1000, end=1100), + hit_subranges=[SequenceRange(start=1000, end=1100)], + aligner_parameters={"aligner": "blat", "min_score": 20, "out_format": "pslx"}, + ) + + +# --------------------------------------------------------------------------- +# _reference_accession_for_target_level +# --------------------------------------------------------------------------- + + +class TestReferenceAccessionForTargetLevel: + def test_protein_layer_from_tx(self): + target = _make_seq_target() + tx = _make_tx(np="NP_001234.1") + result = _reference_accession_for_target_level( + AnnotationLayer.PROTEIN, target, tx, None + ) + assert result == "NP_001234.1" + + def test_protein_layer_from_accession(self): + target = _make_acc_target("NP_001234.1") + result = _reference_accession_for_target_level( + AnnotationLayer.PROTEIN, target, None, None + ) + assert result == "NP_001234.1" + + def test_protein_layer_ensp_accession(self): + target = _make_acc_target("ENSP00000001.1") + result = _reference_accession_for_target_level( + AnnotationLayer.PROTEIN, target, None, None + ) + assert result == "ENSP00000001.1" + + def test_protein_layer_no_info(self): + target = _make_seq_target() + result = _reference_accession_for_target_level( + AnnotationLayer.PROTEIN, target, None, None + ) + assert result is None + + def test_cdna_layer_from_tx(self): + target = _make_seq_target() + tx = _make_tx(nm="NM_000001.1") + result = _reference_accession_for_target_level( + AnnotationLayer.CDNA, target, tx, None + ) + assert result == "NM_000001.1" + + def test_cdna_layer_from_accession(self): + target = _make_acc_target("NM_000001.1") + result = _reference_accession_for_target_level( + AnnotationLayer.CDNA, target, None, None + ) + assert result == "NM_000001.1" + + def test_cdna_layer_no_info(self): + target = _make_seq_target() + result = _reference_accession_for_target_level( + AnnotationLayer.CDNA, target, None, None + ) + assert result is None + + def test_genomic_layer_from_alignment(self): + target = _make_seq_target() + align = _make_align(chrom="chr1") + with patch( + "dcd_mapping.annotate.get_chromosome_identifier", + return_value="refseq:NC_000001.11", + ): + result = _reference_accession_for_target_level( + AnnotationLayer.GENOMIC, target, None, align + ) + assert result == "NC_000001.11" + + def test_genomic_layer_nc_accession(self): + target = _make_acc_target("NC_000001.11") + result = _reference_accession_for_target_level( + AnnotationLayer.GENOMIC, target, None, None + ) + assert result == "NC_000001.11" + + def test_genomic_layer_no_info(self): + target = _make_seq_target() + result = _reference_accession_for_target_level( + AnnotationLayer.GENOMIC, target, None, None + ) + assert result is None + + +# --------------------------------------------------------------------------- +# build_scoreset_mapping golden test +# --------------------------------------------------------------------------- + + +def _make_annotation( + layer: AnnotationLayer | None, + post_mapped=None, + error_message: str | None = None, +) -> ScoreAnnotation: + """Minimal ScoreAnnotation for golden test.""" + return ScoreAnnotation( + mavedb_id="urn:mavedb:00000001-a-1#1", + alignment_level=layer, + pre_mapped=None, + post_mapped=post_mapped, + error_message=error_message, + score=None, + ) + + +class TestBuildScoresetMapping: + def test_emits_one_target_mapping_per_layer(self): + """Each (target, layer) pair appearing in mappings generates exactly one TargetMapping.""" + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={ + "GENE1": _make_seq_target("GENE1"), + }, + ) + # Two annotations: one genomic, one protein + g_ann = _make_annotation(AnnotationLayer.GENOMIC) + p_ann = _make_annotation(AnnotationLayer.PROTEIN) + mappings = {"GENE1": [g_ann, p_ann]} + + with ( + patch( + "dcd_mapping.annotate.get_vrs_id_from_identifier", + return_value="ga4gh:SQ.test", + ), + patch( + "dcd_mapping.annotate.get_chromosome_identifier", + return_value="NC_000001.11", + ), + patch( + "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer", + return_value=AnnotationLayer.GENOMIC, + ), + patch( + "dcd_mapping.annotate._get_computed_reference_sequence", + return_value=None, + ), + patch( + "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None + ), + ): + result = build_scoreset_mapping( + metadata=metadata, + raw_metadata={}, + mappings=mappings, + align_results={"GENE1": _make_align()}, + tx_output={"GENE1": _make_tx()}, + gene_info={"GENE1": GeneInfo(hgnc_symbol="GENE1")}, + preferred_layer_only=False, + vrs_version=VrsVersion.V_2, + ) + + assert isinstance(result, ScoresetMapping) + assert result.target_mappings is not None + layers_seen = {tm.alignment_level for tm in result.target_mappings} + assert AnnotationLayer.GENOMIC in layers_seen + assert AnnotationLayer.PROTEIN in layers_seen + # Exactly one mapping per layer + assert len(result.target_mappings) == 2 + + def test_preferred_flag_on_exactly_one_row_per_target(self): + """preferred=True must appear on exactly one TargetMapping per target.""" + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_seq_target("GENE1")}, + ) + g_ann = _make_annotation(AnnotationLayer.GENOMIC) + p_ann = _make_annotation(AnnotationLayer.PROTEIN) + + with ( + patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None), + patch( + "dcd_mapping.annotate.get_chromosome_identifier", + return_value="NC_000001.11", + ), + patch( + "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer", + return_value=AnnotationLayer.GENOMIC, + ), + patch( + "dcd_mapping.annotate._get_computed_reference_sequence", + return_value=None, + ), + patch( + "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None + ), + ): + result = build_scoreset_mapping( + metadata=metadata, + raw_metadata={}, + mappings={"GENE1": [g_ann, p_ann]}, + align_results={"GENE1": _make_align()}, + tx_output={"GENE1": _make_tx()}, + gene_info={"GENE1": None}, + preferred_layer_only=False, + vrs_version=VrsVersion.V_2, + ) + + preferred = [tm for tm in result.target_mappings if tm.preferred] + assert len(preferred) == 1 + assert preferred[0].alignment_level == AnnotationLayer.GENOMIC + + def test_annotation_qc_counts_sum_correctly(self): + """total_variants == clean + warnings + failed.""" + from ga4gh.vrs._internal.models import ( + Allele, + LiteralSequenceExpression, + SequenceLocation, + SequenceReference, + ) + + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_seq_target("GENE1")}, + ) + + allele = Allele( + location=SequenceLocation( + sequenceReference=SequenceReference(refgetAccession="SQ." + "A" * 32), + start=0, + end=1, + ), + state=LiteralSequenceExpression(sequence="A"), + ) + + annotations = [ + _make_annotation(AnnotationLayer.GENOMIC, post_mapped=allele), # clean + _make_annotation( + AnnotationLayer.GENOMIC, post_mapped=allele, error_message="warn" + ), # warning + _make_annotation(AnnotationLayer.GENOMIC), # failed + ] + + with ( + patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None), + patch( + "dcd_mapping.annotate.get_chromosome_identifier", + return_value="NC_000001.11", + ), + patch( + "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer", + return_value=AnnotationLayer.GENOMIC, + ), + patch( + "dcd_mapping.annotate._get_computed_reference_sequence", + return_value=None, + ), + patch( + "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None + ), + ): + result = build_scoreset_mapping( + metadata=metadata, + raw_metadata={}, + mappings={"GENE1": annotations}, + align_results={"GENE1": _make_align()}, + tx_output={"GENE1": _make_tx()}, + gene_info={"GENE1": None}, + preferred_layer_only=False, + vrs_version=VrsVersion.V_2, + ) + + assert result.target_mappings + tm = result.target_mappings[0] + assert tm.total_variants == 3 + assert tm.variants_mapped_cleanly == 1 + assert tm.variants_with_mapping_warnings == 1 + assert tm.variants_failed == 1 + assert ( + (tm.variants_mapped_cleanly or 0) + + (tm.variants_with_mapping_warnings or 0) + + (tm.variants_failed or 0) + ) == tm.total_variants + + def test_tool_parameters_for_sequence_based_target(self): + """tool_parameters should contain BLAT aligner key for sequence-based targets.""" + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_seq_target("GENE1")}, + ) + ann = _make_annotation(AnnotationLayer.GENOMIC) + + with ( + patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None), + patch( + "dcd_mapping.annotate.get_chromosome_identifier", + return_value="NC_000001.11", + ), + patch( + "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer", + return_value=AnnotationLayer.GENOMIC, + ), + patch( + "dcd_mapping.annotate._get_computed_reference_sequence", + return_value=None, + ), + patch( + "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None + ), + ): + result = build_scoreset_mapping( + metadata=metadata, + raw_metadata={}, + mappings={"GENE1": [ann]}, + align_results={"GENE1": _make_align()}, + tx_output={"GENE1": _make_tx()}, + gene_info={"GENE1": None}, + preferred_layer_only=False, + vrs_version=VrsVersion.V_2, + ) + + tm = result.target_mappings[0] + assert tm.tool_parameters is not None + assert tm.tool_parameters.get("aligner") == "blat" + assert "min_score" in tm.tool_parameters + + def test_tool_parameters_for_accession_based_target(self): + """tool_parameters should reference cdot for accession-based targets.""" + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_acc_target("NM_000001.1", "GENE1")}, + ) + ann = _make_annotation(AnnotationLayer.CDNA) + + with ( + patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None), + patch( + "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer", + return_value=AnnotationLayer.CDNA, + ), + patch( + "dcd_mapping.annotate._get_computed_reference_sequence", + return_value=None, + ), + patch( + "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None + ), + ): + result = build_scoreset_mapping( + metadata=metadata, + raw_metadata={}, + mappings={"GENE1": [ann]}, + align_results={"GENE1": _make_align()}, + tx_output={"GENE1": _make_tx()}, + gene_info={"GENE1": None}, + preferred_layer_only=False, + vrs_version=VrsVersion.V_2, + ) + + tm = result.target_mappings[0] + assert tm.tool_parameters is not None + assert tm.tool_parameters.get("aligner") == "cdot_transcript_placement" + assert "cdot_url" in tm.tool_parameters + + def test_tool_parameters_for_nc_accession_target(self): + """NC_ (chromosome/contig) targets should use 'direct_contig_accession', not cdot.""" + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_acc_target("NC_000001.11", "GENE1")}, + ) + ann = _make_annotation(AnnotationLayer.GENOMIC) + + with ( + patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None), + patch( + "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer", + return_value=AnnotationLayer.GENOMIC, + ), + patch( + "dcd_mapping.annotate._get_computed_reference_sequence", + return_value=None, + ), + patch( + "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None + ), + ): + result = build_scoreset_mapping( + metadata=metadata, + raw_metadata={}, + mappings={"GENE1": [ann]}, + # NC_ targets produce align_result=None in fetch_alignment + align_results={"NC_000001.11": None}, + tx_output={"GENE1": _make_tx()}, + gene_info={"GENE1": None}, + preferred_layer_only=False, + vrs_version=VrsVersion.V_2, + ) + + tm = result.target_mappings[0] + assert tm.tool_parameters is not None + assert tm.tool_parameters.get("aligner") == "direct_contig_accession" + # Must not bleed cdot fields into a non-cdot path + assert "cdot_data_version" not in tm.tool_parameters + assert "cdot_url" not in tm.tool_parameters + # Accession is already in reference_accession; must not duplicate here + assert "accession" not in tm.tool_parameters + + def test_mapped_scores_alignment_levels_subset_of_target_mappings(self): + """Invariant: every alignment_level in mapped_scores must have a parent TargetMapping row. + + No mapped_score should be orphaned (i.e. have an alignment_level that + does not correspond to any TargetMapping.alignment_level for this run). + This mirrors the join the MaveDB API performs via target_gene_mapping_id. + """ + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_seq_target("GENE1")}, + ) + # Genomic success, protein success, and a completely-failed (NULL-layer) variant + g_ann = _make_annotation(AnnotationLayer.GENOMIC) + p_ann = _make_annotation(AnnotationLayer.PROTEIN) + null_ann = _make_annotation(None) # completely failed - no layer attribution + + with ( + patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None), + patch( + "dcd_mapping.annotate.get_chromosome_identifier", + return_value="NC_000001.11", + ), + patch( + "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer", + return_value=AnnotationLayer.GENOMIC, + ), + patch( + "dcd_mapping.annotate._get_computed_reference_sequence", + return_value=None, + ), + patch( + "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None + ), + ): + result = build_scoreset_mapping( + metadata=metadata, + raw_metadata={}, + mappings={"GENE1": [g_ann, p_ann, null_ann]}, + align_results={"GENE1": _make_align()}, + tx_output={"GENE1": _make_tx()}, + gene_info={"GENE1": None}, + preferred_layer_only=False, + vrs_version=VrsVersion.V_2, + ) + + assert result.mapped_scores is not None + assert result.target_mappings is not None + + tm_levels = {tm.alignment_level for tm in result.target_mappings} + ms_levels = {ms.alignment_level for ms in result.mapped_scores} + + # Core invariant: every alignment_level in mapped_scores has a parent row + assert ( + ms_levels <= tm_levels + ), f"Orphaned alignment levels in mapped_scores: {ms_levels - tm_levels}" + + def test_null_layer_failures_attributed_to_preferred_layer(self): + """NULL-layer (completely-failed) variants must be re-attributed to the preferred + layer so the MaveDB API can join them via alignment_level. + """ + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_seq_target("GENE1")}, + ) + g_ann = _make_annotation(AnnotationLayer.GENOMIC) + null_ann = _make_annotation(None) # completely failed + + with ( + patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None), + patch( + "dcd_mapping.annotate.get_chromosome_identifier", + return_value="NC_000001.11", + ), + patch( + "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer", + return_value=AnnotationLayer.GENOMIC, + ), + patch( + "dcd_mapping.annotate._get_computed_reference_sequence", + return_value=None, + ), + patch( + "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None + ), + ): + result = build_scoreset_mapping( + metadata=metadata, + raw_metadata={}, + mappings={"GENE1": [g_ann, null_ann]}, + align_results={"GENE1": _make_align()}, + tx_output={"GENE1": _make_tx()}, + gene_info={"GENE1": None}, + preferred_layer_only=True, + vrs_version=VrsVersion.V_2, + ) + + assert result.mapped_scores is not None + assert result.target_mappings is not None + + # NULL-layer failure should appear in mapped_scores as the preferred layer + for ms in result.mapped_scores: + assert ( + ms.alignment_level == AnnotationLayer.GENOMIC + ), f"Expected alignment_level=GENOMIC for all mapped_scores; got {ms.alignment_level}" + + # The preferred layer's TargetMapping must count the null failure + tm = result.target_mappings[0] + assert tm.alignment_level == AnnotationLayer.GENOMIC + assert tm.total_variants == 2 # one genomic + one null failure + + def test_preferred_layer_only_total_variants_includes_null_failures(self): + """With preferred_layer_only=True, total_variants in TargetMapping must + account for null-layer failures so len(mapped_scores) == total_variants. + """ + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_seq_target("GENE1")}, + ) + # 3 genomic successes + 2 completely-failed variants + g_anns = [_make_annotation(AnnotationLayer.GENOMIC) for _ in range(3)] + null_anns = [_make_annotation(None) for _ in range(2)] + + with ( + patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None), + patch( + "dcd_mapping.annotate.get_chromosome_identifier", + return_value="NC_000001.11", + ), + patch( + "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer", + return_value=AnnotationLayer.GENOMIC, + ), + patch( + "dcd_mapping.annotate._get_computed_reference_sequence", + return_value=None, + ), + patch( + "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None + ), + ): + result = build_scoreset_mapping( + metadata=metadata, + raw_metadata={}, + mappings={"GENE1": g_anns + null_anns}, + align_results={"GENE1": _make_align()}, + tx_output={"GENE1": _make_tx()}, + gene_info={"GENE1": None}, + preferred_layer_only=True, + vrs_version=VrsVersion.V_2, + ) + + assert result.mapped_scores is not None + assert result.target_mappings is not None + + tm = result.target_mappings[0] + assert len(result.mapped_scores) == 5 + assert tm.total_variants == 5 + assert tm.variants_failed == 5 # all have pre_mapped=None (failed) + + +# --------------------------------------------------------------------------- +# Locus flag stamping +# --------------------------------------------------------------------------- + + +def _make_allele_annotation( + start: int, end: int, layer: AnnotationLayer +) -> ScoreAnnotation: + """Build a ScoreAnnotation with a minimal VRS 2 Allele post_mapped.""" + from ga4gh.vrs._internal.models import ( + Allele, + LiteralSequenceExpression, + SequenceLocation, + SequenceReference, + ) + + allele = Allele( + location=SequenceLocation( + sequenceReference=SequenceReference(refgetAccession="SQ." + "A" * 32), + start=start, + end=end, + ), + state=LiteralSequenceExpression(sequence="T"), + ) + return ScoreAnnotation( + mavedb_id=f"urn:mavedb:00000001-a-1#{start}", + alignment_level=layer, + post_mapped=allele, + score=None, + ) + + +def _make_rle_annotation(start: int, end: int) -> ScoreAnnotation: + """Build a ScoreAnnotation with a ReferenceLengthExpression (reference-identical).""" + from ga4gh.vrs._internal.models import ( + Allele, + ReferenceLengthExpression, + SequenceLocation, + SequenceReference, + ) + + allele = Allele( + location=SequenceLocation( + sequenceReference=SequenceReference(refgetAccession="SQ." + "A" * 32), + start=start, + end=end, + ), + state=ReferenceLengthExpression(length=end - start, sequence=None), + ) + return ScoreAnnotation( + mavedb_id=f"urn:mavedb:00000001-a-1#rle{start}", + alignment_level=AnnotationLayer.GENOMIC, + post_mapped=allele, + score=None, + ) + + +def _make_align_result_with_qc( + mismatches: list[int], + gap_intervals: list[tuple[int, int]], + mismatch_count: int | None = None, +) -> AlignmentResult: + qc = AlignmentQc( + alignment_length=1000, + mismatch_count=mismatch_count + if mismatch_count is not None + else len(mismatches), + gap_count=len(gap_intervals), + mismatch_positions=mismatches, + gap_intervals=gap_intervals, + ) + return AlignmentResult( + chrom="chr1", + query_range=SequenceRange(start=0, end=100), + query_subranges=[SequenceRange(start=0, end=100)], + hit_range=SequenceRange(start=0, end=1000), + hit_subranges=[SequenceRange(start=0, end=1000)], + alignment_qc=qc, + ) + + +class TestStampAlignmentLocusFlags: + def test_variant_at_mismatch(self): + ann = _make_allele_annotation(10, 11, AnnotationLayer.GENOMIC) + align = _make_align_result_with_qc(mismatches=[10], gap_intervals=[]) + _stamp_alignment_locus_flags([ann], align, AnnotationLayer.GENOMIC) + assert ann.at_mismatched_locus is True + assert ann.near_gap is False + + def test_variant_near_gap(self): + ann = _make_allele_annotation(10, 11, AnnotationLayer.GENOMIC) + # Gap at [20, 30); within 5-base window of position 10+5=15 < 20, so NOT near + # Place gap closer: [13, 20) + align = _make_align_result_with_qc(mismatches=[], gap_intervals=[(13, 20)]) + _stamp_alignment_locus_flags( + [ann], align, AnnotationLayer.GENOMIC, near_gap_window=5 + ) + assert ann.at_mismatched_locus is False + assert ann.near_gap is True # pos 10, window end 15, gap start 13 < 15 + + def test_variant_not_near_gap(self): + ann = _make_allele_annotation(0, 1, AnnotationLayer.GENOMIC) + # Gap far away + align = _make_align_result_with_qc(mismatches=[], gap_intervals=[(1000, 1100)]) + _stamp_alignment_locus_flags([ann], align, AnnotationLayer.GENOMIC) + assert ann.near_gap is False + + def test_rle_allele_flags_left_as_none(self): + """RLE alleles (reference-identical) must leave both flags as None.""" + ann = _make_rle_annotation(0, 198295559) # whole-chromosome span + # Use a real mismatch/gap so the function would flag a normal allele + align = _make_align_result_with_qc( + mismatches=list(range(100)), + gap_intervals=[(200, 300)], + ) + _stamp_alignment_locus_flags([ann], align, AnnotationLayer.GENOMIC) + assert ann.at_mismatched_locus is None + assert ann.near_gap is None + + def test_idempotent_double_call(self): + """Calling _stamp_alignment_locus_flags twice must not change the result.""" + ann = _make_allele_annotation(10, 11, AnnotationLayer.GENOMIC) + align = _make_align_result_with_qc(mismatches=[10], gap_intervals=[(15, 25)]) + _stamp_alignment_locus_flags([ann], align, AnnotationLayer.GENOMIC) + first_at = ann.at_mismatched_locus + first_gap = ann.near_gap + _stamp_alignment_locus_flags([ann], align, AnnotationLayer.GENOMIC) + assert ann.at_mismatched_locus == first_at + assert ann.near_gap == first_gap + + def test_no_align_result_leaves_flags_none(self): + ann = _make_allele_annotation(10, 11, AnnotationLayer.GENOMIC) + _stamp_alignment_locus_flags([ann], None, AnnotationLayer.GENOMIC) + assert ann.at_mismatched_locus is None + assert ann.near_gap is None + + +# --------------------------------------------------------------------------- +# _align_result_for_target +# --------------------------------------------------------------------------- + + +class TestAlignResultForTarget: + def test_sequence_based_found_by_gene_name(self): + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_seq_target("GENE1")}, + ) + align = _make_align() + result = _align_result_for_target("GENE1", metadata, {"GENE1": align}) + assert result is align + + def test_accession_based_found_by_accession(self): + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_acc_target("NM_000001.1", "GENE1")}, + ) + align = _make_align() + # align_results keyed by accession (as produced by parse_cdot_mapping) + result = _align_result_for_target("GENE1", metadata, {"NM_000001.1": align}) + assert result is align + + def test_returns_none_when_not_found(self): + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_seq_target("GENE1")}, + ) + result = _align_result_for_target("GENE1", metadata, {}) + assert result is None + + +# --------------------------------------------------------------------------- +# accession-based tool_parameters: cdot_data_version propagation +# --------------------------------------------------------------------------- + + +class TestCdotDataVersionPropagation: + def test_cdot_data_version_propagates_when_present(self): + """tool_parameters.cdot_data_version is emitted even when explicitly None.""" + metadata = ScoresetMetadata( + urn="urn:mavedb:00000001-a-1", + target_genes={"GENE1": _make_acc_target("NM_000001.1", "GENE1")}, + ) + ann = _make_annotation(AnnotationLayer.CDNA) + + align_with_version = AlignmentResult( + chrom="chr1", + query_range=SequenceRange(start=0, end=100), + query_subranges=[SequenceRange(start=0, end=100)], + hit_range=SequenceRange(start=1000, end=1100), + hit_subranges=[SequenceRange(start=1000, end=1100)], + aligner_parameters={ + "aligner": "cdot_transcript_placement", + "cdot_data_version": "0.2.26", + }, + ) + + with ( + patch("dcd_mapping.annotate.get_vrs_id_from_identifier", return_value=None), + patch( + "dcd_mapping.annotate._pick_preferred_genomic_or_protein_layer", + return_value=AnnotationLayer.CDNA, + ), + patch( + "dcd_mapping.annotate._get_computed_reference_sequence", + return_value=None, + ), + patch( + "dcd_mapping.annotate._get_mapped_reference_sequence", return_value=None + ), + ): + result = build_scoreset_mapping( + metadata=metadata, + raw_metadata={}, + mappings={"GENE1": [ann]}, + align_results={"NM_000001.1": align_with_version}, + tx_output={"GENE1": _make_tx()}, + gene_info={"GENE1": None}, + preferred_layer_only=False, + vrs_version=VrsVersion.V_2, + ) + + tm = result.target_mappings[0] + assert tm.tool_parameters is not None + assert "cdot_data_version" in tm.tool_parameters + assert tm.tool_parameters["cdot_data_version"] == "0.2.26" diff --git a/tests/test_transcript.py b/tests/test_transcript.py index 9219e39..bd8548a 100644 --- a/tests/test_transcript.py +++ b/tests/test_transcript.py @@ -309,7 +309,7 @@ def fake_get_sequence(ac): chrom="chr1", strand=1, coverage=None, - ident_pct=None, + percent_identity=None, query_range=SequenceRange(start=1, end=6), query_subranges=[SequenceRange(start=1, end=6)], hit_range=SequenceRange(start=1, end=6), diff --git a/tests/test_vrs_map.py b/tests/test_vrs_map.py index 20b0244..a52b5ca 100644 --- a/tests/test_vrs_map.py +++ b/tests/test_vrs_map.py @@ -29,7 +29,7 @@ def _assert_correct_vrs_map( expected_mappings_data: dict[tuple[str, AnnotationLayer], dict], ): """Note that we're testing against VRS 1.3 VA IDs (temporary?).""" - key = (mapping.accession_id, mapping.annotation_layer) + key = (mapping.accession_id, mapping.alignment_level) assert ( key in expected_mappings_data ), "Score row/layer combination is not in expected mappings" @@ -116,7 +116,7 @@ def test_2_a_2( align_result=align_result[target_gene], records=records[target_gene], transcript=tx_result[target_gene], - ) + ).mappings assert mappings["hYAP65 WW domain"] is not None assert len(mappings["hYAP65 WW domain"]) == 1 @@ -178,7 +178,7 @@ def test_41_a_1( align_result=align_result[target_gene], records=records[target_gene], transcript=tx_result[target_gene], - ) + ).mappings assert mappings["Src catalytic domain"] is not None assert len(mappings["Src catalytic domain"]) == 5 @@ -251,7 +251,7 @@ def test_99_a_1( align_result=align_result[target_gene], records=records[target_gene], transcript=tx_result[target_gene], - ) + ).mappings assert mappings["RHO"] is not None assert len(mappings["RHO"]) == 8 # includes protein and genomic for all 4 rows @@ -309,7 +309,7 @@ def test_103_c_1( align_result=align_result[target_gene], records=records[target_gene], transcript=tx_result[target_gene], - ) + ).mappings assert mappings["MAPK1"] is not None assert len(mappings["MAPK1"]) == 4 for m in mappings["MAPK1"]: @@ -378,7 +378,7 @@ def test_1_b_2( align_result=align_result[target_gene], records=records[target_gene], transcript=tx_result[target_gene], - ) + ).mappings assert mappings["SUMO1"] is not None assert len(mappings["SUMO1"]) == 8 for m in mappings["SUMO1"]: From e017c701f18375ca2eab4f8373f61fd577b092f6 Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Tue, 28 Apr 2026 16:33:06 -0700 Subject: [PATCH 2/6] fix(align): handle protein-vs-DNA and malformed PSL blocks in _build_alignment_qc - protein-vs-DNA (-q=prot -t=dnax) BLAT runs store target coords in nucleotide space and query coords in amino-acid space (3:1 ratio); minus-strand target blocks have ts > te, making seq[ts:te] return "". Comparison was crashing with ValueError from zip(strict=True); the per-base mismatch loop is now skipped entirely for this mode, setting mismatch_positions_unavailable=True so at_mismatched_locus is correctly left as None (not evaluated) rather than a false False. The preferred layer for protein scoresets is PROTEIN, flagged from the downstream protein-to-protein alignment, so no signal is lost. - For nucleotide-vs-nucleotide runs, replace the bare zip(strict=True) with an explicit length-mismatch guard that logs a WARNING and falls through to zip(strict=False), preserving all mismatches in the overlapping prefix rather than crashing or discarding the block. --- src/dcd_mapping/align.py | 59 ++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index de547af..14dd060 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -495,7 +495,9 @@ def _cigar_from_coords(aln: "BioAlign.Alignment") -> str | None: return "".join(parts) if parts else None -def _build_alignment_qc(aln: "BioAlign.Alignment") -> "AlignmentQc": +def _build_alignment_qc( + aln: "BioAlign.Alignment", protein_vs_dna: bool = False +) -> "AlignmentQc": """Derive the per-(target, alignment_level) QC block from a Bio.Align ``Alignment`` parsed out of BLAT's pslx output. @@ -512,6 +514,10 @@ def _build_alignment_qc(aln: "BioAlign.Alignment") -> "AlignmentQc": must treat ``at_mismatched_locus`` as ``None`` (not evaluated) in that situation to avoid a silent disagreement where ``mismatch_count > 0`` but no variant is ever flagged. + + :param protein_vs_dna: set ``True`` for ``-q=prot -t=dnax`` BLAT runs. + Skips per-base mismatch detection (target/query coordinates are in + incompatible spaces) and sets ``mismatch_positions_unavailable=True``. """ counts = aln.counts() coords = aln.coordinates @@ -519,15 +525,23 @@ def _build_alignment_qc(aln: "BioAlign.Alignment") -> "AlignmentQc": gap_intervals: list[tuple[int, int]] = [] query_insert_blocks = 0 - # Per-base mismatch detection requires concrete sequence content for both - # target and query. Biopython's pslx parser sometimes leaves bases outside - # aligned blocks (or even within, depending on BLAT output) as undefined - # placeholders. The aggregate mismatch *count* still comes from - # ``aln.counts()`` below; we treat per-position recording as best-effort - # and set mismatch_positions_unavailable=True if the sequence content isn't - # materialized so callers know at_mismatched_locus is unreliable. - mismatch_positions_unavailable = False - sequences_available = True + # Per-base mismatch detection requires sequence content from pslx tseqs/qseqs. + # BioPython sometimes leaves these as undefined placeholders; aggregate + # mismatch *count* from counts() is still accurate in that case. We treat + # per-position recording as best-effort and set mismatch_positions_unavailable + # so callers leave at_mismatched_locus as None rather than a misleading False. + # + # For -q=prot -t=dnax runs, target coordinates are in nucleotide space and + # query coordinates are in amino-acid space (3:1 ratio). Minus-strand target + # blocks have ts > te, making aln.target.seq[ts:te] return "". Per-base + # comparison is impossible, so we skip it entirely (sequences_available=False). + # This is fine: for protein scoresets the preferred layer is PROTEIN, flagged + # from the protein-to-protein alignment in vrs_map.py — not this genomic one. + # The gap_intervals walk below still runs; its output is stored in AlignmentQc + # but not consumed for near_gap in the typical protein-only case (no GENOMIC + # layer variants are emitted when hgvs_nt is absent). + mismatch_positions_unavailable = protein_vs_dna + sequences_available = not protein_vs_dna for i in range(coords.shape[1] - 1): ts, te = int(coords[0][i]), int(coords[0][i + 1]) qs, qe = int(coords[1][i]), int(coords[1][i + 1]) @@ -570,7 +584,27 @@ def _build_alignment_qc(aln: "BioAlign.Alignment") -> "AlignmentQc": sequences_available = False continue - for j, (tb, qb) in enumerate(zip(t_bases, q_bases, strict=True)): + # PSL should guarantee equal-length aligned blocks for + # nucleotide-vs-nucleotide runs (protein-vs-DNA is excluded + # above via sequences_available=False). This branch is a + # defensive fallback for malformed BLAT output. Compare the + # overlapping prefix rather than discarding the entire block. + if len(t_bases) != len(q_bases): + _logger.warning( + "Aligned-block sequence slices have unequal lengths " + "(%d vs %d) for target block [%d:%d] query block [%d:%d]. " + "Comparing the overlapping prefix; %d base(s) at the tail " + "of the longer slice will not be assessed for mismatches.", + len(t_bases), + len(q_bases), + ts, + te, + qs, + qe, + abs(len(t_bases) - len(q_bases)), + ) + + for j, (tb, qb) in enumerate(zip(t_bases, q_bases, strict=False)): if qb.upper() != tb.upper(): mismatch_positions.append(ts + j) @@ -906,7 +940,8 @@ def _get_best_match( hit_subranges.append(SequenceRange(start=ts, end=te)) query_subranges.append(SequenceRange(start=min(qs, qe), end=max(qs, qe))) - alignment_qc = _build_alignment_qc(best_aln) + protein_vs_dna = "-q=prot" in blat_params.get("target_args", "") + alignment_qc = _build_alignment_qc(best_aln, protein_vs_dna=protein_vs_dna) return AlignmentResult( aligner_parameters=blat_params, From b449001a90b70db3f56a0c4a8b027d80684acb60 Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Wed, 29 Apr 2026 10:43:14 -0700 Subject: [PATCH 3/6] fix(annotate): ensure JSON output is expanded by serializing without JSON mode --- src/api/routers/map.py | 2 +- src/dcd_mapping/annotate.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index cc0149b..495babd 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -248,4 +248,4 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse ).model_dump(exclude_none=True) ) - return JSONResponse(content=output.model_dump(mode="json", exclude_none=True)) + return JSONResponse(content=output.model_dump(exclude_none=True)) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 2f6c291..72296d9 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -1107,9 +1107,7 @@ def write_scoreset_mapping_to_json( _logger.info("Saving mapping output to %s", output_path) with output_path.open("w") as file: json.dump( - scoreset_mapping.model_dump( - mode="json", exclude_unset=False, exclude_none=True - ), + scoreset_mapping.model_dump(exclude_none=True), file, indent=4, ) From 736db82223ac80d57842a072a74837953dc7a167 Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Wed, 29 Apr 2026 11:54:03 -0700 Subject: [PATCH 4/6] feat(lookup): enhance gene symbol retrieval by tokenizing target names for better matching This is mostly useful in multi-word target names where gene information is available but not in the first word of the target name. --- src/dcd_mapping/lookup.py | 46 ++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index f763b16..6b60cc6 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -368,10 +368,11 @@ def _get_hgnc_symbol(term: str) -> str | None: def get_gene_symbol(target_gene: TargetGene) -> str | None: """Acquire HGNC gene symbol given provided target gene metadata from MaveDB. - Right now, we use two sources for normalizing: - 1. UniProt ID, if available - 2. Target name: specifically, we try the first word in the name (this could - cause some problems and we should double-check it) + Tokenizes the target name on whitespace and tries each token against the + gene normalizer until one matches (gene symbols are not always the first + token, e.g. ``"Wildtype G6PD"``). Silently returns ``None`` if no token + resolves — see ``_get_normalized_gene_response`` for the full description + of this limitation. :param target_gene: target gene metadata given by MaveDB API :return: gene symbol if available @@ -381,10 +382,12 @@ def get_gene_symbol(target_gene: TargetGene) -> str | None: if result: return result - # try taking the first word in the target name if target_gene.target_gene_name: - parsed_name = target_gene.target_gene_name.split(" ")[0] - return _get_hgnc_symbol(parsed_name) + for word in target_gene.target_gene_name.split(" "): + result = _get_hgnc_symbol(word) + if result: + return result + return None @@ -406,7 +409,17 @@ def _get_normalized_gene_response( ) -> Gene | None: """Fetch best normalized concept given available scoreset metadata. - :param metadata: salient scoreset metadata items + **Limitation — heuristic name parsing**: when the target name is not itself + a valid HGNC symbol this function tokenizes it on whitespace and tries each + token in order (e.g. ``"Wildtype G6PD"`` resolves because ``"G6PD"`` is the + second token). This will silently fail for names whose tokens are all + non-HGNC strings (e.g. ``"my favourite protein"``). When it fails, + downstream chromosome selection has no anchor and BLAT's chromosome fallback + may land on the wrong chromosome, causing transcript selection to return no + results. The only reliable fix is to ensure the target name contains the + HGNC gene symbol as one of its whitespace-delimited tokens. + + :param target_gene: salient scoreset metadata items :return: Normalized gene if available """ if target_gene.target_uniprot_ref: @@ -414,12 +427,13 @@ def _get_normalized_gene_response( if gene_descriptor: return gene_descriptor - # try taking the first word in the target name + # Try each whitespace-delimited token from the target name. Gene symbols + # are not always the first word (e.g. "Wildtype G6PD"). if target_gene.target_gene_name: - parsed_name = target_gene.target_gene_name.split(" ")[0] - gene_descriptor = _normalize_gene(parsed_name) - if gene_descriptor: - return gene_descriptor + for word in target_gene.target_gene_name.split(" "): + gene_descriptor = _normalize_gene(word) + if gene_descriptor: + return gene_descriptor return None @@ -453,10 +467,8 @@ def get_gene_location(target_gene: TargetGene) -> GeneLocation | None: """Acquire gene location data from gene normalizer using metadata provided by scoreset. - As with ``get_gene_symbol()``, we try to normalize from the following: - 1. UniProt ID, if available - 2. Target name: specifically, we try the first word in the name (this could - cause some problems and we should double-check it) + Delegates to ``_get_normalized_gene_response`` — see that function for the + full description of the heuristic and its limitations. :param target_gene: data given by MaveDB API :return: gene location data if available From ef26cbd6eb1c3478637f87b727b060b677b47f5c Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Wed, 29 Apr 2026 15:25:29 -0700 Subject: [PATCH 5/6] fix(mapping): update aligner type to reference_accession_passthrough and adjust related annotations Co-authored-by: Copilot --- src/dcd_mapping/annotate.py | 17 ++-------------- src/dcd_mapping/schemas.py | 28 ++++++++------------------- tests/test_annotate_target_mapping.py | 4 ++-- 3 files changed, 12 insertions(+), 37 deletions(-) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 72296d9..38e2bd5 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -1208,7 +1208,6 @@ def _build_target_mapping( vrs_version: VrsVersion, annotations: list[ScoreAnnotation], protein_align_result: AlignmentResult | None = None, - null_failure_count: int = 0, near_gap_window: int = NEAR_GAP_WINDOW, ) -> TargetMapping: """Assemble one target_mappings[] row for a (target, alignment_level) pair. @@ -1257,7 +1256,7 @@ def _build_target_mapping( # Direct chromosome/contig accession: the coordinate frame is defined by # the accession itself — no placement tool was invoked. tool_parameters = { - "aligner": "direct_contig_accession", + "aligner": "reference_accession_passthrough", } elif accession_id is not None: # Transcript accession (NM_/ENST): cdot was used for transcript placement. @@ -1308,12 +1307,7 @@ def _build_target_mapping( total = len(annotations) failed = sum(1 for a in annotations if a.post_mapped is None) - warnings = sum( - 1 - for a in annotations - if a.post_mapped is not None and a.error_message is not None - ) - clean = total - failed - warnings + clean = total - failed # near_gap is always evaluable (derived from gap positions in the alignment, # not per-base sequence content), so count it unconditionally. @@ -1352,12 +1346,8 @@ def _build_target_mapping( alignment_metadata=alignment_metadata, total_variants=total, variants_mapped_cleanly=clean, - variants_with_mapping_warnings=warnings, variants_with_alignment_warnings=alignment_warnings, variants_failed=failed, - variants_failed_pre_layer_selection=null_failure_count - if null_failure_count - else None, ) @@ -1556,9 +1546,6 @@ def build_scoreset_mapping( vrs_version=vrs_version, annotations=annotations_for_tm, protein_align_result=protein_align_for_target, - null_failure_count=len(null_failures) - if layer == preferred_layer_for_target - else 0, near_gap_window=near_gap_window, ) ) diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index b0e6889..fd92aed 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -373,10 +373,10 @@ class TargetMapping(BaseModel): "target_args": "-q=prot -t=prot", } - Direct contig accession (accession-based):: + Passthrough for fully qualified reference accessions (accession-based):: { - "aligner": "direct_contig_accession", + "aligner": "reference_accession_passthrough", } cdot transcript placement (accession-based):: @@ -433,31 +433,19 @@ class TargetMapping(BaseModel): # this target/level. True otherwise. alignment_metadata: dict[str, Any] | None = None - # Annotation QC -- totals for this target x level. The two warning columns - # count disjoint things: - # * variants_with_mapping_warnings: the mapper itself attached an - # ``error_message`` despite producing a ``post_mapped`` value (e.g. - # fell back to RLE, ambiguous reference allele). - # * variants_with_alignment_warnings: the variant's reference position - # fell on a mismatched locus or near a gap in the underlying alignment - # (``at_mismatched_locus`` or ``near_gap`` was True). Mapping itself - # succeeded cleanly; this is purely about alignment context. - # A single variant can be in neither, either, or both. + # Annotation QC -- totals for this target x level. + # variants_with_alignment_warnings: the variant's reference position fell on + # a mismatched locus or near a gap in the underlying alignment + # (``at_mismatched_locus`` or ``near_gap`` was True). Mapping itself + # succeeded cleanly; this is purely about alignment context. total_variants: int | None = None - # Count of variants where post_mapped is not None AND error_message is None. + # Count of variants where post_mapped is not None. # Alignment warnings (at_mismatched_locus, near_gap) are counted separately in # variants_with_alignment_warnings and do NOT reduce this count; a variant can # be "cleanly mapped" and still sit at a mismatched locus or near a gap. variants_mapped_cleanly: int | None = None - variants_with_mapping_warnings: int | None = None variants_with_alignment_warnings: int | None = None variants_failed: int | None = None - # Count of variants that failed before any annotation layer was determined - # (annotation_layer=None, pre_mapped=None in the VRS output). These are - # re-attributed to the preferred layer's variants_failed total so the join - # invariant holds, but keeping them separate lets consumers see a clean - # per-layer failure count on all other (non-preferred) rows. - variants_failed_pre_layer_selection: int | None = None class VrsMapResult(NamedTuple): diff --git a/tests/test_annotate_target_mapping.py b/tests/test_annotate_target_mapping.py index 744b225..2f66f13 100644 --- a/tests/test_annotate_target_mapping.py +++ b/tests/test_annotate_target_mapping.py @@ -425,7 +425,7 @@ def test_tool_parameters_for_accession_based_target(self): assert "cdot_url" in tm.tool_parameters def test_tool_parameters_for_nc_accession_target(self): - """NC_ (chromosome/contig) targets should use 'direct_contig_accession', not cdot.""" + """NC_ (chromosome/contig) targets should use 'reference_accession_passthrough', not cdot.""" metadata = ScoresetMetadata( urn="urn:mavedb:00000001-a-1", target_genes={"GENE1": _make_acc_target("NC_000001.11", "GENE1")}, @@ -460,7 +460,7 @@ def test_tool_parameters_for_nc_accession_target(self): tm = result.target_mappings[0] assert tm.tool_parameters is not None - assert tm.tool_parameters.get("aligner") == "direct_contig_accession" + assert tm.tool_parameters.get("aligner") == "reference_accession_passthrough" # Must not bleed cdot fields into a non-cdot path assert "cdot_data_version" not in tm.tool_parameters assert "cdot_url" not in tm.tool_parameters From 6cc9b664ca89bf3c04cc6425e8d26e60348db89f Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Thu, 30 Apr 2026 10:34:10 -0700 Subject: [PATCH 6/6] chore(schema): Update schema to reflect updated output --- schema.json | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/schema.json b/schema.json index 708e4d1..32a9bc0 100644 --- a/schema.json +++ b/schema.json @@ -564,7 +564,7 @@ "type": "object" }, "TargetMapping": { - "description": "Per-(target, alignment_level) provenance and QC block.\n\nField names mirror the corresponding columns on `target_gene_mappings` in the\nMaveDB API so the API worker can deserialize directly with minimal transformation.\nAny aligner-specific structured details go in ``tool_parameters`` /\n``alignment_metadata`` (JSONB on the API side).\n\n``reference_assembly`` is a top-level column (not nested in ``tool_parameters``)\nbecause it describes the coordinate frame of the mapping result, not aligner\nconfiguration. It is ``None`` for alignments with no genomic frame (e.g.\nprotein-vs-protein).\n\n``tool_parameters`` shape per aligner\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nBLAT genomic (sequence-based, nucleotide or protein-vs-genome)::\n\n {\n \"aligner\": \"blat\",\n \"aligner_version\": \"\",\n \"min_score\": ,\n \"out_format\": \"\",\n \"target_args\": \"\",\n }\n\nBLAT protein-vs-protein (sequence-based, protein annotation layer)::\n\n {\n \"aligner\": \"blat\",\n \"aligner_version\": \"\",\n \"min_score\": ,\n \"out_format\": \"\",\n \"target_args\": \"-q=prot -t=prot\",\n }\n\ncdot transcript placement (accession-based)::\n\n {\n \"aligner\": \"cdot_transcript_placement\",\n \"aligner_version\": \"\",\n \"cdot_url\": \"\",\n \"cdot_data_version\": \"\",\n }\n\n``cdot_data_version`` is ``null`` when the cdot REST response did not\ninclude a ``cdot_data_version`` field; its presence (even as ``null``)\ndistinguishes \"cdot run, version unknown\" from \"not a cdot run\".", + "description": "Per-(target, alignment_level) provenance and QC block.\n\nField names mirror the corresponding columns on `target_gene_mappings` in the\nMaveDB API so the API worker can deserialize directly with minimal transformation.\nAny aligner-specific structured details go in ``tool_parameters`` /\n``alignment_metadata`` (JSONB on the API side).\n\n``reference_assembly`` is a top-level column (not nested in ``tool_parameters``)\nbecause it describes the coordinate frame of the mapping result, not aligner\nconfiguration. It is ``None`` for alignments with no genomic frame (e.g.\nprotein-vs-protein).\n\n``tool_parameters`` shape per aligner\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nBLAT genomic (sequence-based, nucleotide or protein-vs-genome)::\n\n {\n \"aligner\": \"blat\",\n \"aligner_version\": \"\",\n \"min_score\": ,\n \"out_format\": \"\",\n \"target_args\": \"\",\n }\n\nBLAT protein-vs-protein (sequence-based, protein annotation layer)::\n\n {\n \"aligner\": \"blat\",\n \"aligner_version\": \"\",\n \"min_score\": ,\n \"out_format\": \"\",\n \"target_args\": \"-q=prot -t=prot\",\n }\n\nPassthrough for fully qualified reference accessions (accession-based)::\n\n {\n \"aligner\": \"reference_accession_passthrough\",\n }\n\ncdot transcript placement (accession-based)::\n\n {\n \"aligner\": \"cdot_transcript_placement\",\n \"aligner_version\": \"\",\n \"cdot_url\": \"\",\n \"cdot_data_version\": \"\",\n }\n\n``cdot_data_version`` is ``null`` when the cdot REST response did not\ninclude a ``cdot_data_version`` field; its presence (even as ``null``)\ndistinguishes \"cdot run, version unknown\" from \"not a cdot run\".", "properties": { "alignment_length": { "anyOf": [ @@ -758,18 +758,6 @@ "default": null, "title": "Variants Failed" }, - "variants_failed_pre_layer_selection": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Variants Failed Pre Layer Selection" - }, "variants_mapped_cleanly": { "anyOf": [ { @@ -794,18 +782,6 @@ "default": null, "title": "Variants With Alignment Warnings" }, - "variants_with_mapping_warnings": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Variants With Mapping Warnings" - }, "vrs_version": { "anyOf": [ {