openvar / variantValidator

Public repository for VariantValidator project
GNU Affero General Public License v3.0
70 stars 21 forks source link

New MANE transcript for ALMS1 gene causing issues #380

Open Peter-J-Freeman opened 2 years ago

Peter-J-Freeman commented 2 years ago

Describe the bug When looking at variant 1 of the VariantValidator truth set, I tried updating to a MANE variant.

I got a strange delins in the MANE transcripts which is a simple substitution in the former RefSeq Select Transcript

To Reproduce

>>> import json
>>> import VariantValidator
>>> vval = VariantValidator.Validator()
>>> variant = 'NC_000002.12:g.73385903T>C'
>>> genome_build = 'GRCh38'
>>> select_transcripts = 'all'
>>> validate = vval.validate(variant, genome_build, select_transcripts)
/Users/pjf9/anaconda3/envs/vvenv/lib/python3.6/site-packages/Bio/Seq.py:2715: BiopythonWarning: Partial codon, len(sequence) not a multiple of three. Explicitly trim the sequence or add trailing N before translation. This may become an error in future.
  BiopythonWarning)
>>> validation = validate.format_as_dict(with_meta=True)
>>> print(json.dumps(validation, sort_keys=True, indent=4, separators=(',', ': ')))
{
    "NM_001378454.1:c.35T>C": {  # As expected
        "alt_genomic_loci": [],
        "annotations": {
            "chromosome": "2",
            "db_xref": {
                "CCDS": null,
                "ensemblgene": null,
                "hgnc": "HGNC:428",
                "ncbigene": "7840",
                "select": "MANE"
            },
            "ensembl_select": false,
            "mane_plus_clinical": false,
            "mane_select": true,
            "map": "2p13.1",
            "note": "ALMS1 centrosome and basal body associated protein",
            "refseq_select": true,
            "variant": "2"
        },
        "gene_ids": {
            "ccds_ids": [
                "CCDS42697"
            ],
            "ensembl_gene_id": "ENSG00000116127",
            "entrez_gene_id": "7840",
            "hgnc_id": "HGNC:428",
            "omim_id": [
                "606844"
            ],
            "ucsc_id": "uc032nrd.1"
        },
        "gene_symbol": "ALMS1",
        "genome_context_intronic_sequence": "",
        "hgvs_lrg_transcript_variant": "",
        "hgvs_lrg_variant": "",
        "hgvs_predicted_protein_consequence": {
            "lrg_slr": "",
            "lrg_tlr": "",
            "slr": "NP_001365383.1:p.(L12P)",
            "tlr": "NP_001365383.1:p.(Leu12Pro)"
        },
        "hgvs_refseqgene_variant": "",
        "hgvs_transcript_variant": "NM_001378454.1:c.35T>C",
        "primary_assembly_loci": {
            "grch37": {
                "hgvs_genomic_description": "NC_000002.11:g.73613031T>C",
                "vcf": {
                    "alt": "C",
                    "chr": "2",
                    "pos": "73613031",
                    "ref": "T"
                }
            },
            "grch38": {
                "hgvs_genomic_description": "NC_000002.12:g.73385903T>C",
                "vcf": {
                    "alt": "C",
                    "chr": "2",
                    "pos": "73385903",
                    "ref": "T"
                }
            },
            "hg19": {
                "hgvs_genomic_description": "NC_000002.11:g.73613031T>C",
                "vcf": {
                    "alt": "C",
                    "chr": "chr2",
                    "pos": "73613031",
                    "ref": "T"
                }
            },
            "hg38": {
                "hgvs_genomic_description": "NC_000002.12:g.73385903T>C",
                "vcf": {
                    "alt": "C",
                    "chr": "chr2",
                    "pos": "73385903",
                    "ref": "T"
                }
            }
        },
        "reference_sequence_records": {
            "protein": "https://www.ncbi.nlm.nih.gov/nuccore/NP_001365383.1",
            "transcript": "https://www.ncbi.nlm.nih.gov/nuccore/NM_001378454.1"
        },
        "refseqgene_context_intronic_sequence": "",
        "selected_assembly": "GRCh38",
        "submitted_variant": "NC_000002.12:g.73385903T>C",
        "transcript_description": "Homo sapiens ALMS1 centrosome and basal body associated protein (ALMS1), transcript variant 2, mRNA",
        "validation_warnings": [
            "Removing redundant reference bases from variant description",
            "The displayed variants may be artefacts of aligning NM_015120.4 with genome build GRCh38: NM_015120.4:c.35_41 contains 3 transcript base(s) that fail to align to chromosome NC_000002.12: : Caution should be used when reporting the displayed variant descriptions: If you are unsure, please contact admin",
            "RefSeqGene record not available"
        ],
        "variant_exonic_positions": {
            "NC_000002.11": {
                "end_exon": "1",
                "start_exon": "1"
            },
            "NC_000002.12": {
                "end_exon": "1",
                "start_exon": "1"
            }
        }
    },
    "NM_015120.4:c.35_41delinsC": { # Weird delins
        "alt_genomic_loci": [],
        "annotations": {
            "chromosome": "2",
            "db_xref": {
                "CCDS": "CCDS42697.1",
                "ensemblgene": null,
                "hgnc": "HGNC:428",
                "ncbigene": "7840",
                "select": false
            },
            "ensembl_select": false,
            "mane_plus_clinical": false,
            "mane_select": false,
            "map": "2p13.1",
            "note": "ALMS1 centrosome and basal body associated protein",
            "refseq_select": false,
            "variant": "1"
        },
        "gene_ids": {
            "ccds_ids": [
                "CCDS42697"
            ],
            "ensembl_gene_id": "ENSG00000116127",
            "entrez_gene_id": "7840",
            "hgnc_id": "HGNC:428",
            "omim_id": [
                "606844"
            ],
            "ucsc_id": "uc032nrd.1"
        },
        "gene_symbol": "ALMS1",
        "genome_context_intronic_sequence": "",
        "hgvs_lrg_transcript_variant": "LRG_741t1:c.35_41delinsC",
        "hgvs_lrg_variant": "LRG_741:g.5146_5152delinsC",
        "hgvs_predicted_protein_consequence": {
            "lrg_slr": "LRG_741p1:p.(L12_E14delinsP)",
            "lrg_tlr": "LRG_741p1:p.(Leu12_Glu14delinsPro)",
            "slr": "NP_055935.4:p.(L12_E14delinsP)",
            "tlr": "NP_055935.4:p.(Leu12_Glu14delinsPro)"
        },
        "hgvs_refseqgene_variant": "NG_011690.1:g.5146_5152delinsC",
        "hgvs_transcript_variant": "NM_015120.4:c.35_41delinsC",
        "primary_assembly_loci": {
            "grch37": {
                "hgvs_genomic_description": "NC_000002.11:g.73613031_73613034delinsC",
                "vcf": {
                    "alt": "C",
                    "chr": "2",
                    "pos": "73613031",
                    "ref": "TGGA"
                }
            },
            "grch38": {
                "hgvs_genomic_description": "NC_000002.12:g.73385903_73385906delinsC",
                "vcf": {
                    "alt": "C",
                    "chr": "2",
                    "pos": "73385903",
                    "ref": "TGGA"
                }
            },
            "hg19": {
                "hgvs_genomic_description": "NC_000002.11:g.73613031_73613034delinsC",
                "vcf": {
                    "alt": "C",
                    "chr": "chr2",
                    "pos": "73613031",
                    "ref": "TGGA"
                }
            },
            "hg38": {
                "hgvs_genomic_description": "NC_000002.12:g.73385903_73385906delinsC",
                "vcf": {
                    "alt": "C",
                    "chr": "chr2",
                    "pos": "73385903",
                    "ref": "TGGA"
                }
            }
        },
        "reference_sequence_records": {
            "lrg": "http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_741.xml",
            "protein": "https://www.ncbi.nlm.nih.gov/nuccore/NP_055935.4",
            "refseqgene": "https://www.ncbi.nlm.nih.gov/nuccore/NG_011690.1",
            "transcript": "https://www.ncbi.nlm.nih.gov/nuccore/NM_015120.4"
        },
        "refseqgene_context_intronic_sequence": "",
        "selected_assembly": "GRCh38",
        "submitted_variant": "NC_000002.12:g.73385903T>C",
        "transcript_description": "Homo sapiens ALMS1 centrosome and basal body associated protein (ALMS1), transcript variant 1, mRNA",
        "validation_warnings": [
            "Removing redundant reference bases from variant description",
            "The displayed variants may be artefacts of aligning NM_015120.4 with genome build GRCh38: NM_015120.4:c.35_41 contains 3 transcript base(s) that fail to align to chromosome NC_000002.12: : Caution should be used when reporting the displayed variant descriptions: If you are unsure, please contact admin",
            "RefSeqGene record not available"
        ],
        "variant_exonic_positions": {
            "NC_000002.11": {
                "end_exon": "1",
                "start_exon": "1"
            },
            "NC_000002.12": {
                "end_exon": "1",
                "start_exon": "1"
            },
            "NG_011690.1": {
                "end_exon": "1",
                "start_exon": "1"
            }
        }
    },
    "flag": "gene_variant",
    "metadata": {
        "variantvalidator_hgvs_version": "2.0.1.dev2+g58fc52a",
        "variantvalidator_version": "1.0.5.dev272+gd30cbb9",
        "vvdb_version": "vvdb_2022_04",
        "vvseqrepo_db": "VV_SR_2022_02/master",
        "vvta_version": "vvta_2022_02"
    }
}
>>> 

Expected behavior To be confirmed

Peter-J-Freeman commented 2 years ago

seems OK when mane is selected

>>> import json
>>> import VariantValidator
>>> vval = VariantValidator.Validator()
>>> variant = 'NC_000002.12:g.73385903T>C'
>>> genome_build = 'GRCh38'
>>> select_transcripts = 'mane'
>>> validate = vval.validate(variant, genome_build, select_transcripts)
>>> validation = validate.format_as_dict(with_meta=True)
>>> print(json.dumps(validation, sort_keys=True, indent=4, separators=(',', ': ')))
{
    "NM_001378454.1:c.35T>C": { # Looks good
        "alt_genomic_loci": [],
        "annotations": {
            "chromosome": "2",
            "db_xref": {
                "CCDS": null,
                "ensemblgene": null,
                "hgnc": "HGNC:428",
                "ncbigene": "7840",
                "select": "MANE"
            },
            "ensembl_select": false,
            "mane_plus_clinical": false,
            "mane_select": true,
            "map": "2p13.1",
            "note": "ALMS1 centrosome and basal body associated protein",
            "refseq_select": true,
            "variant": "2"
        },
        "gene_ids": {
            "ccds_ids": [
                "CCDS42697"
            ],
            "ensembl_gene_id": "ENSG00000116127",
            "entrez_gene_id": "7840",
            "hgnc_id": "HGNC:428",
            "omim_id": [
                "606844"
            ],
            "ucsc_id": "uc032nrd.1"
        },
        "gene_symbol": "ALMS1",
        "genome_context_intronic_sequence": "",
        "hgvs_lrg_transcript_variant": "",
        "hgvs_lrg_variant": "",
        "hgvs_predicted_protein_consequence": {
            "lrg_slr": "",
            "lrg_tlr": "",
            "slr": "NP_001365383.1:p.(L12P)",
            "tlr": "NP_001365383.1:p.(Leu12Pro)"
        },
        "hgvs_refseqgene_variant": "",
        "hgvs_transcript_variant": "NM_001378454.1:c.35T>C",
        "primary_assembly_loci": {
            "grch37": {
                "hgvs_genomic_description": "NC_000002.11:g.73613031T>C",
                "vcf": {
                    "alt": "C",
                    "chr": "2",
                    "pos": "73613031",
                    "ref": "T"
                }
            },
            "grch38": {
                "hgvs_genomic_description": "NC_000002.12:g.73385903T>C",
                "vcf": {
                    "alt": "C",
                    "chr": "2",
                    "pos": "73385903",
                    "ref": "T"
                }
            },
            "hg19": {
                "hgvs_genomic_description": "NC_000002.11:g.73613031T>C",
                "vcf": {
                    "alt": "C",
                    "chr": "chr2",
                    "pos": "73613031",
                    "ref": "T"
                }
            },
            "hg38": {
                "hgvs_genomic_description": "NC_000002.12:g.73385903T>C",
                "vcf": {
                    "alt": "C",
                    "chr": "chr2",
                    "pos": "73385903",
                    "ref": "T"
                }
            }
        },
        "reference_sequence_records": {
            "protein": "https://www.ncbi.nlm.nih.gov/nuccore/NP_001365383.1",
            "transcript": "https://www.ncbi.nlm.nih.gov/nuccore/NM_001378454.1"
        },
        "refseqgene_context_intronic_sequence": "",
        "selected_assembly": "GRCh38",
        "submitted_variant": "NC_000002.12:g.73385903T>C",
        "transcript_description": "Homo sapiens ALMS1 centrosome and basal body associated protein (ALMS1), transcript variant 2, mRNA",
        "validation_warnings": [
            "RefSeqGene record not available"
        ],
        "variant_exonic_positions": {
            "NC_000002.11": {
                "end_exon": "1",
                "start_exon": "1"
            },
            "NC_000002.12": {
                "end_exon": "1",
                "start_exon": "1"
            }
        }
    },
    "flag": "gene_variant",
    "metadata": {
        "variantvalidator_hgvs_version": "2.0.1.dev2+g58fc52a",
        "variantvalidator_version": "1.0.5.dev272+gd30cbb9",
        "vvdb_version": "vvdb_2022_04",
        "vvseqrepo_db": "VV_SR_2022_02/master",
        "vvta_version": "vvta_2022_02"
    }
}
Peter-J-Freeman commented 2 years ago

Correction, MANE is ok, the other transcript gives the delins!