clingen-data-model / clinvar-ingest

Apache License 2.0
2 stars 0 forks source link

Results file includes an empty clinical assertion #101

Closed theferrit32 closed 9 months ago

theferrit32 commented 9 months ago

When running the test data file through the workflow, the parse step is outputting a clinical_assertion record with null values (other than those added explicitly, like entity_type and release_date).

{
    "assertion_id": null,
    "title": null,
    "local_key": null,
    "assertion_accession": null,
    "version": null,
    "assertion_type": null,
    "date_created": null,
    "date_last_updated": null,
    "submitted_assembly": null,
    "record_status": null,
    "review_status": null,
    "interpretation_date_last_evaluated": null,
    "interpretation_description": null,
    "content": "null",
    "entity_type": "clinical_assertion",
    "release_date": "2023-10-07"
}

See:

gs://clinvar-ingest/executions/2023_10_07_2024_02_20T201140068660/clinvar_parsed/2023-10-07/clinical_assertion/clinical_assertion.ndjson

theferrit32 commented 9 months ago

It happens when the VCV has no interpretations.

This record in ClinVar: https://www.ncbi.nlm.nih.gov/clinvar/variation/VCV001264328.1/?redir=vcv

2024-02-20 15:57:15 clinvar_ingest[79759] INFO VariationArchive.from_xml(inp={"@VariationID": "1264328", "@VariationName": "NM_000497.4(CYP11B1):c.1024C>T (p.Gln342Ter)", "@VariationType": "single nucleotide variant", "@Accession": "VCV001264328", "@Version": "1", "@RecordType": "included", "@NumberOfSubmissions": "0", "@NumberOfSubmitters": "0", "@DateLastUpdated": "2022-04-25", "@DateCreated": "2021-09-19", "RecordStatus": {"$": "current"}, "Species": {"$": "Homo sapiens"}, "IncludedRecord": {"SimpleAllele": {"@AlleleID": "1254229", "@VariationID": "1264328", "GeneList": {"Gene": [{"@Symbol": "CYP11B1", "@FullName": "cytochrome P450 family 11 subfamily B member 1", "@GeneID": "1584", "@HGNC_ID": "HGNC:2591", "@Source": "submitted", "@RelationshipType": "within multiple genes by overlap", "Location": {"CytogeneticLocation": {"$": "8q24.3"}, "SequenceLocation": [{"@Assembly": "GRCh38", "@AssemblyAccessionVersion": "GCF_000001405.38", "@AssemblyStatus": "current", "@Chr": "8", "@Accession": "NC_000008.11", "@start": "142872357", "@stop": "142879825", "@display_start": "142872357", "@display_stop": "142879825", "@Strand": "-"}, {"@Assembly": "GRCh37", "@AssemblyAccessionVersion": "GCF_000001405.25", "@AssemblyStatus": "previous", "@Chr": "8", "@Accession": "NC_000008.10", "@start": "143953772", "@stop": "143961235", "@display_start": "143953772", "@display_stop": "143961235", "@Strand": "-"}]}, "OMIM": {"$": "610613"}}, {"@Symbol": "LOC106799833", "@FullName": "CYP11B1 recombination region", "@GeneID": "106799833", "@Source": "calculated", "@RelationshipType": "within multiple genes by overlap", "Location": {"CytogeneticLocation": {"$": "8q24.3"}, "SequenceLocation": {"@Assembly": "GRCh38", "@AssemblyAccessionVersion": "GCF_000001405.38", "@AssemblyStatus": "current", "@Chr": "8", "@Accession": "NC_000008.11", "@start": "142874234", "@stop": "142879022", "@display_start": "142874234", "@display_stop": "142879022", "@Strand": "+"}}}]}, "Name": {"$": "NM_000497.4(CYP11B1):c.1024C>T (p.Gln342Ter)"}, "CanonicalSPDI": {"$": "NC_000008.11:142875808:G:A"}, "VariantType": {"$": "single nucleotide variant"}, "Location": {"CytogeneticLocation": {"$": "8q24.3"}, "SequenceLocation": [{"@Assembly": "GRCh38", "@AssemblyAccessionVersion": "GCF_000001405.38", "@forDisplay": "true", "@AssemblyStatus": "current", "@Chr": "8", "@Accession": "NC_000008.11", "@start": "142875809", "@stop": "142875809", "@display_start": "142875809", "@display_stop": "142875809", "@variantLength": "1", "@positionVCF": "142875809", "@referenceAlleleVCF": "G", "@alternateAlleleVCF": "A"}, {"@Assembly": "GRCh37", "@AssemblyAccessionVersion": "GCF_000001405.25", "@AssemblyStatus": "previous", "@Chr": "8", "@Accession": "NC_000008.10", "@start": "143957225", "@stop": "143957225", "@display_start": "143957225", "@display_stop": "143957225", "@variantLength": "1", "@positionVCF": "143957225", "@referenceAlleleVCF": "G", "@alternateAlleleVCF": "A"}]}, "ProteinChange": {"$": "Q342*"}, "HGVSlist": {"HGVS": [{"@Assembly": "GRCh37", "@Type": "genomic, top-level", "NucleotideExpression": {"@sequenceAccessionVersion": "NC_000008.10", "@sequenceAccession": "NC_000008", "@sequenceVersion": "10", "@change": "g.143957225G>A", "@Assembly": "GRCh37", "Expression": {"$": "NC_000008.10:g.143957225G>A"}}}, {"@Assembly": "GRCh38", "@Type": "genomic, top-level", "NucleotideExpression": {"@sequenceAccessionVersion": "NC_000008.11", "@sequenceAccession": "NC_000008", "@sequenceVersion": "11", "@change": "g.142875809G>A", "@Assembly": "GRCh38", "Expression": {"$": "NC_000008.11:g.142875809G>A"}}}, {"@Type": "genomic", "NucleotideExpression": {"@sequenceAccessionVersion": "NG_007954.1", "@sequenceAccession": "NG_007954", "@sequenceVersion": "1", "@change": "g.9012C>T", "Expression": {"$": "NG_007954.1:g.9012C>T"}}}, {"@Type": "genomic", "NucleotideExpression": {"@sequenceAccessionVersion": "NG_046132.1", "@sequenceAccession": "NG_046132", "@sequenceVersion": "1", "@change": "g.1676G>A", "Expression": {"$": "NG_046132.1:g.1676G>A"}}}, {"@Type": "coding", "NucleotideExpression": {"@sequenceAccessionVersion": "NM_000497.4", "@sequenceAccession": "NM_000497", "@sequenceVersion": "4", "@change": "c.1024C>T", "@MANESelect": "true", "Expression": {"$": "NM_000497.4:c.1024C>T"}}, "ProteinExpression": {"@sequenceAccessionVersion": "NP_000488.3", "@sequenceAccession": "NP_000488", "@sequenceVersion": "3", "@change": "p.Gln342Ter", "Expression": {"$": "NP_000488.3:p.Gln342Ter"}}, "MolecularConsequence": {"@ID": "SO:0001587", "@Type": "nonsense", "@DB": "SO"}}, {"@Type": "coding", "NucleotideExpression": {"@sequenceAccessionVersion": "NM_001026213.1", "@sequenceAccession": "NM_001026213", "@sequenceVersion": "1", "@change": "c.1024C>T", "Expression": {"$": "NM_001026213.1:c.1024C>T"}}, "ProteinExpression": {"@sequenceAccessionVersion": "NP_001021384.1", "@sequenceAccession": "NP_001021384", "@sequenceVersion": "1", "@change": "p.Gln342Ter", "Expression": {"$": "NP_001021384.1:p.Gln342Ter"}}, "MolecularConsequence": {"@ID": "SO:0001587", "@Type": "nonsense", "@DB": "SO"}}]}, "Interpretations": {"Interpretation": {"@NumberOfSubmissions": "0", "@NumberOfSubmitters": "0", "@Type": "Clinical significance", "Description": {"$": "no interpretation for the single variant"}}}, "XRefList": {"XRef": {"@Type": "Interpreted", "@ID": "1264342", "@DB": "ClinVar"}}}, "ReviewStatus": {"$": "no interpretation for the single variant"}, "Interpretations": {"Interpretation": {"@NumberOfSubmissions": "0", "@NumberOfSubmitters": "0", "@Type": "Clinical significance", "Description": {"$": "no interpretation for the single variant"}}}, "SubmittedInterpretationList": {"SCV": {"@Accession": "SCV001890911", "@Version": "1"}}, "InterpretedVariationList": {"InterpretedVariation": {"@VariationID": "1264342", "@Accession": "VCV001264342", "@Version": "1"}}}}, jsonify_content=True)

2024-02-20 15:57:15 clinvar_ingest[79759] INFO interpretation: {"@NumberOfSubmissions": "0", "@NumberOfSubmitters": "0", "@Type": "Clinical significance", "Description": {"$": "no interpretation for the single variant"}}

2024-02-20 15:57:15 clinvar_ingest[79759] INFO xml_clinical_assertions: []

theferrit32 commented 9 months ago

Issue here: https://github.com/clingen-data-model/clinvar-ingest/blob/e3c2af1456ba28872f924e03f5c24c745409c923/clinvar_ingest/model.py#L853-L863

The extract on line 857 returns a None if there are no ClinicalAssertion entries where expected (in this case there are actually none), and ensure_list turns this into [None]. We can just make it an empty list