clingen-data-model / genegraph

Presents an RDF triplestore of gene information using GraphQL APIs
5 stars 0 forks source link

Variation descriptor snapshot generation producing text variation instead of Allele #757

Closed toneillbroad closed 1 year ago

toneillbroad commented 1 year ago

The snapshot is producing a Text variation but the GRCh37/38 information is available as "candidate_expressions".

It's clear that the variation is of two different timeframes (2019-07-01 for the snapshot vs 2022-06-26 gk-pilot snapshot).

{ "description": "NM_000152.4(GAA):c.716del (p.Leu239Argfs)", "subject_variation_descriptor": [], "type": "CanonicalVariationDescriptor", "xrefs": [ "https://www.ncbi.nlm.nih.gov/clinvar/556853", "https://identifiers.org/clinvar:556853" ], "alternate_labels": [], "canonical_variation": { "id": "ga4gh:VCC.1nBMtTE5PZK3YPxO-1ZFSN3-6YrgLpa-", "type": "CanonicalVariation", "complement": false, "variation": { "id": "ga4gh:VT.18o8JgGDMW_P8EJH67Fedwc_cFLKaxFl", "type": "Text", "definition": "clinvar:556853" } }, "record_metadata": { "type": "RecordMetadata", "is_version_of": "http://dataexchange.clinicalgenome.org/terms/VariationDescriptor_556853", "version": "2019-07-01" }, "extensions": [ { "type": "Extension", "name": "variation_type", "value": "Deletion" }, { "type": "Extension", "name": "entity_type", "value": "variation" }, { "type": "Extension", "name": "protein_change", "value": [] }, { "type": "Extension", "name": "clingen_version", "value": 0 }, { "type": "Extension", "name": "child_ids", "value": [] }, { "type": "Extension", "name": "allele_id", "value": "548398" }, { "type": "Extension", "name": "subclass_type", "value": "SimpleAllele" }, { "type": "Extension", "name": "clinvar_variation", "value": "https://identifiers.org/clinvar:556853" }, { "type": "Extension", "name": "descendant_ids", "value": [] }, { "type": "Extension", "name": "canonical_expression", "value": "clinvar:556853" }, { "type": "Extension", "name": "candidate_expressions", "value": [ { "expression": "NC_000017.11:g.80107580del", "label": "GRCh38" }, { "expression": "NC_000017.10:g.78081379del", "label": "GRCh37" }, { "expression": "clinvar:556853", "label": "Text" } ] } ], "label": "NM_000152.4(GAA):c.716del (p.Leu239Argfs)", "id": "http://dataexchange.clinicalgenome.org/terms/VariationDescriptor_556853.2019-07-01", "members": [ { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.g", "value": "LRG_673:g.11025del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.g", "value": "NG_009822.1:g.11025del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.g", "value": "NC_000017.10:g.78081379del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.g", "value": "NC_000017.11:g.80107580del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.c", "value": "NM_000152.3:c.716del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.p", "value": "NP_000143.2:p.Leu239Argfs" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.c", "value": "LRG_673t1:c.716del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.p", "value": "LRG_673p1:p.Leu239Argfs" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.c", "value": "NM_000152.4:c.716del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.p", "value": "NP_000143.2:p.Leu239Argfs" } ] } ] }

GK-pilot snapshot: { "description": "NM_000152.5(GAA):c.716del (p.Leu239fs)", "type": "CanonicalVariationDescriptor", "xrefs": [ "https://www.ncbi.nlm.nih.gov/clinvar/556853", "https://identifiers.org/clinvar:556853" ], "canonical_variation": { "id": "ga4gh:CLV.GoXYRvsXvWOat2OQUnuP5GUrz_wQVh16", "type": "CanonicalVariation", "canonical_context": { "id": "ga4gh:VA.eJMriiyLKeVohA5Obl5lwAWN80FkZlNp", "type": "Allele", "location": { "id": "ga4gh:SL.ii9TychibO-iCKbRehPkGb4lMQ6aB_18", "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", "start": { "type": "Number", "value": 80107579 }, "end": { "type": "Number", "value": 80107580 } }, "state": { "type": "LiteralSequenceExpression", "sequence": "" } } }, "extensions": [ { "type": "Extension", "name": "descendant_ids", "value": [] }, { "type": "Extension", "name": "candidate_expressions", "value": [ { "label": "GRCh37", "expression": "NC_000017.10:g.78081379del" }, { "label": "Text", "expression": "556853" }, { "label": "SPDI", "expression": "NC_000017.11:80107579:T:" }, { "label": "GRCh38", "expression": "NC_000017.11:g.80107580del" } ] }, { "type": "Extension", "name": "subclass_type", "value": "SimpleAllele" }, { "type": "Extension", "name": "child_ids", "value": [] }, { "type": "Extension", "name": "variation_type", "value": "Deletion" }, { "type": "Extension", "name": "canonical_expression", "value": "NC_000017.11:g.80107580del" }, { "type": "Extension", "name": "clinvar_variation", "value": "https://identifiers.org/clinvar:556853" }, { "type": "Extension", "name": "entity_type", "value": "variation" }, { "type": "Extension", "name": "allele_id", "value": "548398" }, { "type": "Extension", "name": "protein_change", "value": "L239fs" }, { "type": "Extension", "name": "clingen_version", "value": 0 } ], "label": "NM_000152.5(GAA):c.716del (p.Leu239fs)", "id": "cgterms:VariationDescriptor_556853.2022-06-26", "members": [ { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.g", "value": "NC_000017.11:g.80107580del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.c", "value": "NM_000152.5:c.716del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.g", "value": "NC_000017.10:g.78081379del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.c", "value": "NM_001079804.3:c.716del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "spdi", "value": "NC_000017.11:80107579:T:" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.c", "value": "NM_000152.3:c.716del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.c", "value": "LRG_673t1:c.716del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.p", "value": "NP_001073271.1:p.Leu239fs" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.c", "value": "NM_001079803.3:c.716del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.g", "value": "NG_009822.1:g.11025del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.g", "value": "LRG_673:g.11025del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.c", "value": "NM_000152.5(GAA):c.716del" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.p", "value": "NP_000143.2:p.Leu239fs" } ] }, { "type": "VariationMember", "expressions": [ { "type": "Expression", "syntax": "hgvs.p", "value": "NP_001073272.1:p.Leu239fs" } ] } ] }

theferrit32 commented 1 year ago

I don't see the string "NC_000017.11:g.80107580del" in the logs of the vrs-cache pod.

Connected to REPL in vrs-cache pod and calling the normalization function directly worked:

genegraph.transform.clinvar.cancervariants=> (normalize-canonical "NC_000017.11:g.80107580del" :hgvs)
{"id" "ga4gh:CAN.GoXYRvsXvWOat2OQUnuP5GUrz_wQVh16",
 "type" "CanonicalVariation",
 "canonical_context"
 {"id" "ga4gh:VA.eJMriiyLKeVohA5Obl5lwAWN80FkZlNp",
  "type" "Allele",
  "location"
  {"id" "ga4gh:SL.ii9TychibO-iCKbRehPkGb4lMQ6aB_18",
   "type" "SequenceLocation",
   "sequence_id" "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7",
   "start" {"type" "Number", "value" 80107579},
   "end" {"type" "Number", "value" 80107580}},
  "state" {"type" "LiteralSequenceExpression", "sequence" ""}},
 "@context"
 {"id" {"@id" "@id"},
  "_id" {"@id" "@id"},
  "type" {"@id" "@type", "@type" "@id"},
  "@vocab" "https://vrs.ga4gh.org/terms/",
  "normalize.variation" {"@id" "https://github.com/cancervariants/variation-normalization/", "@prefix" true}}}

Could be that this function wasn't getting called. The logic around special case handling of del expressions may be related. The gk-pilot dataset might have been generated with different logic there. Could also be a backend issue in the normalization service causing it to intermittently return an error reponse and it just happened to fail on this one.

theferrit32 commented 1 year ago

Log for this clinvar variation id: 20:04:55.952 [claypoole-0-19] INFO g.transform.clinvar.variation - {:candidate-expressions ({:expr "clinvar:556853", :type :text, :label "Text", :location nil}), :line 422} Candidate expressions filtered out during normalization are not included in the log but are still included on the event and show up in the extensions.

theferrit32 commented 1 year ago

Using the function above genegraph.transform.clinvar.cancervariants/normalize-canonical in the REPL in the currently running vrs-cache pod:

genegraph.transform.clinvar.variation=> (normalize-canonical-expression {::canonical-candidate-expressions [{:expr "NC_000017.11:g.80107580del" :expr-type :hgvs}]}) pretty output:

{
    "normalized": {
        "id": "ga4gh:CAN.GoXYRvsXvWOat2OQUnuP5GUrz_wQVh16",
        "type": "CanonicalVariation",
        "canonical_context": {
            "id": "ga4gh:VA.eJMriiyLKeVohA5Obl5lwAWN80FkZlNp",
            "type": "Allele",
            "location": {
                "id": "ga4gh:SL.ii9TychibO-iCKbRehPkGb4lMQ6aB_18",
                "type": "SequenceLocation",
                "sequence_id": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7",
                "start": {
                    "type": "Number",
                    "value": 80107579
                },
                "end": {
                    "type": "Number",
                    "value": 80107580
                }
            },
            "state": {
                "type": "LiteralSequenceExpression",
                "sequence": ""
            }
        },
        "@context": {
            "id": {
                "@id": "@id"
            },
            "type": {
                "@id": "@type",
                "@type": "@id"
            },
            "@vocab": "https://vrs.ga4gh.org/terms/",
            "normalize.variation": {
                "@id": "https://github.com/cancervariants/variation-normalization/",
                "@prefix": true
            }
        }
    },
    "expression": {
        "expr": "NC_000017.11:g.80107580del",
        "expr-type": "hgvs"
    },
    "label": null
}
theferrit32 commented 1 year ago

I can replicate this running the vrs-rocks-db-snapshotting branch locally. This branch doesn't have the changes I added to handle more dup/del instead of dropping them all into text. I will merge those changes in.

WARN g.transform.clinvar.variation - {:fn :normalize-canonical-expression, :msg "Removed some deldup candidate expressions", :removed #{{:expr "NC_000017.11:g.80107580del", :type :hgvs, :label "GRCh38", :location {:start "80107580", :stop "80107580", :variant-length 1}} {:expr "NC_000017.10:g.78081379del", :type :hgvs, :label "GRCh37", :location {:start "78081378", :stop "78081379", :variant-length 1}}}, :line 425}

INFO g.transform.clinvar.cancervariants - {:fn :vrs-allele-for-variation, :variation-expression "clinvar:556853", :expr-type :text, :line 282}

theferrit32 commented 1 year ago

Merging those changes fixed the issue. I performed a rebase of master onto vrs-rocks-db-snapshotting

{"description":"NM_000152.4(GAA):c.716del (p.Leu239Argfs)","type":"CanonicalVariationDescriptor","xrefs":["https://www.ncbi.nlm.nih.gov/clinvar/556853","https://identifiers.org/clinvar:556853"],"canonical_variation":{"id":"ga4gh:CAN.GoXYRvsXvWOat2OQUnuP5GUrz_wQVh16","type":"CanonicalVariation","canonical_context":{"id":"ga4gh:VA.eJMriiyLKeVohA5Obl5lwAWN80FkZlNp","type":"Allele","location":{"id":"ga4gh:SL.ii9TychibO-iCKbRehPkGb4lMQ6aB_18","type":"SequenceLocation","sequence_id":"ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7","start":{"type":"Number","value":80107579},"end":{"type":"Number","value":80107580}},"state":{"type":"LiteralSequenceExpression","sequence":""}}},"extensions":[{"type":"Extension","name":"variation_type","value":"Deletion"},{"type":"Extension","name":"entity_type","value":"variation"},{"type":"Extension","name":"protein_change","value":[]},{"type":"Extension","name":"clingen_version","value":0},{"type":"Extension","name":"child_ids","value":[]},{"type":"Extension","name":"allele_id","value":"548398"},{"type":"Extension","name":"subclass_type","value":"SimpleAllele"},{"type":"Extension","name":"clinvar_variation","value":"https://identifiers.org/clinvar:556853"},{"type":"Extension","name":"descendant_ids","value":[]},{"type":"Extension","name":"canonical_expression","value":"NC_000017.11:g.80107580del"},{"type":"Extension","name":"candidate_expressions","value":[{"expression":"NC_000017.11:g.80107580del","label":"GRCh38"},{"expression":"NC_000017.10:g.78081379del","label":"GRCh37"},{"expression":"clinvar:556853","label":"Text"}]}],"label":"NM_000152.4(GAA):c.716del (p.Leu239Argfs)","id":"http://dataexchange.clinicalgenome.org/terms/VariationDescriptor_556853.2019-07-01","members":[{"type":"VariationMember","expressions":[{"type":"Expression","syntax":"hgvs.g","value":"LRG_673:g.11025del"}]},{"type":"VariationMember","expressions":[{"type":"Expression","syntax":"hgvs.g","value":"NG_009822.1:g.11025del"}]},{"type":"VariationMember","expressions":[{"type":"Expression","syntax":"hgvs.g","value":"NC_000017.10:g.78081379del"}]},{"type":"VariationMember","expressions":[{"type":"Expression","syntax":"hgvs.g","value":"NC_000017.11:g.80107580del"}]},{"type":"VariationMember","expressions":[{"type":"Expression","syntax":"hgvs.c","value":"NM_000152.3:c.716del"}]},{"type":"VariationMember","expressions":[{"type":"Expression","syntax":"hgvs.p","value":"NP_000143.2:p.Leu239Argfs"}]},{"type":"VariationMember","expressions":[{"type":"Expression","syntax":"hgvs.c","value":"LRG_673t1:c.716del"}]},{"type":"VariationMember","expressions":[{"type":"Expression","syntax":"hgvs.p","value":"LRG_673p1:p.Leu239Argfs"}]},{"type":"VariationMember","expressions":[{"type":"Expression","syntax":"hgvs.c","value":"NM_000152.4:c.716del"}]},{"type":"VariationMember","expressions":[{"type":"Expression","syntax":"hgvs.p","value":"NP_000143.2:p.Leu239Argfs"}]}]}