Closed Peter-J-Freeman closed 1 year ago
I want to change how r. inputs are handled. Here is an example based on this issue
import json
import VariantValidator
vval = VariantValidator.Validator()
variant = 'NM_000484.3:r.2065_2211del' # variant 1
genome_build = 'GRCh38'
select_transcripts = 'all'
transcript_set = 'refseq'
validate = vval.validate(variant, genome_build, select_transcripts, transcript_set)
validation = validate.format_as_dict(with_meta=True)
print(json.dumps(validation, sort_keys=True, indent=4, separators=(',', ': ')))
{
"NM_000484.3:c.2067_2211+2del": { # r. assumed to be equivalent to the c. so converted then mapped as if it is a c.
"alt_genomic_loci": [],
"annotations": {
"chromosome": "21",
"db_xref": {
"CCDS": "CCDS13576.1",
"ensemblgene": null,
"hgnc": "HGNC:620",
"ncbigene": "351",
"select": false
},
"ensembl_select": false,
"mane_plus_clinical": false,
"mane_select": false,
"map": "21q21.3",
"note": "amyloid beta precursor protein",
"refseq_select": false,
"variant": "1"
},
"gene_ids": {
"ccds_ids": [
"CCDS13577",
"CCDS56211",
"CCDS56212",
"CCDS56213",
"CCDS33523",
"CCDS46638",
"CCDS46639",
"CCDS13576"
],
"ensembl_gene_id": "ENSG00000142192",
"entrez_gene_id": "351",
"hgnc_id": "HGNC:620",
"omim_id": [
"104760"
],
"ucsc_id": "uc002ylz.4"
},
"gene_symbol": "APP",
"genome_context_intronic_sequence": "NC_000021.9(NM_000484.3):c.2067_2211+2del",
"hgvs_lrg_transcript_variant": "",
"hgvs_lrg_variant": "",
"hgvs_predicted_protein_consequence": {
"lrg_slr": "",
"lrg_tlr": "",
"slr": "NP_000475.1:p.?",
"tlr": "NP_000475.1:p.?"
},
"hgvs_refseqgene_variant": "NG_007376.1:g.283955_284101del",
"hgvs_transcript_variant": "NM_000484.3:c.2067_2211+2del",
"primary_assembly_loci": {
"grch37": {
"hgvs_genomic_description": "NC_000021.8:g.27264036_27264182del",
"vcf": {
"alt": "T",
"chr": "21",
"pos": "27264031",
"ref": "TACCTCCACCACACCATGATGAATGGATGTGTACTGTTTCTTCTTCAGCATCACCAAGGTGATGACGATCACTGTCGCTATGACAACACCGCCCACCATGAGTCCAATGATTGCACCTTTGTTTGAACCCACATCTTCTGCAAAGAAC"
}
},
"grch38": {
"hgvs_genomic_description": "NC_000021.9:g.25891724_25891870del",
"vcf": {
"alt": "T",
"chr": "21",
"pos": "25891719",
"ref": "TACCTCCACCACACCATGATGAATGGATGTGTACTGTTTCTTCTTCAGCATCACCAAGGTGATGACGATCACTGTCGCTATGACAACACCGCCCACCATGAGTCCAATGATTGCACCTTTGTTTGAACCCACATCTTCTGCAAAGAAC"
}
},
"hg19": {
"hgvs_genomic_description": "NC_000021.8:g.27264036_27264182del",
"vcf": {
"alt": "T",
"chr": "chr21",
"pos": "27264031",
"ref": "TACCTCCACCACACCATGATGAATGGATGTGTACTGTTTCTTCTTCAGCATCACCAAGGTGATGACGATCACTGTCGCTATGACAACACCGCCCACCATGAGTCCAATGATTGCACCTTTGTTTGAACCCACATCTTCTGCAAAGAAC"
}
},
"hg38": {
"hgvs_genomic_description": "NC_000021.9:g.25891724_25891870del",
"vcf": {
"alt": "T",
"chr": "chr21",
"pos": "25891719",
"ref": "TACCTCCACCACACCATGATGAATGGATGTGTACTGTTTCTTCTTCAGCATCACCAAGGTGATGACGATCACTGTCGCTATGACAACACCGCCCACCATGAGTCCAATGATTGCACCTTTGTTTGAACCCACATCTTCTGCAAAGAAC"
}
}
},
"reference_sequence_records": {
"protein": "https://www.ncbi.nlm.nih.gov/nuccore/NP_000475.1",
"refseqgene": "https://www.ncbi.nlm.nih.gov/nuccore/NG_007376.1",
"transcript": "https://www.ncbi.nlm.nih.gov/nuccore/NM_000484.3"
},
"refseqgene_context_intronic_sequence": "NG_007376.1(NM_000484.3):c.2067_2211+2del",
"selected_assembly": "GRCh38",
"submitted_variant": "NM_000484.3:r.2065_2211del",
"transcript_description": "Homo sapiens amyloid beta precursor protein (APP), transcript variant 1, mRNA",
"validation_warnings": [
"NM_000484.3:r.2065_2211del automapped to NM_000484.3:c.2065_2211del",
"NM_000484.3:c.2065_2211del normalized to NM_000484.3:c.2067_2211+2del",
"A more recent version of the selected reference sequence NM_000484.3 is available (NM_000484.4): NM_000484.4:c.2067_2211+2del MUST be fully validated prior to use in reports: select_variants=NM_000484.4:c.2067_2211+2del"
],
"variant_exonic_positions": {
"NC_000021.8": {
"end_exon": "17i",
"start_exon": "17"
},
"NC_000021.9": {
"end_exon": "17i",
"start_exon": "17"
},
"NG_007376.1": {
"end_exon": "17i",
"start_exon": "17"
}
}
},
"flag": "gene_variant",
"metadata": {
"variantvalidator_hgvs_version": "2.0.2.dev1+g6ecbf8e",
"variantvalidator_version": "1.0.5.dev288+g3cf22ee",
"vvdb_version": "vvdb_2022_04",
"vvseqrepo_db": "VV_SR_2022_02/master",
"vvta_version": "vvta_2022_02"
}
}
I think what ought to happen is that r. should be treated as an individual variant, normalised with exon-boundary crossing enabled because we know we are dealing with a spliced transcript i.e. no introns, and then translate.
However, the assumptions about mapping r. to c. are not irrelevant and should still be included and appropriate warning provided. So I'm thinking of adding an additional key to the json like so
{
"NM_000484.3:c.2067_2211+2del": { # r. assumed to be equivalent to the c. so converted then mapped as if it is a c.
"alt_genomic_loci": [],
"annotations": {
"chromosome": "21",
"db_xref": {
"CCDS": "CCDS13576.1",
"ensemblgene": null,
"hgnc": "HGNC:620",
"ncbigene": "351",
"select": false
},
"ensembl_select": false,
"mane_plus_clinical": false,
"mane_select": false,
"map": "21q21.3",
"note": "amyloid beta precursor protein",
"refseq_select": false,
"variant": "1"
},
"gene_ids": {
"ccds_ids": [
"CCDS13577",
"CCDS56211",
"CCDS56212",
"CCDS56213",
"CCDS33523",
"CCDS46638",
"CCDS46639",
"CCDS13576"
],
"ensembl_gene_id": "ENSG00000142192",
"entrez_gene_id": "351",
"hgnc_id": "HGNC:620",
"omim_id": [
"104760"
],
"ucsc_id": "uc002ylz.4"
},
"gene_symbol": "APP",
"genome_context_intronic_sequence": "NC_000021.9(NM_000484.3):c.2067_2211+2del",
"hgvs_lrg_transcript_variant": "",
"hgvs_lrg_variant": "",
"hgvs_predicted_protein_consequence": {
"lrg_slr": "",
"lrg_tlr": "",
"slr": "NP_000475.1:p.?",
"tlr": "NP_000475.1:p.?"
},
"hgvs_refseqgene_variant": "NG_007376.1:g.283955_284101del",
"hgvs_transcript_variant": "NM_000484.3:c.2067_2211+2del",
"rna_variant": {
"hgvs_transcript_variant": "NM_000484.4:c.2065_2211del",
"hgvs_predicted_protein_consequence": {
"lrg_slr": "",
"lrg_tlr": "",
"slr": "NP_000475.1:p.Pos&Edit",
"tlr": "NP_000475.1:p.Pos&Edit"
"useage_information": "Some stuff relevant to r. variants being annotative etc"
},
"primary_assembly_loci": {
"grch37": {
"hgvs_genomic_description": "NC_000021.8:g.27264036_27264182del",
"vcf": {
"alt": "T",
"chr": "21",
"pos": "27264031",
"ref": "TACCTCCACCACACCATGATGAATGGATGTGTACTGTTTCTTCTTCAGCATCACCAAGGTGATGACGATCACTGTCGCTATGACAACACCGCCCACCATGAGTCCAATGATTGCACCTTTGTTTGAACCCACATCTTCTGCAAAGAAC"
}
},
"grch38": {
"hgvs_genomic_description": "NC_000021.9:g.25891724_25891870del",
"vcf": {
"alt": "T",
"chr": "21",
"pos": "25891719",
"ref": "TACCTCCACCACACCATGATGAATGGATGTGTACTGTTTCTTCTTCAGCATCACCAAGGTGATGACGATCACTGTCGCTATGACAACACCGCCCACCATGAGTCCAATGATTGCACCTTTGTTTGAACCCACATCTTCTGCAAAGAAC"
}
},
"hg19": {
"hgvs_genomic_description": "NC_000021.8:g.27264036_27264182del",
"vcf": {
"alt": "T",
"chr": "chr21",
"pos": "27264031",
"ref": "TACCTCCACCACACCATGATGAATGGATGTGTACTGTTTCTTCTTCAGCATCACCAAGGTGATGACGATCACTGTCGCTATGACAACACCGCCCACCATGAGTCCAATGATTGCACCTTTGTTTGAACCCACATCTTCTGCAAAGAAC"
}
},
"hg38": {
"hgvs_genomic_description": "NC_000021.9:g.25891724_25891870del",
"vcf": {
"alt": "T",
"chr": "chr21",
"pos": "25891719",
"ref": "TACCTCCACCACACCATGATGAATGGATGTGTACTGTTTCTTCTTCAGCATCACCAAGGTGATGACGATCACTGTCGCTATGACAACACCGCCCACCATGAGTCCAATGATTGCACCTTTGTTTGAACCCACATCTTCTGCAAAGAAC"
}
}
},
"reference_sequence_records": {
"protein": "https://www.ncbi.nlm.nih.gov/nuccore/NP_000475.1",
"refseqgene": "https://www.ncbi.nlm.nih.gov/nuccore/NG_007376.1",
"transcript": "https://www.ncbi.nlm.nih.gov/nuccore/NM_000484.3"
},
"refseqgene_context_intronic_sequence": "NG_007376.1(NM_000484.3):c.2067_2211+2del",
"selected_assembly": "GRCh38",
"submitted_variant": "NM_000484.3:r.2065_2211del",
"transcript_description": "Homo sapiens amyloid beta precursor protein (APP), transcript variant 1, mRNA",
"validation_warnings": [
"NM_000484.3:r.2065_2211del automapped to NM_000484.3:c.2065_2211del",
"NM_000484.3:c.2065_2211del normalized to NM_000484.3:c.2067_2211+2del",
"A more recent version of the selected reference sequence NM_000484.3 is available (NM_000484.4): NM_000484.4:c.2067_2211+2del MUST be fully validated prior to use in reports: select_variants=NM_000484.4:c.2067_2211+2del"
],
"variant_exonic_positions": {
"NC_000021.8": {
"end_exon": "17i",
"start_exon": "17"
},
"NC_000021.9": {
"end_exon": "17i",
"start_exon": "17"
},
"NG_007376.1": {
"end_exon": "17i",
"start_exon": "17"
}
}
},
"flag": "gene_variant",
"metadata": {
"variantvalidator_hgvs_version": "2.0.2.dev1+g6ecbf8e",
"variantvalidator_version": "1.0.5.dev288+g3cf22ee",
"vvdb_version": "vvdb_2022_04",
"vvseqrepo_db": "VV_SR_2022_02/master",
"vvta_version": "vvta_2022_02"
}
}
That way we correctly describe the variant at the g. level, c. level, r. level and p. levels. Any thoughta @ifokkema and all?
Personally, I'd be fine with a different endpoint. Integrating it into the VV endpoint will have several negative consequences.
rna_variant
.But, your time is obviously also scarce. So, if including it directly into VV would save you so much time/trouble, that it overrules the issues above, just go ahead :wink:
To be honest, this was my second option.
I am quite happy with that too, but will still likely add it to the VV output as an educational resource i.e. we can provide all the necessary descriptions in 1 output. The idea is to actually show users that r. is different to c. and explain how and why with warnings and tell them what goes where. Can't hide forever lol
We would still be interested in the genome mapping and protein prediction of the DNA variant which we already are getting from VV. Having the r. additionally is what will make it incredibly valuable for us, but also many more researchers interested exon-skipping therapies who try to evaluate which exon skips (deletions) cause which effects on protein level.
@DCRT-LUMC , exactly, that's why as well as a separate endpoint as requested by @ifokkema (which is a good idea as well) embedding the function directly into VV so that all the outputs are in 1 place will be really useful
My user story is
As a lecturer, I need to be able to train students that r. descriptions are different from c. descriptions. I need to be able to map the provided r. description to the c. and g. description with the correct p. description derived from the c., while also proving the r. description and corrected p. description based on the r. description. It is therefore useful to me if this is all available in 1 output
Are you happy with the proposed outputs, an embedded method and a separated method
@Peter-J-Freeman Yes, we are super happy with the proposed outputs as embedded and separate as well! Thank you very much.
Just to make sure I fully understand you, will there be an RNA section also when somebody enters a DNA variant? Or only when somebody provides an r. input? I.e., will there be an NM_000484.4:r.2065_2211del
handling also when somebody enters NM_000484.4:c.2065_2211del
, which is normalized to NM_000484.4:c.2067_2211+2del
(with genomic background)? I wouldn't recommend this, but if you do want to add it to c. input, we'd need to make absolutely sure that this is not an RNA prediction of the given variant, just an alternative interpretation of the input variant.
I was thinking to only fill the r. annotation if someone specifically inputs an r. description. So there will be no r. handling when NM_000484.4:c.2065_2211del is input, only when NM_000484.4:r.2065_2211del is input. It is possible that this will normalize into the next exon though, and I think the VV code can handle this!!?!. Will report back here if this is the case and also fully agree on the HGVS rules here. For example, HGVS rules would say we should go from NM_000484.4:c.2065_2211del > NM_000484.4:c.2067_2211+2del and variant descriptions should not cross the exon boundary. However, I would argue that NM_000484.4:r.2065_2211del should cross the boundary because by specifying r. you are presenting a spliced transcript (mature mRNA). What do you guys think?
I agree best to only get r. output if someone uses it as a specific input. However, people should be aware that this is now a possibility? I also agree that specifying r. is the mRNA, i.e. the spliced transcript and as thus can cross the exon boundaries (technically, there are no boundaries any more?).
I was thinking to only fill the r. annotation if someone specifically inputs an r. description.
Excellent!
It is possible that this will normalize into the next exon though (...) I would argue that NM_000484.4:r.2065_2211del should cross the boundary because by specifying r. you are presenting a spliced transcript (mature mRNA). What do you guys think?
Fully agree!
However, people should be aware that this is now a possibility?
Yes, I believe it may become the first tool that will allow this at all.
(...) (technically, there are no boundaries anymore?).
Exactly! And, normalizing 3' as the HGVS prescribes here simply results in the same protein prediction as splicing the desired exon out, so it fits your needs precisely.
Thanks both. We all agree then. I will start building ASAP. I just cracked my laptop screen though so might be a short delay. AAARGH
Oh, that sucks :cry: I hope it will be repaired soon!
Oh, that sucks 😢 I hope it will be repaired soon!
I'm getting a little annoyed. The computer will cost a few hundred pounds to repair. However the university wants me instead to get a staff machine. These machines are way lest powerful, would require hours/days for me to reconfigure and would not process my large data sets quickly enough. Also, each costs the uni ~2000 pounds. Also, the university claims to be environmentally but is encouraging a throw away society. Time to go to war!!!
Apologies for being late to the party.
I think that I agree with most of what has been said already. The key points from my perspective are:
@leicray . Think this agrees with our thinking. Excellent
I forgot one additional point:
I forgot one additional point:
- An r. description does not contain any information that can unequivocally identify the causative g. or c. sequence variant that gave rise to the the altered RNA described by the r. description
That's a valid point, and it argues against having a c.
variant described as the input for the variant. However, since the use case also includes providing genomic mappings, we have no choice but to go through a DNA variant description. It would justify, however, a reminder somewhere in the output about this "silent mapping" from RNA to cDNA.
In isolation, a variant that is described as r. could be caused by several possible c. or g. variants. However, in the case of a particular individual, the c. and g. descriptions will (should?) be known.
Hence, an r. description (or descriptions) relating to a single individual must always be in addition to a c. or g. description. For each r. description, the corresponding p. description should be generated.
That's not really possible in this use case - well, it could be faked, but it will only cause more issues and make things more complicated.
p.?
and that's it.c.100+1_200-1del
for deleting an exon running from 101 to 199 inclusive, effectively creating RNA, but VV doesn't predict the protein change then, either.c.101_199del
in combo with r.101_199del
, then you'd run into the issue that VV doesn't allow for submitting two "levels" of one variant. Also, it would just complicate things.The simplest is then to input the RNA; VV predicts the simplest possible DNA change that could have caused this RNA variant, maps that to the genome, predicts the protein change of the RNA change, and we're done.
The simplest is then to input the RNA; VV predicts the simplest possible DNA change that could have caused this RNA variant, maps that to the genome, predicts the protein change of the RNA change, and we're done.
That's not a viable solution for all use cases and would be misleading in most instances. The variant description r.123G>C might reasonably be considered to correspond to c.123G>C but it will be impossible to predict the simplest possible DNA change that could have cause RNA variants in most instances.
For example, let's say that the variant r.246_312del has been detected by analysis of RNA and that the deleted region corresponds precisely to an annotated exon. The most reasonable presumption is that the exon has been skipped during splicing. It is improbable that there is a DNA deletion corresponding to the deleted RNA. A much more probable explanation is that the cause is a sequence change at the splice sites either immediately upstream or downstream of the skipped exon. Such sequence changes occur most commonly at the -2, -1, +1, +2, and +5 positions, though other changes are entirely plausible.
My suggestion is that users be allowed to submit r. descriptions but that the validation process be limited to validation of the syntax, the description content, and translation of the mutant RNA if the reference RNA is a coding transcript.
I have been thinking about his also, and find I have to agree with @leicray. We cannot in many circumstances even predict the c. and g. for a description if r. is provided. I think it best to only show the r. dictionary in the output.
However, we could also design a conjoined description that the HGVS could adopt for these cases e.g.
NC_00000000099.9(NM_0123456.7):g.636363_636365del(r.8_9del)
or NM_0123456.7:c.8_8+1del(r.8_9del). # My preference
OK this is just a quick thought but please feel free to suggest similar ideas. To me, having a description that can combine the c. and the r. and explicitly show they are different would be very useful for the HGVS to adopt and will save a lot of headaches, like those @leicray describes.
Also, is p. not extensive enough? It does not allow for indication as to whether the p. is originated from r. or c. e.g. the description NM_0123456.7:c.8_8+1del(r.8_9del) could have 2 p.
Again, thoughts please.
For example, let's say that the variant r.246_312del has been detected by analysis of RNA (...)
The most basic translation will then be c.246_312del
. But I understand your point, and this can be dropped.
My suggestion is that users be allowed to submit r. descriptions but that the validation process be limited to validation of the syntax, the description content, and translation of the mutant RNA if the reference RNA is a coding transcript.
That's fine I guess, it would be more correct, and the user can then just create two requests.
However, we could also design a conjoined description that the HGVS could adopt for these cases e.g.
I think this will overcomplicate things - we'd need to go through the HGVS committee, LOVD and VV will then need a lot of code changes to support all of this... Let's keep things simple.
The simplest would then be:
It is acceptable to need two requests to get this information, and it will lead to fewer assumptions in the code and remove the chance (I think) of misreading interpretations.
Two things I thought of that we need to keep in mind:
@ifokkema, also a valid point. I am making sure we explore all options.
So as it stands, It look like we go with @leicray and if r. is provided, we only populate the r. dictionary. Then provide a guidance message that says, you need to provide the correct c./g. variant as well.
My issue with this is that it would be really useful to have the full validation in a single output for the purposes of storing the data and associating it with publications etc. VV does allow for batch processing, so we could insist that for r. descriptions that if the c. is available, it must be submitted along with the r. variant e.g. variant = NM_:r.edit|NMc.edit and must be provided in those pairs e.g. NM:r.edit|NMc.edit|NM:r.edit|NMc.edit|NM:r.edit|NMc.edit|NM:r.edit|NM_c.edit if multiple r. are being used.
If RNA is provided, the calculated protein change should not have parentheses.
Not sure about this. I thought removing the parentheses implied the protein sequence is known. We cannot assume the protein sequence from the r. because biology is crazy and will bite us if we do. @leicray , can you clarify?
If RNA is provided, T should be converted to U and all bases should be enforced to be lowercase.
Correct and it will be.
My issue with this is that it would be really useful to have the full validation in a single output for the purposes of storing the data and associating it with publications etc.
Ah, I now see why you'd like to do this. But cramping this in a single input is going to be difficult. And you'd need to add at least some cross-validation, too, e.g., make sure that at least the chromosome or the transcript is the same.
And, maybe just asking for whitespace as a separator would make copying/pasting from Excel easier, while at the same time staying away from HGVS issues as a pipe is a character used within the HGVS, but a space or horizontal tab is not.
If RNA is provided, the calculated protein change should not have parentheses.
Not sure about this. I thought removing the parentheses implied the protein sequence is known. We cannot assume the protein sequence from the r. because biology is crazy and will bite us if we do. @leicray , can you clarify?
Actually, the parentheses are dropped when RNA analysis has been done. The varnomen site states:
predicted consequences, i.e. without experimental evidence (no RNA or protein sequence analysed), should be given in parentheses
That's also how LOVD has been doing it for 20 years now :wink:
predicted consequences, i.e. without experimental evidence (no RNA or protein sequence analysed), should be given in parentheses
I thought they would know better. Well, their choice. We will go with it, but I bet you that butts will get bitten.
That's also how LOVD has been doing it for 20 years now 😉
Think we need to get out more!!!!
Ah, I now see why you'd like to do this. But cramping this in a single input is going to be difficult. And you'd need to add at least some cross-validation, too, e.g., make sure that at least the chromosome or the transcript is the same.
Yeah, I think that if we insist that users ought to provide consecutive r. c. submissions in the manuals this will be OK. We will say that if they do not do this, they will not get all the info they need!!
And, maybe just asking for whitespace as a separator would make copying/pasting from Excel easier, while at the same time staying away from HGVS issues as a pipe is a character used within the HGVS, but a space or horizontal tab is not.
I am pushing to have the unnecessary pipe character dropped because |1gom adds no more useful information than simply 1gom but needlessly wastes one of the last asci characters we can use as dividers. I was a member of the committee when this dropped out of the ether and do not remember being consulted on it. Neither does @leicray I don't believe. I don't like whitespace in code etc. However, for the interactive website, we will not insist users use pipe. Just for the API. Normal users will be unaware this is happening. Will be in the background.
predicted consequences, i.e. without experimental evidence (no RNA or protein sequence analysed), should be given in parentheses
I thought they would know better. Well, their choice. We will go with it, but I bet you that butts will get bitten.
Hehe, likely. I'm not sure how common protein sequencing is nowadays (quite rare still, I believe), but yeah, once validation on the protein level becomes more common, this will become an issue.
That's also how LOVD has been doing it for 20 years now wink
Think we need to get out more!!!!
Also true :rofl:
Ah, I now see why you'd like to do this. But cramping this in a single input is going to be difficult. And you'd need to add at least some cross-validation, too, e.g., make sure that at least the chromosome or the transcript is the same.
Yeah, I think that if we insist that users ought to provide consecutive r. c. submissions in the manuals this will be OK. We will say that if they do not do this, they will not get all the info they need!!
Makes sense. Perhaps make it c. r. since DNA will always be required (optional arguments should go last), it makes more sense logically (cause before the effect), it helps sorting, and perhaps even providing multiple RNA inputs for one c. input (although obviously, they should be using the r.[...]
syntax for that).
And, maybe just asking for whitespace as a separator would make copying/pasting from Excel easier, while at the same time staying away from HGVS issues as a pipe is a character used within the HGVS, but a space or horizontal tab is not.
I am pushing to have the unnecessary pipe character dropped because |1gom adds no more useful information than simply 1gom but needlessly wastes one of the last asci characters we can use as dividers.
The |
is planned to allow more annotation that belongs to the positions but says nothing about the sequence. Although, in theory, it makes sense, I do not know how much this feature is needed. The idea, I think, was that c.100
could then be parsed separately, the |
would indicate non-sequence related info came up, and annotation such as gom
, met=
, lom
, or other stuff would then sort of have their own namespace for annotation. As some kind of quick method to see if the sequence was affected without parsing the whole variant or so. In reality, for us, there's little to no difference when we already recognize c.100A>G
, c.100del
, c.100dup
, etc, to just add c.100gom
, c.100met=
, and c.100gom
instead of c.100|gom
, c.100|met=
, and c.100|gom
. So unless there will be some kind of desire to have c.100|whatever
, I suppose indeed the pipe can be dropped.
I was a member of the committee when this dropped out of the ether and do not remember being consulted on it. Neither does @leicray I don't believe.
When I write up the work done on our variant syntax recognition libraries, I'll dig into git to see if I can find where that came from.
I don't like whitespace in code etc.
I forgot that you used the |
for the batch API. The website splits the input based on newlines, I suppose, but for the API, that won't help indeed.
Thanks @ifokkema . Think I can put together a prototype now.
Agreed, c. before r.
Started building this.
The dict that will be added to the VV output currently looks like this. Comments please, especially around the warnings
You will notice that normalization between exons is working as it should because the introns are removed from mRNA which r. descriptions are based on
@leicray @ifokkema @DCRT-LUMC
{
"usage_warnings": [
"RNA (r.) descriptions are independent from cDNA descriptions (c.)",
"RNA descriptions must only be used if the RNA has been sequenced and must not be inferred from a cDNA description"
],
"submitted_variant": "NM_000089.3:r.1033_1035delguu",
"rna_variant": "NM_000089.3:r.1034_1036deluug",
"translation": "NP_000080.2:p.(Val345del)"
}
The original query will look like this currently
{
"usage_warnings": [
"RNA (r.) descriptions are independent from cDNA descriptions (c.)",
"RNA descriptions must only be used if the RNA has been sequenced and must not be inferred from a cDNA description"
],
"submitted_variant": "NM_000484.4:r.2065_2211del",
"rna_variant": "NM_000484.4:r.2067_2213delguucuuugcagaagauguggguucaaacaaaggugcaaucauuggacucauggugggcgguguugucauagcgacagugaucgucaucaccuuggugaugcugaagaagaaacaguacacauccauucaucauggugugguggaggu",
"translation": "NP_000475.1:p.(Phe690_Val738del)"
}
So I can immediately see an issue in that the bases need to be stripped from the reference, so will sort that first.
Also, @ifokkema , I need to remove the () from the p. description????????
{ "usage_warnings": [ "RNA (r.) descriptions are independent from cDNA descriptions (c.)", "RNA descriptions must only be used if the RNA has been sequenced and must not be inferred from a cDNA description" ], "submitted_variant": "NM_000484.4:r.2065_2211del", "rna_variant": "NM_000484.4:r.2067_2213delguucuuugcagaagauguggguucaaacaaaggugcaaucauuggacucauggugggcgguguugucauagcgacagugaucgucaucaccuuggugaugcugaagaagaaacaguacacauccauucaucauggugugguggaggu", "translation": "NP_000475.1:p.(Phe690_Val738del)" }
My comments;
rna_variant
is, as you already mentioned, non-HGVS compliant; the suffix is redundant, and it should be removed.translation
should, as you already mentioned, drop the parentheses, but only when the RNA description doesn't have parentheses. E.g., NM_000484.4:r.(2065_2211del)
should still result in NP_000475.1:p.(Phe690_Val738del)
while NM_000484.4:r.2065_2211del
should result in NP_000475.1:p.Phe690_Val738del
. I'm not sure how difficult that is to implement.The translation should, as you already mentioned, drop the parentheses, but only when the RNA description doesn't have parentheses. E.g., NM_000484.4:r.(2065_2211del) should still result in NP_000475.1:p.(Phe690_Val738del) while NM_000484.4:r.2065_2211del should result in NP_000475.1:p.Phe690_Val738del. I'm not sure how difficult that is to implement.
This should be fine in the new structure I have built. Object orientated makes this easier!!!
The rna_variant is, as you already mentioned, non-HGVS compliant; the suffix is redundant, and it should be removed.
I don't follow. Please clarify
My grammar checker says that "independent from" should be "independent of".
Sorted
The rna_variant is, as you already mentioned, non-HGVS compliant; the suffix is redundant, and it should be removed.
I don't follow. Please clarify
I meant that NM_000484.4:r.2067_2213delguucuuugcagaagauguggguucaaacaaaggugcaaucauuggacucauggugggcgguguugucauagcgacagugaucgucaucaccuuggugaugcugaagaagaaacaguacacauccauucaucauggugugguggaggu
should be NM_000484.4:r.2067_2213del
, but I'm pretty sure that's what you meant when you wrote:
So I can immediately see an issue in that the bases need to be stripped from the reference, so will sort that first.
NM_000484.4:r.2067_2213delguucuuugcagaagauguggguucaaacaaaggugcaaucauuggacucauggugggcgguguugucauagcgacagugaucgucaucaccuuggugaugcugaagaagaaacaguacacauccauucaucauggugugguggaggu should be NM_000484.4:r.2067_2213del, but I'm pretty sure that's what you meant when you wrote:
Ah, yes. Already sorted. Thanmks
The translation should, as you already mentioned, drop the parentheses, but only when the RNA description doesn't have parentheses. E.g., NM_000484.4:r.(2065_2211del) should still result in NP_000475.1:p.(Phe690_Val738del) while NM_000484.4:r.2065_2211del should result in NP_000475.1:p.Phe690_Val738del. I'm not sure how difficult that is to implement.
{'usage_warnings': ['RNA (r.) descriptions are independent of cDNA descriptions (c.)', 'RNA descriptions must only be used if the RNA has been sequenced and must not be inferred from a cDNA description'], 'submitted_variant': 'NM_000089.3:r.(1033_1035delguu)', 'rna_variant': 'NM_000089.3:r.(1034_1036del)', 'translation': 'NP_000080.2:p.(Val345del)'}
{'usage_warnings': ['RNA (r.) descriptions are independent of cDNA descriptions (c.)', 'RNA descriptions must only be used if the RNA has been sequenced and must not be inferred from a cDNA description'], 'submitted_variant': 'NM_000089.3:r.1033_1035delguu', 'rna_variant': 'NM_000089.3:r.1034_1036del', 'translation': 'NP_000080.2:p.Val345del'}
My grammar checker says that "independent from" should be "independent of".
Sorted, see above
The translation should, as you already mentioned, drop the parentheses, but only when the RNA description doesn't have parentheses. E.g., NM_000484.4:r.(2065_2211del) should still result in NP_000475.1:p.(Phe690_Val738del) while NM_000484.4:r.2065_2211del should result in NP_000475.1:p.Phe690_Val738del. I'm not sure how difficult that is to implement.
{'usage_warnings': ['RNA (r.) descriptions are independent of cDNA descriptions (c.)', 'RNA descriptions must only be used if the RNA has been sequenced and must not be inferred from a cDNA description'], 'submitted_variant': 'NM_000089.3:r.(1033_1035delguu)', 'rna_variant': 'NM_000089.3:r.(1034_1036del)', 'translation': 'NP_000080.2:p.(Val345del)'}
{'usage_warnings': ['RNA (r.) descriptions are independent of cDNA descriptions (c.)', 'RNA descriptions must only be used if the RNA has been sequenced and must not be inferred from a cDNA description'], 'submitted_variant': 'NM_000089.3:r.(1033_1035delguu)', 'rna_variant': 'NM_000089.3:r.(1034_1036del)', 'translation': 'NP_000080.2:p.Val345del'}
The second example is off, the input is the same but the output is different. I assume you meant this is the output for NM_000089.3:r.1033_1035delguu
?
I changed it, noticed an error
Output is now
{'usage_warnings': ['RNA (r.) descriptions are independent of cDNA descriptions (c.)', 'RNA descriptions must only be used if the RNA has been sequenced and must not be inferred from a cDNA description'], 'submitted_variant': 'NM_000089.3:r.1033_1035delguu', 'rna_variant': 'NM_000089.3:r.1034_1036del', 'translation': 'NP_000080.2:p.Val345del'}
The submission is NM_000089.3:r.1033_1035delguu
The rna_output is NM_000089.3:r.1034_1036del
Note how the position pushes into the next exon!
I need to add a warning about redundant characters as well.
Correction, the warning about removing redundant bases will be in the VV main warnings, this section is for notes about usage
Excellent, all sorted then! Well done! :partying_face:
Will plug it in to the main VV tool likely tomorrow then get it onto the API at least. Got a new computer coming so can't handle this on the VVweb until I have re-built all my environments. Its gonna take ages!!!!
Pfew, good luck!
Apologies, only had a chance to look into this now. Thanks so much for all the work and implementing this!
No worries @DCRT-LUMC. One step at a time, but it's coming together. See the next message because it shows integration into the VV engine. The next stage is to design an interface on the VVWeb site and embed into the batch tool. I will do everything in this issue to keep things simple for me
Here is the VV output with the RNA data embedded.
Note, the r. does still produce the most likely c. description. In this case, we know the variant at the c. r. and p. level so we can confirm all these are correct! @leicray can probably provide the paper. We used it in the VV manuscript to illustrate crossing into introns. Now we can expand the illustration to show that for r. Variants crossing into the next exon is allowed. See the json comments below
{
"NM_000089.3:c.1035_1035+2del": {
"alt_genomic_loci": [],
"annotations": {
"chromosome": "7",
"db_xref": {
"CCDS": "CCDS34682.1",
"ensemblgene": null,
"hgnc": "HGNC:2198",
"ncbigene": "1278",
"select": "RefSeq"
},
"ensembl_select": false,
"mane_plus_clinical": false,
"mane_select": false,
"map": "7q21.3",
"note": "collagen type I alpha 2 chain",
"refseq_select": true,
"variant": "0"
},
"gene_ids": {
"ccds_ids": [
"CCDS34682"
],
"ensembl_gene_id": "ENSG00000164692",
"entrez_gene_id": "1278",
"hgnc_id": "HGNC:2198",
"omim_id": [
"120160"
],
"ucsc_id": "uc003ung.1"
},
"gene_symbol": "COL1A2",
"genome_context_intronic_sequence": "NC_000007.13(NM_000089.3):c.1035_1035+2del", # The c. description is correctly normalized into the intron
"hgvs_lrg_transcript_variant": "LRG_2t1:c.1035_1035+2del",
"hgvs_lrg_variant": "LRG_2:g.20261_20263del",
"hgvs_predicted_protein_consequence": {
"lrg_slr": "LRG_2p1:p.?",
"lrg_tlr": "LRG_2p1:p.?",
"slr": "NP_000080.2:p.?",
"tlr": "NP_000080.2:p.?" # At the p. level, based on the c. description, the predicted p. is ?
},
"hgvs_refseqgene_variant": "NG_007405.1:g.20261_20263del",
"hgvs_transcript_variant": "NM_000089.3:c.1035_1035+2del",
"primary_assembly_loci": {
"grch37": {
"hgvs_genomic_description": "NC_000007.13:g.94039133_94039135del",
"vcf": {
"alt": "C",
"chr": "7",
"pos": "94039128",
"ref": "CTTG"
}
},
"grch38": {
"hgvs_genomic_description": "NC_000007.14:g.94409821_94409823del",
"vcf": {
"alt": "C",
"chr": "7",
"pos": "94409816",
"ref": "CTTG"
}
},
"hg19": {
"hgvs_genomic_description": "NC_000007.13:g.94039133_94039135del",
"vcf": {
"alt": "C",
"chr": "chr7",
"pos": "94039128",
"ref": "CTTG"
}
},
"hg38": {
"hgvs_genomic_description": "NC_000007.14:g.94409821_94409823del",
"vcf": {
"alt": "C",
"chr": "chr7",
"pos": "94409816",
"ref": "CTTG"
}
}
},
"reference_sequence_records": {
"lrg": "http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_2.xml",
"protein": "https://www.ncbi.nlm.nih.gov/nuccore/NP_000080.2",
"refseqgene": "https://www.ncbi.nlm.nih.gov/nuccore/NG_007405.1",
"transcript": "https://www.ncbi.nlm.nih.gov/nuccore/NM_000089.3"
},
"refseqgene_context_intronic_sequence": "NG_007405.1(NM_000089.3):c.1035_1035+2del",
"rna_variant_descriptions": {
"rna_variant": "NM_000089.3:r.1034_1036del", # However, at the r. level, there are no introns, so the variant normalizes into the adjacent exon
"submitted_variant": "NM_000089.3:r.1033_1035delguu",
"translation": "NP_000080.2:p.Val345del", # And we can generate a p. description based on the accurate r. description
"usage_warnings": [
"RNA (r.) descriptions are independent of cDNA descriptions (c.)",
"RNA descriptions must only be used if the RNA has been sequenced and must not be inferred from a cDNA description",
"c. and g. descriptions provided by VariantValidator must only be used if the DNA sequence has been confirmed"
] # Providing additional usage warnings to users
},
"selected_assembly": "GRCh37",
"submitted_variant": "NM_000089.3:r.(1033_1035delguu)",
"transcript_description": "Homo sapiens collagen type I alpha 2 chain (COL1A2), mRNA",
"validation_warnings": [
"NM_000089.3:r.(1033_1035delGUU) automapped to NM_000089.3:c.1033_1035delGTT",
"NM_000089.3:c.1033_1035del normalized to NM_000089.3:c.1035_1035+2del",
"A more recent version of the selected reference sequence NM_000089.3 is available (NM_000089.4): NM_000089.4:c.1035_1035+2del MUST be fully validated prior to use in reports: select_variants=NM_000089.4:c.1035_1035+2del"
],
"variant_exonic_positions": {
"NC_000007.13": {
"end_exon": "19i",
"start_exon": "19"
},
"NC_000007.14": {
"end_exon": "19i",
"start_exon": "19"
},
"NG_007405.1": {
"end_exon": "19i",
"start_exon": "19"
}
}
},
"flag": "gene_variant",
"metadata": {
"variantvalidator_hgvs_version": "2.0.1.dev3+g6ecbf8e.d20220915",
"variantvalidator_version": "1.0.5.dev295+g795e091.d20221025",
"vvdb_version": "vvdb_2022_04",
"vvseqrepo_db": "VV_SR_2022_02/master",
"vvta_version": "vvta_2022_02"
}
}
What does everyone thing? @leicray @ifokkema @DCRT-LUMC
I would like to get feedback on this before I complete embedding into the VVweb. I will then get everything pushed up in the next major release. @John-F-Wagstaff and I will complete a new VVTA build for the new release in November.
Nice! Is there any difference between providing NM_000089.3:r.(1033_1035delguu)
as input or NM_000089.3:r.1033_1035delguu
? Only the latter indicates RNA is really known.
I do not think that people should be trying to input NM_000089.3:r.(1033_1035delguu)
because r. descriptions should only ever be used in instances where the sequence alteration has been confirmed at the RNA level. That confirmation means that there should be no brackets in the description.
The reference to the first reported instance of this variant is https://pubmed.ncbi.nlm.nih.gov/8444468/.
I tend to agree. It's just that I saw two different values for submitted_variant
in the output, which confused me a bit.
I also agree, but earlier comments from @ifokkema says the following
The translation should, as you already mentioned, drop the parentheses, but only when the RNA description doesn't have parentheses. E.g., NM_000484.4:r.(2065_2211del) should still result in NP_000475.1:p.(Phe690_Val738del) while NM_000484.4:r.2065_2211del should result in NP_000475.1:p.Phe690_Val738del. I'm not sure how difficult that is to implement.
So, if it's in the HGVS rules we need to keep it. This does allow for people to predict the RNA outcome.
I tend to agree. It's just that I saw two different values for submitted_variant in the output, which confused me a bit.
Not sure whether this key will stay. I need it currently while I am building.
I also agree, but earlier comments from @ifokkema says the following
The translation should, as you already mentioned, drop the parentheses, but only when the RNA description doesn't have parentheses. E.g., NM_000484.4:r.(2065_2211del) should still result in NP_000475.1:p.(Phe690_Val738del) while NM_000484.4:r.2065_2211del should result in NP_000475.1:p.Phe690_Val738del. I'm not sure how difficult that is to implement.
So, if it's in the HGVS rules we need to keep it. This does allow for people to predict the RNA outcome.
Yes, NM_000484.4:r.(2065_2211del)
is an HGVS-compliant variant description. If you wish to allow users to enter it, then keep the parentheses around the protein change prediction. Without the parentheses, the protein change prediction should also lose them.
I tend to agree. It's just that I saw two different values for submitted_variant in the output, which confused me a bit.
Not sure whether this key will stay. I need it currently while I am building.
Sure, it can stay, it's just a bit confusing that the values are different. Shouldn't they have the same value, as it's the user's input?
Sure, it can stay, it's just a bit confusing that the values are different. Shouldn't they have the same value, as it's the user's input?
Yes, however, I wasn't sure exactly at the time whether it was swapped out when the old code converted r. to c. so put it in, hence it's likely to be removed. Thankfully, it is handled correctly and as expected, so I will remove.
User Story Hi VariantValidator team, we got into some problems this morning using your tool and are trying to understand what is going wrong. We wanted to look at the consequence of deleting a whole exon from the APP gene and typed in the following: NM_000484.4:c.2065_2211del. Variant validator then corrects this somehow to: NM_000484.4:c.2065_2211del normalized to NM_000484.4:c.2067_2211+2del --> which is obviously not a whole exon but part of the intron. Checking with Mutalyzer gave us the correct output though. Since we use the VariantValidator API for a tool of ours, we are a bit worried what is happening here. Could you please have a look? I also checked the genome browser etc and there our initial description is correct, I do not understand why VV changes it.
What we are doing is developing a tool to prioritze patients for tailormade genetic therapies and need to predict what happens if we delete single exons, i.e. will we create a termination codon etc. So happy to work together on this. We are fully aware that we need to have the HGVS nomenclature correct, so we are not trying to give a wrong description, we really need to see what happens to the AA sequence when we take single exons out.
Describe the solution you'd like Tools like VV and Mutalyzer are not intended to predict the consequences of variations beyond the scope of direct mapping of variation at the DNA level to its correct position with respect to HGVS guidelines then interpret the outcome. You should not use non-compliant descriptions because they are more convenient so I recommend against using NC_000021.9(NM_000484.3):c.2065_2211del because it will not be accepted by clinical journals going forward, nor will it be accepted by databases such as ClinVar and LOVD. Correct nomenclature must be used and will in the very near future be enforced. Happy to discuss what you intend to do, and look at options, but remember NC_000021.9(NM_000484.3):c.2065_2211del is exactly the same as NC_000021.9(NM_000484.4):c.2067_2211+2del, but only the latter is correctly described. What exactly do you want to achieve? For example, if you are trying to describe a known deletion at the RNA level, i.e. you know the sequence of the RNA, the correct description would be NM_000484.3:r.2065_2211del (not c.) and this could be translated into a protein. It would make sense for us to implement a method that would accurately apply HGVS r. rules to make sure NM_000484.3:r.2065_2211del is indeed correctly described, whether it needs to be shuffled internally within the RNA sequence and what the protein outcome would be.
Additional context Guidance that r. descriptions will not necessarily directly map to a g. c. description and must be used in conjunction with correct c. and r. descriptions.