Closed Peter-J-Freeman closed 2 years ago
I have checked the APIs based on this afternoons chat and both tools have a light-weight batch mode.
The APIs are here http://rest.variantvalidator.org/
VariantValidator
NM_000088.3:c.589G>T|NM_000088.3:c.589-1G>T
{
"NM_000088.3:c.589-1G>T": {
"alt_genomic_loci": [],
"annotations": {
"chromosome": "17",
"db_xref": {
"CCDS": "CCDS11561.1",
"GeneID": "1277",
"HGNC": "2197",
"MIM": "120150",
"select": "RefSeq"
},
"gene_synonym": [
"EDSARTH1",
"EDSC",
"OI1",
"OI2",
"OI3",
"OI4"
],
"map": "17q21.33",
"note": "collagen type I alpha 1 chain",
"variant": "0"
},
"gene_ids": {
"ccds_ids": [
"CCDS11561"
],
"ensembl_gene_id": "ENSG00000108821",
"entrez_gene_id": "1277",
"hgnc_id": "HGNC:2197",
"omim_id": [
"120150"
],
"ucsc_id": "uc002iqm.4"
},
"gene_symbol": "COL1A1",
"genome_context_intronic_sequence": "NC_000017.11(NM_000088.3):c.589-1G>T",
"hgvs_lrg_transcript_variant": "LRG_1t1:c.589-1G>T",
"hgvs_lrg_variant": "LRG_1:g.8637G>T",
"hgvs_predicted_protein_consequence": {
"lrg_slr": "LRG_1p1:p.?",
"lrg_tlr": "LRG_1p1:p.?",
"slr": "NP_000079.2:p.?",
"tlr": "NP_000079.2:p.?"
},
"hgvs_refseqgene_variant": "NG_007400.1:g.8637G>T",
"hgvs_transcript_variant": "NM_000088.3:c.589-1G>T",
"primary_assembly_loci": {
"grch37": {
"hgvs_genomic_description": "NC_000017.10:g.48275364C>A",
"vcf": {
"alt": "A",
"chr": "17",
"pos": "48275364",
"ref": "C"
}
},
"grch38": {
"hgvs_genomic_description": "NC_000017.11:g.50198003C>A",
"vcf": {
"alt": "A",
"chr": "17",
"pos": "50198003",
"ref": "C"
}
},
"hg19": {
"hgvs_genomic_description": "NC_000017.10:g.48275364C>A",
"vcf": {
"alt": "A",
"chr": "chr17",
"pos": "48275364",
"ref": "C"
}
},
"hg38": {
"hgvs_genomic_description": "NC_000017.11:g.50198003C>A",
"vcf": {
"alt": "A",
"chr": "chr17",
"pos": "50198003",
"ref": "C"
}
}
},
"reference_sequence_records": {
"lrg": "http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_1.xml",
"protein": "https://www.ncbi.nlm.nih.gov/nuccore/NP_000079.2",
"refseqgene": "https://www.ncbi.nlm.nih.gov/nuccore/NG_007400.1",
"transcript": "https://www.ncbi.nlm.nih.gov/nuccore/NM_000088.3"
},
"refseqgene_context_intronic_sequence": "NG_007400.1(NM_000088.3):c.589-1G>T",
"selected_assembly": "GRCh38",
"submitted_variant": "NM_000088.3:c.589-1G>T",
"transcript_description": "Homo sapiens collagen type I alpha 1 chain (COL1A1), mRNA",
"validation_warnings": []
},
"NM_000088.3:c.589G>T": {
"alt_genomic_loci": [],
"annotations": {
"chromosome": "17",
"db_xref": {
"CCDS": "CCDS11561.1",
"GeneID": "1277",
"HGNC": "2197",
"MIM": "120150",
"select": "RefSeq"
},
"gene_synonym": [
"EDSARTH1",
"EDSC",
"OI1",
"OI2",
"OI3",
"OI4"
],
"map": "17q21.33",
"note": "collagen type I alpha 1 chain",
"variant": "0"
},
"gene_ids": {
"ccds_ids": [
"CCDS11561"
],
"ensembl_gene_id": "ENSG00000108821",
"entrez_gene_id": "1277",
"hgnc_id": "HGNC:2197",
"omim_id": [
"120150"
],
"ucsc_id": "uc002iqm.4"
},
"gene_symbol": "COL1A1",
"genome_context_intronic_sequence": "",
"hgvs_lrg_transcript_variant": "LRG_1t1:c.589G>T",
"hgvs_lrg_variant": "LRG_1:g.8638G>T",
"hgvs_predicted_protein_consequence": {
"lrg_slr": "LRG_1p1:p.(G197C)",
"lrg_tlr": "LRG_1p1:p.(Gly197Cys)",
"slr": "NP_000079.2:p.(G197C)",
"tlr": "NP_000079.2:p.(Gly197Cys)"
},
"hgvs_refseqgene_variant": "NG_007400.1:g.8638G>T",
"hgvs_transcript_variant": "NM_000088.3:c.589G>T",
"primary_assembly_loci": {
"grch37": {
"hgvs_genomic_description": "NC_000017.10:g.48275363C>A",
"vcf": {
"alt": "A",
"chr": "17",
"pos": "48275363",
"ref": "C"
}
},
"grch38": {
"hgvs_genomic_description": "NC_000017.11:g.50198002C>A",
"vcf": {
"alt": "A",
"chr": "17",
"pos": "50198002",
"ref": "C"
}
},
"hg19": {
"hgvs_genomic_description": "NC_000017.10:g.48275363C>A",
"vcf": {
"alt": "A",
"chr": "chr17",
"pos": "48275363",
"ref": "C"
}
},
"hg38": {
"hgvs_genomic_description": "NC_000017.11:g.50198002C>A",
"vcf": {
"alt": "A",
"chr": "chr17",
"pos": "50198002",
"ref": "C"
}
}
},
"reference_sequence_records": {
"lrg": "http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_1.xml",
"protein": "https://www.ncbi.nlm.nih.gov/nuccore/NP_000079.2",
"refseqgene": "https://www.ncbi.nlm.nih.gov/nuccore/NG_007400.1",
"transcript": "https://www.ncbi.nlm.nih.gov/nuccore/NM_000088.3"
},
"refseqgene_context_intronic_sequence": "",
"selected_assembly": "GRCh38",
"submitted_variant": "NM_000088.3:c.589G>T",
"transcript_description": "Homo sapiens collagen type I alpha 1 chain (COL1A1), mRNA",
"validation_warnings": []
},
"flag": "gene_variant",
"metadata": {
"seqrepo_db": "2018-08-21",
"uta_schema": "uta_20180821",
"variantvalidator_hgvs_version": "1.2.5.vv1",
"variantvalidator_version": "1.0.4.dev125+gd3e875e"
}
}
LOVD
NC_000017.10:g.48275363C>A|NC_000017.11:g.50198003C>A
{
"NC_000017.10:g.48275363C>A": {
"NC_000017.10:g.48275363C>A": {
"g_hgvs": null,
"genomic_variant_error": "chromosome ID NC_000017.10 is not associated with genome build GRCh38",
"hgvs_t_and_p": null,
"p_vcf": null,
"selected_build": "GRCh38"
},
"errors": [],
"flag": "genomic_variant_warning"
},
"NC_000017.11:g.50198003C>A": {
"NC_000017.11:g.50198003C>A": {
"g_hgvs": "NC_000017.11:g.50198003C>A",
"genomic_variant_error": null,
"hgvs_t_and_p": {
"NM_000088.3": {
"alt_genomic_loci": {
"grch37": {},
"grch38": {},
"hg19": {},
"hg38": {}
},
"gap_statement": null,
"gapped_alignment_warning": null,
"p_hgvs_slc": "NP_000079.2:p.?",
"p_hgvs_tlc": "NP_000079.2:p.?",
"primary_assembly_loci": {
"grch37": {
"NC_000017.11": {
"hgvs_genomic_description": "NC_000017.11:g.50198003C>A",
"vcf": {
"alt": "A",
"chr": "NC_000017.11",
"pos": "50198003",
"ref": "C"
}
}
},
"grch38": {
"NC_000017.11": {
"hgvs_genomic_description": "NC_000017.11:g.50198003C>A",
"vcf": {
"alt": "A",
"chr": "NC_000017.11",
"pos": "50198003",
"ref": "C"
}
}
},
"hg19": {
"NC_000017.11": {
"hgvs_genomic_description": "NC_000017.11:g.50198003C>A",
"vcf": {
"alt": "A",
"chr": "NC_000017.11",
"pos": "50198003",
"ref": "C"
}
}
},
"hg38": {
"NC_000017.11": {
"hgvs_genomic_description": "NC_000017.11:g.50198003C>A",
"vcf": {
"alt": "A",
"chr": "NC_000017.11",
"pos": "50198003",
"ref": "C"
}
}
}
},
"t_hgvs": "NM_000088.3:c.589-1G>T",
"transcript_variant_error": null
}
},
"p_vcf": "17:50198003:C:A",
"selected_build": "GRCh38"
},
"errors": [],
"flag": null
},
"metadata": {
"seqrepo_db": "/local/seqrepo/2018-08-21",
"uta_schema": "uta_20180821",
"variantformatter_version": "1.0.2.dev26+gcbd8fb1",
"variantvalidator_hgvs_version": "1.2.5.vv1",
"variantvalidator_version": "1.0.4.dev125+gd3e875e"
}
}
When we build, I think that Validator should be used as the transcript validation tool and LOVD as the genomic variant validation tool. We can add gene information to the output very easily if needed.
The only slight issues is that, as @ifokkema once pointed out, that the |
is now used by HGVS. I need to have a chat with Johan about reserving at least 1 symbol for delimiting. Perhaps the backslash?
Basing the examples on the input NC_000017.11:g.50198002del Note: The variant is incorrect and needs to be normalised (as asked by Johan)
The complete output is
{
"NC_000017.11:g.50198002del": {
"NC_000017.11:g.50198002del": {
"g_hgvs": "NC_000017.11:g.50198003del",
"genomic_variant_error": null, # There really ought to be a warning here, opening an issue
"hgvs_t_and_p": {
"NM_000088.3": {
"alt_genomic_loci": {
"grch37": {},
"grch38": {},
"hg19": {},
"hg38": {}
},
"gap_statement": null,
"gapped_alignment_warning": null,
"p_hgvs_slc": "NP_000079.2:p.(G197Vfs*68)",
"p_hgvs_tlc": "NP_000079.2:p.(Gly197ValfsTer68)",
"primary_assembly_loci": {
"grch37": {
"NC_000017.11": {
"hgvs_genomic_description": "NC_000017.11:g.50198003del",
"vcf": {
"alt": "A",
"chr": "NC_000017.11",
"pos": "50198000",
"ref": "AC"
}
}
},
"grch38": {
"NC_000017.11": {
"hgvs_genomic_description": "NC_000017.11:g.50198003del",
"vcf": {
"alt": "A",
"chr": "NC_000017.11",
"pos": "50198000",
"ref": "AC"
}
}
},
"hg19": {
"NC_000017.11": {
"hgvs_genomic_description": "NC_000017.11:g.50198003del",
"vcf": {
"alt": "A",
"chr": "NC_000017.11",
"pos": "50198000",
"ref": "AC"
}
}
},
"hg38": {
"NC_000017.11": {
"hgvs_genomic_description": "NC_000017.11:g.50198003del",
"vcf": {
"alt": "A",
"chr": "NC_000017.11",
"pos": "50198000",
"ref": "AC"
}
}
}
},
"t_hgvs": "NM_000088.3:c.590del",
"transcript_variant_error": null
}
},
"p_vcf": "17:50198000:AC:A",
"selected_build": "GRCh38"
},
"errors": [],
"flag": null
},
"metadata": {
"seqrepo_db": "/local/seqrepo/2018-08-21",
"uta_schema": "uta_20180821",
"variantformatter_version": "1.0.2.dev26+gcbd8fb1",
"variantvalidator_hgvs_version": "1.2.5.vv1",
"variantvalidator_version": "1.0.4.dev125+gd3e875e"
}
}
Bug report for LOVD endpoint opened https://github.com/openvar/variantValidator/issues/272
This is the bare minimum and could be used for checking g. HGVS without any mapping and was requested by Johan
{
"NC_000017.11:g.50198002del": {
"NC_000017.11:g.50198002del": {
"g_hgvs": "NC_000017.11:g.50198003del",
"genomic_variant_error": "SOME WARNING HERE",
},
"errors": [],
"flag": null
},
"metadata": {
"seqrepo_db": "/local/seqrepo/2018-08-21",
"uta_schema": "uta_20180821",
"variantformatter_version": "1.0.2.dev26+gcbd8fb1",
"variantvalidator_hgvs_version": "1.2.5.vv1",
"variantvalidator_version": "1.0.4.dev125+gd3e875e"
}
}
@leicray pointed out that this could lead to accidental mistakes by not having any expected transcript level data available so we could do something like this instead/too by selecting a "canonical" (not that I believe in such things) transcript)
Note: This would require the addition of some additional data which can be seen in the VV output above
{
"NC_000017.11:g.50198002del": {
"NC_000017.11:g.50198002del": {
"g_hgvs": "NC_000017.11:g.50198003del",
"genomic_variant_error": "SOME WARNING HERE",
"hgvs_t_and_p": {
"NM_000088.3": {
"gap_statement": null,
"gapped_alignment_warning": null,
"p_hgvs_slc": "NP_000079.2:p.(G197Vfs*68)",
"p_hgvs_tlc": "NP_000079.2:p.(Gly197ValfsTer68)",
"t_hgvs": "NM_000088.3:c.590del",
"transcript_variant_error": null,
"db_xref": {
"GeneID": "1277",
"HGNC": "2197",
"select": "RefSeq"
},
},
"errors": [],
"flag": null
},
"metadata": {
"seqrepo_db": "/local/seqrepo/2018-08-21",
"uta_schema": "uta_20180821",
"variantformatter_version": "1.0.2.dev26+gcbd8fb1",
"variantvalidator_hgvs_version": "1.2.5.vv1",
"variantvalidator_version": "1.0.4.dev125+gd3e875e"
}
}
Note: This can also be over-ridden and a different transcript/transcripts can be specified by the user by setting select_transcripts - instructions can be provided
We have an app for that. https://variantvalidator.org/service/gene2trans/ Try it out with gene symbol COL1A1
The only slight issues is that, as @ifokkema once pointed out, that the
|
is now used by HGVS. I need to have a chat with Johan about reserving at least 1 symbol for delimiting. Perhaps the backslash?
How about something that parsers already exist for, like:
NC_000017.10:g.48275363C>A
for one variant, and
["NC_000017.10:g.48275363C>A";"NC_000017.11:g.50198003C>A"]
for a set; throw it through a JSON parser if it starts with [
and if that's successful, you already have your array.
@leicray pointed out that this could lead to accidental mistakes by not having any expected transcript level data available so we could do something like this instead/too by selecting a "canonical" (not that I believe in such things) transcript)
We touched upon this subject before, what would be needed for that automated selection. If VVTA is ready and can provide some info, then I'd suggest having that option ("choose transcript for me") in the interface, which then runs gene2transcripts
to figure out what's best for the first variant, then runs the API calls with that transcript selected until we get no more mappings (meaning we're now in a different gene), and then repeats the logic (run gene2transcripts
, select new transcript, continue API calls). Or, have the user provide a list of transcripts, I believe the API will then always ignore the transcripts that are given but don't return a mapping... right?
The only slight issues is that, as @ifokkema once pointed out, that the
|
is now used by HGVS. I need to have a chat with Johan about reserving at least 1 symbol for delimiting. Perhaps the backslash?How about something that parsers already exist for, like:
NC_000017.10:g.48275363C>A
for one variant, and["NC_000017.10:g.48275363C>A";"NC_000017.11:g.50198003C>A"]
for a set; throw it through a JSON parser if it starts with[
and if that's successful, you already have your array.
I caution against the use of backslash because it is currently misused by some to indicate fusion genes, e.g. HBD\HBB though it has now been resolved by agreement with HGNC that fusion genes be indicated as HBD::HBB.
Using square brackets looks okay, but we must avoid their use possibly causing future complications that are not obvious at present. Formally, square brackets are used to group alleles.
There are few other symbols that not already defined in HGVS for some specific use. As far as I can see, we could use any of the printable symbols in the ASCII 0–127 range that are not already defined for some other purpose:
$, #, @, ~, !
Hi @leicray
It might be just me, but I also want a separator that "looks" like a separator. In my opinion the '~' character looks nice. @ifokkema , any thoughts. The only reason I'd avoid ["NC_000017.10:g.48275363C>A";"NC_000017.11:g.50198003C>A"]
is that this looks very like the "allele" syntax and could confuse both me and our users. I'm easily confused. Nice idea though.
As for the auto selection of transcripts, there would be no need at all to hit the gene2transcripts. The LOVD endpoint hits this natively. If you don't mind me adding
"db_xref": {
"GeneID": "1277",
"HGNC": "2197",
"select": "RefSeq"
},
to the LOVD endpoint output then we have all we need.
The selection process Loop through the transcripts until "select": "RefSeq" or "select": "MANE" is found (if both exist, MANE takes priority), return this transcript. If no "select" exists, return the first transcript LOVD endpoint fonds which is usually the "longest"
@leicray @ifokkema , this seem reasonable logic? Also, @ifokkema can I make these changes please, i.e. the addition of some basic gene info as above?
How about something that parsers already exist for, like:
NC_000017.10:g.48275363C>A
for one variant, and["NC_000017.10:g.48275363C>A","NC_000017.11:g.50198003C>A"]
for a set; throw it through a JSON parser if it starts with[
and if that's successful, you already have your array.(...) Using square brackets looks okay, but we must avoid their use possibly causing future complications that are not obvious at present. Formally, square brackets are used to group alleles.
There are few other symbols that not already defined in HGVS for some specific use. As far as I can see, we could use any of the printable symbols in the ASCII 0–127 range that are not already defined for some other purpose:
$, #, @, ~, !
Please note that my intention was to provide a syntax for the API/Bioinformaticians, not for non-programmer users! The syntax that I suggested (by error I used ;
which should have been ,
) is a generic method to pass arrays/lists rather than single entries through APIs. It's not intended as an extension to HGVS or even to be seen by users, it's irrespective of whatever values are included in the array/list that's passed, it's merely to be used by a tool or bioinformatician to submit various variants to the API, like (pseudocode):
// Current situation.
$sVariant = "NC_000017.10:g.48275363C>A";
callVV($sVariant);
$sVariant = "NC_000017.11:g.50198003C>A";
callVV($sVariant);
// New situation, reducing overhead.
$aVariants = [
"NC_000017.10:g.48275363C>A",
"NC_000017.11:g.50198003C>A"
];
callVV(json_encode($aVariants));
Any other choice of a single character is "hacky" and can cause potential problems if they're ever added to HGVS, and/or whenever somebody makes a typo. My suggestion won't have this issue at all. If valid JSON is passed, parse it and process the array. Otherwise, it's a single variant.
As for the auto selection of transcripts, there would be no need at all to hit the gene2transcripts. The LOVD endpoint hits this natively. If you don't mind me adding
"db_xref": { "GeneID": "1277", "HGNC": "2197", "select": "RefSeq" },
to the LOVD endpoint output then we have all we need.
Oh, that's a great idea! But, from a data modeling point of view, I would suggest some changes. db_xref
looks to me like a list of cross-references regarding the gene rather than the transcript, even though I've also seen gene_ids
in the VV endpoint with references there. It seems repeated data and perhaps this needs to be thought out a bit more. Also, select
is not a database source, it's more like a flag (with values null
, refseq
or MANE
). Finally, maybe it's good to know when something is both MANE select and RefSeq select, so better not overwrite anything? More logical would be making it flags like refseq_select
(true or false) and mane_select
(true or false). Having both true gives the client stronger confidence that this transcript is the correct one to use for reporting than just seeing one flag.
The selection process Loop through the transcripts until "select": "RefSeq" or "select": "MANE" is found (if both exist, MANE takes priority), return this transcript.
Then, as a speed improvement, further API calls in the same region (perhaps that's a bit flexible of a definition), can already pass the selected transcript to VV so no time will be wasted on mapping to more transcripts and predicting protein changes on them.
If no "select" exists, return the first transcript LOVD endpoint fonds which is usually the "longest"
Ah, so the order of transcripts returned is specified?
@leicray @ifokkema , this seem reasonable logic? Also, @ifokkema can I make these changes please, i.e. the addition of some basic gene info as above?
Adding data doesn't break my parser, so that's totally fine! (although I recommend a different format, see above)
I see what you mean about the inputs being for programmers and not general users, but the Swagger interfaces do attract general users. That being said, I do have instructions in the swagger documentation. I can still see a few drawbacks, might be easier to discuss this on a call and hash it out?
As for the data structures, these are my thoughts but this is getting complicated so a call may be useful.
I kinda spotted myself that the data is duplicated in VV. I totally agree there is duplication. My fault for rushing. will fix that. In VV I will alter the structure from
"db_xref": {
"CCDS": "CCDS11561.1",
"GeneID": "1277",
"HGNC": "2197",
"MIM": "120150",
"select": "RefSeq"
},
"gene_synonym": [
"EDSARTH1",
"EDSC",
"OI1",
"OI2",
"OI3",
"OI4"
],
"map": "17q21.33",
"note": "collagen type I alpha 1 chain",
"variant": "0"
to
"db_xref": {
"CCDS": "CCDS11561.1", #Note, this is not duplication because the version number makes it transcript specific I think
"select": "RefSeq"
},
"map": "17q21.33",
"note": "collagen type I alpha 1 chain",
"variant": "0"
The main reason to keep the gene info with the transcript is that you cannot assume that a user will only submit queries from a single gene. This is why the data is always nested in the transcript level info
Since the LOVD endpoint has no gene information currently, I think this structure
"db_xref": {
"GeneID": "1277",
"HGNC": "2197",
"select": "RefSeq"
},
is plenty. Not overwhelming for a light-weight tool. Note, for Ensembl data the GeneID will be an Ensembl gene. Why do we provide both HGNC and a source ID? Because mapping via gene symbol can be unreliable. Especially for Ensembl data.
I agree that an individual transcript will be MANE, RefSeq of Null in terms of SELECT status. However, we also record historic transcripts, therefor it is likely that many genes will have something like the following
NM_12345.3 - SELECT: MANE (Maped to GRCh38 only)
NM_12345.2 - SELECT: RefSeq (Mapped to GRCh37 and GRCh38)
NM_12345.1 - Null (Mapped to GRCh38 only)
or even
NM_12345.3 - SELECT: MANE (Mapped to GRCh37 and GRCh38)
NM_12345.2 - SELECT: RefSeq (Mapped to GRCh37 and GRCh38) # Pre-dates the MANE so is RefSeqSelect validly
NM_12345.1 - SELECT: RefSeq (Mapped to GRCh37 only)
Its a total mess and cannot be simplified
This was behind my logic for the filtering MANE >> RefSeq >> Null but select longest transcript.
However, if you record this data in LOVD databases, you can benefit from the speed improvements by specifying the exact transcript you are interested in. I am also looking into adding "SELECT" information to the gene2transcripts tool so a user can look-up in advance for the MANE transcript for a gene and just specify it. Automation is great, but sometimes Human I beats AI ;)
I see what you mean about the inputs being for programmers and not general users, but the Swagger interfaces do attract general users.
Sure, but what's the use case here? I thought it's for the new interface for journals, not for individual users using the Swagger interface (who can just submit variant-by-variant and not a set). Therefore, it doesn't need to get (clearly) documented in Swagger as usable for users typing in their variants, but it could be implemented for API usage by programs.
I can still see a few drawbacks, might be easier to discuss this on a call and hash it out?
Sure! I have time tomorrow (from 13:00 CET) or otherwise next week?
The main reason to keep the gene info with the transcript is that you cannot assume that a user will only submit queries from a single gene. This is why the data is always nested in the transcript level info
That's totally fine, I get that, and don't expect to see that any different :wink:
Since the LOVD endpoint has no gene information currently, I think this structure
"db_xref": { "GeneID": "1277", "HGNC": "2197", "select": "RefSeq" },
If you want to stick to Identifiers.org IDs for links, "GeneID" should be "ncbigene": https://registry.identifiers.org/registry/ncbigene
That self-documents your cross-references. HGNC:2197 translates into https://identifiers.org/hgnc:2197 which resolves directly to what you need. https://registry.identifiers.org/registry/hgnc
Note, for Ensembl data the GeneID will be an Ensembl gene.
That's not a good idea, in my opinion. (see below)
Why do we provide both HGNC and a source ID? Because mapping via gene symbol can be unreliable. Especially for Ensembl data.
Not using gene symbols is fine, I understand that. But considering Entrez an official "GeneID" without any reference specific to Entrez or the NCBI isn't technically correct as Entrez/NCBI isn't the authority on genes, the HGNC is. Althought the Ensembl IDs are obvious to humans because of their prefix, it's not immediately obvious what ID the Entrez Gene ID really is, especially if I wouldn't see the HGNC ID there, too. Best would be to have an NCBIGene
there, and an Ensembl
key if appropriate:
https://registry.identifiers.org/registry/ensembl
Another reason why the select
key doesn't belong there :grin:
I've been reading up a lot on how APIs ought to be more self-explanatory, and I really agree. The perfect API wouldn't need documentation other than its entry point. When using Identifier.org IDs allows you to easily create references without the need to specify them in the docs.
I agree that an individual transcript will be MANE, RefSeq of Null in terms of SELECT status.
No, actually, I propose something different. MANE, RefSeq, MANE+RefSeq, or null. Therefore better not to use one field but use multiple. Don't call it select
, but have a boolean for each set. select_mane
true or false; select_refseq
true or false. If ever another set would come by, it could simply be a new field, e.g., select_lsdbs
true or false.
(...) therefore it is likely that many genes will have something like the following (...)
NM_12345.3 - SELECT: MANE (Mapped to GRCh37 and GRCh38) NM_12345.2 - SELECT: RefSeq (Mapped to GRCh37 and GRCh38) # Pre-dates the MANE so is RefSeqSelect validly NM_12345.1 - SELECT: RefSeq (Mapped to GRCh37 only)
Totally fine!
This was behind my logic for the filtering MANE >> RefSeq >> Null but select longest transcript.
I just really wouldn't throw data away. Not mentioning that something is RefSeq select if it's MANE select discards data and I don't see any advantage doing that... Why not let the user decide what's important? Plus, it's inflexible if there will be more "sets". Validated sets, high-quality no-gap sets, Curated sets, whatever the future may bring.
However, if you record this data in LOVD databases, you can benefit from the speed improvements by specifying the exact transcript you are interested in. I am also looking into adding "SELECT" information to the gene2transcripts tool so a user can look-up in advance for the MANE transcript for a gene and just specify it.
Totally agree, we're no the same page; that's the same as I suggested earlier for automation in the Journal file interface :wink:
Sure, but what's the use case here? I thought it's for the new interface for journals, not for individual users using the Swagger interface (who can just submit variant-by-variant and not a set). Therefore, it doesn't need to get (clearly) documented in Swagger as usable for users typing in their variants, but it could be implemented for API usage by programs.
Nope the swagger users can submit batch data. Says so on LOVD but not on VV. The thing is, we just need a simple way of indicating that one variant starts and the other stops. This is used to split a string of variants into a list. Perhaps something like ||
. Keep it simple.
If you want to stick to Identifiers.org IDs for links, "GeneID" should be "ncbigene": https://registry.identifiers.org/registry/ncbigene
That self-documents your cross-references. HGNC:2197 translates into https://identifiers.org/hgnc:2197 which resolves directly to what you need. https://registry.identifiers.org/registry/hgnc
Thanks for this. I was not aware. Although these are the field names provided by Entrez, so perhaps tell them LOL! I'll take a look at identifiers.org.
No, actually, I propose something different. MANE, RefSeq, MANE+RefSeq, or null. Therefore better not to use one field but use multiple. Don't call it select, but have a boolean for each set. select_mane true or false; select_refseq true or false. If ever another set would come by, it could simply be a new field, e.g., select_lsdbs true or false.
I'm not sure whether MANE select superceeds RefSeq SELECT. Perhaps the pragmatic automated solution is to return all transcripts listed as MANE SELECT or REFSEQ select. That way we have provided sensible options and this is useful because some editors will be working with legacy transcripts and not realise there has been an update. Note: This can be overridden by the user and all transcripts will be returned. Or, as you know, only user-selected transcripts returned.
Note, for Ensembl data the GeneID will be an Ensembl gene.
That's not a good idea, in my opinion. (see below)
The issue is that an Ensembl gene is not equivalent to a RefSeq gene (entrez gene). So providing an Entrez ID for an ensembl record is pretty much meaningless. The structures are frequently very different. So I don't think you can assume that Ensembl view a particular gene the way that RefSeq would. Total pain!!!! Especially when you are hanling both RefSeq and Ensembl data.
Perhaps I could use the dentifiers.org keys, and have one for the ensembl gene Id and one for the ncbigene and the one not being used will be set to null.
To save me a headache, these keys will be changed server side :)
Actually, I will be re-building the database. Will do it in the hard code. Great timing.
Sure, but what's the use case here? I thought it's for the new interface for journals, not for individual users using the Swagger interface (who can just submit variant-by-variant and not a set). Therefore, it doesn't need to get (clearly) documented in Swagger as usable for users typing in their variants, but it could be implemented for API usage by programs.
Nope the swagger users can submit batch data.
I know they can (currently using the pipe), but are we discussing how to implement an efficient VV light for journals (where batch-submission will speed things up), or are we discussing helping Swagger users submit batches? I personally don't care for the latter but if you think/know they need this, then I understand you prefer a method using a single separator (using two chars will decrease possible issues indeed).
If you want to stick to Identifiers.org IDs for links, "GeneID" should be "ncbigene": https://registry.identifiers.org/registry/ncbigene
That self-documents your cross-references. HGNC:2197 translates into https://identifiers.org/hgnc:2197 which resolves directly to what you need. https://registry.identifiers.org/registry/hgnc
Thanks for this. I was not aware. Although these are the field names provided by Entrez, so perhaps tell them LOL! I'll take a look at identifiers.org.
I can imagine that Entrez calls their gene ID "GeneID" :laughing: When removed out of context and applied as a DB XRef, then it's a bit different :wink:
No, actually, I propose something different. MANE, RefSeq, MANE+RefSeq, or null. Therefore better not to use one field but use multiple. Don't call it select, but have a boolean for each set. select_mane true or false; select_refseq true or false. If ever another set would come by, it could simply be a new field, e.g., select_lsdbs true or false.
I'm not sure whether MANE select superceeds RefSeq SELECT. Perhaps the pragmatic automated solution is to return all transcripts listed as MANE SELECT or REFSEQ select. That way we have provided sensible options and this is useful because some editors will be working with legacy transcripts and not realise there has been an update. Note: This can be overridden by the user and all transcripts will be returned. Or, as you know, only user-selected transcripts returned.
So have all
or None
to select all transcripts, select
to only show MANE Select and RefSeq Select transcripts, or specifying IDs to select certain transcripts?
Note, for Ensembl data the GeneID will be an Ensembl gene.
That's not a good idea, in my opinion. (see below)
The issue is that an Ensembl gene is not equivalent to a RefSeq gene (entrez gene). So providing an Entrez ID for an ensembl record is pretty much meaningless.
I'm afraid I have to disagree, and all the resources I checked seem to disagree as well. HGNC, Ensembl, GeneCards, Entrez, they're all linking Ensembl ENSG
IDs to HGNC and Entrez Gene IDs...
Ensembl provides an FAQ entry on how to do that. GeneCards links Ensembl IDs to NCBI and HGNC IDs, and from each of these pages, you can find links to the other resources. The only link that seems to be missing is from Ensembl to Entrez, but Entrez to Ensembl, HGNC to Entrez and Ensembl, all those links exist and are given freely. Maybe you mean linking Ensembl ENST
IDs to RefSeq IDs?
The structures are frequently very different. So I don't think you can assume that Ensembl view a particular gene the way that RefSeq would. Total pain!!!! Especially when you are hanling both RefSeq and Ensembl data.
I know the gene structure can be different, but you can still identify which gene this Ensembl transcript belongs to using HGNC and Entrez gene IDs. I, for instance, don't store ENSG
IDs, so with just an ENST
and an ENSG
I can't identify the gene and I'd have to discard the Ensembl transcripts unless I'll start storing the ENSG
IDs as well.
Perhaps I could use the dentifiers.org keys, and have one for the ensembl gene Id and one for the ncbigene and the one not being used will be set to null.
I'd be good to know which genes are missing one of these IDs! When you can't make that link, yeah, the missing value could just be null
!
To save me a headache, these keys will be changed server side :)
Sounds good! Right now I do the matching based on the RefSeq transcripts so I don't think I'm parsing any of those fields now (I will when I'll be adapting more code to be using VV instead of Mutalyzer).
I know they can (currently using the pipe), but are we discussing how to implement an efficient VV light for journals (where batch-submission will speed things up), or are we discussing helping Swagger users submit batches? I personally don't care for the latter but if you think/know they need this, then I understand you prefer a method using a single separator (using two chars will decrease possible issues indeed).
Sorry, I should have said, we use the APIs extensively in teaching and training bioinformaticians and non bioinformaticians. They love the swagger interface because it lets them build queries, including batch queries. That's why I try to make sure it can do everything and simply. Hopefully you will see the benefit of this in the co-development meeting next week. Thats not to say we couldn't make the API handle purpose built structures for our specific needs. For now I'll use ||
and get Johan to add it to reserved fields so HGVS cannot use it.
Sounds good! Right now I do the matching based on the RefSeq transcripts so I don't think I'm parsing any of those fields now (I will when I'll be adapting more code to be using VV instead of Mutalyzer).
I know we knew this was happening but its great to hear it. Talk about a step forward :)
I'd be good to know which genes are missing one of these IDs! When you can't make that link, yeah, the missing value could just be null!
Good to know. I'm going to take a step back from the discussion and show you a new data-model I designed which I will implement in the new build and how to use it. This is useful because it will also form the basis of the variants in journals interface to the LOVD API (same functions, but will look pretty).
# Derived from an Ensembl transcript
{'db_xref': {'ensemblgene': 'ENSG00000243710', 'ncbigene': '149465', 'hgnc': 'HGNC:26485', 'CCDS': None, 'select': False}, 'chromosome': '1', 'map': '1p34.2', 'note': 'cilia and flagella associated protein 57', 'variant': '005'}
# Derived from a RefSeq Transcript
{'db_xref': {'hgnc': 'HGNC:26485', 'CCDS': 'CCDS72768.1', 'select': 'False', 'ncbigene': '149465', 'ensemblgene': 'ENSG00000243710'}, 'chromosome': '1', 'map': '1p34.2', 'note': 'cilia and flagella associated protein 57', 'variant': '5'}
For your purposes, you will switch the setting "transcript_model" = "refseq" and the following will be displayed
{'db_xref': {'hgnc': 'HGNC:26485', 'CCDS': 'CCDS72768.1', 'select': 'False', 'ncbigene': '149465'}, 'chromosome': '1', 'map': '1p34.2', 'note': 'cilia and flagella associated protein 57', 'variant': '5'}
Note, omits the Ensembl. I'll leave you to guess what happens when transcript_model is "ensembl" or "all" ;P
So have all or None to select all transcripts, select to only show MANE Select and RefSeq Select transcripts, or specifying IDs to select certain transcripts?
My thoughts entirely. Let's hash it out. How about
select_transcripts = None : will check genomic variant only
select_transcripts = Select : returns all transcripts which are either MANE or RefSeq select. (or Mane or Ensembl select)
My reasoning here is that many genes still have no mane select.
select_transcripts = all : No real explanation needed
select_transcripts = NM_12345.6|NM_7890.1 etc : Returns the requested transcript(s)
Do you want to tweak this or add options?
I know they can (currently using the pipe), but are we discussing how to implement an efficient VV light for journals (where batch-submission will speed things up), or are we discussing helping Swagger users submit batches? I personally don't care for the latter but if you think/know they need this, then I understand you prefer a method using a single separator (using two chars will decrease possible issues indeed).
Sorry, I should have said, we use the APIs extensively in teaching and training bioinformaticians and non bioinformaticians. They love the swagger interface because it lets them build queries, including batch queries. That's why I try to make sure it can do everything and simply. Hopefully you will see the benefit of this in the co-development meeting next week. Thats not to say we couldn't make the API handle purpose built structures for our specific needs. For now I'll use
||
and get Johan to add it to reserved fields so HGVS cannot use it.
Ah, clear, thanks! And ||
makes sense also for programmers :wink:
Sounds good! Right now I do the matching based on the RefSeq transcripts so I don't think I'm parsing any of those fields now (I will when I'll be adapting more code to be using VV instead of Mutalyzer).
I know we knew this was happening but its great to hear it. Talk about a step forward :)
I'd say a leap :grin:
# Derived from an Ensembl transcript {'db_xref': {'ensemblgene': 'ENSG00000243710', 'ncbigene': '149465', 'hgnc': 'HGNC:26485', 'CCDS': None, 'select': False}, 'chromosome': '1', 'map': '1p34.2', 'note': 'cilia and flagella associated protein 57', 'variant': '005'}
Cool!
Not sure if this matters, but my JSON parser didn't like None
or False
, but they can be replaced by null
and false
.
So... stop me if you'd like already, but with one select
field I can't see what's MANE Select and RefSeq Select, and it's not a database cross-reference :stuck_out_tongue_closed_eyes:
{'db_xref': {'hgnc': 'HGNC:26485', 'CCDS': 'CCDS72768.1', 'select': 'False', 'ncbigene': '149465'}, 'chromosome': '1', 'map': '1p34.2', 'note': 'cilia and flagella associated protein 57', 'variant': '5'}
Note, omits the Ensembl. I'll leave you to guess what happens when transcript_model is "ensembl" or "all" ;P
Haha clear, thanks! :grin:
So have all or None to select all transcripts, select to only show MANE Select and RefSeq Select transcripts, or specifying IDs to select certain transcripts?
My thoughts entirely. Let's hash it out. How about
select_transcripts = None : will check genomic variant only
So this will replace or imply checkonly=True
?
select_transcripts = Select : returns all transcripts which are either MANE or RefSeq select. (or Mane or Ensembl select) My reasoning here is that many genes still have no mane select. select_transcripts = all : No real explanation needed select_transcripts = NM_12345.6|NM_7890.1 etc : Returns the requested transcript(s)
Do you want to tweak this or add options?
Nope, it looks great! :+1:
I'd say a leap 😁
Definately, but clearly us brits are too modest 😁
Not sure if this matters, but my JSON parser didn't like None or False, but they can be replaced by null and false.
I couldn't be bothered to serialise the dictionary. Its Py dict not Json! Sorry
So... stop me if you'd like already, but with one select field I can't see what's MANE Select and RefSeq Select,
No no, it's a fair comment. However, I have not yet had chance to find out if there are situations where RefSeq have decided that a gene has a different MANE select to a RefSeq Select outside of the following instances
Therefore if select = MANE it's MANE select. If select = RefSeq it's RefSeq select. If its Null it's not select. I may be missing something though
and it's not a database cross-reference 😝
Take that up with RefSeq, it's their data 😝. True enough though, although I'd like to see a MANE database similar to the CCDS database which also shows the Ensembl SELECT and RefSeq SELECT for each gene. Are you particularly fussy about moving it? If not it will save me a headache because VV interactive site already calls it
So this will replace or imply checkonly=True?
Nope, look at the call
/VariantFormatter/variantformatter/{genome_build}/{variant_description}/{transcript_model}/{select_transcripts}/{checkonly}
we can use additional fields in {transcript_model}, {select_transcripts} and {checkonly} for our needs.
Any other functions you fancy? Just double checkin
Sorry, I overlooked this one!
I'd say a leap :grin:
Definately, but clearly us brits are too modest :grin:
Hahaha :laughing:
So... stop me if you'd like already, but with one select field I can't see what's MANE Select and RefSeq Select,
No no, it's a fair comment. However, I have not yet had chance to find out if there are situations where RefSeq have decided that a gene has a different MANE select to a RefSeq Select outside of the following instances (...) Therefore if select = MANE it's MANE select. If select = RefSeq it's RefSeq select. If its Null it's not select. I may be missing something though
If MANE and RefSeq select the same transcript, you can't indicate that with one field. If you choose to use "MANE" in such cases, overwriting "RefSeq", then users will need to read the docs to understand that "select": "MANE"
implies "select": "RefSeq"
UNLESS "select": "RefSeq"
is found in a different transcript assigned to the same gene UNLESS the user has selected certain transcripts and therefore doesn't know if they're maybe not seeing the RefSeq select transcript, in which case "select": "MANE"
COULD mean also RefSeq, or not, it's unknown. And, of course, when either MANE or RefSeq will select more than one transcript, nobody knows anything anymore, and you can't draw any conclusions at all.
and it's not a database cross-reference stuck_out_tongue_closed_eyes
Take that up with RefSeq, it's their data stuck_out_tongue_closed_eyes.
They don't put a select
field in a db_xref
array, do they? :open_mouth:
Are you particularly fussy about moving it? If not it will save me a headache because VV interactive site already calls it
No, if it's annoying for you to change, please leave it. It's more for whenever you're redesigning anyway.
So this will replace or imply checkonly=True?
Nope, look at the call
/VariantFormatter/variantformatter/{genome_build}/{variant_description}/{transcript_model}/{select_transcripts}/{checkonly}
we can use additional fields in {transcript_model}, {select_transcripts} and {checkonly} for our needs.
* {checkonly} = True will basically give the minimal response with no transcript info, * {checkonly} = Select could in my eyes return all transcripts where "Select" is not null, i.e. MANE, RefSeq, Ensembl
You mean select_transcripts
here, right?
* {checkonly} = Select and {transcript_model} = refseq in my eyes return all transcripts where "Select" is not null, i.e. MANE, RefSeq
Also here, right?
* {checkonly} = False and {select_transcripts} = populated will return mappings to a user-defined transcript(s)
Yup, makes sense!
Any other functions you fancy? Just double check-in
No, that's fine, but with this structure, some arguments are ignored when certain other values are set. e.g., checkonly
set to True
will ignore select_transcripts
(implying select_transcripts
= None
, actually) and select_transcripts
set to None
(as you wrote before) will imply checkonly
= True
, which is fine, but just should be documented. So:
/(...)/{variant_description}/{transcript_model}/None/True
is the same as
/(...)/{variant_description}/{transcript_model}/None/False
is the same as
/(...)/{variant_description}/{transcript_model}/None/Whatever
and
/(...)/{variant_description}/{transcript_model}/all/True
is the same as
/(...)/{variant_description}/{transcript_model}/NM_123456.1/True
is the same as
/(...)/{variant_description}/{transcript_model}/Whatever/True
They don't put a select field in a db_xref array, do they? 😮
Ah, no. it looks like I may have done it. I might make the database myself ;). I'm definately thinking about adding search terms for this database so we can quickly identify SELECT transcripts. Think that would benefit us both. Anyway, for now it can stay where it is. Will come back to it.
If MANE and RefSeq select the same transcript, you can't indicate that with one field. If you choose to use "MANE" in such cases, overwriting "RefSeq", then users will need to read the docs to understand that "select": "MANE" implies "select": "RefSeq" UNLESS "select": "RefSeq" is found in a different transcript assigned to the same gene UNLESS the user has selected certain transcripts and therefore doesn't know if they're maybe not seeing the RefSeq select transcript, in which case "select": "MANE" COULD mean also RefSeq, or not, it's unknown. And, of course, when either MANE or RefSeq will select more than one transcript, nobody knows anything anymore, and you can't draw any conclusions at all.
What I want to do then is keep the current key because it works as a top-level look as to whether a transcript has ever been selected as Canonical. I could also add keys for additional information. THIS IS THE VITAL BIT FOR FEEDBACK
mane_select: True/False
refseq_select: True/False
ensembl_select: True/False
I'll add these outside the db xref.
These keys will need to expand when MANE looks at anything outside of canonical (SELECT) transcripts. Or we come up with some selection strategy (still on our to-do-list)
Note, records are not described as RefSeq SELECT and MANE Select. Its one or the other (so far as I have seen)
/VariantFormatter/variantformatter/{genome_build}/{variant_description}/{transcript_model}/{select_transcripts}/{checkonly}
Assuming I am remembering correctly, if {checkonly} is True, then no transcript mappings are performed, regardless of whether transcripts are specified
checkonly *string(path) | Accepted:True (return ONLY the genomic variant descriptions and not transcript and protein descriptions)Falsetx (Stop at transcript level, exclude protein) -- | --
So this will mean that select_transcripts is ignored if checkonly is True because no transcript mappings will be performed
Looking at the calls again and reviewing based on comments from @ifokkema
/VariantFormatter/variantformatter/{genome_build}/{variant_description}/{transcript_model}/{select_transcripts}/{checkonly}
{checkonly} = True will basically give the minimal response with no transcript info i.e. will only return genomic description
{checkonly} = False and {select_transcripts} = mane_select will return transcripts where the new mane_select = True Extrapolate out for Ensembl and RefSeqSelect
{checkonly} = False and {select_transcripts} = populated with transcript IDs will return mappings to a user-defined transcript(s)
{transcript_model} = refseq will only return RefSeq transcripts
Does this look better?
We will need to model these calls when we build the interface and provide clear guidance in the API manuals for API users. Creating a user-friendly interface will help us model the calls. Always best to build it around what users ask for
I am probably confused about what is being asked here.
I cannot really comment without knowing what the possible formats of {variant_description}
might be. If it's a transcript-based description, I'd probably be a bit confused to receive back a genomic description. Call me naïve, but i would expect {checkonly}
= True to trigger a check and return an indication of whether the syntax and the content of the description are each individually correct.
All the accepted formats are documented in Swagger on the API site. LOVD endpoint.
https://rest.variantvalidator.org/
{checkonly} = True does check the genomic description which is submitted and returns the corrected descriptions where necessary. The LOVD endpoint only accepts genomic descriptions (VCF or HGVS). {checkonly} = True performs checks at the genomic description level (also transforms from VCF to HGVS and vice versa but blocks mapping to transcripts.
Here is the output for a vcf description
{
"17-50198002-C-A": {
"17-50198002-C-A": {
"g_hgvs": "NC_000017.11:g.50198002C>A",
"genomic_variant_error": null,
"hgvs_t_and_p": null,
"p_vcf": "17-50198002-C-A",
"selected_build": "GRCh38"
},
"errors": [],
"flag": null
},
"metadata": {
"seqrepo_db": "/local/seqrepo/2018-08-21",
"uta_schema": "uta_20180821",
"variantformatter_version": "1.0.2.dev26+gcbd8fb1",
"variantvalidator_hgvs_version": "1.2.5.vv1",
"variantvalidator_version": "1.0.4.dev125+gd3e875e"
}
}
Just to clarify the Select status.
So far, the only instance I have seen where MANE Select transcripts and RefSeq select transcripts for a gene have been to do with versions that pre-date the MANE project.
Example https://www.ncbi.nlm.nih.gov/nuccore/NM_007294.4
variant 1, mRNA.
ACCESSION NM_007294
VERSION NM_007294.4
KEYWORDS RefSeq; MANE Select.
https://www.ncbi.nlm.nih.gov/nuccore/NM_007294.3
ACCESSION NM_007294
VERSION NM_007294.3
KEYWORDS RefSeq; RefSeq Select.
*Note, the KEYWORDS refer to "source, Select status". If a transcript is not Select (see below) this flag is not present
Records do not state whether transcripts are MANE and RefSeqSelect. At least I have not seen any. It is possible two up-to-date transcripts could conflict with RefSeq Select and Mane select disagreeing.
https://www.ncbi.nlm.nih.gov/nuccore/NM_007297.4
ACCESSION NM_007297
VERSION NM_007297.4
KEYWORDS RefSeq.
If anyone spots anything that contradicts this, please let me know.
The question is, if we see MANE select, do we assume this is also RefSeq Select. I do not like assuming so I would say no, but I would encourage users to search for MANE Select or RefSeq Select, not Mane Select only or RefSeq Select only otherwise they will most likely accidentally omit data, i.e. if they state RefSeq select only, they return only old transcript versions, if they say MANE Select only, for many genes there will be no response. This isn't easy.
Don't get me started on the Ensembl flags!!!!
According to https://www.ncbi.nlm.nih.gov/refseq/refseq_select/#MANE
Mane Select over-writes RefSeq Select
Look in section https://www.ncbi.nlm.nih.gov/refseq/refseq_select/#relationship-between-refseq-sele
The MANE Select dataset is a subset of RefSeq Select. For a given human protein-coding gene, the RefSeq Select transcript is designated as the ‘MANE Select’ when it matches the Ensembl ‘Select’ transcript and is included in the public MANE set. In the case of MANE Select transcripts, the keyword ‘RefSeq Select’ is replaced by ‘MANE Select’ in the RefSeq flatfile
This is not a useful way of doing things because it suggests we cannot assume that MANE Select and RefSeq Selct are the same. It's easy for Ensembl because they give a designated flag "is_canonical" for every transcript which is separate data from the MANE project information. I need to find a gene where the MANE Select and the RefSeq Select disagree and the transcripts are both the latest versions!!!!
Hey, sorry about the late reply, I ignored GitHub for a bit to focus on the CZI application.
If MANE and RefSeq select the same transcript, you can't indicate that with one field. If you choose to use "MANE" in such cases, overwriting "RefSeq", then users will need to read the docs to understand that "select": "MANE" implies "select": "RefSeq" UNLESS "select": "RefSeq" is found in a different transcript assigned to the same gene UNLESS the user has selected certain transcripts and therefore doesn't know if they're maybe not seeing the RefSeq select transcript, in which case "select": "MANE" COULD mean also RefSeq, or not, it's unknown. And, of course, when either MANE or RefSeq will select more than one transcript, nobody knows anything anymore, and you can't draw any conclusions at all.
What I want to do then is keep the current key because it works as a top-level look as to whether a transcript has ever been selected as Canonical. I could also add keys for additional information. THIS IS THE VITAL BIT FOR FEEDBACK
mane_select: True/False refseq_select: True/False ensembl_select: True/False
I'll add these outside the db xref.
These will be perfect, and they will solve all possible issues with unclarities/uncertainties in select sets!
These keys will need to expand when MANE looks at anything outside of canonical (SELECT) transcripts. Or we come up with some selection strategy (still on our to-do-list)
Yes, we indeed still have that on our to-do list. This is the flexible approach that will always work.
Note, records are not described as RefSeq SELECT and MANE Select. Its one or the other (so far as I have seen)
But we can set refseq_select
to true whenever mane_select
is true, right?
/VariantFormatter/variantformatter/{genome_build}/{variant_description}/{transcript_model}/{select_transcripts}/{checkonly}
Assuming I am remembering correctly, if {checkonly} is True, then no transcript mappings are performed, regardless of whether transcripts are specified (...) So this will mean that select_transcripts is ignored if checkonly is True because no transcript mappings will be performed
Exactly, yes.
Looking at the calls again and reviewing based on comments from @ifokkema
/VariantFormatter/variantformatter/{genome_build}/{variant_description}/{transcript_model}/{select_transcripts}/{checkonly}
* {checkonly} = True will basically give the minimal response with no transcript info i.e. will only return genomic description * {checkonly} = False and {select_transcripts} = mane_select will return transcripts where the new mane_select = True _Extrapolate out for Ensembl and RefSeqSelect_
Note, will passing refseq_select
also return all mane_select
transcripts? Or should we pass refseq_select|mane_select
to get all RefSeq select transcripts? I prefer the former. As MANE is a subset of RefSeq, refseq_select
should be true for all mane_select
transcripts.
* {checkonly} = False and {select_transcripts} = populated with transcript IDs will return mappings to a user-defined transcript(s) * {transcript_model} = refseq will only return RefSeq transcripts
Does this look better?
Yes, this matches the behavior well!
Records do not state whether transcripts are MANE and RefSeqSelect. At least I have not seen any. It is possible two up-to-date transcripts could conflict with RefSeq Select and Mane select disagreeing.
This could also happen when RefSeq select is expanded to two transcripts per gene, but Ensembl doesn't do this and thus MANE doesn't annotate the other transcript. Then you'll have a MANE select transcript, and a RefSeq select transcript (although in reality, the RefSeq select transcript also includes the MANE select transcript).
According to https://www.ncbi.nlm.nih.gov/refseq/refseq_select/#MANE (...)
The MANE Select dataset is a subset of RefSeq Select. For a given human protein-coding gene, the RefSeq Select transcript is designated as the ‘MANE Select’ when it matches the Ensembl ‘Select’ transcript and is included in the public MANE set. In the case of MANE Select transcripts, the keyword ‘RefSeq Select’ is replaced by ‘MANE Select’ in the RefSeq flatfile
I found the same, indeed!
It's easy for Ensembl because they give a designated flag "is_canonical" for every transcript which is separate data from the MANE project information.
More logical, indeed. And we can do the same by adding the refseq_select
flag automatically for each mane_select
transcript.
I need to find a gene where the MANE Select and the RefSeq Select disagree and the transcripts are both the latest versions!!!!
If VVTA is storing this, a database query would provide this list? I just checked CDKN2A as a quick check, but it has only one RefSeq select transcript which is insane. Otherwise we could have a look at genes with multiple transcripts in their LRGs... :thinking:
VVTA currently does not store tags or subset information that could allow this kind of query. Adding this would require another change to the transcript, table structure and derived views, as well as the addition of more database management functions to the VVTA data loading script. It would also require decisions around some 'interesting' trade offs on the subject of flexibility/forward compatibility versus performance for getting subsets/subset tags or getting transcripts marked with specific tags. On the other hand if you want a subset database containing just the transcripts within any specific subset, that can be done, with the tooling as is.
VVTA is not storing information about Select transcripts, but I am storing it in the validator database. We will likely combine this with VVTA at a later date. So, a search for the flags will be made available. I do not want to alter the current VVTA to store this info.
But we can set refseq_select to true whenever mane_select is true, right?
This is the kicker
The difficulty is that there is no mechanism to tell if a transcript is RefSeq Select of Mane Select. I have only ever seen transcripts that are either listed as MANE or RefSeq. Note: This links into the following comment I just checked CDKN2A as a quick check, but it has only one RefSeq select transcript which is insane
. This is as I would expect. Select means Canonical. There should only be one Canonical transcript per gene. MANE is going to provide a list of additional transcripts in the long run (MANE + CLINICAL or similar). In terms of RefSeq, there is the Select which is the canonical and all NM_ transcripts. Same for Ensmebl really, but we have to apply our own quality metrics here because most transcripts in the Ensembl genes fail QC. So This brings me back to here These keys will need to expand when MANE looks at anything outside of canonical (SELECT) transcripts. Or we come up with some selection strategy (still on our to-do-list)
. Personally, I think that we could provide this in addition to MANE + Clinical based on info gathered from clinical labs we deal with (ideally within the NIHR grant).
Anyway, I really need to build the database so I can get this stuff up and running. Consequently, I am going to apply the following assumptions until it is disproven.
If transcript is MANE Select:
mane_select = True
refseq_select = True
elif transcript is RefSeq Select:
mane_select = False
refseq_select = True
Once the database is built, we can take a look at the data but I predict that for genes where there is a MANE Select and a separate non MANE RefSeq SELECT, the RefSeq SELECT will be a previous version of the newly created transcript that is now MANE Select (e.g. removed the poly A tail or something)
P.S. The Mane + Clinical flag id available through the Ensembl APIs. I am adding a keyword to capture this.
I have not yet seen a record which is annotated with this data
I do not yet know how RefSeq will display this data.
All a bit of a mess at the moment. This database we are building could become very useful
ENST00000252934.10
{'db_xref': {'ensemblgene': 'ENSG00000130638', 'ncbigene': None, 'CCDS': None, 'select': 'MANE', 'hgnc': 'HGNC:10549'}, 'chromosome': '22', 'map': '22q13.31', 'note': 'ataxin 10', 'variant': '201', 'mane_select': True, 'mane_plus_clinical': False, 'ensembl_select': True, 'refseq_select': False}
NM_013236.4
{'db_xref': {'CCDS': 'CCDS14070.1', 'select': 'MANE', 'ncbigene': '25814', 'ensemblgene': None, 'hgnc': 'HGNC:10549'}, 'chromosome': '22', 'map': '22q13.31', 'note': 'ataxin 10', 'variant': '1', 'mane_select': True, 'refseq_select': True, 'ensembl_select': False, 'mane_plus_clinical': False}
Here are some example jsons for current transcripts and the previous version
Updating - ENST00000252934.10
{
"db_xref": {
"ensemblgene": "ENSG00000130638",
"ncbigene": null,
"CCDS": null,
"select": "MANE",
"hgnc": "HGNC:10549"
},
"chromosome": "22",
"map": "22q13.31",
"note": "ataxin 10",
"variant": "201",
"mane_select": true,
"mane_plus_clinical": false,
"ensembl_select": true,
"refseq_select": false
}
Updating - ENST00000252934.5
{
"db_xref": {
"ensemblgene": "ENSG00000130638",
"ncbigene": null,
"CCDS": null,
"select": "Ensembl",
"hgnc": "10549"
},
"chromosome": "22",
"map": "22q13.31",
"note": "ataxin 10",
"variant": "001",
"mane_select": false,
"mane_plus_clinical": false,
"ensembl_select": true,
"refseq_select": false
}
Updating - NM_013236.4
{
"db_xref": {
"CCDS": "CCDS14070.1",
"select": "MANE",
"ncbigene": "25814",
"ensemblgene": null,
"hgnc": "HGNC:10549"
},
"chromosome": "22",
"map": "22q13.31",
"note": "ataxin 10",
"variant": "1",
"mane_select": true,
"refseq_select": true,
"ensembl_select": false,
"mane_plus_clinical": false
}
Updating - NM_013236.3
{
"db_xref": {
"CCDS": "CCDS14070.1",
"select": "False",
"ncbigene": "25814",
"ensemblgene": null,
"hgnc": "HGNC:10549"
},
"chromosome": "22",
"map": "22q13.31",
"note": "ataxin 10",
"variant": "1",
"ensembl_select": false,
"mane_plus_clinical": false
}
I've just noticed the refseq_select is missing for NM_013236.3
fixing this now I'm also going to dig because it looks like NM_013236 was not the RefSeqSect prior to MANE. So what was? To be continued with more examples
Update, there is no transcript labelled RefSeq Select for this gene, only a MANE Select
Updating - NM_013236.3
{
"db_xref": {
"CCDS": "CCDS14070.1",
"select": "False",
"ncbigene": "25814",
"ensemblgene": null,
"hgnc": "HGNC:10549"
},
"chromosome": "22",
"map": "22q13.31",
"note": "ataxin 10",
"variant": "1",
"refseq_select": false,
"ensembl_select": false,
"mane_plus_clinical": false
}
Examples for COL1A1
Updating - NM_000088.4
{
"db_xref": {
"CCDS": "CCDS11561.1",
"select": "MANE",
"ncbigene": "1277",
"ensemblgene": null,
"hgnc": "HGNC:2197"
},
"chromosome": "17",
"map": "17q21.33",
"note": "collagen type I alpha 1 chain",
"variant": "0",
"refseq_select": true,
"mane_select": true,
"ensembl_select": false,
"mane_plus_clinical": false
}
Updating - NM_000088.3
{
"db_xref": {
"CCDS": "CCDS11561.1",
"select": "RefSeq",
"ncbigene": "1277",
"ensemblgene": null,
"hgnc": "HGNC:2197"
},
"chromosome": "17",
"map": "17q21.33",
"note": "collagen type I alpha 1 chain",
"variant": "0",
"refseq_select": true,
"ensembl_select": false,
"mane_plus_clinical": false
}
In this instance, the previous version i.e. .3 is RefSeq Select
The difficulty is that there is no mechanism to tell if a transcript is RefSeq Select of Mane Select. I have only ever seen transcripts that are either listed as MANE or RefSeq. Note: This links into the following comment
I just checked CDKN2A as a quick check, but it has only one RefSeq select transcript which is insane
. This is as I would expect. Select means Canonical. There should only be one Canonical transcript per gene.
Canonical to what? What tissue? What phenotype? I'm pretty sure they'll find themselves getting lots of requests for multiple select transcripts because lots of genes have multiple clinically relevant transcripts, there isn't always just one. I am hoping they just thought it was easiest to start with one, but they do know that there shouldn't be just one - there may be just one.
MANE is going to provide a list of additional transcripts in the long run (MANE + CLINICAL or similar).
All the more reason to have the separate flags, so, good!
If transcript is MANE Select: mane_select = True refseq_select = True elif transcript is RefSeq Select: mane_select = False refseq_select = True
Sounds good!
P.S. The Mane + Clinical flag id available through the Ensembl APIs. I am adding a keyword to capture this.
I have not yet seen a record which is annotated with this data
But good to include already, to be prepared!
Updating - NM_013236.3
{ "db_xref": { "CCDS": "CCDS14070.1", "select": "False",
Shouldn't this be false
instead of "False"
?
"ncbigene": "25814", "ensemblgene": null, "hgnc": "HGNC:10549" }, "chromosome": "22", "map": "22q13.31", "note": "ataxin 10", "variant": "1", "refseq_select": false, "ensembl_select": false, "mane_plus_clinical": false }
mane_select
is missing here, it's better to define it and set it to false
, no?
Updating - NM_000088.3
{ "db_xref": { "CCDS": "CCDS11561.1", "select": "RefSeq", "ncbigene": "1277", "ensemblgene": null, "hgnc": "HGNC:2197" }, "chromosome": "17", "map": "17q21.33", "note": "collagen type I alpha 1 chain", "variant": "0", "refseq_select": true, "ensembl_select": false, "mane_plus_clinical": false }
Also here, mane_select
is suddenly missing, where for other examples it was defined but set to false
.
It all looks great, though! :grin:
So 2 days into building the database you spot issues!!! AAAAAAARGH Haha.
I need to check. I may well have corrected these.
Canonical to what? What tissue? What phenotype? I'm pretty sure they'll find themselves getting lots of requests for multiple select transcripts because lots of genes have multiple clinically relevant transcripts, there isn't always just one. I am hoping they just thought it was easiest to start with one, but they do know that there shouldn't be just one - there may be just one.
Speak to Ensembl and RefSeq. This is part of the process of defining a Select or alternatively canoonical transctipt. Not that I agree with such things ;)
Nope. I missed tat the mane_select key was missing. Killing the database update.
I'm gonna find a beer!!!!
All fixed. Thanks for spotting @ifokkema. Really appreciate the eye for detail. Think I'm slipping a bit Haha :)
Update to new VVTA going very well. I have time to try get the new dev server working while this database builds.
Sorry for all the stress late last week :sweat_smile: I hope you got yourself that beer! :laughing:
I waited. Pubs have opened again for outdoor service. Think I'll pop in tonight :)P
The database is still building. Takes ages because it is API driven, but all other components in place.
I've ordered up a new dev server. Soon as it's up and running I'll let you know.
I'm also updating the docker containers so that containers can all be accessed externally. This means that docker can be used to completely configure a containerised version of VV (also VV API). However, you can now configure the database ports that VV uses so use the docker database builds to power a local install of VV. Saves users having to install MySQL, Postgres and SQL:ite
Exciting developments on the horizon.
I waited. Pubs have opened again for outdoor service. Think I'll pop in tonight :)P
Haha great! We, unfortunately, need to wait longer... End of April at least!
I'm also updating the docker containers so that containers can all be accessed externally. This means that docker can be used to completely configure a containerised version of VV (also VV API). However, you can now configure the database ports that VV uses so use the docker database builds to power a local install of VV. Saves users having to install MySQL, Postgres and SQL:it
Sounds great! Furthermore, it stops the container from interfering when the user does actually already have existing installations of MySQL or Postgres :wink: Looking forward to it!
Sounds great! Furthermore, it stops the container from interfering when the user does actually already have existing installations of MySQL or Postgres 😉
Exactly. The config builds Postgres on 54320 and MySQL on 33050. Can't test it till the database is built, but expect to release soon.
This is superceeded by Aries EM integration
Opening an issue to plan the integration.
Inviting comment from @leicray @John-F-Wagstaff @ifokkema.
Note. I will be starting this process next week (22nd/23rd March) and will use this issue to log progress and take feedback from you all.
Cheers