Closed ktsirigos closed 3 years ago
Example of the new evidence. A few disclaimers:
id
).w2 = Window.partitionBy("sourceId").orderBy(col("score").desc())
out = df.withColumn("row", row_number().over(w2)).filter(col("row") == 1).drop("row")
out.write.json("/Users/ochoa/exampleEvidence.json")
@ktsirigos you can try to find the same evidence in the old pipeline. (ask @AsierGonzalez if you don't know how to do it) @ireneisdoomed, we should iterate in the desired evidence further.
A note here. The next evidence corresponds to uniprot_literature
.
{
"diseaseFromSourceId": "Orphanet_98977",
"resourceScore": 1,
"literature": [
"12860809",
"17499207",
"10798654",
"9345106",
"9792882",
"12189160",
"9490287",
"15534471",
"25524706",
"15025728",
"10873982",
"12356829",
"10980537",
"12362081",
"9005853",
"9328473",
"9863594",
"9361308",
"12442283",
"12872267",
"10196380",
"9510647",
"9535666",
"11004290",
"10644174",
"10330365",
"11774072",
"9521427",
"10340788",
"17210859",
"10819638",
"10916185",
"15255110",
"16401791",
"9697688",
"15795224"
],
"targetId": "ENSG00000034971",
"targetFromSourceId": "Q99972",
"datasourceId": "uniprot_literature",
"datatypeId": "genetic_literature",
"diseaseFromSource": "Glaucoma 1, open angle, A",
"diseaseId": "Orphanet_98977",
"studyId": "Q99972#pathology_and_biotech",
"id": "41531baa86e90e7ae31d7b41176aa754777bb7d3",
"score": 1,
"sourceId": "uniprot_literature"
}
This is the original submitted evidence string in the 20.11 UniProt evidence file:
jq-1.6 'select(.unique_association_fields.target == "http://identifiers.org/uniprot/Q99972" and .unique_association_fields.disease_uri == "http://www.orpha.net/ORDO/Orphanet_98977" and .disease.name == "Glaucoma 1, open angle, A")' cttv011-20-10-2020.json
{
"sourceID": "uniprot_literature",
"access_level": "public",
"validated_against_schema_version": "1.7.3",
"unique_association_fields": {
"target": "http://identifiers.org/uniprot/Q99972",
"disease_acronym": "GLC1A",
"disease_uri": "http://www.orpha.net/ORDO/Orphanet_98977"
},
"target": {
"target_type": "http://identifiers.org/cttv.target/protein_evidence",
"activity": "http://identifiers.org/cttv.activity/up_or_down",
"id": "http://identifiers.org/uniprot/Q99972"
},
"disease": {
"id": "http://www.orpha.net/ORDO/Orphanet_98977",
"name": "Glaucoma 1, open angle, A",
"acronym": "GLC1A"
},
"literature": {
"references": [
{
"lit_id": "http://europepmc.org/abstract/MED/12860809"
},
{
"lit_id": "http://europepmc.org/abstract/MED/17499207"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10798654"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9345106"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9792882"
},
{
"lit_id": "http://europepmc.org/abstract/MED/12189160"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9490287"
},
{
"lit_id": "http://europepmc.org/abstract/MED/15534471"
},
{
"lit_id": "http://europepmc.org/abstract/MED/25524706"
},
{
"lit_id": "http://europepmc.org/abstract/MED/15025728"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10873982"
},
{
"lit_id": "http://europepmc.org/abstract/MED/12356829"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10980537"
},
{
"lit_id": "http://europepmc.org/abstract/MED/12362081"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9005853"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9328473"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9863594"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9361308"
},
{
"lit_id": "http://europepmc.org/abstract/MED/12442283"
},
{
"lit_id": "http://europepmc.org/abstract/MED/12872267"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10196380"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9510647"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9535666"
},
{
"lit_id": "http://europepmc.org/abstract/MED/11004290"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10644174"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10330365"
},
{
"lit_id": "http://europepmc.org/abstract/MED/11774072"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9521427"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10340788"
},
{
"lit_id": "http://europepmc.org/abstract/MED/17210859"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10819638"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10916185"
},
{
"lit_id": "http://europepmc.org/abstract/MED/15255110"
},
{
"lit_id": "http://europepmc.org/abstract/MED/16401791"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9697688"
},
{
"lit_id": "http://europepmc.org/abstract/MED/15795224"
}
]
},
"type": "genetic_literature",
"evidence": {
"resource_score": {
"type": "probability",
"value": 1
},
"date_asserted": "2020-10-06T23:00:00Z",
"is_associated": true,
"provenance_type": {
"literature": {
"references": [
{
"lit_id": "http://europepmc.org/abstract/MED/12860809"
},
{
"lit_id": "http://europepmc.org/abstract/MED/17499207"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10798654"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9345106"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9792882"
},
{
"lit_id": "http://europepmc.org/abstract/MED/12189160"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9490287"
},
{
"lit_id": "http://europepmc.org/abstract/MED/15534471"
},
{
"lit_id": "http://europepmc.org/abstract/MED/25524706"
},
{
"lit_id": "http://europepmc.org/abstract/MED/15025728"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10873982"
},
{
"lit_id": "http://europepmc.org/abstract/MED/12356829"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10980537"
},
{
"lit_id": "http://europepmc.org/abstract/MED/12362081"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9005853"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9328473"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9863594"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9361308"
},
{
"lit_id": "http://europepmc.org/abstract/MED/12442283"
},
{
"lit_id": "http://europepmc.org/abstract/MED/12872267"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10196380"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9510647"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9535666"
},
{
"lit_id": "http://europepmc.org/abstract/MED/11004290"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10644174"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10330365"
},
{
"lit_id": "http://europepmc.org/abstract/MED/11774072"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9521427"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10340788"
},
{
"lit_id": "http://europepmc.org/abstract/MED/17210859"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10819638"
},
{
"lit_id": "http://europepmc.org/abstract/MED/10916185"
},
{
"lit_id": "http://europepmc.org/abstract/MED/15255110"
},
{
"lit_id": "http://europepmc.org/abstract/MED/16401791"
},
{
"lit_id": "http://europepmc.org/abstract/MED/9697688"
},
{
"lit_id": "http://europepmc.org/abstract/MED/15795224"
}
]
},
"database": {
"id": "uniprot",
"version": "2020_05"
}
},
"evidence_codes": [
"http://purl.obolibrary.org/obo/ECO_0000205"
],
"urls": [
{
"nice_name": "Further details in UniProt database",
"url": "http://www.uniprot.org/uniprot/Q99972#pathology_and_biotech"
}
]
}
}
New evidence for uniprot_literature
.
{
"datasourceId" : "uniprot_literature",
"datatypeId" : "genetic_literature",
"diseaseFromSource" : "Glaucoma 1, open angle, A",
"diseaseFromSourceMappedId" : "Orphanet_98977",
"literature" : [
"12860809",
"17499207",
"10798654",
"9345106",
"9792882",
"12189160",
"9490287",
"15534471",
"25524706",
"15025728",
"10873982",
"12356829",
"10980537",
"12362081",
"9005853",
"9328473",
"9863594",
"9361308",
"12442283",
"12872267",
"10196380",
"9510647",
"9535666",
"11004290",
"10644174",
"10330365",
"11774072",
"9521427",
"10340788",
"17210859",
"10819638",
"10916185",
"15255110",
"16401791",
"9697688",
"15795224"
],
"targetFromSourceId" : "Q99972",
"targetModulation" : "up_or_down"
}
Additionally to the information contained in the old evidence:
confidence
field based on internal criteria: "high" or "medium" Please note: This only applies to uniprot_literature, it's still pending to decide whether we are definitely dropping uniprot. After that analysis is done, we will do the same comparison for this datatype.
As agreed on #1141, we need an example of old and new evidence for the uniprot
datasource. I vote to rename it uniprot_variants
to avoid confusion from now on
Example of an old evidence from uniprot_variants
:
{
"sourceID": "uniprot",
"access_level": "public",
"validated_against_schema_version": "1.6.7",
"unique_association_fields": {
"target": "Q14896",
"disease_acronym": "CMD1MM",
"disease_uri": "http://www.orpha.net/ORDO/Orphanet_217607",
"uniprot_release": "2020_04",
"variant_id": "VAR_070455",
"dbSnps": "rs397514751",
"alleleOrigin": "germline"
},
"target": {
"target_type": "http://identifiers.org/cttv.target/protein_evidence",
"activity": "http://identifiers.org/cttv.activity/up_or_down",
"id": "http://identifiers.org/uniprot/Q14896"
},
"disease": {
"id": "http://www.orpha.net/ORDO/Orphanet_217607",
"name": "Cardiomyopathy, dilated 1MM",
"acronym": "CMD1MM"
},
"type": "genetic_association",
"variant": {
"id": "http://identifiers.org/dbsnp/rs397514751",
"type": "snp single"
},
"evidence": {
"gene2variant": {
"date_asserted": "2020-08-11T23:00:00Z",
"is_associated": true,
"provenance_type": {
"literature": {
"references": [
{
"lit_id": "http://europepmc.org/abstract/MED/20215591"
}
]
},
"database": {
"id": "uniprot",
"version": "2020_04"
}
},
"evidence_codes": [
"http://purl.obolibrary.org/obo/ECO_0000205"
],
"functional_consequence": "http://purl.obolibrary.org/obo/SO_0001583",
"urls": [
{
"nice_name": "Further details in UniProt database",
"url": "http://www.uniprot.org/uniprot/Q14896#pathology_and_biotech"
}
]
},
"variant2disease": {
"resource_score": {
"type": "probability",
"value": 1.0
},
"date_asserted": "2020-08-11T23:00:00Z",
"is_associated": true,
"unique_experiment_reference": "http://europepmc.org/abstract/MED/20215591",
"provenance_type": {
"literature": {
"references": [
{
"lit_id": "http://europepmc.org/abstract/MED/20215591"
}
]
},
"database": {
"id": "uniprot",
"version": "2020_04"
}
},
"gwas_panel_resolution": 1,
"gwas_sample_size": 1,
"evidence_codes": [
"http://purl.obolibrary.org/obo/ECO_0000205"
],
"urls": [
{
"nice_name": "Further details in UniProt database",
"url": "http://www.uniprot.org/uniprot/Q14896#pathology_and_biotech"
},
{
"nice_name": "Published reference",
"url": "http://europepmc.org/abstract/MED/20215591"
}
]
}
}
}
Example of a new evidence from uniprot_variants
:
{
"datasourceId" : "uniprot_variants",
"datatypeId" : "genetic_association",
"diseaseFromSource" : "Cardiomyopathy, dilated 1MM",
"diseaseFromSourceMappedId" : "Orphanet_217607",
"functionalConsequenceId" : "SO_0001583",
"literature" : [
"20215591"
],
"targetFromSourceId" : "Q14896",
"targetModulation" : "up_or_down",
"variantRsId" : "rs397514751"
}
diseaseFromSourceId
include if available in the resourceconfidence
: Include confidence field based on internal criteria: "high" or "medium"variantId
with the chrom_pos_ref_alt notation.The document to submit to the provider has been reviewed and can be consulted here.
I think uniprot requires a specific communication of the decision to include uniprot_variants
as a data source.
Provide an example evidence from Uniprot of how it looks with the current JSON schema and how it should look like when they use the new schema