opentargets / issues

Issue tracker for Open Targets Platform and Open Targets Genetics Portal
https://platform.opentargets.org https://genetics.opentargets.org
Apache License 2.0
12 stars 2 forks source link

I would like to have an example for Uniprot for the new JSON schema #1338

Closed ktsirigos closed 3 years ago

ktsirigos commented 3 years ago

Provide an example evidence from Uniprot of how it looks with the current JSON schema and how it should look like when they use the new schema

d0choa commented 3 years ago

Example of the new evidence. A few disclaimers:

w2 = Window.partitionBy("sourceId").orderBy(col("score").desc())
out = df.withColumn("row", row_number().over(w2)).filter(col("row") == 1).drop("row")
out.write.json("/Users/ochoa/exampleEvidence.json")

@ktsirigos you can try to find the same evidence in the old pipeline. (ask @AsierGonzalez if you don't know how to do it) @ireneisdoomed, we should iterate in the desired evidence further.

A note here. The next evidence corresponds to uniprot_literature.

{
  "diseaseFromSourceId": "Orphanet_98977",
  "resourceScore": 1,
  "literature": [
    "12860809",
    "17499207",
    "10798654",
    "9345106",
    "9792882",
    "12189160",
    "9490287",
    "15534471",
    "25524706",
    "15025728",
    "10873982",
    "12356829",
    "10980537",
    "12362081",
    "9005853",
    "9328473",
    "9863594",
    "9361308",
    "12442283",
    "12872267",
    "10196380",
    "9510647",
    "9535666",
    "11004290",
    "10644174",
    "10330365",
    "11774072",
    "9521427",
    "10340788",
    "17210859",
    "10819638",
    "10916185",
    "15255110",
    "16401791",
    "9697688",
    "15795224"
  ],
  "targetId": "ENSG00000034971",
  "targetFromSourceId": "Q99972",
  "datasourceId": "uniprot_literature",
  "datatypeId": "genetic_literature",
  "diseaseFromSource": "Glaucoma 1, open angle, A",
  "diseaseId": "Orphanet_98977",
  "studyId": "Q99972#pathology_and_biotech",
  "id": "41531baa86e90e7ae31d7b41176aa754777bb7d3",
  "score": 1,
  "sourceId": "uniprot_literature"
}
AsierGonzalez commented 3 years ago

This is the original submitted evidence string in the 20.11 UniProt evidence file:

jq-1.6 'select(.unique_association_fields.target == "http://identifiers.org/uniprot/Q99972" and .unique_association_fields.disease_uri == "http://www.orpha.net/ORDO/Orphanet_98977" and .disease.name == "Glaucoma 1, open angle, A")' cttv011-20-10-2020.json
{
  "sourceID": "uniprot_literature",
  "access_level": "public",
  "validated_against_schema_version": "1.7.3",
  "unique_association_fields": {
    "target": "http://identifiers.org/uniprot/Q99972",
    "disease_acronym": "GLC1A",
    "disease_uri": "http://www.orpha.net/ORDO/Orphanet_98977"
  },
  "target": {
    "target_type": "http://identifiers.org/cttv.target/protein_evidence",
    "activity": "http://identifiers.org/cttv.activity/up_or_down",
    "id": "http://identifiers.org/uniprot/Q99972"
  },
  "disease": {
    "id": "http://www.orpha.net/ORDO/Orphanet_98977",
    "name": "Glaucoma 1, open angle, A",
    "acronym": "GLC1A"
  },
  "literature": {
    "references": [
      {
        "lit_id": "http://europepmc.org/abstract/MED/12860809"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/17499207"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/10798654"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/9345106"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/9792882"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/12189160"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/9490287"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/15534471"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/25524706"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/15025728"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/10873982"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/12356829"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/10980537"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/12362081"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/9005853"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/9328473"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/9863594"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/9361308"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/12442283"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/12872267"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/10196380"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/9510647"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/9535666"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/11004290"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/10644174"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/10330365"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/11774072"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/9521427"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/10340788"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/17210859"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/10819638"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/10916185"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/15255110"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/16401791"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/9697688"
      },
      {
        "lit_id": "http://europepmc.org/abstract/MED/15795224"
      }
    ]
  },
  "type": "genetic_literature",
  "evidence": {
    "resource_score": {
      "type": "probability",
      "value": 1
    },
    "date_asserted": "2020-10-06T23:00:00Z",
    "is_associated": true,
    "provenance_type": {
      "literature": {
        "references": [
          {
            "lit_id": "http://europepmc.org/abstract/MED/12860809"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/17499207"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/10798654"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/9345106"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/9792882"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/12189160"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/9490287"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/15534471"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/25524706"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/15025728"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/10873982"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/12356829"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/10980537"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/12362081"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/9005853"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/9328473"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/9863594"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/9361308"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/12442283"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/12872267"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/10196380"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/9510647"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/9535666"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/11004290"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/10644174"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/10330365"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/11774072"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/9521427"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/10340788"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/17210859"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/10819638"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/10916185"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/15255110"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/16401791"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/9697688"
          },
          {
            "lit_id": "http://europepmc.org/abstract/MED/15795224"
          }
        ]
      },
      "database": {
        "id": "uniprot",
        "version": "2020_05"
      }
    },
    "evidence_codes": [
      "http://purl.obolibrary.org/obo/ECO_0000205"
    ],
    "urls": [
      {
        "nice_name": "Further details in UniProt database",
        "url": "http://www.uniprot.org/uniprot/Q99972#pathology_and_biotech"
      }
    ]
  }
}
ireneisdoomed commented 3 years ago

New evidence for uniprot_literature.

{
    "datasourceId" : "uniprot_literature",
    "datatypeId" : "genetic_literature",
    "diseaseFromSource" : "Glaucoma 1, open angle, A",
    "diseaseFromSourceMappedId" : "Orphanet_98977",
    "literature" : [
        "12860809",
        "17499207",
        "10798654",
        "9345106",
        "9792882",
        "12189160",
        "9490287",
        "15534471",
        "25524706",
        "15025728",
        "10873982",
        "12356829",
        "10980537",
        "12362081",
        "9005853",
        "9328473",
        "9863594",
        "9361308",
        "12442283",
        "12872267",
        "10196380",
        "9510647",
        "9535666",
        "11004290",
        "10644174",
        "10330365",
        "11774072",
        "9521427",
        "10340788",
        "17210859",
        "10819638",
        "10916185",
        "15255110",
        "16401791",
        "9697688",
        "15795224"
    ],
    "targetFromSourceId" : "Q99972",
    "targetModulation" : "up_or_down"
}

Additionally to the information contained in the old evidence:

Please note: This only applies to uniprot_literature, it's still pending to decide whether we are definitely dropping uniprot. After that analysis is done, we will do the same comparison for this datatype.

d0choa commented 3 years ago

As agreed on #1141, we need an example of old and new evidence for the uniprot datasource. I vote to rename it uniprot_variants to avoid confusion from now on

ireneisdoomed commented 3 years ago

Example of an old evidence from uniprot_variants:

{
    "sourceID": "uniprot",
    "access_level": "public",
    "validated_against_schema_version": "1.6.7",
    "unique_association_fields": {
        "target": "Q14896",
        "disease_acronym": "CMD1MM",
        "disease_uri": "http://www.orpha.net/ORDO/Orphanet_217607",
        "uniprot_release": "2020_04",
        "variant_id": "VAR_070455",
        "dbSnps": "rs397514751",
        "alleleOrigin": "germline"
    },
    "target": {
        "target_type": "http://identifiers.org/cttv.target/protein_evidence",
        "activity": "http://identifiers.org/cttv.activity/up_or_down",
        "id": "http://identifiers.org/uniprot/Q14896"
    },
    "disease": {
        "id": "http://www.orpha.net/ORDO/Orphanet_217607",
        "name": "Cardiomyopathy, dilated 1MM",
        "acronym": "CMD1MM"
    },
    "type": "genetic_association",
    "variant": {
        "id": "http://identifiers.org/dbsnp/rs397514751",
        "type": "snp single"
    },
    "evidence": {
        "gene2variant": {
            "date_asserted": "2020-08-11T23:00:00Z",
            "is_associated": true,
            "provenance_type": {
                "literature": {
                    "references": [
                        {
                            "lit_id": "http://europepmc.org/abstract/MED/20215591"
                        }
                    ]
                },
                "database": {
                    "id": "uniprot",
                    "version": "2020_04"
                }
            },
            "evidence_codes": [
                "http://purl.obolibrary.org/obo/ECO_0000205"
            ],
            "functional_consequence": "http://purl.obolibrary.org/obo/SO_0001583",
            "urls": [
                {
                    "nice_name": "Further details in UniProt database",
                    "url": "http://www.uniprot.org/uniprot/Q14896#pathology_and_biotech"
                }
            ]
        },
        "variant2disease": {
            "resource_score": {
                "type": "probability",
                "value": 1.0
            },
            "date_asserted": "2020-08-11T23:00:00Z",
            "is_associated": true,
            "unique_experiment_reference": "http://europepmc.org/abstract/MED/20215591",
            "provenance_type": {
                "literature": {
                    "references": [
                        {
                            "lit_id": "http://europepmc.org/abstract/MED/20215591"
                        }
                    ]
                },
                "database": {
                    "id": "uniprot",
                    "version": "2020_04"
                }
            },
            "gwas_panel_resolution": 1,
            "gwas_sample_size": 1,
            "evidence_codes": [
                "http://purl.obolibrary.org/obo/ECO_0000205"
            ],
            "urls": [
                {
                    "nice_name": "Further details in UniProt database",
                    "url": "http://www.uniprot.org/uniprot/Q14896#pathology_and_biotech"
                },
                {
                    "nice_name": "Published reference",
                    "url": "http://europepmc.org/abstract/MED/20215591"
                }
            ]
        }
    }
}
ireneisdoomed commented 3 years ago

Example of a new evidence from uniprot_variants:

{
    "datasourceId" : "uniprot_variants",
    "datatypeId" : "genetic_association",
    "diseaseFromSource" : "Cardiomyopathy, dilated 1MM",
    "diseaseFromSourceMappedId" : "Orphanet_217607",
    "functionalConsequenceId" : "SO_0001583",
    "literature" : [
        "20215591"
    ],
    "targetFromSourceId" : "Q14896",
    "targetModulation" : "up_or_down",
    "variantRsId" : "rs397514751"
}
ireneisdoomed commented 3 years ago

The document to submit to the provider has been reviewed and can be consulted here.

I think uniprot requires a specific communication of the decision to include uniprot_variants as a data source.