robinandeer / scout

Frontend web UI for scouting clinical DNA variants
Other
6 stars 0 forks source link

New JSON format #184

Open robinandeer opened 9 years ago

robinandeer commented 9 years ago

Hej, det kommer att bli ganska stor skillnad i den information som du får från mongo-databasen Robin.

Om man ser på den data som detta anrop ger idag http://clinical-db:8084/variants/77787685 så får du data enligt nedan:

{
    "rating": "",
    "location_reliability": "",
    "functional_annotation": "nonsynonymous SNV",
    "stop_bp": 6645448,
    "dbsnp129": null,
    "lrt_whole_exome": 0.0121,
    "omim_morbid_desc": "",
    "rank_score": 21,
    "hgnc_synonyms": "GCSP, NKH;",
    "unscaled_cscore_snv": 3.2367,
    "phast_const_elements": "Score=389;Name=lod=51",
    "snorna_mirna_annotation": "-",
    "chr": "chr9",
    "hgnc_transcript_id": "GLDC:NM_000170:exon1:c.52G>T:p.G18C,",
    "esp6500": null,
    "hgnc_symbol": "GLDC",
    "GT_call_filter": "PASS",
    "reduced_penetrance": "",
    "pseudogene": "processed_pseudogene;",
    "clinical_db_gene_annotation": "IEM:EP",
    "main_location": "",
    "dbsnp132": null,
    "disease_group": "Nonketotic hyperglycinemia",
    "ref_nt": "C",
    "mutation_taster": 0.009623,
    "hgmd_accession": null,
    "hgmd": "",
    "gene_annotation": "exonic",
    "omim_gene_desc": "",
    "start_bp": 6645448,
    "hgmd_variant_type": null,
    "otherVariants": [
        {
            "pk": 75559832,
            "family": "183"
        },
        {
            "pk": 62008206,
            "family": "43"
        },
        {
            "pk": 65943750,
            "family": "92"
        },
        {
            "pk": 66075846,
            "family": "93"
        },
        {
            "pk": 77048406,
            "family": "trio22X"
        }
    ],
    "dbsnp": "-",
    "sift_whole_exome": 0.1,
    "id": 77787685,
    "gerp_element": 598.0,
    "dbsnp_id": "-",
    "scaled_cscore_snv": 16.85,
    "compounds": [
        {
            "alt_nt": "T",
            "ref_nt": "C",
            "variant": 93754058,
            "combined_score": 38,
            "chr": "chr9",
            "start_bp": 6588403
        },
        {
            "alt_nt": "T",
            "ref_nt": "C",
            "variant": 93754059,
            "combined_score": 20,
            "chr": "chr9",
            "start_bp": 6610326
        },
        {
            "alt_nt": "C",
            "ref_nt": "T",
            "variant": 93754056,
            "combined_score": 16,
            "chr": "chr9",
            "start_bp": 6591972
        },
        {
            "alt_nt": "T",
            "ref_nt": "C",
            "variant": 93754057,
            "combined_score": 16,
            "chr": "chr9",
            "start_bp": 6602374
        },
        {
            "alt_nt": "C",
            "ref_nt": "T",
            "variant": 93754060,
            "combined_score": 16,
            "chr": "chr9",
            "start_bp": 6606746
        },
        {
            "alt_nt": "-",
            "ref_nt": "T",
            "variant": 93754061,
            "combined_score": 16,
            "chr": "chr9",
            "start_bp": 6602378
        }
    ],
    "hgmd_variant_pmid": null,
    "polyphen_var_human": 0.671,
    "unscaled_cscore_thousand_g": null,
    "other_location": "",
    "scaled_cscore_thousand_g": null,
    "disease_gene_model": "AR",
    "hbvdb": 0.0127,
    "alt_nt": "A",
    "phylop_whole_exome": 1.27,
    "expression_type": "",
    "gene_model": "AR_compound:AD:AD_denovo",
    "variant_count": null,
    "individual_rank_score": 21,
    "thousand_g": null,
    "ensembl_geneid": "ENSG00000178445;",
    "hgnc_approved_name": "glycine dehydrogenase (decarboxylating);",
    "genomic_super_dups": "-",
    "polyphen_div_human": 0.975,
    "gerp_whole_exome": 3.62
}

Med den data som jag komer att ha i mongo-databasen så kommer informationen för en variant typiskt att se ut som nedan:

{
    "CHROM":1,
    "POS":2332391,
    "REF":C,
    "ALT":[T]
   "KGPhase1": true,
    "RSPOS": 2332391,
    "OTHERKG": true,
    "set": "Intersection",
    "SLO": true,
    "HGVS_PROTEIN_VAR": [
        null
    ],
    "POSITIVE_TRAIN_SITE": true,
    "Comp": [
        "-"
    ],
    "VC": "SNV",
    "EA_GTC": [
        "845",
        "2007",
        "1257"
    ],
    "Dbsnp129MAF": 0.3503,
    "GTC": [
        "1009",
        "2709",
        "2273"
    ],
    "CADD": 0.35,
    "dbSNPBuildID": 120,
    "FG": [
        "NM_007033.4:intron"
    ],
    "AA_AC": [
        "1030",
        "2734"
    ],
    "EA_AC": [
        "3697",
        "4521"
    ],
    "CAF": [
        "[0.6497",
        "0.3503]"
    ],
    "DP": 5362,
    "CDS_SIZES": [
        "NM_007033.4:591"
    ],
    "GL": [
        "RER1"
    ],
    "SAO": 0,
    "culprit": "FS",
    "RS": 12037485,
    "HGVScp": [
        "PEX10:NM_002617.3:downstream_gene_variant:s.-",
        "PEX10:NM_153818.1:downstream_gene_variant:s.-"
    ],
    "HGVS_CDNA_VAR": [
        "NM_007033.4:c.365+17C>T"
    ],
    "VLD": true,
    "GNO": true,
    "Dels": 0.0,
    "COMMON": 1,
    "pop": "ALL",
    "TAC": [
        "4727",
        "7255"
    ],
    "HaplotypeScore": 0.2072,
    "DBSNP": [
        "dbSNP_120"
    ],
    "GWAS_PUBMED": [
        null
    ],
    "DbsnpMAF": 0.3503,
    "SB": -20674.8,
    "AA": "C",
    "AC": [
        3
    ],
    "G5A": true,
    "GS": [
        null
    ],
    "ESPMAF": 0.394508,
    "AF": [
        0.5
    ],
    "AA_AGE": [
        null
    ],
    "CA": [
        null
    ],
    "CG": -6.4,
    "G5": true,
    "AN": 6,
    "VP": "0x05010008000117011e000100",
    "MQ0": 0,
    "GTS": [
        "TT",
        "TC",
        "CC"
    ],
    "EA_AGE": [
        null
    ],
    "AA_GTC": [
        "164",
        "702",
        "1016"
    ],
    "BVDMAF": 0.4522,
    "GQ_MEAN": 514.92,
    "GM": [
        "NA"
    ],
    "CP": 0.0,
    "CSQ": [
        "T|5192|NM_002617.3|Transcript|downstream_gene_variant|||||||||3850|-1|PEX10|||||",
        "T|5192|NM_153818.1|Transcript|downstream_gene_variant|||||||||3850|-1|PEX10|||||"
    ],
    "PH": [
        null
    ],
    "KGPROD": true,
    "WGT": 1,
    "INT": true,
    "1000GMAF": 0.34547,
    "HRun": 0,
    "GeneticRegionAnnotation": [
        "PEX10:downstream"
    ],
    "GQ_STDDEV": 561.47,
    "SSR": 0,
    "EXOME_CHIP": [
        "no"
    ],
    "MS": null,
    "MAF": [
        "44.9866",
        "27.3645",
        "39.4508"
    ],
    "KGPilot123": true,
    "MostSevereConsequence": [
        "PEX10:downstream_gene_variant"
    ],
    "NCC": 0,
    "ANN": [
        "PEX10"
    ]
}

Så det blir ganska stor skillnad. /Mats

robinandeer commented 9 years ago

Japp, det blir mer oklart vad som är vad (för din del). Men jag tror att det är det som är meningen, all tolkning av data och namn på taggar flyttas nu från mitt program som parsar data och insertar det i mysql till den som söker i databasen. Tror att det är det som är meningen med att använda en schema-lös databas.

Mitt dataladdningsprogram kommer nu bara att ta rad för rad i VCF-filen och lagra det i mongo-databasen.

Ska försöka hitta exempel på Compounds.

/Mats

robinandeer commented 9 years ago

Hej igen, jag hittade en variant där Comp inte var tom. Den ser ut som nedan. Vet inte om det räcker eller om du kommer behöva ytterligare info?

{
    "CHROM":1,
    "POS":11856378,
    "REF": G
    "ALT":[A],
    "RSPOS": 11856378,
    "BVDMAF": 0.2994,
    "CLNDSDB": [
        "MedGen"
    ],
    "CLNACC": [
        "RCV000003697.3"
    ],
    "POSITIVE_TRAIN_SITE": true,
    "CLNDBN": [
        "MTHFR_deficiency\\x2c_thermolabile_type"
    ],
    "AA_AC": [
        "536",
        "3870"
    ],
    "G5": true,
    "GL": [
        "MTHFR"
    ],
    "SAO": 0,
    "RS": 1801133,
    "HGVScp": [
        "MTHFR:NM_005957.4:missense_variant:s.-:e.5/12:c.665C>T:p.Ala222Val"
    ],
    "CLNSIG": [
        "2"
    ],
    "Sift": [
        "MTHFR:0.06"
    ],
    "TAC": [
        "3519",
        "9487"
    ],
    "GS": [
        "64"
    ],
    "TPA": true,
    "dbNSFP_phyloP100way_vertebrate": [
        9.044,
        9.044
    ],
    "HGNC_id": [
        "MTHFR"
    ],
    "MQ0": 0,
    "EA_AGE": [
        "461.7+/-365.5"
    ],
    "AA_GTC": [
        "31",
        "474",
        "1698"
    ],
    "GQ_MEAN": 399.65,
    "GM": [
        "AR_comp"
    ],
    "OTHERKG": true,
    "PH": [
        "probably-damaging:0.999"
    ],
    "OM": true,
    "1000GMAF": 0.32667,
    "HRun": 0,
    "MAF": [
        "34.686",
        "12.1652",
        "27.0567"
    ],
    "dbNSFP_SIFT_score": [
        0.05,
        0.05
    ],
    "dbNSFP_MutationTaster_pred": [
        "P",
        "P"
    ],
    "dbNSFP_phastCons100way_vertebrate": [
        1.0,
        1.0
    ],
    "FG": [
        "NM_005957.4:missense"
    ],
    "PH3": true,
    "dbNSFP_Polyphen2_HVAR_pred": [
        "D",
        "D",
        "D",
        "D"
    ],
    "Disease_group_pathway": [
        "Cobalamin_and_folate_transport_and_metab"
    ],
    "HD": true,
    "LSD": true,
    "dbNSFP_FATHMM_pred": [
        "D",
        "D",
        "D",
        "D",
        "D",
        "D",
        "D",
        "D"
    ],
    "culprit": "ReadPosRankSum",
    "dbNSFP_phyloP46way_primate": [
        0.655,
        0.655
    ],
    "VLD": true,
    "COMMON": 1,
    "DBSNP": [
        "dbSNP_89"
    ],
    "KGPROD": true,
    "dbNSFP_RadialSVM_pred": [
        "T",
        "T"
    ],
    "PM": true,
    "dbNSFP_phastCons46way_primate": [
        0.116,
        0.116
    ],
    "ESPMAF": 0.270567,
    "AA_AGE": [
        "242.1+/-252.3"
    ],
    "KGPilot123": true,
    "RV": true,
    "WGT": 1,
    "CLNSRC": [
        "GTR|GTR|GTR|GTR|GTR|GTR|OMIM_Allelic_Variant"
    ],
    "CLNORIGIN": [
        "1"
    ],
    "SB": -20836.01,
    "dbNSFP_MutationTaster_score": [
        0.9999999135526,
        0.9999999135526
    ],
    "CSQ": [
        "A|4524|NM_005957.4|Transcript|missense_variant|894|665|222|A/V|gCc/gTc||5/12|||-1|MTHFR||0.06|1|NM_005957.4:c.665C>T|NP_005948.3:p.Ala222Val"
    ],
    "dbNSFP_GERP++_NR": [
        5.08,
        5.08
    ],
    "ANN": [
        "MTHFR"
    ],
    "GQ_STDDEV": 559.15,
    "SSR": 0,
    "MS": 60.0,
    "dbNSFP_MutationAssessor_score": [
        "2.425",
        "2.425"
    ],
    "dbNSFP_SiPhy_29way_logOdds": [
        17.4321,
        17.4321
    ],
    "HGVS_PROTEIN_VAR": [
        "NM_005957.4:p.(A222V)"
    ],
    "dbNSFP_GERP++_RS": [
        5.08,
        5.08
    ],
    "pop": "ALL",
    "EA_GTC": [
        "519",
        "1945",
        "1836"
    ],
    "Dbsnp129MAF": 0.3246,
    "set": "Intersection",
    "GENEINFO": "MTHFR:4524",
    "REF": true,
    "CDS_SIZES": [
        "NM_005957.4:1971"
    ],
    "ASP": true,
    "Comp": [
        "1_11851003_G_C:1_11848139_G_A:1_11854671_C_A:1_11844214_A_T:1_11854476_T_G:1_11860458_C_T:1_11852300_C_T"
    ],
    "PolyPhen": [
        "MTHFR:1"
    ],
    "PMC": true,
    "DbsnpMAF": 0.3246,
    "GTC": [
        "550",
        "2419",
        "3534"
    ],
    "G5A": true,
    "CG": 5.1,
    "GTS": [
        "AA",
        "AG",
        "GG"
    ],
    "CLNDSDBID": [
        "C1856059"
    ],
    "CP": 1.0,
    "dbNSFP_LR_score": [
        0.0,
        0.0
    ],
    "Clinical_db_gene_annotation": [
        "IEM"
    ],
    "GeneticRegionAnnotation": [
        "MTHFR:exonic"
    ],
    "dbNSFP_LR_pred": [
        "T",
        "T"
    ],
    "CLNHGVS": [
        "NC_000001.10:g.11856378G>A"
    ],
    "MostSevereConsequence": [
        "MTHFR:missense_variant"
    ],
    "CLNSRCID": [
        "GTR000017233|GTR000194242|GTR000254993|GTR000327733|GTR000500035|GTR000500311|607093.0003"
    ],
    "KGPhase1": true,
    "dbSNPBuildID": 89,
    "GWAS_PUBMED": [
        "http://www.ncbi.nlm.nih.gov/pubmed?term"
    ],
    "CADD": 21.9,
    "EA_AC": [
        "2983",
        "5617"
    ],
    "CAF": [
        "[0.6754",
        "0.3246]"
    ],
    "DP": 6452,
    "Disease_gene_model": [
        "AR"
    ],
    "NSM": true,
    "HGVS_CDNA_VAR": [
        "NM_005957.4:c.665C>T"
    ],
    "GNO": true,
    "dbNSFP_SIFT_pred": [
        "D",
        "D"
    ],
    "Dels": 0.0,
    "CLNALLE": [
        1
    ],
    "dbNSFP_LRT_score": [
        "0.000000",
        "0.000000"
    ],
    "dbNSFP_LRT_pred": [
        "D",
        "D"
    ],
    "HaplotypeScore": 0.3201,
    "CA": [
        "http://www.ncbi.nlm.nih.gov/pubmed?term"
    ],
    "dbNSFP_RadialSVM_score": [
        -1.4547,
        -1.4547
    ],
    "AA": "G",
    "AC": [
        2
    ],
    "VC": "SNV",
    "dbNSFP_Reliability_index": [
        9,
        9
    ],
    "AF": [
        0.333
    ],
    "MTP": true,
    "AN": 6,
    "SLO": true,
    "S3D": true,
    "OTH": true,
    "dbNSFP_FATHMM_score": [
        "-4.03",
        "-4.03",
        "-4.03",
        "-4.03",
        "-4.03",
        "-4.03",
        "-4.03",
        "-4.03"
    ],
    "dbNSFP_Polyphen2_HVAR_score": [
        "0.941",
        "0.987",
        "0.941",
        "0.987"
    ],
    "Clinical_db_genome_build": [
        "GRCh37.p8"
    ],
    "Ensembl_gene_id": [
        "ENSG00000177000"
    ],
    "EXOME_CHIP": [
        "yes"
    ],
    "NCC": 0
}
robinandeer commented 9 years ago

Vi kan nog be Måns att byta ut separatorn ':' mot ',' för det är standarden i VCF för att generera listor. Comp entryt skulle i så fall bli:

    "Comp": [
        "1_11851003_G_C",
        "1_11848139_G_A",
        "1_11854671_C_A",
        "1_11844214_A_T",
        "1_11854476_T_G",
        "1_11860458_C_T",
        "1_11852300_C_T"
    ]

Vilket är bättre tycker jag.