varfish-org / varfish-server-worker

Rust-based background worker for varfish-server
MIT License
2 stars 1 forks source link

Add "spec" JSON files #95

Closed holtgrewe closed 3 months ago

holtgrewe commented 1 year ago

Is your feature request related to a problem? Please describe. The data in each file currently does not have much meta data on it.

Describe the solution you'd like We should add "specification JSON" files for each. We need to specify this JSON format and then provide it for each database file.

Describe alternatives you've considered N/A

Additional context N/A

holtgrewe commented 1 year ago

Here is an example for the hESC TAD spec JSON file:

{
    "dc:format": "text/x-bed",
    "dc:identifier": "features/GRCh37/tads/hesc.bed:dixon2015",
    "dc:title": "Topological Associated Domains (TADs) in hESC cell line for GRCh37",
    "dc:description": "This BED file contains the human embryonic stem cells TAD domains as published by Dixon et al. (2015).",
    "dc:created": "2023-02-06",
    "dc:creator": "Dixon et al. (2015)",
    "dc:contributor": [
        "VarFish Developer Team"
    ],
    "dc:source": [
        "PMID:25693564",
        "http://compbio.med.harvard.edu/modencode/webpage/hic/"
    ],
    "tsv:columns": {
      "chrom": "Chromosome name without chr prefix",
      "begin": "0-based begin position",
      "end": "0-based end position"
    }
}

And one for the hand-curated ACMG one:

{
    "dc:format": "text/tsv",
    "dc:identifier": "genes/acmg/acmg-sf-genes.tsv:3.1",
    "dc:title": "ACMG Secondary Findings Gene List (v3.1)",
    "dc:description": "This is version 3.1 of the ACMG gene list for reporting incidental findings. The file was curated from PMID:35802134 as gene symbols and then translated to ENSEMBL and Entrez/NCBI gene ID with the HGNC BioMart",
    "dc:created": "2022-02-03",
    "dc:creator": "American College of Medical Genetics",
    "dc:contributor": [
        "VarFish Developer Team"
    ],
    "dc:source": [
        "PMID:35802134",
        "https://www.ncbi.nlm.nih.gov/clinvar/docs/acmg/",
        "https://biomart.genenames.org/"
    ],
    "tsv:columns": {
        "hgnc_id": "HGNC gene ID",
        "ensembl_gene_id": "ENSEMBL gene ID",
        "ncbi_gene_id": "Entrez/NCBI gene ID",
        "gene_symbol": "HGNC approve gene symbol",
        "gene_mim": "MIM code of the gene",
        "disease_phenotype": "Name of the disease",
        "disorder_mim": "MIM code of the disorder/disease",
        "phenotype_category": "Phenotype category",
        "inheritance": "Free text describing the variant",
        "sf_list_version": "First SF list version that included the variant",
        "variants_to_report": "Free text describing which variants to report (based on ACMG rating)"
    }
}

Here is one for the NCBI. This is an example of documenting JSON structures in a JSONL file.

{
    "dc:format": "application/jsonl",
    "dc:identifier": "genes/ncbi/gene_info.jsonl:2023-05-08",
    "dc:title": "NCBI gene information (retrieved 2023-05-08)",
    "dc:description": "This is an extract from the NCBI gene information/Entrez database created on 2023-05-08. Note that later versions may use different upstream NCBI database and extend the data.",
    "dc:created": "2023-05-08",
    "dc:creator": "NCBI",
    "dc:contributor": [
        "VarFish Developer Team"
    ],
    "dc:source": [
        "PMID:34850941",
        "https://www.ncbi.nlm.nih.gov/gene/",
        "http://ftp.ncbi.nih.gov/gene/DATA/ASN_BINARY/Mammalia/Homo_sapiens.ags.gz"
    ],
    "json:fields": {
        "gene_id": "HGNC gene ID",
        "summary": "ENSEMBL gene ID",
        "rif_entries": "List of reference-into-function entries",
        "rif_entries.*.text": "Text of the reference into function entry",
        "rif_entries.*.pmids": "List of (string) PubMed IDs of the entry"
    }
}

And here is an example describing a RocksDB database. Note that the identifier ends in "/" to indicate that this is a directory.

{
    "dc:format": "application/x-rocksdb",
    "dc:identifier": "genes/db/:2023-05-08",
    "dc:title": "VarFish genes RocksDB (built 2023-05-08)",
    "dc:description": "This is a RocksDB that contains the gene information.",
    "dc:created": "2023-05-08",
    "dc:creator": "VarFish Developer Team",
    "dc:source": [
        "PMID:35802134",
        "https://www.ncbi.nlm.nih.gov/clinvar/docs/acmg/",
        "https://biomart.genenames.org/"
        "PMID:34850941",
        "https://www.ncbi.nlm.nih.gov/gene/",
        "http://ftp.ncbi.nih.gov/gene/DATA/ASN_BINARY/Mammalia/Homo_sapiens.ags.gz"
    ],
    "rocksdb:column_families": {
        "meta": {
            "description": "Metadata"
        },
        "genes": {
            "description": "JSON with gene information with HGNC ID as the key (e.g., 'HGNC:123'",
            "json:fields": {
                "acmg_sf": "ACMG secondary findings information",
                "gnomad_constraints": "gnomAD constraint information",
                "hgnc": "HGNC gene information",
                "ncbi": "NCBI gene information",
            }
        }
    }
}