genome-nexus / genome-nexus-importer

Import data into MongoDB for use by https://github.com/genome-nexus/genome-nexus/
MIT License
4 stars 16 forks source link

Fix missing pfam in grch38 #78

Closed leexgh closed 1 year ago

leexgh commented 1 year ago

Fix: https://github.com/genome-nexus/genome-nexus/issues/684 Input file ensembl_biomart_pfam.txt is downloaded from retrieve_biomart_tables.R. We haven't updated input files for a while, the old file has an empty column "Protein stable ID" that causes problems with parsing.

Previous EGFR transcript:

  "domains": [
    {
      "pfam_domain_id": 185.0,
      "pfam_domain_start": 338.0,
      "pfam_domain_end": null
    },
    {
      "pfam_domain_id": 361.0,
      "pfam_domain_start": 480.0,
      "pfam_domain_end": null
    },
    {
      "pfam_domain_id": 57.0,
      "pfam_domain_start": 167.0,
      "pfam_domain_end": null
    },
    {
      "pfam_domain_id": 713.0,
      "pfam_domain_start": 965.0,
      "pfam_domain_end": null
    },
    {
      "pfam_domain_id": 505.0,
      "pfam_domain_start": 636.0,
      "pfam_domain_end": null
    }

New EGFR transcript:

  "domains": [
    {
      "pfam_domain_id": "PF14843",
      "pfam_domain_start": 505.0,
      "pfam_domain_end": 636.0
    },
    {
      "pfam_domain_id": "PF01030",
      "pfam_domain_start": 361.0,
      "pfam_domain_end": 480.0
    },
    {
      "pfam_domain_id": "PF01030",
      "pfam_domain_start": 57.0,
      "pfam_domain_end": 167.0
    },
    {
      "pfam_domain_id": "PF07714",
      "pfam_domain_start": 713.0,
      "pfam_domain_end": 965.0
    },
    {
      "pfam_domain_id": "PF00757",
      "pfam_domain_start": 185.0,
      "pfam_domain_end": 338.0
    }
  ],