Illumina / Nirvana

The nimble & robust variant annotator
https://illumina.github.io/NirvanaDocumentation/
GNU General Public License v3.0
170 stars 44 forks source link

MNVs variants without variantFrequencies annotation #57

Closed geocarvalho closed 2 years ago

geocarvalho commented 2 years ago

Hello, thanks to make this great project available as open source. I was creating a table for some variants annotated with Nirvana and I found some cases (21 MNVs from 300k variants) that the variant doesn't have "variantFrequencies", "fisherStrandBias", "mappingQuality", and "alleleDepths" annotations, but I can see the "AD" and "AF" information inside the VCF for the SNVs separately. Three examples:

{
  "chromosome": "chr7",
  "position": 72728899,
  "refAllele": "AA",
  "altAlleles": [
    "CA",
    "CG"
  ],
  "quality": 47.85,
  "filters": [
    "PASS"
  ],
  "cytogeneticBand": "7q11.23",
  "samples": [
    {
      "genotype": "1|2",
      "genotypeQuality": 40
    }
  ],
  "variants": [
    {
      "vid": "7-72728899-A-C",
      "chromosome": "chr7",
      "begin": 72728899,
      "end": 72728899,
      "refAllele": "A",
      "altAllele": "C",
      "variantType": "SNV",
      "isRecomposedVariant": true,
      "linkedVids": [
        "7-72728899-A-C"
      ],
      "hgvsg": "NC_000007.14:g.72728899A>C",
      "phylopScore": -0.2,
      "dbsnp": [
        "rs117110249"
      ],
      "gnomad": {
        "coverage": 34,
        "allAf": 0.935019,
        "allAn": 151922,
        "allAc": 142050,
        "allHc": 66519,
        "afrAf": 0.983839,
        "afrAn": 41396,
        "afrAc": 40727,
        "afrHc": 20040,
        "amrAf": 0.897849,
        "amrAn": 15252,
        "amrAc": 13694,
        "amrHc": 6161,
        "easAf": 0.883012,
        "easAn": 5180,
        "easAc": 4574,
        "easHc": 2026,
        "finAf": 0.951595,
        "finAn": 10598,
        "finAc": 10085,
        "finHc": 4803,
        "nfeAf": 0.91339,
        "nfeAn": 67902,
        "nfeAc": 62021,
        "nfeHc": 28309,
        "asjAf": 0.933968,
        "asjAn": 3468,
        "asjAc": 3239,
        "asjHc": 1514,
        "sasAf": 0.975973,
        "sasAn": 4828,
        "sasAc": 4712,
        "sasHc": 2300,
        "othAf": 0.916587,
        "othAn": 2086,
        "othAc": 1912,
        "othHc": 877,
        "maleAf": 0.936853,
        "maleAn": 74192,
        "maleAc": 69507,
        "maleHc": 32619,
        "femaleAf": 0.933269,
        "femaleAn": 77730,
        "femaleAc": 72543,
        "femaleHc": 33900,
        "controlsAllAf": 0.942508,
        "controlsAllAn": 32822,
        "controlsAllAc": 30935
      },
      "oneKg": {
        "allAf": 0.938099,
        "afrAf": 0.996218,
        "amrAf": 0.897695,
        "easAf": 0.882937,
        "eurAf": 0.900596,
        "sasAf": 0.98364,
        "allAn": 5008,
        "afrAn": 1322,
        "amrAn": 694,
        "easAn": 1008,
        "eurAn": 1006,
        "sasAn": 978,
        "allAc": 4698,
        "afrAc": 1317,
        "amrAc": 623,
        "easAc": 890,
        "eurAc": 906,
        "sasAc": 962
      },
      "topmed": {
        "allAf": 0.929664,
        "allAn": 125568,
        "allAc": 116736,
        "allHc": 54328
      },
      "transcripts": [
        {
          "transcript": "ENST00000612372.4",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "tTg/tGg",
          "aminoAcids": "L/W",
          "cdnaPos": "728",
          "cdsPos": "629",
          "exons": "7/12",
          "proteinPos": "210",
          "geneId": "ENSG00000277149",
          "hgnc": "TYW1B",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000612372.4:c.629T>G",
          "hgvsp": "ENSP00000480534.1:p.(Leu210Trp)",
          "polyPhenScore": 0.45,
          "polyPhenPrediction": "possibly damaging",
          "proteinId": "ENSP00000480534.1",
          "siftScore": 0,
          "siftPrediction": "deleterious"
        },
        {
          "transcript": "NM_001145440.2",
          "source": "RefSeq",
          "bioType": "protein_coding",
          "codons": "tTg/tGg",
          "aminoAcids": "L/W",
          "cdnaPos": "1274",
          "cdsPos": "1115",
          "exons": "9/14",
          "proteinPos": "372",
          "geneId": "441250",
          "hgnc": "TYW1B",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "NM_001145440.2:c.1115T>G",
          "hgvsp": "NP_001138912.2:p.(Leu372Trp)",
          "isCanonical": true,
          "polyPhenScore": 0.77,
          "polyPhenPrediction": "possibly damaging",
          "proteinId": "NP_001138912.2",
          "siftScore": 0,
          "siftPrediction": "deleterious"
        },
        {
          "transcript": "ENST00000620995.4",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "tTg/tGg",
          "aminoAcids": "L/W",
          "cdnaPos": "1238",
          "cdsPos": "1115",
          "exons": "9/14",
          "proteinPos": "372",
          "geneId": "ENSG00000277149",
          "hgnc": "TYW1B",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000620995.4:c.1115T>G",
          "hgvsp": "ENSP00000482502.1:p.(Leu372Trp)",
          "isCanonical": true,
          "polyPhenScore": 0.77,
          "polyPhenPrediction": "possibly damaging",
          "proteinId": "ENSP00000482502.1",
          "siftScore": 0,
          "siftPrediction": "deleterious"
        },
        {
          "transcript": "ENST00000610600.1",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "tTg/tGg",
          "aminoAcids": "L/W",
          "cdnaPos": "1031",
          "cdsPos": "920",
          "exons": "8/8",
          "proteinPos": "307",
          "geneId": "ENSG00000277149",
          "hgnc": "TYW1B",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000610600.1:c.920T>G",
          "hgvsp": "ENSP00000484480.1:p.(Leu307Trp)",
          "polyPhenScore": 0.375,
          "polyPhenPrediction": "benign",
          "proteinId": "ENSP00000484480.1",
          "siftScore": 0,
          "siftPrediction": "deleterious",
          "aminoAcidConservation": {
            "scores": [
              0.01
            ]
          }
        }
      ]
    },
    {
      "vid": "7-72728899-AA-CG",
      "chromosome": "chr7",
      "begin": 72728899,
      "end": 72728900,
      "refAllele": "AA",
      "altAllele": "CG",
      "variantType": "MNV",
      "isRecomposedVariant": true,
      "linkedVids": [
        "7-72728899-A-C",
        "7-72728900-A-G"
      ],
      "hgvsg": "NC_000007.14:g.72728899_72728900delinsCG",
      "dbsnp": [
        "rs386714630"
      ],
      "transcripts": [
        {
          "transcript": "ENST00000612372.4",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "TTg/CGg",
          "aminoAcids": "L/R",
          "cdnaPos": "727-728",
          "cdsPos": "628-629",
          "exons": "7/12",
          "proteinPos": "210",
          "geneId": "ENSG00000277149",
          "hgnc": "TYW1B",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000612372.4:c.628_629delinsCG",
          "hgvsp": "ENSP00000480534.1:p.(Leu210Arg)",
          "polyPhenScore": 0,
          "polyPhenPrediction": "benign",
          "proteinId": "ENSP00000480534.1",
          "siftScore": 1,
          "siftPrediction": "tolerated"
        },
        {
          "transcript": "NM_001145440.2",
          "source": "RefSeq",
          "bioType": "protein_coding",
          "codons": "TTg/CGg",
          "aminoAcids": "L/R",
          "cdnaPos": "1273-1274",
          "cdsPos": "1114-1115",
          "exons": "9/14",
          "proteinPos": "372",
          "geneId": "441250",
          "hgnc": "TYW1B",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "NM_001145440.2:c.1114_1115delinsCG",
          "hgvsp": "NP_001138912.2:p.(Leu372Arg)",
          "isCanonical": true,
          "polyPhenScore": 0,
          "polyPhenPrediction": "benign",
          "proteinId": "NP_001138912.2",
          "siftScore": 1,
          "siftPrediction": "tolerated"
        },
        {
          "transcript": "ENST00000620995.4",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "TTg/CGg",
          "aminoAcids": "L/R",
          "cdnaPos": "1237-1238",
          "cdsPos": "1114-1115",
          "exons": "9/14",
          "proteinPos": "372",
          "geneId": "ENSG00000277149",
          "hgnc": "TYW1B",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000620995.4:c.1114_1115delinsCG",
          "hgvsp": "ENSP00000482502.1:p.(Leu372Arg)",
          "isCanonical": true,
          "polyPhenScore": 0,
          "polyPhenPrediction": "benign",
          "proteinId": "ENSP00000482502.1",
          "siftScore": 1,
          "siftPrediction": "tolerated"
        },
        {
          "transcript": "ENST00000610600.1",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "TTg/CGg",
          "aminoAcids": "L/R",
          "cdnaPos": "1030-1031",
          "cdsPos": "919-920",
          "exons": "8/8",
          "proteinPos": "307",
          "geneId": "ENSG00000277149",
          "hgnc": "TYW1B",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000610600.1:c.919_920delinsCG",
          "hgvsp": "ENSP00000484480.1:p.(Leu307Arg)",
          "polyPhenScore": 0,
          "polyPhenPrediction": "benign",
          "proteinId": "ENSP00000484480.1",
          "siftScore": 1,
          "siftPrediction": "tolerated",
          "aminoAcidConservation": {
            "scores": [
              0.01
            ]
          }
        }
      ]
    }
  ]
},
{
  "chromosome": "chr7",
  "position": 100956478,
  "refAllele": "GTG",
  "altAlleles": [
    "CTC"
  ],
  "quality": 14.89,
  "filters": [
    "PASS"
  ],
  "cytogeneticBand": "7q22.1",
  "samples": [
    {
      "genotype": "0|1",
      "genotypeQuality": 15
    }
  ],
  "variants": [
    {
      "vid": "7-100956478-GTG-CTC",
      "chromosome": "chr7",
      "begin": 100956478,
      "end": 100956480,
      "refAllele": "GTG",
      "altAllele": "CTC",
      "variantType": "MNV",
      "isRecomposedVariant": true,
      "linkedVids": [
        "7-100956478-G-C",
        "7-100956480-G-C"
      ],
      "hgvsg": "NC_000007.14:g.100956478_100956480delinsCTC",
      "transcripts": [
        {
          "transcript": "NM_005960.1",
          "source": "RefSeq",
          "bioType": "protein_coding",
          "codons": "GTG/CTC",
          "aminoAcids": "V/L",
          "cdnaPos": "4904-4906",
          "cdsPos": "4699-4701",
          "exons": "2/12",
          "proteinPos": "1567",
          "geneId": "4584",
          "hgnc": "MUC3A",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "NM_005960.1:c.4699_4701delinsCTC",
          "hgvsp": "NP_005951.1:p.(Val1567Leu)",
          "isCanonical": true,
          "polyPhenScore": 0,
          "polyPhenPrediction": "unknown",
          "proteinId": "NP_005951.1",
          "siftScore": 1,
          "siftPrediction": "tolerated - low confidence"
        },
        {
          "transcript": "ENST00000379458.8",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "GTG/CTC",
          "aminoAcids": "V/L",
          "cdnaPos": "4769-4771",
          "cdsPos": "4699-4701",
          "exons": "2/12",
          "proteinPos": "1567",
          "geneId": "ENSG00000169894",
          "hgnc": "MUC3A",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000379458.8:c.4699_4701delinsCTC",
          "hgvsp": "ENSP00000368771.5:p.(Val1567Leu)",
          "isCanonical": true,
          "polyPhenScore": 0,
          "polyPhenPrediction": "unknown",
          "proteinId": "ENSP00000368771.5",
          "siftScore": 1,
          "siftPrediction": "tolerated - low confidence"
        },
        {
          "transcript": "ENST00000483366.5",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "GTG/CTC",
          "aminoAcids": "V/L",
          "cdnaPos": "4699-4701",
          "cdsPos": "4699-4701",
          "exons": "2/11",
          "proteinPos": "1567",
          "geneId": "ENSG00000169894",
          "hgnc": "MUC3A",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000483366.5:c.4699_4701delinsCTC",
          "hgvsp": "ENSP00000483541.1:p.(Val1567Leu)",
          "polyPhenScore": 0,
          "polyPhenPrediction": "unknown",
          "proteinId": "ENSP00000483541.1",
          "siftScore": 1,
          "siftPrediction": "tolerated - low confidence"
        },
        {
          "transcript": "ENST00000414964.5",
          "source": "Ensembl",
          "bioType": "nonsense_mediated_decay",
          "geneId": "ENSG00000169894",
          "hgnc": "MUC3A",
          "consequence": [
            "upstream_gene_variant"
          ],
          "proteinId": "ENSP00000393306.2"
        }
      ]
    }
  ]
},
{
  "chromosome": "chr7",
  "position": 150858967,
  "refAllele": "CG",
  "altAlleles": [
    "GC"
  ],
  "quality": 144.55,
  "filters": [
    "PASS"
  ],
  "cytogeneticBand": "7q36.1",
  "samples": [
    {
      "genotype": "1|1",
      "genotypeQuality": 95
    }
  ],
  "variants": [
    {
      "vid": "7-150858967-CG-GC",
      "chromosome": "chr7",
      "begin": 150858967,
      "end": 150858968,
      "refAllele": "CG",
      "altAllele": "GC",
      "variantType": "MNV",
      "isRecomposedVariant": true,
      "linkedVids": [
        "7-150858967-C-G",
        "7-150858968-G-C"
      ],
      "hgvsg": "NC_000007.14:g.150858967_150858968delinsGC",
      "dbsnp": [
        "rs71516432"
      ],
      "transcripts": [
        {
          "transcript": "ENST00000493429.5",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "aCG/aGC",
          "aminoAcids": "T/S",
          "cdnaPos": "2359-2360",
          "cdsPos": "1775-1776",
          "exons": "5/7",
          "proteinPos": "592",
          "geneId": "ENSG00000002726",
          "hgnc": "AOC1",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000493429.5:c.1775_1776delinsGC",
          "hgvsp": "ENSP00000418614.1:p.(Thr592Ser)",
          "polyPhenScore": 0,
          "polyPhenPrediction": "benign",
          "proteinId": "ENSP00000418614.1",
          "siftScore": 1,
          "siftPrediction": "tolerated"
        },
        {
          "transcript": "ENST00000467291.5",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "aCG/aGC",
          "aminoAcids": "T/S",
          "cdnaPos": "2081-2082",
          "cdsPos": "1775-1776",
          "exons": "5/7",
          "proteinPos": "592",
          "geneId": "ENSG00000002726",
          "hgnc": "AOC1",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000467291.5:c.1775_1776delinsGC",
          "hgvsp": "ENSP00000418328.1:p.(Thr592Ser)",
          "polyPhenScore": 0,
          "polyPhenPrediction": "benign",
          "proteinId": "ENSP00000418328.1",
          "siftScore": 1,
          "siftPrediction": "tolerated"
        },
        {
          "transcript": "ENST00000460213.1",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "geneId": "ENSG00000002726",
          "hgnc": "AOC1",
          "consequence": [
            "downstream_gene_variant"
          ],
          "proteinId": "ENSP00000418557.1"
        },
        {
          "transcript": "NM_001091.3",
          "source": "RefSeq",
          "bioType": "protein_coding",
          "codons": "aCG/aGC",
          "aminoAcids": "T/S",
          "cdnaPos": "1873-1874",
          "cdsPos": "1775-1776",
          "exons": "3/5",
          "proteinPos": "592",
          "geneId": "26",
          "hgnc": "AOC1",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "NM_001091.3:c.1775_1776delinsGC",
          "hgvsp": "NP_001082.2:p.(Thr592Ser)",
          "polyPhenScore": 0,
          "polyPhenPrediction": "benign",
          "proteinId": "NP_001082.2",
          "siftScore": 1,
          "siftPrediction": "tolerated"
        },
        {
          "transcript": "NM_001272072.1",
          "source": "RefSeq",
          "bioType": "protein_coding",
          "codons": "aCG/aGC",
          "aminoAcids": "T/S",
          "cdnaPos": "1873-1874",
          "cdsPos": "1775-1776",
          "exons": "3/5",
          "proteinPos": "592",
          "geneId": "26",
          "hgnc": "AOC1",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "NM_001272072.1:c.1775_1776delinsGC",
          "hgvsp": "NP_001259001.1:p.(Thr592Ser)",
          "isCanonical": true,
          "polyPhenScore": 0,
          "polyPhenPrediction": "benign",
          "proteinId": "NP_001259001.1",
          "siftScore": 1,
          "siftPrediction": "tolerated"
        },
        {
          "transcript": "ENST00000360937.8",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "aCG/aGC",
          "aminoAcids": "T/S",
          "cdnaPos": "1873-1874",
          "cdsPos": "1775-1776",
          "exons": "3/5",
          "proteinPos": "592",
          "geneId": "ENSG00000002726",
          "hgnc": "AOC1",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000360937.8:c.1775_1776delinsGC",
          "hgvsp": "ENSP00000354193.4:p.(Thr592Ser)",
          "polyPhenScore": 0,
          "polyPhenPrediction": "benign",
          "proteinId": "ENSP00000354193.4",
          "siftScore": 1,
          "siftPrediction": "tolerated"
        },
        {
          "transcript": "ENST00000416793.6",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "codons": "aCG/aGC",
          "aminoAcids": "T/S",
          "cdnaPos": "1835-1836",
          "cdsPos": "1775-1776",
          "exons": "3/5",
          "proteinPos": "592",
          "geneId": "ENSG00000002726",
          "hgnc": "AOC1",
          "consequence": [
            "missense_variant"
          ],
          "hgvsc": "ENST00000416793.6:c.1775_1776delinsGC",
          "hgvsp": "ENSP00000411613.2:p.(Thr592Ser)",
          "isCanonical": true,
          "polyPhenScore": 0,
          "polyPhenPrediction": "benign",
          "proteinId": "ENSP00000411613.2",
          "siftScore": 1,
          "siftPrediction": "tolerated"
        },
        {
          "transcript": "ENST00000483043.1",
          "source": "Ensembl",
          "bioType": "protein_coding",
          "geneId": "ENSG00000002726",
          "hgnc": "AOC1",
          "consequence": [
            "downstream_gene_variant"
          ],
          "proteinId": "ENSP00000417392.1"
        },
        {
          "transcript": "ENST00000480582.1",
          "source": "Ensembl",
          "bioType": "processed_transcript",
          "cdnaPos": "313-314",
          "exons": "2/5",
          "geneId": "ENSG00000002726",
          "hgnc": "AOC1",
          "consequence": [
            "non_coding_transcript_exon_variant"
          ],
          "hgvsc": "ENST00000480582.1:n.313_314delinsGC"
        }
      ]
    }
  ]
}

Would be great to have all the annotations. Is it somehow expected?

Nirvana 3.16.1 Cache version: 27, Supplementary annotation version: 63, Reference version: 7 using GRCh38

Best, George.

rajatshuvro commented 2 years ago

Hello @geocarvalho , If you notice carefully, you will find that all these variants are composed by Nirvana ("isRecomposedVariant": true). They do not come from the variant caller. For these variants, we cannot provide "variantFrequencies", "fisherStrandBias", "mappingQuality", and "alleleDepths" annotations. We decided against attempting to aggregate these values from the constituent SNVs since it was not clear how it could be done uncontroversially.

Best Rajat