brentp / vcfanno

annotate a VCF with other VCFs/BEDs/tabixed files
https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0973-5
MIT License
357 stars 55 forks source link

Incorrect AC, AF, AN, DP values after merging multiple annotated vcfs. #89

Closed gezthxbio closed 6 years ago

gezthxbio commented 6 years ago

vcfanno version 0.2.9 [built with go1.10] Tried pynnotator to annotate a query vcf file. Pynnotator uses vcfanno in the final step to merge multiple annotated vcf file. It creates config.toml file required to initiate vcfanno. After merging I noticed an increase in DP values and as a reflection AC, AF, AN has also changed.

QUERY VCF Entry

chr1    723798  .       CAG     C       80.17   .       AC=2;AF=1.000;AN=2;DP=13;FS=0.000;MQ=60.00;QD=6.17;SOR=1.270;FractionInformativeReads=0.846     GT:AD:AF:DP:GQ:PL:GL:GP:PRI:SB:MB    1/1:0,11:1.000:11:30:109,33,0:-10.917,-3.317,0.000:8.017e+01,3.017e+01,4.177e-03:0.00,26.00,29.00:0,0,7,4:0,0,6,5

Tried to run vcfanno with this command vcfanno_linux64 -p 20 test.toml ../sanity_check/sorted.vcf > test.vcf using test.toml

[[annotation]]
file="/home/biouser/AGBL_LABS/Projects/BS009/Analysis/Annotation/NEW/Pynnotator/ann_Lynn_S1/snpsift/snpsift.final.vcf.gz"
fields = ["VARTYPE", "SNP", "MNP", "INS", "DEL", "MIXED", "HOM", "HET"]
ops=["first", "first", "first", "first", "first", "first", "first", "first"]

[[annotation]]
file="/home/biouser/bin/pynnotator/pynnotator/data/dbsnp/clinvar_20180429.vcf.gz"
fields = ["AF_ESP", "AF_EXAC", "AF_TGP", "ALLELEID", "CLNDN", "CLNDNINCL", "CLNDISDB", "CLNDISDBINCL", "CLNHGVS", "CLNREVSTAT", "CLNSIG", "CLNSIGCONF", "CLNSIGINCL", "CLNVC", "CLNVCSO", "CLNVI", "DBVARID", "GENEINFO", "MC", "ORIGIN", "RS", "SSR", ]
ops=["first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first"]

[[annotation]]
file="/home/biouser/bin/pynnotator/pynnotator/data/decipher/HI_Predictions_Version3.bed.gz"
names=["HI_PREDICTION"]
columns=[4]
ops=["uniq"]

[[annotation]]
file="/home/biouser/AGBL_LABS/Projects/BS009/Analysis/Annotation/NEW/Pynnotator/ann_Lynn_S1/snpeff/snpeff.output.vcf.gz"
fields = ["ANN"]
names = ["snpeff_eff"]
ops=["first"]

[[annotation]]
file="/home/biouser/AGBL_LABS/Projects/BS009/Analysis/Annotation/NEW/Pynnotator/ann_Lynn_S1/vep/vep.output.sorted.vcf.gz"
fields = ["CSQ"]
names = ["vep_csq"]
ops=["first"]

After merge VCF Entry with no issue

chr1    723798  .       CAG     C       80.2    .       AC=2;AF=1.000;AN=2;DP=13;FS=0.000;MQ=60.00;QD=6.17;SOR=1.270;FractionInformativeReads=0.846;VARTYPE=DEL;DEL;HOM GT:AD:AF:DP:GQ:PL:GL:GP:PRI:SB:MB    1/1:0,11:1.000:11:30:109,33,0:-10.917,-3.317,0.000:8.017e+01,3.017e+01,4.177e-03:0.00,26.00,29.00:0,0,7,4:0,0,6,5

Using test2.toml

[[annotation]]
file="/home/biouser/AGBL_LABS/Projects/BS009/Analysis/Annotation/NEW/Pynnotator/ann_Lynn_S1/snpsift/snpsift.final.vcf.gz"
fields = ["VARTYPE", "SNP", "MNP", "INS", "DEL", "MIXED", "HOM", "HET"]
ops=["first", "first", "first", "first", "first", "first", "first", "first"]

[[annotation]]
file="/home/biouser/bin/pynnotator/pynnotator/data/dbsnp/All_20180423.vcf.gz"
fields = ["ID", "RS", "RSPOS", "RV", "VP", "GENEINFO", "dbSNPBuildID", "SAO", "SSR", "WGT", "VC", "PM", "TPA", "PMC", "S3D", "SLO", "NSF", "NSM", "NSN", "REF", "SYN", "U3", "U5", "ASS", "DSS", "INT", "R3", "R5", "OTH", "CFL", "ASP", "MUT", "VLD", "G5A", "G5", "HD", "GNO", "KGPhase1", "KGPhase3", "CDA", "LSD", "MTP", "OM", "NOC", "WTD", "NOV", "NC", "CAF", "COMMON", "TOPMED"]
ops=["first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first"]

[[annotation]]
file="/home/biouser/bin/pynnotator/pynnotator/data/dbsnp/clinvar_20180429.vcf.gz"
fields = ["AF_ESP", "AF_EXAC", "AF_TGP", "ALLELEID", "CLNDN", "CLNDNINCL", "CLNDISDB", "CLNDISDBINCL", "CLNHGVS", "CLNREVSTAT", "CLNSIG", "CLNSIGCONF", "CLNSIGINCL", "CLNVC", "CLNVCSO", "CLNVI", "DBVARID", "GENEINFO", "MC", "ORIGIN", "RS", "SSR", ]
ops=["first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first"]

[[annotation]]
file="/home/biouser/bin/pynnotator/pynnotator/data/1000genomes/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz"
fields = ["CIEND", "CIPOS", "CS", "END", "IMPRECISE", "MC", "MEINFO", "MEND", "MLEN", "MSTART", "SVLEN", "SVTYPE", "TSD", "AC", "AF", "NS", "AN", "EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF", "DP", "AA", "VT", "EX_TARGET", "MULTI_ALLELIC", "OLD_VARIANT"]
ops=["first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first"]

[[annotation]]
file="/home/biouser/bin/pynnotator/pynnotator/data/decipher/HI_Predictions_Version3.bed.gz"
names=["HI_PREDICTION"]
columns=[4]
ops=["uniq"]

[[annotation]]
file="/home/biouser/bin/pynnotator/pynnotator/data/gnomead/gnomad.exomes.r2.0.2.sites.vcf.bgz"
fields = ["AC", "AF", "AN", "BaseQRankSum", "ClippingRankSum", "DB", "DP", "FS", "InbreedingCoeff", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "VQSLOD", "VQSR_culprit", "VQSR_NEGATIVE_TRAIN_SITE", "VQSR_POSITIVE_TRAIN_SITE", "GQ_HIST_ALT", "DP_HIST_ALT", "AB_HIST_ALT", "GQ_HIST_ALL", "DP_HIST_ALL", "AB_HIST_ALL", "AC_AFR", "AC_AMR", "AC_ASJ", "AC_EAS", "AC_FIN", "AC_NFE", "AC_OTH", "AC_SAS", "AC_Male", "AC_Female", "AN_AFR", "AN_AMR", "AN_ASJ", "AN_EAS", "AN_FIN", "AN_NFE", "AN_OTH", "AN_SAS", "AN_Male", "AN_Female", "AF_AFR", "AF_AMR", "AF_ASJ", "AF_EAS", "AF_FIN", "AF_NFE", "AF_OTH", "AF_SAS", "AF_Male", "AF_Female", "GC_AFR", "GC_AMR", "GC_ASJ", "GC_EAS", "GC_FIN", "GC_NFE", "GC_OTH", "GC_SAS", "GC_Male", "GC_Female", "AC_raw", "AN_raw", "AF_raw", "GC_raw", "GC", "Hom_AFR", "Hom_AMR", "Hom_ASJ", "Hom_EAS", "Hom_FIN", "Hom_NFE", "Hom_OTH", "Hom_SAS", "Hom_Male", "Hom_Female", "Hom_raw", "Hom", "STAR_AC", "STAR_AC_raw", "STAR_Hom", "POPMAX", "AC_POPMAX", "AN_POPMAX", "AF_POPMAX", "DP_MEDIAN", "DREF_MEDIAN", "GQ_MEDIAN", "AB_MEDIAN", "AS_RF", "AS_FilterStatus", "AS_RF_POSITIVE_TRAIN", "AS_RF_NEGATIVE_TRAIN", "CSQ", "AN_FIN_Male", "AN_EAS_Female", "AN_NFE_Female", "AC_AFR_Male", "AN_AMR_Female", "AF_AMR_Male", "Hemi_NFE", "Hemi_AFR", "AC_ASJ_Female", "AF_FIN_Female", "AN_ASJ_Male", "AC_OTH_Female", "GC_OTH_Male", "GC_FIN_Male", "AC_NFE_Female", "AC_EAS_Male", "AC_OTH_Male", "GC_SAS_Male", "Hemi_AMR", "AC_NFE_Male", "Hemi", "AN_FIN_Female", "GC_EAS_Male", "GC_ASJ_Female", "GC_SAS_Female", "GC_ASJ_Male", "Hemi_SAS", "AN_ASJ_Female", "AF_FIN_Male", "AN_OTH_Male", "AF_AFR_Male", "STAR_Hemi", "AF_SAS_Male", "Hemi_ASJ", "AN_SAS_Female", "AN_AFR_Female", "Hemi_raw", "AF_OTH_Male", "AC_SAS_Female", "AF_NFE_Female", "AF_EAS_Female", "AN_OTH_Female", "AF_EAS_Male", "AF_SAS_Female", "GC_AFR_Female", "AF_AFR_Female", "AC_FIN_Female", "Hemi_OTH", "GC_AMR_Male", "AC_AFR_Female", "GC_NFE_Male", "AF_AMR_Female", "GC_NFE_Female", "AN_AFR_Male", "AN_NFE_Male", "AC_AMR_Male", "GC_AMR_Female", "AC_SAS_Male", "AF_ASJ_Male", "GC_FIN_Female", "AC_EAS_Female", "AC_AMR_Female", "Hemi_FIN", "AC_FIN_Male", "GC_EAS_Female", "AF_ASJ_Female", "AF_OTH_Female", "GC_AFR_Male", "AN_SAS_Male", "AF_NFE_Male", "AN_EAS_Male", "AC_ASJ_Male", "Hemi_EAS", "AN_AMR_Male", "GC_OTH_Female", "segdup", "lcr"]
ops=["first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first","first"]

[[annotation]]
file="/home/biouser/AGBL_LABS/Projects/BS009/Analysis/Annotation/NEW/Pynnotator/ann_Lynn_S1/snpeff/snpeff.output.vcf.gz"
fields = ["ANN"]
names = ["snpeff_eff"]
ops=["first"]

[[annotation]]
file="/home/biouser/AGBL_LABS/Projects/BS009/Analysis/Annotation/NEW/Pynnotator/ann_Lynn_S1/vep/vep.output.sorted.vcf.gz"
fields = ["CSQ"]
names = ["vep_csq"]
ops=["first"]

After merge VCF Entry with issue

chr1    723798  .       CAG     C       80.2    .       AC=4012;AF=0.8011;AN=5008;DP=24752;FS=0.000;MQ=60.00;QD=6.17;SOR=1.270;FractionInformativeReads=0.846;VARTYPE=DEL;DEL;HOM;ID=rs34882115;RS=34882115;RSPOS=723799;VP=0x050000000005110026000200;dbSNPBuildID=134;SAO=0;SSR=0;WGT=1;VC=DIV;ASP;G5;KGPhase3;CAF=0.1989,0.8011;COMMON=1;TOPMED=0.17555428134556574,0.82444571865443425;NS=2504;EAS_AF=0.7946;EUR_AF=0.9602;AFR_AF=0.5416;AMR_AF=0.8775;SAS_AF=0.9407;VT=INDEL GT:AD:AF:DP:GQ:PL:GL:GP:PRI:SB:MB       1/1:0,11:1.000:11:30:109,33,0:-10.917,-3.317,0.000:8.017e+01,3.017e+01,4.177e-03:0.00,26.00,29.00:0,0,7,4:0,0,6,5
gezthxbio commented 6 years ago

I guess this is coming from the ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz vcfanno is trying to merge AC, AF, AN, DP cols of query vcf and ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz

1       723798  rs34882115      CAG     C       100     PASS    AC=4012;AF=0.801118;AN=5008;NS=2504;DP=24752;EAS_AF=0.7946;AMR_AF=0.8775;AFR_AF=0.5416;EUR_AF=0.9602;SAS_AF=0.9407;VT=INDEL

fixed by removing AC, AF, AN, DP fields of ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz toml

[[annotation]]
file="/home/biouser/bin/pynnotator/pynnotator/data/1000genomes/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz"
fields = ["CIEND", "CIPOS", "CS", "END", "IMPRECISE", "MC", "MEINFO", "MEND", "MLEN", "MSTART", "SVLEN", "SVTYPE", "TSD", "NS", "EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF", "AA", "VT", "EX_TARGET", "MULTI_ALLELIC", "OLD_VARIANT"]
ops=["first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first", "first"]