Closed JuanitaGil closed 1 year ago
Hi Juanita,
Thank you for providing an example, that really helps. Please check the below example that I've modified from yours to get the result I think you're looking for. Please let me know if it's a result you're happy with. Note the that I used some 'creativity' to get 'MQ'. A better solution may be to go back to variant calling and see if your variant caller will output the information you're looking for. But I see that as a 'high labor' option. My hope is that this post-hoc example is lower labor but still gets you what you want.
Thanks! Brian
library(vcfR)
#>
#> ***** *** vcfR *** *****
#> This is vcfR 1.13.0
#> browseVignettes('vcfR') # Documentation
#> citation('vcfR') # Citation
#> ***** ***** ***** *****
library(ggplot2)
library(pinfsc50)
library(reshape2)
my_path <- "~/Desktop/sandbox/juanitagil/DP_issue_example_data/"
vcf <- read.vcfR(paste(my_path, "Rsolani_test.vcf.gz", sep = ""), verbose = FALSE)
dna <- ape::read.dna(paste(my_path, "Rsolani_test_dna.fa", sep = ""), format = "fasta")
gff <- read.table(paste(my_path, "Rsolani_AG1-IA_ref_genome.gff3", sep = ""), sep = "\t", quote = "")
queryMETA(vcf)
#> [1] "INFO=ID=CNV" "INFO=ID=TA" "INFO=ID=TID" "INFO=ID=TGN"
#> [5] "INFO=ID=TCO" "INFO=ID=TACH" "INFO=ID=NS" "INFO=ID=MAF"
#> [9] "INFO=ID=OH" "INFO=ID=AN" "INFO=ID=AFS" "INFO=ID=TYPE"
#> [13] "INFO=ID=FS" "FORMAT=ID=GT" "FORMAT=ID=PL" "FORMAT=ID=GQ"
#> [17] "FORMAT=ID=DP" "FORMAT=ID=ADP" "FORMAT=ID=BSDP" "FORMAT=ID=ACN"
queryMETA(vcf, element = "DP")
#> [[1]]
#> [1] "FORMAT=ID=DP" "Number=1" "Type=Integer"
#> [4] "Description=Read depth"
#>
#> [[2]]
#> [1] "FORMAT=ID=ADP"
#> [2] "Number=R"
#> [3] "Type=Integer"
#> [4] "Description=Counts for observed alleles"
#> [5] " including the reference allele"
#>
#> [[3]]
#> [1] "FORMAT=ID=BSDP"
#> [2] "Number=4"
#> [3] "Type=Integer"
#> [4] "Description=Number of base calls (depth) for the 4 nucleotides in called SNVs sorted as A"
#> [5] "C"
#> [6] "G"
#> [7] "T"
getINFO(vcf)[1:3]
#> [1] "NS=1;AN=1;AFS=0,2;OH=0.0;MAF=0.0;CNV=38;TYPE=INDEL;TA=frameshift_variant;TID=rna-gnl|WGS:JACYCF|g1.1_mrna;TGN=RHS01_00001;TCO=98.2;TACH=NH98NS"
#> [2] "NS=6;AN=2;AFS=8,4;OH=0.0;MAF=0.33;CNV=61;TA=intron_variant;TID=rna-gnl|WGS:JACYCF|g1.1_mrna;TGN=RHS01_00001"
#> [3] "NS=5;AN=1;AFS=0,10;OH=0.0;MAF=0.0;CNV=63;TA=intron_variant;TID=rna-gnl|WGS:JACYCF|g1.1_mrna;TGN=RHS01_00001"
vcf@gt[1:3, 1:2]
#> FORMAT 117T
#> [1,] "GT:PL:GQ:DP:ADP:ACN" "./.:80,6,0:0:2:0,2:2,0"
#> [2,] "GT:PL:GQ:DP:BSDP:ACN" "1/1:169,15,0:42:6:5,0,0,0:0,2"
#> [3,] "GT:PL:GQ:DP:BSDP:ACN" "1/1:174,15,0:42:6:0,0,0,5:0,2"
dp <- extract.gt(vcf, "DP", as.numeric = TRUE)
Add DP to INFO
dp[1:3, 1:4]
#> 117T AC13 AC14 AC15
#> JACYCF010000001.1_929_1 2 0 0 1
#> JACYCF010000001.1_956_2 6 1 1 4
#> JACYCF010000001.1_957_3 6 1 1 4
myDP <- rowSums(dp)
myDP <- paste("DP=", myDP, sep = "")
myDP[1:3]
#> [1] "DP=66" "DP=153" "DP=157"
vcf@fix[, 'INFO'] <- paste(vcf@fix[, 'INFO'], myDP, sep = ";")
vcf@fix[, 'INFO'][1:3]
#> [1] "NS=1;AN=1;AFS=0,2;OH=0.0;MAF=0.0;CNV=38;TYPE=INDEL;TA=frameshift_variant;TID=rna-gnl|WGS:JACYCF|g1.1_mrna;TGN=RHS01_00001;TCO=98.2;TACH=NH98NS;DP=66"
#> [2] "NS=6;AN=2;AFS=8,4;OH=0.0;MAF=0.33;CNV=61;TA=intron_variant;TID=rna-gnl|WGS:JACYCF|g1.1_mrna;TGN=RHS01_00001;DP=153"
#> [3] "NS=5;AN=1;AFS=0,10;OH=0.0;MAF=0.0;CNV=63;TA=intron_variant;TID=rna-gnl|WGS:JACYCF|g1.1_mrna;TGN=RHS01_00001;DP=157"
The MQ is less straight forward. We can summarize ‘GQ’ and use it as a surrogate. We need to name our summary ‘MQ’ so that ‘create.chromR()’ will recognize it.
dp <- extract.gt(vcf, "GQ", as.numeric = TRUE)
myDP <- rowSums(dp)
myDP <- paste("MQ=", myDP, sep = "")
vcf@fix[, 'INFO'] <- paste(vcf@fix[, 'INFO'], myDP, sep = ";")
vcf@fix[, 'INFO'][1:3]
#> [1] "NS=1;AN=1;AFS=0,2;OH=0.0;MAF=0.0;CNV=38;TYPE=INDEL;TA=frameshift_variant;TID=rna-gnl|WGS:JACYCF|g1.1_mrna;TGN=RHS01_00001;TCO=98.2;TACH=NH98NS;DP=66;MQ=42"
#> [2] "NS=6;AN=2;AFS=8,4;OH=0.0;MAF=0.33;CNV=61;TA=intron_variant;TID=rna-gnl|WGS:JACYCF|g1.1_mrna;TGN=RHS01_00001;DP=153;MQ=264"
#> [3] "NS=5;AN=1;AFS=0,10;OH=0.0;MAF=0.0;CNV=63;TA=intron_variant;TID=rna-gnl|WGS:JACYCF|g1.1_mrna;TGN=RHS01_00001;DP=157;MQ=222"
Recreate the ‘DP’ matrix that we ‘clobbered’ above.
dp <- extract.gt(vcf, "DP", as.numeric = TRUE)
chrom <- create.chromR(vcf, name="CHROM", seq=dna, ann=gff, verbose=TRUE)
#> Annotations include more than one chromosome.
#> JACYCF010000001.1, JACYCF010000010.1, JACYCF010000011.1, JACYCF010000012.1, JACYCF010000013.1, JACYCF010000014.1, JACYCF010000015.1, JACYCF010000016.1, JACYCF010000017.1, JACYCF010000018.1, JACYCF010000019.1, JACYCF010000002.1, JACYCF010000020.1, JACYCF010000021.1, JACYCF010000022.1, JACYCF010000023.1, JACYCF010000024.1, JACYCF010000025.1, JACYCF010000026.1, JACYCF010000027.1, JACYCF010000028.1, JACYCF010000029.1, JACYCF010000003.1, JACYCF010000030.1, JACYCF010000031.1, JACYCF010000032.1, JACYCF010000033.1, JACYCF010000034.1, JACYCF010000035.1, JACYCF010000036.1, JACYCF010000037.1, JACYCF010000038.1, JACYCF010000039.1, JACYCF010000004.1, JACYCF010000040.1, JACYCF010000041.1, JACYCF010000042.1, JACYCF010000043.1, JACYCF010000044.1, JACYCF010000045.1, JACYCF010000046.1, JACYCF010000047.1, JACYCF010000048.1, JACYCF010000049.1, JACYCF010000005.1, JACYCF010000050.1, JACYCF010000051.1, JACYCF010000052.1, JACYCF010000053.1, JACYCF010000054.1, JACYCF010000055.1, JACYCF010000056.1, JACYCF010000057.1, JACYCF010000058.1, JACYCF010000059.1, JACYCF010000006.1, JACYCF010000060.1, JACYCF010000061.1, JACYCF010000062.1, JACYCF010000063.1, JACYCF010000064.1, JACYCF010000065.1, JACYCF010000066.1, JACYCF010000067.1, JACYCF010000068.1, JACYCF010000069.1, JACYCF010000007.1, JACYCF010000070.1, JACYCF010000071.1, JACYCF010000072.1, JACYCF010000073.1, JACYCF010000074.1, JACYCF010000075.1, JACYCF010000076.1, JACYCF010000077.1, JACYCF010000078.1, JACYCF010000079.1, JACYCF010000008.1, JACYCF010000080.1, JACYCF010000081.1, JACYCF010000082.1, JACYCF010000083.1, JACYCF010000084.1, JACYCF010000085.1, JACYCF010000086.1, JACYCF010000087.1, JACYCF010000088.1, JACYCF010000089.1, JACYCF010000009.1, JACYCF010000090.1, JACYCF010000091.1, JACYCF010000092.1, JACYCF010000093.1, JACYCF010000094.1, JACYCF010000095.1, JACYCF010000096.1
#> Subsetting to the first chromosome
#> Names in vcf:
#> JACYCF010000001.1
#> Names of sequences:
#> JACYCF010000001.1
#> Names in annotation:
#> JACYCF010000001.1
#> Initializing var.info slot.
#> var.info slot initialized.
sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: aarch64-apple-darwin20 (64-bit)
#> Running under: macOS Monterey 12.4
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] reshape2_1.4.4 pinfsc50_1.2.0 ggplot2_3.3.6 vcfR_1.13.0
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_1.1.2 xfun_0.32 memuse_4.2-1 purrr_0.3.4
#> [5] splines_4.2.1 lattice_0.20-45 colorspace_2.0-3 vctrs_0.4.1
#> [9] generics_0.1.3 htmltools_0.5.3 viridisLite_0.4.1 yaml_2.3.5
#> [13] mgcv_1.8-40 utf8_1.2.2 rlang_1.0.5 pillar_1.8.1
#> [17] glue_1.6.2 withr_2.5.0 lifecycle_1.0.1 plyr_1.8.8
#> [21] stringr_1.4.1 munsell_0.5.0 gtable_0.3.1 evaluate_0.16
#> [25] knitr_1.40 permute_0.9-7 fastmap_1.1.0 curl_4.3.2
#> [29] parallel_4.2.1 fansi_1.0.3 highr_0.9 Rcpp_1.0.9
#> [33] scales_1.2.1 vegan_2.6-2 mime_0.12 fs_1.5.2
#> [37] digest_0.6.29 stringi_1.7.8 dplyr_1.0.10 grid_4.2.1
#> [41] cli_3.4.0 tools_4.2.1 magrittr_2.0.3 tibble_3.1.8
#> [45] cluster_2.1.3 ape_5.6-2 pkgconfig_2.0.3 MASS_7.3-57
#> [49] Matrix_1.4-1 xml2_1.3.3 reprex_2.0.2 rmarkdown_2.16
#> [53] httr_1.4.4 rstudioapi_0.14 R6_2.5.1 nlme_3.1-157
#> [57] compiler_4.2.1
Created on 2023-01-17 with reprex v2.0.2
Hi Brian,
Thank you so much for fixing the problem so promptly! I think this is just what we were looking for. I actually think that reporting the GQ values makes more sense for us, because this is the population VCF and we already filter by mapping quality while calling variants.
Best, Juanita
Hi Brian,
Thank you so much for providing an answer to my previous post. I've created this new issue to address the problem and generated an example that hopefully will allow you to look at the problem and create a "fix" for the data. Below is also my session info.
Thanks, Juanita
DP_issue_example_data.zip
Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages: [1] stats graphics grDevices utils datasets methods base
other attached packages: [1] reshape2_1.4.4
loaded via a namespace (and not attached): [1] Rcpp_1.0.9 knitr_1.41 magrittr_2.0.3 tidyselect_1.2.0 munsell_0.5.0
[6] lattice_0.20-45 colorspace_2.0-3 ape_5.6-2 R6_2.5.1 rlang_1.0.6
[11] fastmap_1.1.0 fansi_1.0.3 stringr_1.5.0 plyr_1.8.8 dplyr_1.0.10
[16] tools_4.2.2 parallel_4.2.2 grid_4.2.2 nlme_3.1-160 gtable_0.3.1
[21] xfun_0.36 utf8_1.2.2 cli_3.6.0 htmltools_0.5.4 digest_0.6.31
[26] yaml_2.3.6 tibble_3.1.8 lifecycle_1.0.3 ggplot2_3.4.0 vctrs_0.5.1
[31] evaluate_0.19 glue_1.6.2 rmarkdown_2.19 stringi_1.7.12 compiler_4.2.2
[36] pillar_1.8.1 generics_0.1.3 scales_1.2.1 pkgconfig_2.0.3