knausb / vcfR

Tools to work with variant call format files
240 stars 54 forks source link

extract.gt() missing heterozygotes #208

Open samarth8392 opened 1 year ago

samarth8392 commented 1 year ago

Hi, I have a vcf file that I read using vcfR

> head(vcf)
[1] "***** Object of class 'vcfR' *****"
[1] "***** Meta section *****"
[1] "##fileformat=VCFv4.2"
[1] "##ALT=<ID=NON_REF,Description=\"Represents any possible alternative a [Truncated]"
[1] "##FILTER=<ID=AN323,Description=\"AN < 323.0\">"
[1] "##FILTER=<ID=FS40,Description=\"FS > 40.0\">"
[1] "##FILTER=<ID=LowQual,Description=\"Low quality\">"
[1] "##FILTER=<ID=MQ20,Description=\"MQ < 20.0\">"
[1] "First 6 rows."
[1] 
[1] "***** Fixed section *****"
     CHROM       POS       ID REF ALT QUAL      FILTER
[1,] "Scate-ma1" "67509"   NA "C" "A" "738.06"  "PASS"
[2,] "Scate-ma1" "67559"   NA "C" "T" "1630.99" "PASS"
[3,] "Scate-ma1" "219664"  NA "G" "A" "228.43"  "PASS"
[4,] "Scate-ma1" "1152241" NA "A" "C" "34006.6" "PASS"
[5,] "Scate-ma1" "1152242" NA "G" "C" "34016.8" "PASS"
[6,] "Scate-ma1" "1311761" NA "G" "A" "891.18"  "PASS"
[1] 
[1] "***** Genotype section *****"
     FORMAT                     
[1,] "GT:AD:DP:GQ:PL"           
[2,] "GT:AD:DP:GQ:PL"           
[3,] "GT:AD:DP:GQ:PGT:PID:PL:PS"
[4,] "GT:AD:DP:GQ:PGT:PID:PL:PS"
[5,] "GT:AD:DP:GQ:PGT:PID:PL:PS"
[6,] "GT:AD:DP:GQ:PL"           
     sca0947                                            
[1,] "0/0:10,0:10:30:0,30,274"                          
[2,] "0/0:11,0:11:33:0,33,341"                          
[3,] "0/0:9,0:9:27:.:.:0,27,236:."                      
[4,] "0|1:17,11:29:99:0|1:1152235_C_A:411,0,681:1152235"
[5,] "0|1:17,11:29:99:0|1:1152235_C_A:411,0,681:1152235"
[6,] "0/0:16,0:16:48:0,48,426"                          
     sca0949                                            
[1,] "0/0:27,0:27:72:0,72,1080"                         
[2,] "0/0:25,0:25:69:0,69,1035"                         
[3,] "0/0:13,0:13:39:.:.:0,39,331:."                    
[4,] "0|1:24,10:34:99:0|1:1152215_G_T:348,0,978:1152215"
[5,] "0|1:24,10:34:99:0|1:1152215_G_T:348,0,978:1152215"
[6,] "0/0:15,0:15:45:0,45,403"                          
     sca0950                                            
[1,] "0/0:10,0:10:27:0,27,405"                          
[2,] "0/0:12,0:12:36:0,36,321"                          
[3,] "0/0:20,0:20:54:.:.:0,54,810:."                    
[4,] "0|1:15,17:38:99:0|1:1152235_C_A:669,0,579:1152235"
[5,] "0|1:15,17:38:99:0|1:1152235_C_A:669,0,579:1152235"
[6,] "0/0:19,0:19:57:0,57,488"                          
     sca0951                                            
[1,] "0/0:11,0:11:33:0,33,306"                          
[2,] "0/0:17,0:17:51:0,51,464"                          
[3,] "0/0:12,0:12:36:.:.:0,36,300:."                    
[4,] "0|1:19,11:34:99:0|1:1152215_G_T:405,0,765:1152215"
[5,] "0|1:19,11:34:99:0|1:1152215_G_T:405,0,765:1152215"
[6,] "0/0:16,0:16:48:0,48,416"                          
     sca1113                                          
[1,] "0/0:12,0:12:24:0,24,360"                        
[2,] "0/0:16,0:16:48:0,48,488"                        
[3,] "0/0:7,0:7:21:.:.:0,21,185:."                    
[4,] "0|1:6,4:10:99:0|1:1152215_G_T:150,0,240:1152215"
[5,] "0|1:6,4:10:99:0|1:1152215_G_T:150,0,240:1152215"
[6,] "0/0:15,0:15:42:0,42,630"                        
[1] "First 6 columns only."
[1] 
[1] "Unique GT formats:"
[1] "GT:AD:DP:GQ:PL"            "GT:AD:DP:GQ:PGT:PID:PL:PS"
[1] 

and then I used extract.gt() with return.alleles = T del.gt <- extract.gt(vcf, element = "GT",return.alleles = T)

and I get:

                 sca0947 sca0949 sca0950 sca0951 sca1113 sca1115 sca1117
Scate-ma1_67509   "C/C"   "C/C"   "C/C"   "C/C"   "C/C"   "C/C"   "C/C"  
Scate-ma1_67559   "C/C"   "C/C"   "C/C"   "C/C"   "C/C"   "C/C"   "C/C"  
Scate-ma1_219664  "G/G"   "G/G"   "G/G"   "G/G"   "G/G"   "G/G"   "G/G"  
Scate-ma1_1152241 "A|C"   "A|C"   "A|C"   "A|C"   "A|C"   "A|C"   "A|C"  
Scate-ma1_1152242 "G|C"   "G|C"   "G|C"   "G|C"   "G|C"   "G|C"   "G|C"  
Scate-ma1_1311761 "G/G"   "G/G"   "G/G"   "G/G"   "G/G"   "G/G"   "G/G"  
                  sca1120
Scate-ma1_67509   "C/C"  
Scate-ma1_67559   "C/C"  
Scate-ma1_219664  "G/G"  
Scate-ma1_1152241 "A|C"  
Scate-ma1_1152242 "G|C"  
Scate-ma1_1311761 "G/G"  

HOWEVER, if I use extract.gt() with as.numeric = TRUE del.gt <- extract.gt(vcf, element = "GT",as.numeric = TRUE)

I get:

                  sca0947 sca0949 sca0950 sca0951 sca1113 sca1115 sca1117
Scate-ma1_67509         0       0       0       0       0       0       0
Scate-ma1_67559         0       0       0       0       0       0       0
Scate-ma1_219664        0       0       0       0       0       0       0
Scate-ma1_1152241       0       0       0       0       0       0       0
Scate-ma1_1152242       0       0       0       0       0       0       0
Scate-ma1_1311761       0       0       0       0       0       0       0
                  sca1120
Scate-ma1_67509         0
Scate-ma1_67559         0
Scate-ma1_219664        0
Scate-ma1_1152241       0
Scate-ma1_1152242       0
Scate-ma1_1311761       0

As you can clearly see, at SNP Scate-ma1_1152241, all individuals are heterozygotes, but the numeric value is 0 for all of them. The only reason I could think of for this discrepancy is some genotypes are Ref/Alt while others are Ref|Alt

Do you think there's an issue with 0/1 or 0|1 notation for genotypes that's causing extract.gt() to miss heterozygotes or am I doing something wrong here?

Any help figuring this out would be appreciated.

Thanks, Samarth

samarth8392 commented 1 year ago

I forgot to mention, the 0/1 vs 0|1 issue is captured if I run extract.gt() with return.alleles = F

                  sca0947 sca0949 sca0950 sca0951 sca1113 sca1115 sca1117
Scate-ma1_67509   "0/0"   "0/0"   "0/0"   "0/0"   "0/0"   "0/0"   "0/0"  
Scate-ma1_67559   "0/0"   "0/0"   "0/0"   "0/0"   "0/0"   "0/0"   "0/0"  
Scate-ma1_219664  "0/0"   "0/0"   "0/0"   "0/0"   "0/0"   "0/0"   "0/0"  
Scate-ma1_1152241 "0|1"   "0|1"   "0|1"   "0|1"   "0|1"   "0|1"   "0|1"  
Scate-ma1_1152242 "0|1"   "0|1"   "0|1"   "0|1"   "0|1"   "0|1"   "0|1"  
Scate-ma1_1311761 "0/0"   "0/0"   "0/0"   "0/0"   "0/0"   "0/0"   "0/0"  
                  sca1120
Scate-ma1_67509   "0/0"  
Scate-ma1_67559   "0/0"  
Scate-ma1_219664  "0/0"  
Scate-ma1_1152241 "0|1"  
Scate-ma1_1152242 "0|1"  
Scate-ma1_1311761 "0/0"