zhengxwen / SeqArray

Data management of large-scale whole-genome sequence variant calls (Development version only)
http://www.bioconductor.org/packages/SeqArray
44 stars 12 forks source link

inconsistent behavior with seqSetFilter #27

Closed smgogarten closed 6 years ago

smgogarten commented 6 years ago

Setting a filter with action="intersect" results in different lengths returned for variant.id and chromosome:

> gds <- seqOpen(seqExampleFileName("gds"))
> sel <- c(rep(FALSE, 348), rep(TRUE, 1000))
> seqSetFilter(gds, variant.sel=sel)
# of selected variants: 1,000
> table(seqGetFilter(gds)$variant.sel)

FALSE  TRUE 
  348  1000 
> length(seqGetData(gds, "variant.id"))
[1] 1000
> length(seqGetData(gds, "chromosome"))
[1] 1000
> sel <- c(rep(FALSE, 500), rep(TRUE, 500))
> seqSetFilter(gds, variant.sel=sel, action="intersect")
# of selected variants: 1,000
> table(seqGetFilter(gds)$variant.sel)

FALSE  TRUE 
  848   500 
> length(seqGetData(gds, "variant.id"))
[1] 500
> length(seqGetData(gds, "chromosome"))
[1] 1000
> sessionInfo()
R Under development (unstable) (2018-01-01 r74017)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] SeqArray_1.19.6 gdsfmt_1.15.1  

loaded via a namespace (and not attached):
 [1] IRanges_2.13.28        Biostrings_2.47.9      crayon_1.3.4          
 [4] bitops_1.0-6           GenomeInfoDb_1.15.5    stats4_3.5.0          
 [7] zlibbioc_1.25.0        XVector_0.19.8         S4Vectors_0.17.35     
[10] tools_3.5.0            RCurl_1.95-4.10        parallel_3.5.0        
[13] compiler_3.5.0         BiocGenerics_0.25.3    GenomicRanges_1.31.22 
[16] GenomeInfoDbData_1.1.0
jonocarroll commented 6 years ago

I found another, perhaps more general instance of this bug but I believe it too has been remedied by this timely fix. Thank you!

Minimal reprex for posterity/in case this surfaces again/others using locked versions:

In 1.18.2:

library(SeqArray)
packageVersion("SeqArray")
# 1.18.2

gds <- seqOpen(seqExampleFileName("gds"))

table(seqGetFilter(gds)$variant.sel)
# TRUE 
# 1348

seqSetFilterCond(gds, maf=0.1, missing.rate=0, .progress=TRUE)
## of selected variants: 36 <------------- :-) 

table(seqGetFilter(gds)$variant.sel)
# FALSE  TRUE 
#  1312    36

dim(seqGetData(gds, "$dosage"))
# [1] 90 36

seqClose(gds)

In 1.19.6:

library(SeqArray)
packageVersion("SeqArray")
# 1.19.6

gds <- seqOpen(seqExampleFileName("gds"))

table(seqGetFilter(gds)$variant.sel)
# TRUE 
# 1348

seqSetFilterCond(gds, maf=0.1, missing.rate=0, .progress=TRUE)
## of selected variants: 1348 <------------- ! (too many)

table(seqGetFilter(gds)$variant.sel)
# FALSE  TRUE 
#  1312    36  <------------- !

dim(seqGetData(gds, "$dosage"))
# [1]   90 1348  <------------- ! (too many)

seqClose(gds)

In most recent update, 1.19.7:

library(SeqArray)
packageVersion("SeqArray")
# 1.19.7

gds <- seqOpen(seqExampleFileName("gds"))

table(seqGetFilter(gds)$variant.sel)
# TRUE 
# 1348

seqSetFilterCond(gds, maf=0.1, missing.rate=0, .progress=TRUE)
## of selected variants: 36 <------------- :-) 

table(seqGetFilter(gds)$variant.sel)
# FALSE  TRUE 
#  1312    36

dim(seqGetData(gds, "$dosage"))
# [1] 90 36

seqClose(gds)