zhengxwen / SeqArray

Data management of large-scale whole-genome sequence variant calls (Development version only)
http://www.bioconductor.org/packages/SeqArray
43 stars 12 forks source link

how to creat an empty SeqArray GDS file #62

Closed terrryliu closed 4 years ago

terrryliu commented 4 years ago

Dear SeqArry Creater, Thanks for the hard work. I went through the help pages for creating an empty SeqArray GDS file, however I only find the function seqVCF2GDS to convert a VCF file or open the example data embeded in the package. I want to find the parallel function createfn.gds of gdsfmtpackage to create a new empty SeqArray GDS file. Can you please help me? Thanks a lot.

terrryliu commented 4 years ago

Or how to create an empty SeqVarGDSClass Object, then I can use seqAddValue to storage information and seqExport to create the SeqArray GDS file. I tried to use seqAddValue on the empty GDS file created by function createfn.gds unfortunately it didn't work.

terrryliu commented 4 years ago

I have to use the Package SAIGEgds in simulation while the function only accept the SeqArray GDS file format not the general gds.class format object which is created by createfn.gds. Sorry to bother, I really want find a simple way to creat SeqArray GDS file and write the simulated genotype data into it.

zhengxwen commented 4 years ago

Here is an example:

library(gdsfmt)
library(parallel)

# Create a simulated genotype file given by a specified MAC under HWE
# Input:
#   outfn    -- the output file name
#   samp_id  -- sample IDs (length(samp_id) is the number of samples)
#   num      -- the total number of variants
#   MAC      -- minor allele count
#   njob     -- the number of processes for parallel computing
#   compress -- the compression algorithm
CreateGDS <- function(outfn, samp_id, num, MAC, njob=1L, compress="LZMA_RA")
{
    # check
    stopifnot(is.character(outfn), length(outfn)==1L)
    stopifnot(is.vector(samp_id))
    stopifnot(is.numeric(num), length(num)==1L)
    stopifnot(is.numeric(MAC), length(MAC)==1L)
    stopifnot(is.numeric(njob), length(njob)==1L)
    stopifnot(is.character(compress), length(compress)==1L)

    tm <- proc.time()
    cat("Output: ", outfn, "\n", sep="")
    outf <- createfn.gds(outfn)
    on.exit({
        closefn.gds(outf)
        cat("Done.\n")
        print(proc.time() - tm)
    })

    put.attr.gdsn(outf$root, "FileFormat", "SEQ_ARRAY")
    put.attr.gdsn(outf$root, "FileVersion", "v1.0")
    addfolder.gdsn(outf, "description")

    val <- seq_len(num)
    cat("    sample.id\n")
    add.gdsn(outf, "sample.id", samp_id, compress=compress, closezip=TRUE)
    cat("    variant.id\n")
    add.gdsn(outf, "variant.id", val, compress=compress, closezip=TRUE)
    cat("    position\n")
    add.gdsn(outf, "position", val, compress=compress, closezip=TRUE)
    cat("    chromosome\n")
    add.gdsn(outf, "chromosome", rep(1L, num), compress=compress, closezip=TRUE)
    cat("    allele\n")
    add.gdsn(outf, "allele", rep("A,T", num), compress=compress, closezip=TRUE)

    # add genotypes
    addgeno <- function(nd, n)
    {
        nzero <- length(samp_id)*2L
        zero <- raw(nzero)
        for (i in seq_len(n))
        {
            x <- zero
            x[sample.int(nzero, MAC)] <- as.raw(1L)
            append.gdsn(nd, x)
        }
    }

    cat("    genotype\n")
    nfolder <- addfolder.gdsn(outf, "genotype")
    nd <- add.gdsn(nfolder, "data", valdim=c(2L, length(samp_id), 0L),
        storage="bit2", compress=compress)
    if (njob <= 1)
    {
        addgeno(nd, num)
    } else {
        tmpfn <- tempfile(pattern=sprintf("%s_%02d_",
            sub("^([^.]*).*", "\\1", basename(outfn)), 1:njob),
            tmpdir=dirname(outfn))
        cat("Writing to:\n")
        for (fn in tmpfn) cat("    ", fn, " ...\n", sep="")
        tmpst <- SeqArray:::.file_split(num, njob)
        mclapply(1:njob, function(i) {
            f <- createfn.gds(tmpfn[i])
            on.exit(closefn.gds(f))
            n <- add.gdsn(f, "data", valdim=c(2L, length(samp_id), 0L),
                storage="bit2", compress=compress)
            addgeno(n, tmpst[[2L]][i])
            readmode.gdsn(n)
            NULL
        }, mc.cores=njob)
        cat("Merging ...\n")
        for (fn in tmpfn)
        {
            cat("    ", fn, "\n", sep="")
            f <- openfn.gds(fn)
            append.gdsn(nd, index.gdsn(f, "data"))
            closefn.gds(f)
        }
        unlink(tmpfn, force=TRUE)
    }
    readmode.gdsn(nd)

    add.gdsn(nfolder, "@data", rep(as.raw(1L), num), storage="uint8",
        compress=compress, closezip=TRUE)

    invisible()
}
terrryliu commented 4 years ago

Thanks for the advice, but the file created by createfn.gds is not a SeqArray GDS file. I tried the simple codes below

library(gdsfmt)
gfile <- createfn.gds("test.gds")
add.gdsn(gfile, "sample.id", val=1:1000)
add.gdsn(gfile,"genotype",val=array(sample(c(0,1),2*1000*10000,replace=T),dim=c(2,1000,10000)))
closefn.gds(gfile)
testgds <- "test.gds"
pheno <- data.frame(sample.id = 1:1000, y=sample(c(0,1),1000,replace=T))
library(SAIGEgds)
glmm <- seqFitNullGLMM_SPA(y ~ 1, pheno, testgds, trait.type="binary", sample.col="sample.id")

The returned error is Error in seqOpen(gdsfile) : 'test.gds' is not a SeqArray GDS file. Can you please provide a way to do the random simulation by SAIGEgds. Thank you so much :)

zhengxwen commented 4 years ago

You should add a complete structure to the gds file:

    put.attr.gdsn(outf$root, "FileFormat", "SEQ_ARRAY")
    put.attr.gdsn(outf$root, "FileVersion", "v1.0")
    addfolder.gdsn(outf, "description")

    val <- seq_len(num)
    cat("    sample.id\n")
    add.gdsn(outf, "sample.id", samp_id, compress=compress, closezip=TRUE)
    cat("    variant.id\n")
    add.gdsn(outf, "variant.id", val, compress=compress, closezip=TRUE)
    cat("    position\n")
    add.gdsn(outf, "position", val, compress=compress, closezip=TRUE)
    cat("    chromosome\n")
    add.gdsn(outf, "chromosome", rep(1L, num), compress=compress, closezip=TRUE)
    cat("    allele\n")
    add.gdsn(outf, "allele", rep("A,T", num), compress=compress, closezip=TRUE)
terrryliu commented 4 years ago

Thank you so much, problem is solved. ;) Best wishes.