Closed zhengxwen closed 9 years ago
The R code of snpgdsVCF2GDS
in SNPRelate_0.9.19 runs correctly, if you need an immediate solution:
#######################################################################
# Convert a VCF (sequence) file to a GDS file (extract SNP data)
#
# INPUT:
# vcf.fn -- the file name of VCF format
# outfn.gds -- the output gds file
# nblock -- the number of lines in buffer
# method -- biallelic SNPs, or copy number of variants
# compress.annotation -- the compression method for sample and snp annotations
# verbose -- show information
#
snpgdsVCF2GDS <- function(vcf.fn, outfn.gds, nblock=1024,
method = c("biallelic.only", "copy.num.of.ref"),
compress.annotation="ZIP.max", snpfirstdim=FALSE, option = NULL,
verbose=TRUE)
{
# check
stopifnot(is.character(vcf.fn))
stopifnot(is.character(outfn.gds))
stopifnot(is.logical(snpfirstdim) & (length(snpfirstdim)==1))
method <- match.arg(method)
if (is.null(option)) option <- snpgdsOption()
######################################################################
# Scan VCF file -- get sample id
scan.vcf.sampid <- function(fn)
{
# open the vcf file
opfile <- file(fn, open="r")
# read header
fmtstr <- substring(readLines(opfile, n=1), 3)
samp.id <- NULL
while (length(s <- readLines(opfile, n=1)) > 0)
{
if (substr(s, 1, 6) == "#CHROM")
{
samp.id <- scan(text=s, what=character(0), sep="\t", quiet=TRUE)[-c(1:9)]
break
}
}
if (is.null(samp.id))
{
close(opfile)
stop("Error VCF format: invalid sample id!")
}
# close the file
close(opfile)
return(samp.id)
}
######################################################################
# Scan VCF file -- get marker information
scan.vcf.marker <- function(fn, method)
{
if (verbose)
cat(sprintf("\tfile: %s\n", fn))
# total number of rows and columns
Cnt <- count.fields(fn, sep="\t")
# check
if (any(Cnt != Cnt[1]))
stop(sprintf("The file (%s) has different numbers of columns.", fn))
line.cnt <- length(Cnt)
col.cnt <- max(Cnt)
if (verbose)
cat(sprintf("\tcontent: %d rows x %d columns\n", line.cnt, col.cnt))
# open the vcf file
opfile <- file(fn, open="r")
# read header
fmtstr <- substring(readLines(opfile, n=1), 3)
while (length(s <- readLines(opfile, n=1)) > 0)
{
if (substr(s, 1, 6) == "#CHROM")
break
}
# init ...
chr <- character(line.cnt); position <- integer(line.cnt)
snpidx <- integer(line.cnt); snp.rs <- character(line.cnt)
snp.allele <- character(line.cnt)
snp.cnt <- 0; var.cnt <- 0
if (method == "biallelic.only")
{
while (length(s <- readLines(opfile, n=nblock)) > 0)
{
for (i in 1:length(s))
{
var.cnt <- var.cnt + 1
ss <- scan(text=s[i], what=character(0), sep="\t", quiet=TRUE, n=5)
if (all(ss[c(4,5)] %in% c("A", "G", "C", "T", "a", "g", "c", "t")))
{
snp.cnt <- snp.cnt + 1
chr[snp.cnt] <- ss[1]
position[snp.cnt] <- as.integer(ss[2])
snpidx[snp.cnt] <- var.cnt
snp.rs[snp.cnt] <- ss[3]
snp.allele[snp.cnt] <- paste(ss[4], ss[5], sep="/")
}
}
}
} else {
while (length(s <- readLines(opfile, n=nblock)) > 0)
{
for (i in 1:length(s))
{
var.cnt <- var.cnt + 1
ss <- scan(text=s[i], what=character(0), sep="\t", quiet=TRUE, n=5)
snp.cnt <- snp.cnt + 1
chr[snp.cnt] <- ss[1]
position[snp.cnt] <- as.integer(ss[2])
snpidx[snp.cnt] <- var.cnt
snp.rs[snp.cnt] <- ss[3]
snp.allele[snp.cnt] <- paste(ss[4], ss[5], sep="/")
}
}
}
# close the file
close(opfile)
# chromosomes
chr <- chr[1:snp.cnt]
flag <- match(chr, names(option$chromosome.code))
chr[!is.na(flag)] <- unlist(option$chromosome.code)[ flag[!is.na(flag)] ]
chr <- suppressWarnings(as.integer(chr))
chr[is.na(chr)] <- -1
snp.allele <- gsub(".", "/", snp.allele[1:snp.cnt], fixed=TRUE)
list(chr = chr, position = position[1:snp.cnt],
snpidx = snpidx[1:snp.cnt], snp.rs = snp.rs[1:snp.cnt],
snp.allele = snp.allele
)
}
######################################################################
# Scan VCF file -- get marker information
scan.vcf.geno <- function(fn, gGeno, method, start)
{
# matching codes
geno.str <- c("0|0", "0|1", "1|0", "1|1", "0/0", "0/1", "1/0", "1/1",
"0", "1",
"0|0|0", "0|0|1", "0|1|0", "0|1|1", "1|0|0", "1|0|1", "1|1|0", "1|1|1",
"0/0/0", "0/0/1", "0/1/0", "0/1/1", "1/0/0", "1/0/1", "1/1/0", "1/1/1")
geno.code <- as.integer(c(2, 1, 1, 0, 2, 1, 1, 0,
1, 0,
2, 1, 1, 1, 1, 1, 1, 0,
2, 1, 1, 1, 1, 1, 1, 0))
# open the vcf file
opfile <- file(fn, open="r")
# read header
fmtstr <- substring(readLines(opfile, n=1), 3)
while (length(s <- readLines(opfile, n=1)) > 0)
{
if (substr(s, 1, 6) == "#CHROM")
break
}
# scan
snp.cnt <- start
if (method == "biallelic.only")
{
while (length(s <- readLines(opfile, n=nblock)) > 0)
{
gx <- NULL
for (i in 1:length(s))
{
ss <- scan(text=s[i], what=character(0), sep="\t", quiet=TRUE, n=5)
if (all(ss[c(4,5)] %in% c("A", "G", "C", "T", "a", "g", "c", "t")))
{
ss <- scan(text=s[i], what=character(0), sep="\t", quiet=TRUE)[-c(1:9)]
ss <- sapply(strsplit(ss, ":"), FUN = function(x) x[1])
x <- match(ss, geno.str)
x <- geno.code[x]
x[is.na(x)] <- as.integer(3)
gx <- cbind(gx, x)
}
}
if (!is.null(gx))
{
if (snpfirstdim)
write.gdsn(gGeno, t(gx), start=c(snp.cnt,1), count=c(ncol(gx),-1))
else {
print(snp.cnt)
write.gdsn(gGeno, gx, start=c(1,snp.cnt), count=c(-1,ncol(gx)))
}
snp.cnt <- snp.cnt + ncol(gx)
}
}
} else {
while (length(s <- readLines(opfile, n=nblock)) > 0)
{
gx <- NULL
for (i in 1:length(s))
{
ss <- scan(text=s[i], what=character(0), sep="\t", quiet=TRUE)[-c(1:9)]
x <- sapply(strsplit(ss, ":"), FUN = function(x) {
a <- unlist(strsplit(x[1], ""))
if (any(a == "."))
NA
else
sum(a == "0")
})
x[x > 2] <- 2
x[is.na(x)] <- as.integer(3)
gx <- cbind(gx, x)
}
if (!is.null(gx))
{
if (snpfirstdim)
write.gdsn(gGeno, t(gx), start=c(snp.cnt,1), count=c(ncol(gx),-1))
else
write.gdsn(gGeno, gx, start=c(1,snp.cnt), count=c(-1,ncol(gx)))
snp.cnt <- snp.cnt + ncol(gx)
}
}
}
# close the file
close(opfile)
snp.cnt - start
}
######################################################################
######################################################################
# Starting ...
######################################################################
######################################################################
if (verbose)
{
cat("Start snpgdsVCF2GDS ...\n")
if (method == "biallelic.only")
cat("\tExtracting bi-allelic and polymorhpic SNPs.\n")
else
cat("\tStoring dosage of the reference allele for all variant sites, including bi-allelic SNPs, multi-allelic SNPs, indels and structural variants.\n")
cat("\tScanning ...\n")
}
####################################
# sample.id
sample.id <- NULL
for (fn in vcf.fn)
{
s <- scan.vcf.sampid(fn)
if (!is.null(sample.id))
{
if (length(sample.id) != length(s))
stop("All VCF files should have the same sample id.")
if (any(sample.id != s))
stop("All VCF files should have the same sample id.")
} else
sample.id <- s
}
####################################
# genetic markers
all.chr <- integer()
all.position <- integer()
all.snpidx <- integer()
all.snp.rs <- character()
all.snp.allele <- character()
for (fn in vcf.fn)
{
v <- scan.vcf.marker(fn, method)
all.chr <- c(all.chr, v$chr)
all.position <- c(all.position, v$position)
all.snpidx <- c(all.snpidx, length(all.snpidx) + v$snpidx)
all.snp.rs <- c(all.snp.rs, v$snp.rs)
all.snp.allele <- c(all.snp.allele, v$snp.allele)
}
####################################
# genetic variants
nSamp <- length(sample.id)
nSNP <- length(all.chr)
if (verbose)
{
cat(date(), "\tstore sample id, snp id, position, and chromosome.\n")
cat(sprintf("\tstart writing: %d samples, %d SNPs ...\n", nSamp, nSNP))
}
######################################################################
# create GDS file
#
gfile <- createfn.gds(outfn.gds)
# add "sample.id"
add.gdsn(gfile, "sample.id", sample.id, compress=compress.annotation, closezip=TRUE)
# add "snp.id"
add.gdsn(gfile, "snp.id", as.integer(all.snpidx), compress=compress.annotation, closezip=TRUE)
# add "snp.rs.id"
add.gdsn(gfile, "snp.rs.id", all.snp.rs, compress=compress.annotation, closezip=TRUE)
# add "snp.position"
add.gdsn(gfile, "snp.position", all.position, compress=compress.annotation, closezip=TRUE)
# add "snp.chromosome"
v.chr <- add.gdsn(gfile, "snp.chromosome", all.chr, storage="int32", compress=compress.annotation, closezip=TRUE)
# add "snp.allele"
add.gdsn(gfile, "snp.allele", all.snp.allele, compress=compress.annotation, closezip=TRUE)
# snp.chromosome
put.attr.gdsn(v.chr, "autosome.start", option$autosome.start)
put.attr.gdsn(v.chr, "autosome.end", option$autosome.end)
for (i in 1:length(option$chromosome.code))
{
put.attr.gdsn(v.chr, names(option$chromosome.code)[i],
option$chromosome.code[[i]])
}
# sync file
sync.gds(gfile)
# add "gonetype", 2 bits to store one genotype
if (snpfirstdim)
{
gGeno <- add.gdsn(gfile, "genotype", storage="bit2", valdim=c(nSNP, nSamp))
put.attr.gdsn(gGeno, "snp.order")
} else {
gGeno <- add.gdsn(gfile, "genotype", storage="bit2", valdim=c(nSamp, nSNP))
put.attr.gdsn(gGeno, "sample.order")
}
# sync file
sync.gds(gfile)
####################################
# genetic genotypes
snp.start <- 1
for (fn in vcf.fn)
{
if (verbose)
cat(sprintf("\tfile: %s\n", fn))
s <- scan.vcf.geno(fn, gGeno, method, start=snp.start)
snp.start <- snp.start + s
sync.gds(gfile) # sync file
}
# close files
closefn.gds(gfile)
if (verbose) cat(date(), "\tDone.\n")
return(invisible(NULL))
}
Or upgrade gdsfmt to v1.1.1.1
It is fixed in SNPRelate_1.1.1!
It is observed that when using SNPRelate 0.99 in R 3.1, the function of snpgdsVCF2GDS seems randomly to exit.
But the snpgdsVCF2GDS in SNPRelate 0.9.1 worked properly in the same VCF file.