Closed panxiaoguang closed 1 year ago
Hi @panxiaoguang ,
Here is a bash script that performs what is mentioned in the doc.
function ExtractSNPs {
AF_MIN=0.35
AF_MAX=0.65
if [[ $1 == 'X' ]]; then
VERSION='1c'
else
VERSION='5b'
fi
bcftools view -i "((EAS_AF>${AF_MIN} && EAS_AF<${AF_MAX}) || (SAS_AF>${AF_MIN} && SAS_AF<${AF_MAX}) || (EUR_AF>${AF_MIN} && EUR_AF<${AF_MAX}) || (AFR_AF>${AF_MIN} && AFR_AF<${AF_MAX}) || (AMR_AF>${AF_MIN} && AMR_AF<${AF_MAX})) && VT=\"SNP\" && MULTI_ALLELIC=0" ALL.chr${1}.phase3_shapeit2_mvncall_integrated_v${VERSION}.20130502.genotypes.vcf.gz | bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\n' - > OUT/G1000_chr${1}.txt
Rscript Clean_SNPs.R OUT/G1000_chr${1}.txt
}
echo 'Extracting SNPs'
export -f ExtractSNPs
echo -e "`seq 1 22`\nX" | xargs -n 1 -P 12 -t -I {} bash -c 'ExtractSNPs "$@"' _ {}
echo -e "\tChr\tPosition" > OUT/G1000.txt
for i in {1..22} X; do cat OUT/G1000_chr${i}.txt >> OUT/G1000.txt && rm OUT/G1000_chr${i}.txt; done
It requires to create an R script called Clean_SNPs.R
with the following content:
suppressPackageStartupMessages(library(GenomicRanges))
args=commandArgs(trailingOnly=T)
FILE=args[1]
stopifnot(file.exists(FILE) && file.info(FILE)$size>0)
LOCI=read.table(FILE,sep='\t',header=F,stringsAsFactors=F)
stopifnot(length(unique(LOCI$V1))==1)
LOCI=LOCI[order(LOCI$V2),]
#############################
# Remove duplicated entries #
#############################
IDs=paste0(LOCI$V1,'_',LOCI$V2)
TO_REMOVE=which(IDs %in% IDs[which(duplicated(IDs))])
if (length(TO_REMOVE)>0) {
LOCI=LOCI[-TO_REMOVE,]
}
rm(IDs,TO_REMOVE)
colnames(LOCI)=c('Chr','Position','Ref','Alt')
LOCI=makeGRangesFromDataFrame(LOCI,seqnames.field='Chr',start.field='Position',end.field='Position',ignore.strand=T,keep.extra.columns=T,starts.in.df.are.0based=F)
#####################################
# Remove ENCODE blacklisted regions #
#####################################
ENCODE_blacklist='hg19-blacklist.v2.bed'
ENCODE_blacklist=read.table(ENCODE_blacklist,sep='\t',header=F,stringsAsFactors=F)
ENCODE_blacklist$V4=NULL
ENCODE_blacklist$V1=gsub('^chr','',ENCODE_blacklist$V1)
ENCODE_blacklist=makeGRangesFromDataFrame(ENCODE_blacklist,seqnames.field='V1',start.field='V2',end.field='V3',ignore.strand=T,keep.extra.columns=F,starts.in.df.are.0based=T)
LOCI=LOCI[-findOverlaps(LOCI,ENCODE_blacklist)@from]
rm(ENCODE_blacklist)
###################
# Remove probloci #
###################
PROBLOCI='probloci_270415.txt.gz'
PROBLOCI=read.table(PROBLOCI,sep='\t',header=T,stringsAsFactors=F)
PROBLOCI=makeGRangesFromDataFrame(PROBLOCI,seqnames.field='Chr',start.field='Pos',end.field='Pos',ignore.strand=T,keep.extra.columns=F,starts.in.df.are.0based=F)
LOCI=LOCI[-findOverlaps(LOCI,PROBLOCI)@from]
rm(PROBLOCI)
#################
# Reformat data #
#################
LOCI=data.frame(LOCI,stringsAsFactors=F)
LOCI$seqnames=as.character(LOCI$seqnames)
LOCI=LOCI[,c(1,2,6,7)]
colnames(LOCI)=c('chromosome','position','a0','a1')
rownames(LOCI)=paste0(LOCI$chromosome,'_',LOCI$position)
ACGT_index=1:4
names(ACGT_index)=c('A','C','G','T')
stopifnot(all(LOCI$a0 %in% names(ACGT_index)))
stopifnot(all(LOCI$a1 %in% names(ACGT_index)))
LOCI$a0=ACGT_index[LOCI$a0]
LOCI$a1=ACGT_index[LOCI$a1]
write.table(LOCI[,1:2],file=gsub('_chr','_loci_chr',FILE),sep='\t',col.names=F,row.names=F,quote=F)
write.table(LOCI[,2:4],file=gsub('_chr','_alleles_chr',FILE),sep='\t',col.names=T,row.names=F,quote=F)
write.table(LOCI[,1:2],file=FILE,sep='\t',col.names=F,row.names=T,quote=F)
Then, GC and RT file can be generated using our scripts. Please mind the 'chr' string for chromosome names.
Cheers,
Tom.
Thank you so much!
I want to use my own normal samples to call germline het snps and then get snp_locis. could you please share your scripts to convert VCF file into snp_loci for WGS data