Open hypotheses opened 5 years ago
Aj. kub
Could you please check my code whether it conforms to your guideline?
In the code,Step 1 and 2 originally come from P'mint.
you can skip setup part.
Setup part
library(doParallel)
rbind.match.columns <- function(input1=0, input2=0) {
n.input1 <- ncol(input1)
n.input2 <- ncol(input2)
if( is.null(n.input1) ){
n.input1 = 0
}
if( is.null(n.input2) ){
n.input2 = 0
}
if ( n.input2 < n.input1) {
TF.names <- which(names(input2) %in% names(input1))
column.names <- names(input2[, TF.names])
} else {
TF.names <- which(names(input1) %in% names(input2))
column.names <- names(input1[, TF.names])
}
return(rbind(input1[, column.names], input2[, column.names]))
}
#setup parallel backend to use many processors
cores=detectCores()
cl <- makeCluster(cores[1]-3) #not to overload your computer (e.g 7-3 = 4 , use only 4 cores)
registerDoParallel(cl)
filename <- "b212wgs_annotation_output.vep.chr17.hg19.gz"
index = 0
counter = 0
chunks = 100000
con = file(filename, "r")
skip_headerLine = 343
data_header <- names(read.csv(filename, nrows=1, sep="\t", skip = skip_headerLine ,header = TRUE , check.names = FALSE))
ready_to_write <- data.frame(matrix(ncol = length(data_header), nrow = 0))
names(ready_to_write) <- data_header
Processing Part
system.time(
repeat{
tryCatch({
vepdata <-read.table(con, nrows=chunks, header=FALSE, fill = FALSE, sep="\t",comment.char = "#" , check.names = FALSE )
# do processing on dataChunk (i.e adding header, converting data type)
colnames(vepdata) <- data_header
finalMatrix <- foreach(i = nrow(vepdata), .combine=cbind) %dopar% {
#Step1 Recode impact variant to numeric and then sort by descending
vepdata$sort_impact <- as.integer(ifelse(vepdata$IMPACT=="LOW","1",
ifelse(vepdata$IMPACT=="MODIFIER","2",
ifelse(vepdata$IMPACT=="MODERATE","3",
ifelse(vepdata$IMPACT=="HIGH","4","")))))
vepdata <- vepdata[order(-vepdata$sort_impact),]
filterimpact <- vepdata[which(vepdata$IMPACT == "HIGH"| vepdata$IMPACT == "MODERATE"),]
#Step 2 Filter Clinical significance("CLIN_SIG")
## If CLIN_SIG reported benign or likely benign,then exclude them from dataset
uncertain <- grepl("uncer", filterimpact$CLIN_SIG) #find "uncertain_significance"
notpro <- grepl("pro", filterimpact$CLIN_SIG) #find "not_provided"
benign <- grepl("ben", filterimpact$CLIN_SIG) #find "benign or likely_benign"
patho <- grepl("patho", filterimpact$CLIN_SIG) #find "pathogenic or likely_pathogenic"
na <- grepl("-", filterimpact$CLIN_SIG) #find "-"
select_clinsig = patho|((uncertain|na)&!notpro&!benign) #logical statement to filter out benign and likely benign
clinsig<- cbind.data.frame(filterimpact, select_clinsig)
clinsig <-clinsig[which(clinsig$select_clinsig == "TRUE"),]
#Step 3 Filter by MetaSVM (D = Deleterious, T = Tolerable) on MODERATE Impact records
## If MetaSVM reported T, exclude it from data
# Covert CADD_phred to numeric
clinsig$CADD_phred <- as.numeric(as.character(clinsig$CADD_phred))
# Keep all (MetaSVM == "D" or "-") AND (CADD Phred Scale >= 15)
temp_data <- clinsig[ !(clinsig$IMPACT == "MODERATE" & (clinsig$CADD_phred < 15) ) , ]
temp_data <- temp_data[ !(temp_data$IMPACT == "MODERATE" & (temp_data$MetaSVM_pred == "T") ) , ]
#step 4 Filter out common variants
#select column AF
afdata <- temp_data[ ,grep("^AF$|*_AF",names(temp_data))]
col<- c("AF","AFR_AF","AMR_AF","EAS_AF","EUR_AF","SAS_AF","AA_AF","EA_AF","gnomAD_AF","gnomAD_AFR_AF",
"gnomAD_AMR_AF", "gnomAD_ASJ_AF", "gnomAD_EAS_AF", "gnomAD_FIN_AF", "gnomAD_NFE_AF","gnomAD_OTH_AF",
"gnomAD_SAS_AF","1000Gp3_AF","1000Gp3_AFR_AF", "1000Gp3_AMR_AF", "1000Gp3_EUR_AF","1000Gp3_SAS_AF",
"ESP6500_AA_AF","ESP6500_EA_AF","ExAC_AF","ExAC_AFR_AF",
"ExAC_AMR_AF","ExAC_Adj_AF","ExAC_EAS_AF","ExAC_FIN_AF","ExAC_NFE_AF","ExAC_SAS_AF")
afdata[col] <- sapply(afdata[col],as.numeric)
#Filter out a common variants with allele frequency in any population that is greater than 0.1
afdata <- afdata <= 0.1
afdata[is.na(afdata)] <- TRUE
mafdata <- temp_data[which(apply(as.data.frame(afdata), 1,all)),]
temp_data #Equivalent to finalMatrix = cbind(finalMatrix, temp_data)
}
ready_to_write <- rbind.match.columns(ready_to_write, finalMatrix)
#check if file end has been reached and break from repeat
if(nrow(vepdata) < chunks){
break
}
#increment the index to read next chunk
index <- index + 1
print(paste('Processing rows:', index * chunks))
temp_data
}, error=function(err) {
## matching condition message only works when message is not translated
if (identical(conditionMessage(err), "no lines available in input"))
break
else stop(err)
})
}
)
Clean and Write output
## Cleans all rows with NAs in every columns (Unexpected event)
ready_to_write<-ready_to_write[rowSums(is.na(ready_to_write)) != ncol(ready_to_write), ]
write.csv(ready_to_write,"filter.v2.csv",na = "",row.names = F)
close(con)
#stop cluster
stopCluster(cl)
Thank you kub, K2
change add to VEP_filter.Chunky.bwgs.R
[ ] When using read.table
if you specify as.is=TRUE
to avoid converting a character string to a factor, we can save quite a lot of time. Please see if you use this option, does it break any existing code when we do a logic test or not. If not, this should be safe to incorporate into the template.
[ ] When merging, pathogenic_clinvar
to the final mafdata
, there are still duplicated rows at the moment. We should need to fix this as well.
Note krub, please help filx the order of filter using the guideline below
Commentsh mainly I filter on Impact early on to remove as many variants as possible based on "Gene Structure" model, which rely the least on any prediction algorithm and based the Impact classification on the known structure location of the gene and biological knowledge about how the changes that happened to the protein or gene may impact the protein function. Regulartory region variants might reduce or increase the gene expression level, but these variants will not completely disrupt the gene function. Hence, the impact of these regulartory variants will be less severe than disruption of the protein function. However, these regulartory variants can still cause diseases. We will consider these regulartory variants usually when we have more evidence such as those evidence from association test e.g. GWAS.