Filter criteria suggestion

hypotheses commented 5 years ago

Note krub, please help filx the order of filter using the guideline below

Keep all known pathogenic variants from ClinVar
Filter on impact on gene function, using VEP IMPACT Prediction, keep only [HIGH & MODERATE] impact -- This should reduce the number of variants drastically. 2.1 Variants with low or modifier impact might have functional roles but we do not have any other supporting evidence at present; therefore, the weight we believe that these variants can cause disease will be lower than the variants with HIGH/Moderate impact function.
Further filter "Moderate Impact" variants using prediction algorithms, using "MetaSVM" + "CADD" 3.1 Keep all (MetaSVM == "D" or "-") AND (CADD Phred Scale >= 15)
Further filter out known benign or likely benign variants, using CLIN_SIG
Futher filter out common variants using frequency data from public databases with the maximum allele frequency in any population > 0.10 (cancer germline might be relative common (?)), not quite sure at the moment though what's the best cut-off frequency)

Commentsh mainly I filter on Impact early on to remove as many variants as possible based on "Gene Structure" model, which rely the least on any prediction algorithm and based the Impact classification on the known structure location of the gene and biological knowledge about how the changes that happened to the protein or gene may impact the protein function. Regulartory region variants might reduce or increase the gene expression level, but these variants will not completely disrupt the gene function. Hence, the impact of these regulartory variants will be less severe than disruption of the protein function. However, these regulartory variants can still cause diseases. We will consider these regulartory variants usually when we have more evidence such as those evidence from association test e.g. GWAS.

silenus092 commented 5 years ago

Aj. kub

Could you please check my code whether it conforms to your guideline?

In the code,Step 1 and 2 originally come from P'mint.

you can skip setup part.

Setup part

library(doParallel)

rbind.match.columns <- function(input1=0, input2=0) {
  n.input1 <- ncol(input1)
  n.input2 <- ncol(input2)
  if( is.null(n.input1) ){
    n.input1 = 0 
  }

  if( is.null(n.input2) ){
    n.input2 = 0 
  }

  if ( n.input2 < n.input1) {
    TF.names <- which(names(input2) %in% names(input1))
    column.names <- names(input2[, TF.names])
  } else {
    TF.names <- which(names(input1) %in% names(input2))
    column.names <- names(input1[, TF.names])
  }

  return(rbind(input1[, column.names], input2[, column.names]))
}

#setup parallel backend to use many processors
cores=detectCores()
cl <- makeCluster(cores[1]-3) #not to overload your computer (e.g 7-3 = 4 , use only 4 cores)
registerDoParallel(cl)

filename <- "b212wgs_annotation_output.vep.chr17.hg19.gz"
index = 0
counter = 0
chunks = 100000
con = file(filename, "r")
skip_headerLine = 343
data_header <- names(read.csv(filename, nrows=1, sep="\t", skip = skip_headerLine ,header = TRUE , check.names = FALSE))
ready_to_write <- data.frame(matrix(ncol =  length(data_header), nrow = 0))
names(ready_to_write) <- data_header

Processing Part

system.time(
repeat{

  tryCatch({
    vepdata <-read.table(con, nrows=chunks, header=FALSE, fill = FALSE, sep="\t",comment.char = "#" , check.names = FALSE )
    # do processing on dataChunk (i.e adding header, converting data type) 

    colnames(vepdata) <- data_header
    finalMatrix <- foreach(i = nrow(vepdata), .combine=cbind) %dopar% {

      #Step1 Recode impact variant to numeric and then sort by descending
      vepdata$sort_impact <- as.integer(ifelse(vepdata$IMPACT=="LOW","1",
                                               ifelse(vepdata$IMPACT=="MODIFIER","2",
                                                      ifelse(vepdata$IMPACT=="MODERATE","3",
                                                             ifelse(vepdata$IMPACT=="HIGH","4","")))))
      vepdata <- vepdata[order(-vepdata$sort_impact),] 

      filterimpact <- vepdata[which(vepdata$IMPACT == "HIGH"| vepdata$IMPACT == "MODERATE"),]

      #Step 2  Filter Clinical significance("CLIN_SIG") 
      ## If CLIN_SIG reported benign or likely benign,then exclude them from dataset 
      uncertain <- grepl("uncer", filterimpact$CLIN_SIG) #find "uncertain_significance"
      notpro <- grepl("pro", filterimpact$CLIN_SIG)  #find "not_provided" 
      benign <- grepl("ben", filterimpact$CLIN_SIG)  #find "benign or likely_benign"
      patho <- grepl("patho", filterimpact$CLIN_SIG) #find "pathogenic or likely_pathogenic"
      na <- grepl("-", filterimpact$CLIN_SIG) #find "-"
      select_clinsig = patho|((uncertain|na)&!notpro&!benign) #logical statement to filter out benign and likely benign 
      clinsig<- cbind.data.frame(filterimpact, select_clinsig)
      clinsig <-clinsig[which(clinsig$select_clinsig == "TRUE"),]

      #Step 3 Filter by MetaSVM (D = Deleterious, T = Tolerable) on MODERATE Impact records
      ## If MetaSVM reported T, exclude it from data   

      # Covert CADD_phred to numeric 
      clinsig$CADD_phred <- as.numeric(as.character(clinsig$CADD_phred))
      # Keep all (MetaSVM == "D" or "-") AND (CADD Phred Scale >= 15)
      temp_data <- clinsig[ !(clinsig$IMPACT == "MODERATE" & (clinsig$CADD_phred < 15)  )  , ]
      temp_data <- temp_data[ !(temp_data$IMPACT == "MODERATE" & (temp_data$MetaSVM_pred == "T")  )  , ]

      #step 4 Filter out common variants 
      #select column AF 
      afdata <- temp_data[ ,grep("^AF$|*_AF",names(temp_data))]
      col<- c("AF","AFR_AF","AMR_AF","EAS_AF","EUR_AF","SAS_AF","AA_AF","EA_AF","gnomAD_AF","gnomAD_AFR_AF", 
              "gnomAD_AMR_AF",  "gnomAD_ASJ_AF",  "gnomAD_EAS_AF",  "gnomAD_FIN_AF",  "gnomAD_NFE_AF","gnomAD_OTH_AF",
              "gnomAD_SAS_AF","1000Gp3_AF","1000Gp3_AFR_AF", "1000Gp3_AMR_AF", "1000Gp3_EUR_AF","1000Gp3_SAS_AF",
              "ESP6500_AA_AF","ESP6500_EA_AF","ExAC_AF","ExAC_AFR_AF",
              "ExAC_AMR_AF","ExAC_Adj_AF","ExAC_EAS_AF","ExAC_FIN_AF","ExAC_NFE_AF","ExAC_SAS_AF")
      afdata[col] <- sapply(afdata[col],as.numeric)

      #Filter out a common variants with allele frequency in any population  that is greater than 0.1
      afdata <- afdata <= 0.1
      afdata[is.na(afdata)] <- TRUE
      mafdata <- temp_data[which(apply(as.data.frame(afdata), 1,all)),]

      temp_data #Equivalent to finalMatrix = cbind(finalMatrix, temp_data)
    }

    ready_to_write <- rbind.match.columns(ready_to_write, finalMatrix)

    #check if file end has been reached and break from repeat
    if(nrow(vepdata) < chunks){
      break
    }

    #increment the index to read next chunk
    index <- index + 1
    print(paste('Processing rows:', index * chunks))

    temp_data
  }, error=function(err) {
    ## matching condition message only works when message is not translated
    if (identical(conditionMessage(err), "no lines available in input"))
      break
    else stop(err)
  })
}
)

Clean and Write output

## Cleans all rows with NAs in every columns (Unexpected event) 
ready_to_write<-ready_to_write[rowSums(is.na(ready_to_write)) != ncol(ready_to_write), ]

write.csv(ready_to_write,"filter.v2.csv",na = "",row.names = F)
close(con)

#stop cluster
stopCluster(cl)

Thank you kub, K2

hypotheses commented 5 years ago

change add to VEP_filter.Chunky.bwgs.R

[ ] When using read.table if you specify as.is=TRUE to avoid converting a character string to a factor, we can save quite a lot of time. Please see if you use this option, does it break any existing code when we do a logic test or not. If not, this should be safe to incorporate into the template.
[ ] When merging, pathogenic_clinvar to the final mafdata, there are still duplicated rows at the moment. We should need to fix this as well.

si-medbif / VEP_filter.R

Filter criteria suggestion #2