envmetagen / metabinkit

Set of programs to perform taxonomic binning.
GNU General Public License v3.0
2 stars 1 forks source link

-taxidlist option #8

Closed bastianegeter closed 4 years ago

bastianegeter commented 4 years ago

Perhaps a personal wish, but I often use the -taxidlist option to limit the blast to a certain taxonomical section of the database (and we used it for mussels). I see you have the negative taxids option, but not the "positive" option. Suggest adding this functionality.

example of previous blast function, in case it helps

> blast.min.bas2
function(infasta,refdb,blast_exec="blastn",wait=T,taxidlimit=NULL,inverse=F,ncbiTaxDir=NULL,overWrite=F,out=NULL,
                         opts=c("-task","blastn","-outfmt", "6 qseqid evalue pident qcovs saccver staxid ssciname sseq","-num_threads", 64,
                                "-max_target_seqs", 100, "-max_hsps",1,"-word_size", 11, "-perc_identity", 50,
                                "-qcov_hsp_perc", 98, "-gapopen", 0, "-gapextend", 2, "-reward", 1, "-penalty", -1)){

  t1<-Sys.time()

  require(processx)

  if(!is.null(taxidlimit)) if(is.null(ncbiTaxDir)) stop("to use taxidlimit, ncbiTaxDir must be supplied")
  if(is.null(out)) out<-paste0(gsub(".fasta", ".blast.txt",infasta))

  outdircheck<-

  if(overWrite==F) if(file.exists(out)) stop("The following file already exists ", out, "Use overWrite=T to overwrite")

  if(!is.null(taxidlimit)){

    h<-list()

    #generate children of taxids and store in file
    taxid.list<-list()

    taxids_fileA<-paste0("taxids",as.numeric(Sys.time()),".txt")

    for(i in 1:length(taxidlimit)){
      system2(command = "taxonkit",args = c("list", "--ids", taxidlimit[i], "--indent", '""',"--data-dir",ncbiTaxDir)
              ,wait=T,stdout = taxids_fileA)
      taxid.list[i]<-read.table(taxids_fileA)
    }

    write.table(unlist(taxid.list),taxids_fileA,row.names = F,quote = F,col.names = F,)

    #change options to include taxidlimit
    opts<-c(opts,"-taxidlist",taxids_fileA)

    if(inverse) opts<-gsub("-taxidlimit","-negative_taxids",opts)
  }

  #run BLAST  

  error.log.file<-paste0("blast.error.temp.processx.file",as.numeric(Sys.time()),".txt")

  h<-process$new(command = blast_exec, args=c("-query", infasta, "-db",refdb,opts, "-out", out),echo_cmd = T,
                 stderr = error.log.file)

  Sys.sleep(time = 2)

  #report PID
  message(paste("PID:",h$get_pid()))

  #check immediate exit status
  exits<-h$get_exit_status()

  if(1 %in% exits){
    message("************
             There was a problem with ", infasta[match(1,exits)], ", aborting blast
             ************")
    print(grep("Error",readLines(error.log.file),value = T))

    h$kill()
  }

  if(wait==T){
    h$wait()
    message(readLines(error.log.file))
    message("exit_status=",exits)
    file.remove(error.log.file)
    if(!is.null(taxidlimit)) file.remove(taxids_fileA)
  }

  headers<-paste0(paste("'1i",paste(unlist(strsplit(opts[match("-outfmt",opts)+1]," "))[-1],collapse = "\t"),collapse = "\t"),"'")

  system2("sed",c("-i", headers, out),wait = T)

  t2<-Sys.time()
  t3<-round(difftime(t2,t1,units = "mins"),digits = 2)

  message(c("All blasts complete in ",t3," mins."))

  return(h)
}
nunofonseca commented 4 years ago

Implemented.