Open duzc-Repos opened 5 years ago
Hello there,
Sorry for the delayed reply. To speed up you can split your data to multiple parts. Say you have a 8-core CPU computer, you can divide you job into 8 subjobs. Some example code is as below. Don't run it in RStudio, though.
library('cleanUpdTSeq')
library("future")
library("future.apply")
library("parallelly")
Args <- commandArgs()
input_file <- Args[6]
output_file <- Args[7]
#cat(Args)
if (.Platform$OS.type == "windows")
{
plan(multisession)
} else {
plan(multicore)
}
testSet <- read.table(input_file, sep = "\t", header = T)
split_group <- ceiling(1: nrow(testSet)/(availableCores() -1))
testSet_list <- split(testSet, split_group)
parallel_cleanUpdTSeq <- function(testSet)
{
peaks <- BED2GRangesSeq(testSet, upstream.seq.ind=7,
downstream.seq.ind=8, withSeq=TRUE)
testSet.NaiveBayes <- buildFeatureVector(peaks,
upstream=40, downstream=30,
wordSize=6, alphabet=c("ACGT"),
sampleType="unknown",
replaceNAdistance=30,
method="NaiveBayes",
ZeroBasedIndex=1, fetchSeq=FALSE)
data(classifier)
test_results <- predictTestSet(testSet.NaiveBayes=testSet.NaiveBayes,
classifier=classifier,
outputFile=NULL,
assignmentCutoff=0.5)
}
test_result <- future_lapply(testSet_list, parallel_cleanUpdTSeq, future.chunck.size = 10)
test_result <- do.call("rbind", test_result)
test_results <- test_results[test_results[2] < 0.5, 1]
write.table(test_results, output_file, quote = F, row.names = F, col.names = F)
Reccently, I use cleanUpdTSeq to filter the internal primed events. it run well on test data, but for real data(PAS-seq, polyA-seq, 3'-seq, etc), it is very slow and speed nearly a day for a sample. are there any method to improve this? the following are my running code `library('cleanUpdTSeq')
Args <- commandArgs() input_file <- Args[6] output_file <- Args[7]
cat(Args)
testSet <- read.table(input_file, sep = "\t", header = T) peaks <- BED2GRangesSeq(testSet, upstream.seq.ind=7, downstream.seq.ind=8, withSeq=TRUE) testSet.NaiveBayes <- buildFeatureVector(peaks, upstream=40, downstream=30, wordSize=6, alphabet=c("ACGT"), sampleType="unknown", replaceNAdistance=30, method="NaiveBayes", ZeroBasedIndex=1, fetchSeq=FALSE)
data(classifier) test_results <- predictTestSet(testSet.NaiveBayes=testSet.NaiveBayes, classifier=classifier, outputFile=NULL, assignmentCutoff=0.5)
test_results <- test_results[test_results[2] < 0.5, 1] write.table(test_results, output_file, quote = F, row.names = F, col.names = F)`
Thank you very much and looking forward to your reply