How to speed up? - Githubissues

jianhong / cleanUpdTSeq

This package classifies putative polyadenylation sites as true or false/internally oligodT primed

0 stars 0 forks source link

cat(Args)

testSet <- read.table(input_file, sep = "\t", header = T) peaks <- BED2GRangesSeq(testSet, upstream.seq.ind=7, downstream.seq.ind=8, withSeq=TRUE) testSet.NaiveBayes <- buildFeatureVector(peaks, upstream=40, downstream=30, wordSize=6, alphabet=c("ACGT"), sampleType="unknown", replaceNAdistance=30, method="NaiveBayes", ZeroBasedIndex=1, fetchSeq=FALSE)

data(classifier) test_results <- predictTestSet(testSet.NaiveBayes=testSet.NaiveBayes, classifier=classifier, outputFile=NULL, assignmentCutoff=0.5)

test_results <- test_results[test_results[2] < 0.5, 1] write.table(test_results, output_file, quote = F, row.names = F, col.names = F)`

Thank you very much and looking forward to your reply

Hello there,

Sorry for the delayed reply.  To speed up you can split your data to multiple parts. Say you have a 8-core CPU computer, you can divide you job into 8 subjobs.  Some example code is as below. Don't run it in RStudio, though.


library('cleanUpdTSeq')
library("future")
library("future.apply")
library("parallelly")

Args <- commandArgs()
input_file <- Args[6]
output_file <- Args[7]
#cat(Args)

if (.Platform$OS.type == "windows")
{
    plan(multisession)
} else {
   plan(multicore)
}

testSet <- read.table(input_file, sep = "\t", header = T)
split_group <- ceiling(1: nrow(testSet)/(availableCores() -1))

testSet_list <- split(testSet, split_group)

parallel_cleanUpdTSeq <- function(testSet)
{
peaks <- BED2GRangesSeq(testSet, upstream.seq.ind=7,
downstream.seq.ind=8, withSeq=TRUE)
testSet.NaiveBayes <- buildFeatureVector(peaks,
upstream=40, downstream=30,
wordSize=6, alphabet=c("ACGT"),
sampleType="unknown",
replaceNAdistance=30,
method="NaiveBayes",
ZeroBasedIndex=1, fetchSeq=FALSE)

data(classifier)
test_results <- predictTestSet(testSet.NaiveBayes=testSet.NaiveBayes,
classifier=classifier,
outputFile=NULL,
assignmentCutoff=0.5)
}

test_result <- future_lapply(testSet_list,  parallel_cleanUpdTSeq, future.chunck.size = 10)
test_result  <- do.call("rbind", test_result)
test_results <- test_results[test_results[2] < 0.5, 1]
write.table(test_results, output_file, quote = F, row.names = F, col.names = F)

jianhong / cleanUpdTSeq

How to speed up? #1

cat(Args)