Open cybog337 opened 5 years ago
Hi Chul,
The code in TCGAbiolinks is extremely old. Below is a code example that should be faster and use less memory. And If you have enough samples, you probably can run a t.test
which should be faster than the wilcoxon
.
nrows <- 200; ncols <- 400
met <- matrix(runif(nrows * ncols, 0, 1), nrows)
rowRanges <- GenomicRanges::GRanges(rep(c("chr1"),nrows),
IRanges::IRanges(floor(runif(nrows, 1e5, 1e6)), width=100),
strand=sample(c("+", "-"), nrows, TRUE),
feature_id=sprintf("ID%03d", 1:nrows))
names(rowRanges) <- paste0("probe_",1:nrows)
colData <- S4Vectors::DataFrame(row.names = paste0("sample_",1:400),
Treatment=rep(c("ChIP", "Input"), 200),
group=rep(c("group1","group2"),c(200,200)))
data <- SummarizedExperiment::SummarizedExperiment(
assays=S4Vectors::SimpleList(met=met),
rowRanges=rowRanges,
colData=colData)
#--------------------------------------------
# wilcoxon test
#--------------------------------------------
# with progress bar and time
group1.idx <- which(data$group == "group1")
group2.idx <- which(data$group == "group2")
results <- plyr::adply(.data = assay(data),.margins = 1, .fun = function(probe){
result <- stats::wilcox.test(probe[group1.idx],probe[group2.idx],conf.int = TRUE)
data.frame("raw_p_value" = result$p.value, "diff_mean_group1_minus_group2" = result$estimate)
}, .progress = "time",.parallel = FALSE,.id = "probe")
cores <- parallel::detectCores()
doParallel::registerDoParallel(cores)
group1.idx <- which(data$group == "group1")
group2.idx <- which(data$group == "group2")
results <- plyr::adply(.data = assay(data),.margins = 1, .fun = function(probe){
result <- stats::wilcox.test(probe[group1.idx],probe[group2.idx],conf.int = TRUE)
data.frame("raw_p_value" = result$p.value, "diff_mean_group1_minus_group2" = result$estimate)
}, .progress = "time",.parallel = TRUE,.id = "probe")
#--------------------------------------------
# t-test
#--------------------------------------------
group1.idx <- which(data$group == "group1")
group2.idx <- which(data$group == "group2")
results <- plyr::adply(.data = assay(data),.margins = 1, .fun = function(probe){
result <- stats::t.test(probe[group1.idx],probe[group2.idx],conf.int = TRUE)
data.frame("raw_p_value" = result$p.value,
"mean_group1" = result$estimate[1],
"mean_group2" = result$estimate[2],
"diff_mean_group1_minus_group2" = result$estimate[1] - result$estimate[2])
}, .progress = "time",.parallel = FALSE, .id = "probe")
Thank you very much for your kind and fast reply. I will apply the codes. Have a happy day. Best regards. Chul Kim
2019년 6월 17일 (월) 오후 11:34, Tiago Chedraoui Silva notifications@github.com님이 작성:
Hi Chul,
The code in TCGAbiolinks is extremely old. Below is a code example that should be faster and use less memory. And If you have enough samples, you probably can run a t.test which should be faster than the wilcoxon.
nrows <- 200; ncols <- 400 met <- matrix(runif(nrows * ncols, 0, 1), nrows) rowRanges <- GenomicRanges::GRanges(rep(c("chr1"),nrows), IRanges::IRanges(floor(runif(nrows, 1e5, 1e6)), width=100), strand=sample(c("+", "-"), nrows, TRUE), featureid=sprintf("ID%03d", 1:nrows)) names(rowRanges) <- paste0("probe",1:nrows) colData <- S4Vectors::DataFrame(row.names = paste0("sample_",1:400), Treatment=rep(c("ChIP", "Input"), 200), group=rep(c("group1","group2"),c(200,200))) data <- SummarizedExperiment::SummarizedExperiment( assays=S4Vectors::SimpleList(met=met), rowRanges=rowRanges, colData=colData)
--------------------------------------------
wilcoxon test
--------------------------------------------
with progress bar and time
group1.idx <- which(data$group == "group1") group2.idx <- which(data$group == "group2") results <- plyr::adply(.data = assay(data),.margins = 1, .fun = function(probe){ result <- stats::wilcox.test(probe[group1.idx],probe[group2.idx],conf.int = TRUE) data.frame("raw_p_value" = result$p.value, "diff_mean_group1_minus_group2" = result$estimate) }, .progress = "time",.parallel = FALSE,.id = "probe")
cores <- parallel::detectCores() doParallel::registerDoParallel(cores)
group1.idx <- which(data$group == "group1") group2.idx <- which(data$group == "group2") results <- plyr::adply(.data = assay(data),.margins = 1, .fun = function(probe){ result <- stats::wilcox.test(probe[group1.idx],probe[group2.idx],conf.int = TRUE) data.frame("raw_p_value" = result$p.value, "diff_mean_group1_minus_group2" = result$estimate) }, .progress = "time",.parallel = TRUE,.id = "probe")
--------------------------------------------
t-test
--------------------------------------------
group1.idx <- which(data$group == "group1") group2.idx <- which(data$group == "group2") results <- plyr::adply(.data = assay(data),.margins = 1, .fun = function(probe){ result <- stats::t.test(probe[group1.idx],probe[group2.idx],conf.int = TRUE) data.frame("raw_p_value" = result$p.value, "mean_group1" = result$estimate[1], "mean_group2" = result$estimate[2], "diff_mean_group1_minus_group2" = result$estimate[1] - result$estimate[2]) }, .progress = "time",.parallel = FALSE, .id = "probe")
— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/BioinformaticsFMRP/TCGAbiolinks/issues/322?email_source=notifications&email_token=AML65T2P6AHI3JWFYXA6BWTP26OHPA5CNFSM4HYTJO6KYY3PNVWWK3TUL52HS4DFVREXG43VMVBW63LNMVXHJKTDN5WW2ZLOORPWSZGODX3LVWY#issuecomment-502708955, or mute the thread https://github.com/notifications/unsubscribe-auth/AML65TYTUA2JFGVBJBFOJKTP26OHPANCNFSM4HYTJO6A .
Dear TCGAbiolinks team, When analyze differential methylation with large data set (~400 cases), the system memory (128Gbyte) is fully exhausted and the os system work very slowly. Would you like to say any possible suggestion to solve it? Best regards, Chul Kim.