Hi Thom!
Regarding the issue of large memory usage, I managed to reimplement what you proposed to me in the following pseudocode:
Alternatively, you could break down the whole propr + FDR routine into chunks. propr(Ai, Aj) does not depend on Ak so long as Ai,Aj,Ak are already CLR transformed. The pseudo-code might look like this:
# CLR transform all data right away
# define chunks (see attached propr.dopar script)
# for each chunk:
# myCLR_chunk = subset chunk from CLR transformed data
# pr_chunk = propr(myCLR_chunk, ivar = NA)
# fdr_chunk = updateCutoffs(pr)
# save whatever information you want
# end for
So now I am able to use propr on large datasets without running out of memory. Thanks a lot for the help!
I'm posting the functions here:
propr.chunk <- function(counts, metric = c("rho", "phi", "phs", "cor", "vlr"),
ivar = NA, symmetrize = FALSE, alpha=NA, p=100, fdr=0.05,
n=100, ncores = 1, interval=seq(0.3,0.95,0.01),
dir = NA){
# divide data into chunks
# I recommend using large chunk size n,
# otherwise dividing the data into too many chunks will slow down the computation
l <- blocks2combs(counts, n)
combs <- l[[1]]
split <- l[[2]]
# parallelize propr computation for i chunks
doMC::registerDoMC(cores = ncores)
`%dopar%` <- foreach::`%dopar%`
RES <- foreach::foreach(i = 1:nrow(combs)) %dopar% {
# get chunk
batch1 <- split[[combs[i,1]]]
batch2 <- split[[combs[i,2]]]
chunk = subset(counts, select = c(batch1, batch2))
# compute propr
l_propr <- chunk2propr(i, chunk, metric=metric, ivar=ivar, symmetrize=symmetrize, alpha=alpha, p=p, interval=interval, fdr=fdr)
if(is.na(dir)){
# return propr matrix, fdr
l_propr
}else{
# save data if required
file2 <- paste0(dir, "/job-", combs[i,1], "+", combs[i,2], ".csv")
print(paste("----saving tmp file[", i, "] to ", file2, sep=""))
write.csv(l_propr[[1]], file=file2)
# return cutoff and FDR
list(NULL, l_propr[[2]])
}
}
# collect files
if(!is.na(dir)){
RES <- file2res(RES, combs, dir)
}
# merge chunks
l_propr <- chunk2full(counts, RES, split, combs, fdr)
return(l_propr)
}
blocks2combs <- function(counts, n){
# define blocks
# the resulting chunks will have size of at most [n, n]
nblocks = ncol(counts) %/% n
ngroup <- rep(1:nblocks, each = n)
leftover <- ncol(counts) - length(ngroup)
if(leftover > 0) ngroup <- c(ngroup, rep(nblocks + 1, leftover))
# check size
# here I decided to stop computation if no more than 2 groups are generated (so only 1 chunk)
# so if this happens you better change n
if (length(unique(ngroup)) <= 2){
stop(paste("ERROR: chunk size ", n, " is too big for data frame of size [", nrow(counts), "][", ncol(counts), "]", sep=""))
}
# split groups
# each row in combs define a chunk
split <- split(1:ncol(counts), ngroup)
combs <- expand.grid(1:length(split), 1:length(split))
combs <- t(apply(combs, 1, sort))
combs <- unique(combs)
combs <- combs[combs[,1] != combs[,2],]
print(paste("----runing propr for ", nrow(combs), " chunks of size ", n))
return(list(combs, split))
}
chunk2propr <- function(i, chunk, metric="rho", ivar=NA, symmetrize=FALSE,
alpha, p=100, interval=seq(0.4,0.95,0.01), fdr=0.05 ){
print(i)
# compute propr for chunk
rho.i <- propr(chunk, metric = metric, ivar = ivar, alpha = alpha, p=p)
rho.i <- updateCutoffs(rho.i,interval)
# get cutoff | fdr
df <- data.frame('cutoff'=rho.i@fdr[,'cutoff'], 'FDR'=rho.i@fdr[,'FDR'])
return(list(rho.i@matrix, df))
}
chunk2full <- function(counts, RES, split, combs, fdr){
# define variables
QUILT <- matrix(0, ncol(counts), ncol(counts))
d <- data.frame('cutoff'=RES[[1]][[2]][,'cutoff'])
# collect chunks
for(i in 1:nrow(combs)){
# add fdr.i
d[paste('FDR', i, sep="")] <- RES[[i]][[2]][,'FDR']
# Fill final matrix with each chunk
batch1 <- split[[combs[i,1]]]
batch2 <- split[[combs[i,2]]]
patch.i <- c(batch1, batch2)
QUILT[patch.i, patch.i] <- RES[[i]][[1]]
}
# average fdr
df <- data.frame('cutoff'=RES[[1]][[2]][,'cutoff'], 'FDR'=round(rowSums(d[,2:ncol(d)])/i,4))
# cutoff
cutoff <- min(df[df[,"FDR"]<fdr,"cutoff"])
# rename columns & rows
matrix <- QUILT
rownames(matrix) <- colnames(counts)
colnames(matrix) <- colnames(counts)
return(list(matrix, cutoff, df))
}
file2res <- function(RES, combs, dir){
print(paste("----collecting ", nrow(combs), " chunk files previously saved in ", dir, sep=""))
for(i in 1:nrow(combs)){
file2 <- paste0(dir, "/job-", combs[i,1], "+", combs[i,2], ".csv")
csv <- read.csv(file2, row.names = 1, header= TRUE)
matrix <- data.matrix(csv)
RES[[i]][[1]] <- matrix
}
# remove tmp files
unlink(dir2, recursive=TRUE)
return(RES)
}
So, to run these functions and get the final propr results, you can just do:
# CLR transformation
clr = propr:::proprCLR(M)
# run proportionality analysis per chunk
ch = propr.chunk(clr, metric="rho", ivar=NA, alpha=NA, ncores=4, p=20, n=5000,
interval=seq(0.3,0.95,0.01), dir="/path/to/tmpdir")
matrix = ch[[1]]
cutoff = ch[[2]]
fdr = ch[[3]]
# organize results data frame
labels <- propr:::labRcpp(ncol(clr))
# lrv <- propr:::lr2vlr(clr) # somehow this step does not work for me
results <-
data.frame(
"Partner" = labels[[1]],
"Pair" = labels[[2]],
# "lrv" = propr:::lltRcpp(lrv),
# "metric" = factor(metric),
# "alpha" = factor(alpha),
"propr" = propr:::lltRcpp(matrix)
)
# get proportional pairs
mypairs=which(results[,"propr"]>=cutoff)
results = results[mypairs,] # this is the final result with the pairs (propr>=cutoff)
Hi Thom! Regarding the issue of large memory usage, I managed to reimplement what you proposed to me in the following pseudocode:
So now I am able to use propr on large datasets without running out of memory. Thanks a lot for the help!
I'm posting the functions here:
So, to run these functions and get the final propr results, you can just do: