While looking at qsep.R I stumbled across the following lines:
## mean distance between all pairs of subcellular marker clusters
tmp <- apply(expand.grid(um, um), 1,
function(.um) {
sel1 <- fData(mobj)[, fcol] == .um[1]
sel2 <- fData(mobj)[, fcol] == .um[2]
ans[.um[1], .um[2]] <<- mean(mrkdist[sel1, sel2],
na.rm = TRUE)
})
Sorry, but I can't accept the use of the <<- operator. :smiley:
IMHO the following is much cleaner and avoids a few comparisons (and would be faster):
n <- length(um)
sel <- lapply(um, "==", markers)
ans <- matrix(NA_real_, nrow=n, ncol=n, dimnames=list(um, um))
for (i in 1L:n) {
for (j in 1L:n) {
ans[i, j] <- mean(x[sel[[i]], sel[[j]]], na.rm=TRUE)
}
}
ans
Here a little comparison and benchmark:
load_all()
library(pRolocdata)
data(hyperLOPIT2015)
library(rbenchmark)
QSep1 <- function(object, fcol = "markers") {
objname <- MSnbase:::getVariableName(match.call(), "object")
## only consider markers
mobj <- markerMSnSet(object)
## vector of markers
um <- unique(getMarkers(mobj, fcol = fcol, verbose = FALSE))
## answer is a square matrix
ans <- diag(length(um))
colnames(ans) <- rownames(ans) <- um
## euclidean distance between all markers
mrkdist <- dist(exprs(mobj))
mrkdist <- as.matrix(mrkdist)
diag(mrkdist) <- NA
## mean distance between all pairs of subcellular marker clusters
tmp <- apply(expand.grid(um, um), 1,
function(.um) {
sel1 <- fData(mobj)[, fcol] == .um[1]
sel2 <- fData(mobj)[, fcol] == .um[2]
ans[.um[1], .um[2]] <<- mean(mrkdist[sel1, sel2],
na.rm = TRUE)
})
res <- .QSep(x = ans,
xnorm = ans / diag(ans),
object = objname)
if (validObject(res)) res
}
QSep2 <- function(object, fcol = "markers") {
objname <- MSnbase:::getVariableName(match.call(), "object")
## only consider markers
mobj <- markerMSnSet(object)
## vector of markers
markers <- getMarkers(mobj, fcol = fcol, verbose = FALSE)
## euclidean distance between all markers
mrkdist <- dist(exprs(mobj))
mrkdist <- as.matrix(mrkdist)
diag(mrkdist) <- NA
ans <- .meanMarkerDist(mrkdist, markers)
res <- .QSep(x = ans,
xnorm = ans / diag(ans),
object = objname)
if (validObject(res)) res
}
QSep3 <- function(object, fcol = "markers") {
objname <- MSnbase:::getVariableName(match.call(), "object")
## only consider markers
mobj <- markerMSnSet(object)
## vector of markers
markers <- getMarkers(mobj, fcol = fcol, verbose = FALSE)
## euclidean distance between all markers
mrkdist <- dist(exprs(mobj))
mrkdist <- as.matrix(mrkdist)
diag(mrkdist) <- NA
ans <- .meanMarkerDist2(mrkdist, markers)
res <- .QSep(x = ans,
xnorm = ans / diag(ans),
object = objname)
if (validObject(res)) res
}
.meanMarkerDist <- function(x, markers) {
um <- unique(markers)
n <- length(um)
sel <- lapply(um, "==", markers)
ans <- matrix(NA_real_, nrow=n, ncol=n, dimnames=list(um, um))
for (i in 1L:n) {
for (j in 1L:n) {
ans[i, j] <- mean(x[sel[[i]], sel[[j]]], na.rm=TRUE)
}
}
ans
}
.meanMarkerDist2 <- function(x, markers) {
## calculate squarewise rowsums
sums <- rowsum(t(rowsum(x, group=markers, reorder=FALSE, na.rm=TRUE)),
group=markers, reorder=FALSE, na.rm=TRUE)
## count squarewise !NA to calculate means
x[] <- as.integer(!is.na(x))
nums <- rowsum(t(rowsum(x, group=markers, reorder=FALSE, na.rm=TRUE)),
group=markers, reorder=FALSE, na.rm=TRUE)
t(sums / nums)
}
all.equal(QSep1(hyperLOPIT2015), QSep2(hyperLOPIT2015))
# [1] TRUE
all.equal(QSep1(hyperLOPIT2015), QSep3(hyperLOPIT2015))
# [1] TRUE
benchmark(QSep1(hyperLOPIT2015), QSep2(hyperLOPIT2015), QSep3(hyperLOPIT2015),
order="relative")
# test replications elapsed relative user.self sys.self user.child sys.child
# 3 QSep3(hyperLOPIT2015) 100 17.311 1.000 17.308 0.000 0 0
# 2 QSep2(hyperLOPIT2015) 100 18.337 1.059 18.324 0.012 0 0
# 1 QSep1(hyperLOPIT2015) 100 21.682 1.252 21.652 0.032 0 0
This PR adds the double for-loop solution. IMHO this is the cleanest and easiest to read solution.
The rowsums approach is a little bit faster but hard to understand. I guess execution time is not important here so I would vote for the for loop approach (if you interested in the rowsums approach you could merge the following branch: https://github.com/sgibb/pRoloc/tree/meanMarkerDistRowsum
While looking at
qsep.R
I stumbled across the following lines:Sorry, but I can't accept the use of the
<<-
operator. :smiley:IMHO the following is much cleaner and avoids a few comparisons (and would be faster):
Here a little comparison and benchmark:
This PR adds the double
for
-loop solution. IMHO this is the cleanest and easiest to read solution. Therowsums
approach is a little bit faster but hard to understand. I guess execution time is not important here so I would vote for thefor
loop approach (if you interested in therowsums
approach you could merge the following branch: https://github.com/sgibb/pRoloc/tree/meanMarkerDistRowsum