lgatto / pRoloc

A unifying bioinformatics framework for organelle proteomics
http://lgatto.github.io/pRoloc/
15 stars 14 forks source link

add for loop based calculation for mean distances #95

Closed sgibb closed 7 years ago

sgibb commented 7 years ago

While looking at qsep.R I stumbled across the following lines:

    ## mean distance between all pairs of subcellular marker clusters
    tmp <- apply(expand.grid(um, um), 1,
                 function(.um) {
                     sel1 <- fData(mobj)[, fcol] == .um[1]
                     sel2 <- fData(mobj)[, fcol] == .um[2]
                     ans[.um[1], .um[2]] <<- mean(mrkdist[sel1, sel2],
                                                  na.rm = TRUE)
                 })

Sorry, but I can't accept the use of the <<- operator. :smiley:

IMHO the following is much cleaner and avoids a few comparisons (and would be faster):

n <- length(um)
sel <- lapply(um, "==", markers)
ans <- matrix(NA_real_, nrow=n, ncol=n, dimnames=list(um, um))

for (i in 1L:n) {
  for (j in 1L:n) {
    ans[i, j] <- mean(x[sel[[i]], sel[[j]]], na.rm=TRUE)
  }
}
ans

Here a little comparison and benchmark:

load_all()
library(pRolocdata)
data(hyperLOPIT2015)

library(rbenchmark)

QSep1 <- function(object, fcol = "markers") {
    objname <- MSnbase:::getVariableName(match.call(), "object")
    ## only consider markers
    mobj <- markerMSnSet(object)
    ## vector of markers
    um <- unique(getMarkers(mobj, fcol = fcol, verbose = FALSE))
    ## answer is a square matrix
    ans <- diag(length(um))
    colnames(ans) <- rownames(ans) <- um
    ## euclidean distance between all markers
    mrkdist <- dist(exprs(mobj))
    mrkdist <- as.matrix(mrkdist)
    diag(mrkdist) <- NA
    ## mean distance between all pairs of subcellular marker clusters
    tmp <- apply(expand.grid(um, um), 1,
                 function(.um) {
                     sel1 <- fData(mobj)[, fcol] == .um[1]
                     sel2 <- fData(mobj)[, fcol] == .um[2]
                     ans[.um[1], .um[2]] <<- mean(mrkdist[sel1, sel2],
                                                  na.rm = TRUE)
                 })

    res <- .QSep(x = ans,
                 xnorm = ans / diag(ans),
                 object = objname)
    if (validObject(res)) res
}

QSep2 <- function(object, fcol = "markers") {
    objname <- MSnbase:::getVariableName(match.call(), "object")
    ## only consider markers
    mobj <- markerMSnSet(object)
    ## vector of markers
    markers <- getMarkers(mobj, fcol = fcol, verbose = FALSE)
    ## euclidean distance between all markers
    mrkdist <- dist(exprs(mobj))
    mrkdist <- as.matrix(mrkdist)
    diag(mrkdist) <- NA

    ans <- .meanMarkerDist(mrkdist, markers)

    res <- .QSep(x = ans,
                 xnorm = ans / diag(ans),
                 object = objname)
    if (validObject(res)) res
}

QSep3 <- function(object, fcol = "markers") {
    objname <- MSnbase:::getVariableName(match.call(), "object")
    ## only consider markers
    mobj <- markerMSnSet(object)
    ## vector of markers
    markers <- getMarkers(mobj, fcol = fcol, verbose = FALSE)
    ## euclidean distance between all markers
    mrkdist <- dist(exprs(mobj))
    mrkdist <- as.matrix(mrkdist)
    diag(mrkdist) <- NA

    ans <- .meanMarkerDist2(mrkdist, markers)

    res <- .QSep(x = ans,
                 xnorm = ans / diag(ans),
                 object = objname)
    if (validObject(res)) res
}

.meanMarkerDist <- function(x, markers) {
  um <- unique(markers)
  n <- length(um)
  sel <- lapply(um, "==", markers)
  ans <- matrix(NA_real_, nrow=n, ncol=n, dimnames=list(um, um))

  for (i in 1L:n) {
    for (j in 1L:n) {
      ans[i, j] <- mean(x[sel[[i]], sel[[j]]], na.rm=TRUE)
    }
  }
  ans
}

.meanMarkerDist2 <- function(x, markers) {
  ## calculate squarewise rowsums
  sums <- rowsum(t(rowsum(x, group=markers, reorder=FALSE, na.rm=TRUE)),
                 group=markers, reorder=FALSE, na.rm=TRUE)
  ## count squarewise !NA to calculate means
  x[] <- as.integer(!is.na(x))
  nums <- rowsum(t(rowsum(x, group=markers, reorder=FALSE, na.rm=TRUE)),
                 group=markers, reorder=FALSE, na.rm=TRUE)
  t(sums / nums)
}

all.equal(QSep1(hyperLOPIT2015), QSep2(hyperLOPIT2015))
# [1] TRUE
all.equal(QSep1(hyperLOPIT2015), QSep3(hyperLOPIT2015))
# [1] TRUE
benchmark(QSep1(hyperLOPIT2015), QSep2(hyperLOPIT2015), QSep3(hyperLOPIT2015),
          order="relative")
#                    test replications elapsed relative user.self sys.self user.child sys.child
# 3 QSep3(hyperLOPIT2015)          100  17.311    1.000    17.308    0.000          0         0
# 2 QSep2(hyperLOPIT2015)          100  18.337    1.059    18.324    0.012          0         0
# 1 QSep1(hyperLOPIT2015)          100  21.682    1.252    21.652    0.032          0         0

This PR adds the double for-loop solution. IMHO this is the cleanest and easiest to read solution. The rowsums approach is a little bit faster but hard to understand. I guess execution time is not important here so I would vote for the for loop approach (if you interested in the rowsums approach you could merge the following branch: https://github.com/sgibb/pRoloc/tree/meanMarkerDistRowsum

lgatto commented 7 years ago

Alright, fair enough :-)

lmsimp commented 7 years ago

Only taken you 3 days to accept Laurent😀 Very good Sebastian 👍🏻