Open lgatto opened 8 years ago
For completeness, here's some testing/devel code I wrote that made it into MSnbase
as the fastquant_max
and quantify_OnDiskMSnExp_max
functions (ad42601e05672545e1cb6042e98e55781c34deee and 55d3fac1726e4f200aca82b560289a10cc3dcdea).
## quantifies peak pk in regions [pk - wd, pk + wd] at half window
## size hws in spectrum spi in file f using max of peak (irrespective
## if spectrum is centroided or profile mode)
fastquant_1 <- function(f, pk, spi, hws, wd = 1) {
ramp <- new(mzR:::.__C__Rcpp_Ramp)
ramp$open(f, declaredOnly = TRUE)
if (!ramp$OK())
stop("Unable to create valid cRamp object.")
on.exit(ramp$close())
pks <- ramp$getPeakList(spi)$peaks
pks <- pks[pks[, 1] >= pk - wd & pks[, 1] <= pk + wd, ]
mx <- MALDIquant:::.localMaxima(pks[, 2], hws)
pks[mx, 2]
}
fastquant(x2[[372]], reporters = TMT6[1], method = "max")[[1]]
## TMT6.126 TMT6.127 TMT6.128 TMT6.129 TMT6.130 TMT6.131
## X422.1 56807640 60283748 63901644 57897180 54275908 53960332
fastquant_1(f, 126, 422, 20)
fastquant_1(f, 127, 422, 20)
fastquant_1(f, 128, 422, 20)
fastquant_1(f, 129, 422, 20)
fastquant_1(f, 130, 422, 20)
fastquant_1(f, 131, 422, 20)
r1 <- quantify(x2[[372]], reporters = TMT6[1], method = "max")[[1]]
names(r1) <- NULL
r2 <- fastquant_1(f, 126, 422, 20)
identical(r1, r2)
microbenchmark(quantify(x2[[372]], reporters = TMT6, method = "max")[[1]],
fastquant_1(f, 126, 422, 20),
times = 10)
## quantifies peaks pk in regions [pks - wd, pks + wd] at half window
## size hws in spectrum spi in file f using max of peak (irrespective
## if spectrum is centroided or profile mode)
fastquant_n <- function(f, pk, spi, hws, wd = 1) {
ramp <- new(mzR:::.__C__Rcpp_Ramp)
ramp$open(f, declaredOnly = TRUE)
if (!ramp$OK())
stop("Unable to create valid cRamp object.")
on.exit(ramp$close())
pks <- ramp$getPeakList(spi)$peaks
res <- rep(NA_real_, length(pk))
for (ii in seq_along(pk)) {
k <- pks[, 1] >= pk[ii] - wd & pks[, 1] <= pk[ii] + wd
.pks <- pks[k, , drop = FALSE]
mx <- MALDIquant:::.localMaxima(.pks[, 2], hws)
res[ii] <- .pks[mx, 2]
}
res
}
quantify(x2[[372]], reporters = TMT6, method = "max")[[1]]
fastquant_n(f, pk = mz(TMT6), 422, 20)
r1 <- quantify(x2[[372]], reporters = TMT6, method = "max")[[1]]
names(r1) <- NULL
r2 <- fastquant_n(f, mz(TMT6), 422, 20, wd = 0.5)
identical(r1, r2)
microbenchmark(quantify(x2[[372]], reporters = TMT6, method = "max")[[1]],
fastquant_n(f, mz(TMT6), 422, 20, wd = 0.5),
times = 1)
## quantifies peaks pk in regions [pks - wd, pks + wd] at half window
## size hws in spectra spi in file f using max of peak (irrespective
## if spectrum is centroided or profile mode)
fastquant_nn <- function(f, pk, spi, hws, wd = 0.5) {
ramp <- openMSfile(f)
on.exit(close(ramp))
pks <- peaks(ramp, spi)
res <- matrix(NA_real_,
ncol = length(pk),
nrow = length(pks))
for (i in seq_along(pks)) {
for (ii in seq_along(pk)) {
k <- pks[[i]][, 1] >= pk[ii] - wd & pks[[i]][, 1] <= pk[ii] + wd
if (any(k)) {
.pks <- pks[[i]][k, , drop = FALSE]
mx <- MALDIquant:::.localMaxima(.pks[, 2], hws)
res[i, ii] <- .pks[mx, 2]
}
}
}
res
}
library("MSnbase")
x2 <- readMSData2(f, msLevel = 2, centroided = TRUE)
ii <- fData(x2)$spIdx
library("microbenchmark")
microbenchmark(r1 <- quantify(x2, reporters = TMT6, verbose = FALSE,
method = "max", BPPARAM = SerialParam()),
r2 <- fastquant_nn(f, pk = mz(TMT6), spi = ii, hws = 20L, wd = 0.05),
times = 5)
## Unit: milliseconds
## min lq mean median uq max neval
## 18720.0183 18758.3670 22544.4527 19908.1385 27524.8491 27810.891 5
## 468.5377 471.3297 517.8705 472.1369 577.6511 599.697 5
## min lq mean median uq max neval
## 18695.8108 18908.834 19187.2583 18963.9148 19216.6528 20151.0787 5
## 468.3613 469.569 471.0501 471.4552 471.9763 473.8886 5
r1 <- exprs(r1)
attributes(r1) <- attributes(r1)[1]
identical(r1, r2)
## Code to inspect if not identical
sel <- !is.na(r1) & !is.na(r2)
r1[!sel] <- 0
r2[!sel] <- 0
table(r1 == r2)
kk <- arrayInd(which(!r1 == r2), dim(r1))
i <- sort(unique(kk[, 1]))
r1[i, ]
r2[i, ]
ssp <- lapply(spectra(x2[i]), as, "data.frame")
for (j in 1:length(ssp)) {
x <- ssp[[j]]
x <- x[x[, 1] > 120 & x[, 1] < 132, ]
plot(x, type = "h", xlim = c(125, 132), ylim = c(0, max(x, 2)))
grid()
abline(v = mz(TMT6), col = "red", lty = "dotted")
points(mz(TMT6), r1[i[j], ])
points(mz(TMT6), r2[i[j], ], col = "red", pch = 3)
tmp <- rbind(r1[i[j], ], r2[i[j], ])
colnames(tmp) <- mz(TMT6)
print(tmp)
scan()
}
f2 <- "~/Data2/PXD000001-new-files/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML"
library("MSnbase")
x2 <- readMSData2(f, msLevel = 2, centroided = TRUE)
x <- readMSData(f, msLevel = 2, centroided = TRUE)
qe0 <- quantify(x, reporters = TMT6, verbose = FALSE, method = "max")
qe00 <- quantify(x2, reporters = TMT6, verbose = FALSE, method = "max")
all.equal(exprs(qe0), exprs(qe00), check.attributes = FALSE)
x2 <- readMSData2(f2, msLevel = 2, centroided = TRUE)
qe00 <- quantify(x2, reporters = TMT6, method = "max", BPPARAM = MulticoreParam(4L))
qe <- MSnbase:::fastquant_max(f2, pk = mz(TMT6), spi = fData(x2)$spIdx,
hws = 20L, wd = 0.05)
all.equal(qe, exprs(qe00), check.attributes = FALSE)
I will document these benchmarks in the benchmarking
vignette (9902833b91bf63dcd5083e25211b9fb84b6cca0e).
This is for TMT quantitation. So we are just interested in the highest peak in a reporter region (not in all local maxima). While MALDIquant:::.localMaxima
is not needed here it could result in an error if the hws
is too small. Minimal example that gives an error because of 2 local maxima:
## reporter mz
pk <- 10
## stupid spectrum with two local maxima in a reporter region
pks <- list(cbind(mz=c(seq(9.2, 10.8, by=0.1)),
intensity=c(1:5, 4:1, 2:6, 5:3)))
## modified fastquant_max to work without a file
fastquant_max_org <- function(pks, pk, hws=20L, wd = 0.5) {
res <- matrix(NA_real_,
ncol = length(pk),
nrow = length(pks))
for (i in seq_along(pks)) {
for (ii in seq_along(pk)) {
k <- pks[[i]][, 1] >= pk[ii] - wd & pks[[i]][, 1] <= pk[ii] + wd
if (any(k)) {
.pks <- pks[[i]][k, , drop = FALSE]
mx <- MALDIquant:::.localMaxima(.pks[, 2], hws)
res[i, ii] <- .pks[mx, 2]
}
}
}
res
}
## small hws
fastquant_max_org(pks, pk, hws=2)
# Error in res[i, ii] <- .pks[mx, 2] :
# number of items to replace is not a multiple of replacement length
I replaced MALDIquant:::.localMaxima
with max
(should be even faster) in c012ed92235764e7c5cb98b9f37dff4a19e451af.
Thanks @sgibb.
The function currently is used for iTRAQ/TMT quantitation indeed, but I would like it to be more general, or have one available for any peak (MS1 or MS2, centroided or profile). In your example, I would actually want the second max at M/Z 10.5 to be chosen. In that respect, the org function does have a bug, but I don't think using max
would be the right change. Having said that, I think max
is fine/better for the current limited use case (as we know the reporter mz values), and another function/iteration (for profile data) might revert to MALDIquant:::.localMaxima
.
Quantification needs to be reimplemented at the C-level or using on disk access and using basic data types to make full use of the
OnDiskMSnExp
data structure.Quantitation involves peak integration, which will be useful for MS1 and MS2 quantitation. The way this is done will depend if data is centroided (only really relevant for MS2 data) or profile mode. When in profile mode, the peak is a curve; in centroided mode, the peak is a sticky peak.
What follows is a crude description of the needs. I will refine later.
quantify
findPeak(mz, tolerance)
quantify(method = "max")
quantify(method = "trapezoidation")
findPeak(mz, tolerance)
MALDIquant:::.localMaxima
, with appropriatehalfWindowSize
.Summary of
quantify
methods:max
fastquant_max
SI
SI
SI
SAF
SAF
count_MSnSet
tic_MSnSet
Some functions are the same (after minor modifications) than for the
MSnExp
:count_MSnSet
,tic_MSnSet
,SI
andSAF
.[1] this one could actually climb up outide of
mz
+/-wd
to detect to maximum. [2] actually not directly available as amethod
, used bySI
,SIgi
andSIn
.