Closed philouail closed 7 months ago
Actually, I think the issue is the bin
function itself:
> lengths(spectra(data)) |> head()
[1] 1378 1356 1404 1496 1525 1498
> res <- bin(spectra(data), binSize = 0.01)
[1] 91100 91100 91100 91100 91100 91100
So, I would suggest the fix should go into bin
. Maybe with an additional parameter dropZero = FALSE
that, if set to TRUE
drops all 0 intensity peaks. For the name of the parameter, I'm not quite sure if that dropZero
is the best choice. Maybe zero.rm
(similar to na.rm
)?
My reasoning why I think adding a parameter instead of just removing zero-intensity peaks by default: the current default would allow to directly rbind
the intensity values returned by bin,Spectra
which again would allow simple visualizations of multiple spectra at once or to use e.g. standard similarity scoring on such matrices (see here) for an example).
Hi @jorainer so this were my ideas to fix it. I tried to mimic the environment of the .peaks_bin
function but i am honestly not super sure this is accurate. I kind of struggled to get a proper input to test the function. So we can discuss it maybe.
## Define example intensities and m/z values
ints <- abs(rnorm(20, mean = 40))
mz <- seq(1:length(ints)) + rnorm(length(ints), sd = 0.001)
## variable that mimic how it is used in the .peaks_bin function
breaks <- seq(floor(min(ints)), ceiling(max(ints)))
FUN <- .Primitive("sum")
zero.rm = TRUE
mids <- (breaks[-length(breaks)] + breaks[-1L]) / 2
## Bin intensities by m/z bins with a bin size of 1
bins <- bin(ints, mz, size = 1, breaks = breaks, returnMids = FALSE, FUN = FUN)
bins[4:6] <- c(12343,5343,345656) #just making it a bit more complex
#### OPTION 1#####
keep <- which(bins != 0)
cbind(mz = mids[keep], intensity = bins[keep])
#### OPTION 2#####
res <- cbind(mz = mids, intensity = bins)
if(zero.rm)
res <- res[res[ ,"intensity"] != 0, ]
res
For the test data, you could use the sciex_pks
variable defined in testthat.R (i.e. you would have that variable available for your unit test). Your suggested implementations would be:
## Define the test data variable (done in testthat.R)
library(Spectra)
sciex_file <- normalizePath(
dir(system.file("sciex", package = "msdata"), full.names = TRUE))
sciex_mzr <- backendInitialize(MsBackendMzR(), files = sciex_file)
sciex_pks <- peaksData(sciex_mzr)
## Define breaks
breaks <- seq(0, 1000, by = 0.01)
mids <- (breaks[-length(breaks)] + breaks[-1L]) / 2
## Original
.peaks_bin <- function(x, spectrumMsLevel, binSize = 1L,
breaks = seq(floor(min(x[, 1])),
ceiling(max(x[, 1])),
by = binSize),
agg_fun = sum,
mids,
msLevel = spectrumMsLevel, ...) {
if (!(spectrumMsLevel %in% msLevel))
return(x)
bins <- MsCoreUtils::bin(x[, 2], x[, 1], size = binSize, breaks = breaks,
FUN = agg_fun, returnMids = FALSE)
cbind(mz = mids, intensity = bins)
}
## Option 1
.peaks_bin2 <- function(x, spectrumMsLevel, binSize = 1L,
breaks = seq(floor(min(x[, 1])),
ceiling(max(x[, 1])),
by = binSize),
agg_fun = sum,
mids,
msLevel = spectrumMsLevel, zero.rm = FALSE, ...) {
if (!(spectrumMsLevel %in% msLevel))
return(x)
bins <- MsCoreUtils::bin(x[, 2], x[, 1], size = binSize, breaks = breaks,
FUN = agg_fun, returnMids = FALSE)
if (zero.rm) {
keep <- which(bins != 0)
cbind(mz = mids[keep], intensity = bins[keep])
} else cbind(mz = mids, intensity = bins)
}
## Option 2
.peaks_bin3 <- function(x, spectrumMsLevel, binSize = 1L,
breaks = seq(floor(min(x[, 1])),
ceiling(max(x[, 1])),
by = binSize),
agg_fun = sum,
mids,
msLevel = spectrumMsLevel, zero.rm = FALSE, ...) {
if (!(spectrumMsLevel %in% msLevel))
return(x)
bins <- MsCoreUtils::bin(x[, 2], x[, 1], size = binSize, breaks = breaks,
FUN = agg_fun, returnMids = FALSE)
res <- cbind(mz = mids, intensity = bins)
if (zero.rm)
res[bins != 0, , drop = FALSE]
else res
}
Just checking which would be more efficient:
library(microbenchmark)
microbenchmark(
.peaks_bin(sciex_pks[[1]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids),
.peaks_bin2(sciex_pks[[1]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = FALSE),
.peaks_bin2(sciex_pks[[1]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = TRUE),
.peaks_bin3(sciex_pks[[1]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = TRUE)
)
Unit: microseconds
expr
.peaks_bin(sciex_pks[[1]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids)
.peaks_bin2(sciex_pks[[1]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = FALSE)
.peaks_bin2(sciex_pks[[1]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = TRUE)
.peaks_bin3(sciex_pks[[1]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = TRUE)
min lq mean median uq max neval cld
426.687 441.3045 523.0643 445.7690 467.9520 2731.169 100 a
420.282 440.3010 534.4374 448.2640 469.2735 2777.107 100 a
465.947 472.8575 527.9308 479.0605 500.5770 2724.102 100 a
562.940 583.8230 692.7686 593.2280 617.8420 2754.945 100 b
Seems option 1 is faster.
Just testing that on another spectrum with more peaks.
nrow(sciex_pks[[1]])
[1] 578
nrow(sciex_pks[[1705]])
[1] 4088
microbenchmark(
.peaks_bin(sciex_pks[[1705]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids),
.peaks_bin2(sciex_pks[[1705]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = FALSE),
.peaks_bin2(sciex_pks[[1705]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = TRUE),
.peaks_bin3(sciex_pks[[1705]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = TRUE)
)
microseconds
expr
.peaks_bin(sciex_pks[[1705]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids)
.peaks_bin2(sciex_pks[[1705]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = FALSE)
.peaks_bin2(sciex_pks[[1705]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = TRUE)
.peaks_bin3(sciex_pks[[1705]], breaks = breaks, spectrumMsLevel = 1, msLevel = 1, mids = mids, zero.rm = TRUE)
min lq mean median uq max neval cld
916.639 948.770 1025.506 966.5760 1004.583 3386.702 100 a
922.674 945.291 1079.206 963.2435 1003.422 3860.321 100 a
952.343 987.816 1049.582 1008.9185 1045.494 3422.146 100 a
1072.307 1099.709 1219.720 1119.6805 1161.226 3389.441 100 b
Thus, I would be OK with your proposed option 1. Also, seems that by adding that additional variable and the related additional if
condition, we are not slowing down the function - which is good.
The
bin
function when used in combination withcombineSpectra
will create peaks of intensity 0. This as for consequence to overload memory on big dataset, and instead of decreasing the complexity of spectra, it will increases it.Here is a reproducible example:
So 8,000 peaks were created
I think the function could be changed by dropping peaks of intensity 0.