Open Adafede opened 1 year ago
As a side note, here is some benchmark I did regarding the different backends:
#################################### Paths ####################################
file <- "CompDb.Hsapiens.HMDB.4.0"
file_db <- file |>
paste0(".sqlite")
file_mgf <- file |>
paste0(".mgf")
################ Taking small HMDB from CompoundDb as example #################
hmdb_file <-
system.file("sdf/HMDB_sub.sdf.gz", package = "CompoundDb")
cmps <- CompoundDb::compound_tbl_sdf(hmdb_file)
xml_path <- system.file("xml", package = "CompoundDb")
spctra <- CompoundDb::msms_spectra_hmdb(xml_path)
metad <-
CompoundDb::make_metadata(
source = "HMDB",
url = "http://www.hmdb.ca",
source_version = "4.0",
source_date = "2017-09",
organism = "Hsapiens"
)
################################## Functions ##################################
export_spectra <- function(file,
spectra,
dir = ".",
cmps = NULL,
metad = NULL) {
switch(
EXPR = gsub(
pattern = ".*\\.",
replacement = "",
x = file
),
"mgf" = {
Spectra::Spectra(object = spectra) |>
Spectra::export(
backend = MsBackendMgf::MsBackendMgf(),
file = file
)
},
"sqlite" = {
if (is.null(cmps)) {
cmps <-
data.frame(
compound_id = if (!is.null(spectra$compound_id)) {
spectra$compound_id
} else {
no <- rep(
NA,
nrow(spectra)
)
},
name = if (!is.null(spectra$name)) {
spectra$name
} else {
no <- rep(
NA,
nrow(spectra)
)
},
inchi = if (!is.null(spectra$inchi)) {
spectra$inchi
} else {
no <- rep(
NA,
nrow(spectra)
)
},
inchikey = if (!is.null(spectra$inchikey)) {
spectra$inchikey
} else {
no <- rep(
NA,
nrow(spectra)
)
},
exactmass = if (!is.null(spectra$exactmass)) {
spectra$exactmass
} else {
no <- rep(
NA_integer_,
nrow(spectra)
)
},
formula = if (!is.null(spectra$formula)) {
spectra$formula
} else {
no <- rep(
NA,
nrow(spectra)
)
},
synonyms = if (!is.null(spectra$synonyms)) {
spectra$synonyms
} else {
no <- rep(
NA,
nrow(spectra)
)
}
)
}
if (is.null(metad)) {
metad <-
data.frame(
name = c(
"source",
"url",
"source_version",
"source_date",
"organism"
),
value = c(
NA,
NA,
0,
Sys.time() |>
as.character(),
NA
)
)
}
CompoundDb::createCompDb(
x = cmps,
metadata = metad,
msms_spectra = spectra,
path = dir,
dbFile = file
)
}
)
}
extract_spectra <- function(object) {
## issues
incoherent_colnames <- c(
ms_level = "msLevel",
precursor_intensity = "precursorIntensity"
)
incoherent_logical <- c("predicted")
incoherent_integer <- c("spectrum_id")
##
peaks <- object |>
Spectra::peaksData() |>
data.frame() |>
dplyr::group_by(group) |>
dplyr::summarize_all(list)
spectra <- object |>
Spectra::spectraData() |>
data.frame()
spectra$mz <- peaks$mz
spectra$intensity <- peaks$intensity
## synonyms issue
spectra <- spectra |>
dplyr::group_by(dplyr::across(c(-dplyr::any_of("synonym")))) |>
dplyr::summarize(dplyr::across(.cols = where(is.list), .fns = as.character)) |>
dplyr::ungroup()
## columns types issue
spectra <- spectra |>
dplyr::mutate(dplyr::across(
.cols = dplyr::any_of(incoherent_logical),
.fns = as.logical
)) |>
dplyr::mutate(dplyr::across(
.cols = dplyr::any_of(incoherent_integer),
.fns = as.integer
))
## columns names issue
spectra <- spectra |>
dplyr::select(-c(dplyr::any_of(incoherent_colnames))) |>
dplyr::rename(dplyr::any_of(incoherent_colnames))
return(spectra)
}
import_spectra <- function(file) {
switch(
EXPR = gsub(
pattern = ".*\\.",
replacement = "",
x = file
),
"mgf" = {
file |>
MsBackendMgf::readMgf() |>
Spectra::Spectra()
},
"sqlite" = {
file |>
CompoundDb::CompDb() |>
CompoundDb::Spectra()
}
)
}
################################### Script ###################################
## Exporting
### To SQLite
export_spectra(
file = file_db,
spectra = spctra,
cmps = cmps,
metad = metad
)
### To MGF
export_spectra(file = file_mgf, spectra = spctra)
## Importing
### From SQLite
sps_from_sqlite <- file_db |>
import_spectra()
### From MGF
sps_from_mgf <- file_mgf |>
import_spectra()
################################# Parameters #################################
params_sim <- MetaboAnnotation::CompareSpectraParam(
ppm = 10,
tolerance = 0.02,
FUN = MsCoreUtils::gnps,
requirePrecursor = TRUE,
THRESHFUN = function(x) {
which(x >= 0.6)
}
)
################################## Benchmark ##################################
mbm_1 <- microbenchmark::microbenchmark(
matches_mgf_mgf = MetaboAnnotation::matchSpectra(
query = sps_from_mgf,
target = sps_from_mgf,
param = params_sim
),
matches_mgf_sqlite = MetaboAnnotation::matchSpectra(
query = sps_from_mgf,
target = sps_from_sqlite,
param = params_sim
),
matches_sqlite_mgf = MetaboAnnotation::matchSpectra(
query = sps_from_sqlite,
target = sps_from_mgf,
param = params_sim
),
matches_sqlite_sqlite = MetaboAnnotation::matchSpectra(
query = sps_from_sqlite,
target = sps_from_sqlite,
param = params_sim
),
times = 200
)
mbm_2 <- microbenchmark::microbenchmark(
matches_mgf_mgf_1 = MetaboAnnotation::matchSpectra(
query = file_mgf |>
import_spectra(),
target = sps_from_mgf,
param = params_sim
),
matches_mgf_mgf_2 = MetaboAnnotation::matchSpectra(
query = sps_from_mgf,
target = file_mgf |>
import_spectra(),
param = params_sim
),
matches_mgf_sqlite_1 = MetaboAnnotation::matchSpectra(
query = file_mgf |>
import_spectra(),
target = sps_from_sqlite,
param = params_sim
),
matches_mgf_sqlite_2 = MetaboAnnotation::matchSpectra(
query = sps_from_mgf,
target = file_db |>
import_spectra(),
param = params_sim
),
matches_sqlite_mgf_1 = MetaboAnnotation::matchSpectra(
query = file_db |>
import_spectra(),
target = sps_from_mgf,
param = params_sim
),
matches_sqlite_mgf_2 = MetaboAnnotation::matchSpectra(
query = sps_from_sqlite,
target = file_mgf |>
import_spectra(),
param = params_sim
),
matches_sqlite_sqlite_1 = MetaboAnnotation::matchSpectra(
query = file_db |>
import_spectra(),
target = sps_from_sqlite,
param = params_sim
),
matches_sqlite_sqlite_2 = MetaboAnnotation::matchSpectra(
query = sps_from_sqlite,
target = file_db |>
import_spectra(),
param = params_sim
),
times = 200
)
mbm_3 <- microbenchmark::microbenchmark(
matches_mgf_mgf = MetaboAnnotation::matchSpectra(
query = file_mgf |>
import_spectra(),
target = file_mgf |>
import_spectra(),
param = params_sim
),
matches_mgf_sqlite = MetaboAnnotation::matchSpectra(
query = file_mgf |>
import_spectra(),
target = file_db |>
import_spectra(),
param = params_sim
),
matches_sqlite_mgf = MetaboAnnotation::matchSpectra(
query = file_db |>
import_spectra(),
target = file_mgf |>
import_spectra(),
param = params_sim
),
matches_sqlite_sqlite = MetaboAnnotation::matchSpectra(
query = file_db |>
import_spectra(),
target = file_db |>
import_spectra(),
param = params_sim
),
times = 200
)
#################################### Plots ####################################
mbm_1 |>
ggplot2::autoplot()
mbm_2 |>
ggplot2::autoplot()
mbm_3 |>
ggplot2::autoplot()
################################### Cleanup ###################################
file.remove(file_mgf)
file.remove(file_db)
################################# Session Info #################################
sessionInfo()
Hi Adriano,
you are right, it is quite slow. Did you try MsBackendMemory
? Should be more efficient, right @jorainer ?
yes, should be - but would still be nice to know where exactly the bottleneck is...
Just tried both
MetaboAnnotation::matchSpectra(query = spq_mini,
target = spt_mini,
param = params_2)
and
MetaboAnnotation::matchSpectra(query = Spectra::setBackend(spq_mini, Spectra::MsBackendMemory()),
target = Spectra::setBackend(spt_mini, Spectra::MsBackendMemory()),
param = params_2)
and observed no difference
Just a general observation: you use always ::
to call a function - this can also slow down things if the namespace of the package is not already loaded (because R needs then to load the namespace first). Ideally, you should first load all required libraries and not use the ::
.
Also, question: the data that you provide has 7897 query spectra and 289429 target spectra, right? Is this already the subset you mentioned above? I'm just asking because it takes me less than 5 minutes to do the matching with matchSpectra
.
I simply find it easier for reading, might be some additional performance improvement to remove it indeed...for now just tried to make things as explicit as possible, so also mapping where functions come from looked like a good idea.
Yes, this is the number of spectra. Are you matching all of them in < 5min?
well, yes, I'm running the 7897 query spectra against all 289429 target spectra in a little less than 5 minutes - that's why I was asking if that is what you considered slow. How long does it take for you to run the same data set?
Hmmmm interesting.
Well, I guess we should split things:
With such a big MGF, loading it is what takes the most time.
In the example above, loading the target MGF took 5 minutes. For this reason, what I was doing was converting it to SQL once to then always access it faster.
The matching part I just re-did takes also ± 3 minutes (with params_1) (12minutes with params_2). So something around 17 in total.
While doing it with https://github.com/mandelbrot-project/spectral_lib_matcher , first thing I do is the same, saving the big MGF once as pickle to access it faster later on, and the whole process (loading + matching) takes 2 minutes (approx 1 minute loading + cleaning and 1 minute matching).
I somehow got the feeling there is some kind of "sweet spot" between loading slow with MGFBackend but then matching faster and loading with SQLBackend but matching slower. I probably was overseeing things as I did not think of mixing both. Might it be that the most efficient solution is an hybrid: loading as SQL, MSBackendMemory as suggested above (which I did not think about until now), and then matching?
Ah, well, let's not mix things. I would here first focus on the matching itself - once the data is loaded. Yes, importing from Mgf takes a long time, but let's fix that later. Also, I would first focus on one matching parameter only. the different versions have different performance, but that's obvious (e.g. the forward reverse needs to run each query twice).
I will try to tune a bit and then post the results here.
OK, first performance check. Below data import (query and target were downloaded first with the scripts above) and settings.
library(MetaboAnnotation)
library(MsCoreUtils)
library(MsBackendMgf)
library(microbenchmark)
query <- Spectra("query.mgf", source = MsBackendMgf())
target <- Spectra("target.mgf", source = MsBackendMgf())
csp <- CompareSpectraParam(
ppm = 10,
tolerance = 0.02,
requirePrecursor = TRUE,
THRESHFUN = function(x) {
which(x >= 0.6)
}
)
query
has 7897 spectra, target
289429. With Bioconductor 3.16 (Spectra
1.9.0 and MetaboAnnotation
1.3.0):
microbenchmark(
matchSpectra(query, target, param = csp),
times = 5
)
Unit: seconds
expr min lq mean median
matchSpectra(query, target, param = csp) 361.4349 364.0958 374.273 368.2352
uq max neval
388.1526 389.4467 5
Takes thus about 6 minutes - not ideal.
The same using using Spectra
version 1.9.8 and MetaboAnnotation
1.3.1: 2 minutes.
Unit: seconds
expr min lq mean median
matchSpectra(query, target, param = csp) 113.8469 116.7591 116.9893 117.0349
uq max neval
117.1121 120.1934 5
Next I'll check some different backends.
MsBackendMemory
Comparing the Spectra
with the MsBackendMgf
(that extends the old MsBackendDataFrame
) against a Spectra
that uses the new MsBackendMemory
backend.
query_mem <- setBackend(query, MsBackendMemory())
target_mem <- setBackend(target, MsBackendMemory())
microbenchmark(
matchSpectra(query, target, param = csp),
matchSpectra(query_mem, target_mem, param = csp),
times = 5
)
Unit: seconds
expr min lq mean
matchSpectra(query, target, param = csp) 111.91539 112.43313 113.15299
matchSpectra(query_mem, target_mem, param = csp) 38.16337 38.20958 38.83991
median uq max neval cld
112.96618 113.1283 115.32199 5 b
38.54426 39.1217 40.16063 5 a
40 seconds to match 7897 spectra against 289429 doesn't sound too bad :)
So from 361.4349
to 38.16337
... very impressive! More or less the factor 10 that was missing! ❤️
MsBackendSql
The MsBackendSql
stores all data (spectra and peak variables) in an SQL database and keeps only the primary keys in memory. It has thus a minimal memory footprint, but any data access requires an SQL call. Backends such as the MsBackendMassbankSql
and the backend for CompoundDb
use a similar approach. Below we change the backend of query
and target
to MsBackendSql
and use it for the matching.
library(MsBackendSql)
library(MsBackendSql)
library(RSQLite)
query_sql <- setBackend(query, MsBackendSql(),
dbcon = dbConnect(SQLite(), tempfile()))
target_sql <- setBackend(query, MsBackendSql(),
dbcon = dbConnect(SQLite(), tempfile()))
microbenchmark(
matchSpectra(query_mem, target_mem, param = csp),
matchSpectra(query_sql, target_sql, param = csp),
matchSpectra(query_mem, target_sql, param = csp),
times = 5
)
Unit: seconds
expr min lq mean
matchSpectra(query_mem, target_mem, param = csp) 37.92803 37.96435 38.91230
matchSpectra(query_sql, target_sql, param = csp) 111.53190 113.52418 114.61208
matchSpectra(query_mem, target_sql, param = csp) 80.01782 85.71989 87.47057
median uq max neval cld
38.85945 39.27129 40.53840 5 a
113.63851 116.74236 117.62345 5 b
86.63616 89.53183 95.44714 5 c
Using a MsBackendSql
seems to be slower. We can however cache the precursor m/z values locally in the object which avoids an SQL call each time the filter for the precursor m/z values is performed (since we're using requirePrecursor = TRUE
):
target_sql_cache <- target_sql
target_sql_cache$precursorMz <- precursorMz(target_sql)
microbenchmark(
matchSpectra(query_mem, target_mem, param = csp),
matchSpectra(query_mem, target_sql, param = csp),
matchSpectra(query_mem, target_sql_cache, param = csp),
times = 5
)
Unit: seconds
expr min lq
matchSpectra(query_mem, target_mem, param = csp) 37.87346 38.57227
matchSpectra(query_mem, target_sql, param = csp) 81.52800 82.24072
matchSpectra(query_mem, target_sql_cache, param = csp) 26.83169 26.98097
mean median uq max neval cld
39.05091 38.84499 39.27651 40.68732 5 a
85.00327 84.26290 84.39204 92.59269 5 b
27.85493 27.26821 28.12350 30.07026 5 c
We can gain another 10 seconds with this trick. The reason is that for each query spectrum target
is filtered/subset to those spectra with matching precursor m/z, and this operation is faster for a MsBackendSql
than for a MsBackendMemory
because only the primary keys and precursor m/z values need to be subset for the former (instead of the full data).
Parallel processing can also help to improve the performance of matchSpectra
- but again, it depends on the used backend as not all backends support parallel processing (such as the MsBackendSql
that currently does not support it). Below we check performance for 1, 2, 3 and 4 CPUs.
p2 <- MulticoreParam(2)
p3 <- MulticoreParam(3)
p4 <- MulticoreParam(4)
microbenchmark(
matchSpectra(query_mem, target_mem, param = csp, BPPARAM = SerialParam()),
matchSpectra(query_mem, target_mem, param = csp, BPPARAM = p2),
matchSpectra(query_mem, target_mem, param = csp, BPPARAM = p3),
matchSpectra(query_mem, target_mem, param = csp, BPPARAM = p4),
times = 5
)
Unit: seconds
expr
matchSpectra(query_mem, target_mem, param = csp, BPPARAM = SerialParam())
matchSpectra(query_mem, target_mem, param = csp, BPPARAM = p2)
matchSpectra(query_mem, target_mem, param = csp, BPPARAM = p3)
matchSpectra(query_mem, target_mem, param = csp, BPPARAM = p4)
min lq mean median uq max neval cld
37.31365 38.12915 38.62866 39.01555 39.06048 39.62445 5 a
22.31075 22.33289 22.92925 22.73671 23.46847 23.79745 5 b
16.65630 16.99816 17.71939 18.04966 18.38344 18.50940 5 c
13.57029 13.57957 14.35026 14.30086 14.76527 15.53531 5 d
Thus, parallel processing can improve performance.
Next to the settings used for the spectral matching, using a more efficient MsBackend
(such as MsBackendMemory
) can improve performance of matchSpectra
considerably.
I'm keeping this issue open because I think it provides some nice examples how to improve matching performance.
Hi,
I am trying to regularly match 1000+ query spectra to 100,000+ target spectra. Currently, this is easily doable in minutes using
matchms
in python. I never obtained such results usingMetaboAnnotation
in R.Here below a tentative script trying to describe what I would like to achieve. It currently uses
MsBackendMgf
, but this is not the issue. I had to subset the data to remain in reasonable times (approx. 5 minutes).Happy to further develop and exchange if needed.
CODE: