rformassspectrometry / MetaboCoreUtils

Core utilities for metabolomics.
https://rformassspectrometry.github.io/MetaboCoreUtils/index.html
9 stars 6 forks source link

refactor: vectorize *Element functions #49

Closed sgibb closed 2 years ago

sgibb commented 2 years ago

This PR is meant for discussion. It changes the behaviour of countElements, especially its input and output values. So it is a backward incompatible change:

While dropping stringr I rewrote countElements using gregexpr which yield the same results in nearly half the time:

library("microbenchmark")
library("stringr")
library("MetaboCoreUtils")

x <- c("C6H12O6", "H2O", "C32H53BrN2O4", "C23H27N5O7S")

countElements.old <- function(x) {
    x <- x[!is.na(x)]
    lx <- length(x)
    if (!lx)
        return(integer())
    if (lx > 1) {
        warning("'countElements' supports only a single character string as ",
                "input. Returning result for 'x[1]'.")
        x <- x[1L]
    }
    ## regex pattern to isolate all elements
    element_pattern <- "([A][cglmrstu]|[B][aehikr]?|[C][adeflmnorsu]?|[D][bsy]|[E][rsu]|[F][elmr]?|[G][ade]|[H][efgos]?|[I][nr]?|[K][r]?|[L][airuv]|[M][cdgnot]|[N][abdehiop]?|[O][gs]?|[P][abdmortu]?|[R][abefghnu]|[S][bcegimnr]?|[T][abcehilms]|[U]|[V]|[W]|[X][e]|[Y][b]?|[Z][nr])([0-9]*)"
    ## extract all matching pattern
    regexMatch <- str_extract_all(x, element_pattern)
    ## get individual elements and their count
    elements <- str_extract(regexMatch[[1]], "[aA-zZ]+")
    numbers <- as.numeric(str_extract(regexMatch[[1]], "[0-9]+"))
    ## replace NAs with 1 for elements which have a count of one
    numbers[is.na(numbers)] <- 1
    ## create named vector for return
    formula_list <- numbers
    names(formula_list) <- elements
    ## remove atoms that might have a count of 0
    formula_list <- formula_list[which(formula_list >= 0)]
    formula_list
}

microbenchmark(old = countElements.old(x[1]), new = countElements(x[1]))
#> Unit: microseconds
#>  expr     min      lq      mean   median       uq      max neval
#>   old 133.605 144.822 352.18756 153.1340 160.7665 19843.27   100
#>   new  70.067  78.597  95.75213  91.0635  98.1655   629.80   100

microbenchmark(old = lapply(x, countElements.old), new = countElements(x))
#> Unit: microseconds
#>  expr     min      lq     mean   median       uq     max neval
#>   old 495.489 534.696 579.4587 565.9105 607.9485 915.496   100
#>   new 135.746 154.443 173.3315 171.2720 184.5505 329.109   100
pasteElements.old <- function(x) {
    if (is.null(names(x))) stop("'x' should be a named vector")
    ## create empty string to append parts of chem_formula
    chem_formula <- ""
    ## first C H N O S P, then elements by alphabetical order
    for (atom in c("C", "H", "N", "O", "S", "P")) {
        if (atom %in% names(x) && x[[atom]] > 0) {
            if (x[[atom]] == 1) {
                chem_formula <- paste0(chem_formula, atom)
            } else {
                chem_formula <- paste0(chem_formula, atom, x[atom])
            }
        }
    }
    ## get all remaining elements
    restElements <- names(x)
    restElements <- restElements[
        !restElements %in% c("C", "H", "N", "O", "S", "P")]
    ## iterate through all remaining elements in alphabetical order
    for (atom in sort(restElements)) {
        if (x[[atom]] > 0) {
            if (x[[atom]] == 1) {
                chem_formula <- paste0(chem_formula, atom)
            } else {
                chem_formula <- paste0(chem_formula, atom, x[atom])
            }
        }
    }
    chem_formula
}

y <- list(c(H = 2, O = 1), c(C = 6, H = 12, O = 6))

microbenchmark(old = pasteElements.old(y[[1]]), new = pasteElements(y[[1]]))
#> Unit: microseconds
#>  expr    min      lq     mean  median     uq       max neval
#>   old 54.858 58.3365 310.2966 59.4800 61.560 24975.309   100
#>   new 52.057 54.7230  59.1818 55.3455 56.215   309.066   100
microbenchmark(old = lapply(y, pasteElements.old), new = pasteElements(y))
#> Unit: microseconds
#>  expr     min       lq     mean   median       uq     max neval
#>   old 133.751 142.9245 155.3385 150.1840 158.8195 318.887   100
#>   new 102.070 109.7620 119.1846 113.3095 120.9060 256.720   100
containsElements.old <- function(x, y) {
    ## parse both formmula
    x <- countElements.old(x)
    y <- countElements.old(y)
    formula_concat <- c(x, y * -1)
    ## subtract formula from each other
    result <- tapply(formula_concat, names(formula_concat), sum)
    all(result >= 0)
}

microbenchmark(
    old = containsElements.old(x[1], "H2O"),
    new = containsElements(x[1], "H2O")
)
#> Unit: microseconds
#>  expr     min       lq     mean   median      uq      max neval
#>   old 419.081 459.6855 583.0982 484.4715 523.565 8104.956   100
#>   new 278.080 303.3190 336.4183 324.5070 350.384  787.938   100
subtractElements.old <- function(x, y) {
    if (length(y) > 1)
        y <- addElements(y)
    ## sanity checks for formulas
    if (!containsElements.old(x, y))
        return(NA_character_)
    ## parse both formmula
    x <- countElements.old(x)
    y <- countElements.old(y)
    formula_concat <- c(x, y * -1)
    ## subtract formula from each other
    result <- tapply(formula_concat, names(formula_concat), sum)
    pasteElements.old(result)
}

microbenchmark(
    old = subtractElements.old(x[1], "H2O"),
    new = subtractElements(x[1], "H2O")
)
#> Unit: microseconds
#>  expr      min      lq     mean   median        uq       max neval
#>   old 1035.892 1105.09 1364.603 1195.932 1341.9515 14167.363   100
#>   new  355.138  393.22  441.596  423.094  459.9205   714.594   100

Created on 2021-12-28 by the reprex package (v2.0.1)

EDIT: please be aware that this PR doesn't include the changes from PR #48

sgibb commented 2 years ago
  • countElements returns now a list (of integer?) because multiple input formulas are supported, right? If that's the case I would be OK with this incompatible change.

Correct. A list of named integers.

  • regarding the y parameter in addElements: if y is of length 1 (e.g. y = "H") this will be added to each formula in x, right? And it would be possible to add/substract a different formula from each x?

Correct. If multiple x and a single y are given, y is add/subtract from each x. If a single x and multiple y are given, the x is recycled/duplicated and each y is add/substract to x. Before multiple y are cumulative added/subtracted to/from a single x. For multiple x and multiple y (given the same length), each x is added/subtracted to/from each corresponding y.