refactor: vectorize *Element functions

This PR is meant for discussion. It changes the behaviour of countElements, especially its input and output values. So it is a backward incompatible change:

countElements accepts now a vector as input but returns a list instead of an integer.
All other element functions are vectorized as well ({count,paste,add,subtract}Elements.)
The meaning of the y argument in addElements/substractElement changed. Before it was added/subtracted in a cumulative manner. Now either x or y is recycled to have the same length. (If required we could restore the old behaviour but this would make vectorization of the arguments hard to understand.)
The dependency stringr is dropped.

While dropping stringr I rewrote countElements using gregexpr which yield the same results in nearly half the time:

library("microbenchmark")
library("stringr")
library("MetaboCoreUtils")

x <- c("C6H12O6", "H2O", "C32H53BrN2O4", "C23H27N5O7S")

countElements.old <- function(x) {
    x <- x[!is.na(x)]
    lx <- length(x)
    if (!lx)
        return(integer())
    if (lx > 1) {
        warning("'countElements' supports only a single character string as ",
                "input. Returning result for 'x[1]'.")
        x <- x[1L]
    }
    ## regex pattern to isolate all elements
    element_pattern <- "([A][cglmrstu]|[B][aehikr]?|[C][adeflmnorsu]?|[D][bsy]|[E][rsu]|[F][elmr]?|[G][ade]|[H][efgos]?|[I][nr]?|[K][r]?|[L][airuv]|[M][cdgnot]|[N][abdehiop]?|[O][gs]?|[P][abdmortu]?|[R][abefghnu]|[S][bcegimnr]?|[T][abcehilms]|[U]|[V]|[W]|[X][e]|[Y][b]?|[Z][nr])([0-9]*)"
    ## extract all matching pattern
    regexMatch <- str_extract_all(x, element_pattern)
    ## get individual elements and their count
    elements <- str_extract(regexMatch[[1]], "[aA-zZ]+")
    numbers <- as.numeric(str_extract(regexMatch[[1]], "[0-9]+"))
    ## replace NAs with 1 for elements which have a count of one
    numbers[is.na(numbers)] <- 1
    ## create named vector for return
    formula_list <- numbers
    names(formula_list) <- elements
    ## remove atoms that might have a count of 0
    formula_list <- formula_list[which(formula_list >= 0)]
    formula_list
}

microbenchmark(old = countElements.old(x[1]), new = countElements(x[1]))
#> Unit: microseconds
#>  expr     min      lq      mean   median       uq      max neval
#>   old 133.605 144.822 352.18756 153.1340 160.7665 19843.27   100
#>   new  70.067  78.597  95.75213  91.0635  98.1655   629.80   100

microbenchmark(old = lapply(x, countElements.old), new = countElements(x))
#> Unit: microseconds
#>  expr     min      lq     mean   median       uq     max neval
#>   old 495.489 534.696 579.4587 565.9105 607.9485 915.496   100
#>   new 135.746 154.443 173.3315 171.2720 184.5505 329.109   100

pasteElements.old <- function(x) {
    if (is.null(names(x))) stop("'x' should be a named vector")
    ## create empty string to append parts of chem_formula
    chem_formula <- ""
    ## first C H N O S P, then elements by alphabetical order
    for (atom in c("C", "H", "N", "O", "S", "P")) {
        if (atom %in% names(x) && x[[atom]] > 0) {
            if (x[[atom]] == 1) {
                chem_formula <- paste0(chem_formula, atom)
            } else {
                chem_formula <- paste0(chem_formula, atom, x[atom])
            }
        }
    }
    ## get all remaining elements
    restElements <- names(x)
    restElements <- restElements[
        !restElements %in% c("C", "H", "N", "O", "S", "P")]
    ## iterate through all remaining elements in alphabetical order
    for (atom in sort(restElements)) {
        if (x[[atom]] > 0) {
            if (x[[atom]] == 1) {
                chem_formula <- paste0(chem_formula, atom)
            } else {
                chem_formula <- paste0(chem_formula, atom, x[atom])
            }
        }
    }
    chem_formula
}

y <- list(c(H = 2, O = 1), c(C = 6, H = 12, O = 6))

microbenchmark(old = pasteElements.old(y[[1]]), new = pasteElements(y[[1]]))
#> Unit: microseconds
#>  expr    min      lq     mean  median     uq       max neval
#>   old 54.858 58.3365 310.2966 59.4800 61.560 24975.309   100
#>   new 52.057 54.7230  59.1818 55.3455 56.215   309.066   100
microbenchmark(old = lapply(y, pasteElements.old), new = pasteElements(y))
#> Unit: microseconds
#>  expr     min       lq     mean   median       uq     max neval
#>   old 133.751 142.9245 155.3385 150.1840 158.8195 318.887   100
#>   new 102.070 109.7620 119.1846 113.3095 120.9060 256.720   100

containsElements.old <- function(x, y) {
    ## parse both formmula
    x <- countElements.old(x)
    y <- countElements.old(y)
    formula_concat <- c(x, y * -1)
    ## subtract formula from each other
    result <- tapply(formula_concat, names(formula_concat), sum)
    all(result >= 0)
}

microbenchmark(
    old = containsElements.old(x[1], "H2O"),
    new = containsElements(x[1], "H2O")
)
#> Unit: microseconds
#>  expr     min       lq     mean   median      uq      max neval
#>   old 419.081 459.6855 583.0982 484.4715 523.565 8104.956   100
#>   new 278.080 303.3190 336.4183 324.5070 350.384  787.938   100

subtractElements.old <- function(x, y) {
    if (length(y) > 1)
        y <- addElements(y)
    ## sanity checks for formulas
    if (!containsElements.old(x, y))
        return(NA_character_)
    ## parse both formmula
    x <- countElements.old(x)
    y <- countElements.old(y)
    formula_concat <- c(x, y * -1)
    ## subtract formula from each other
    result <- tapply(formula_concat, names(formula_concat), sum)
    pasteElements.old(result)
}

microbenchmark(
    old = subtractElements.old(x[1], "H2O"),
    new = subtractElements(x[1], "H2O")
)
#> Unit: microseconds
#>  expr      min      lq     mean   median        uq       max neval
#>   old 1035.892 1105.09 1364.603 1195.932 1341.9515 14167.363   100
#>   new  355.138  393.22  441.596  423.094  459.9205   714.594   100

^{Created on 2021-12-28 by the reprex package (v2.0.1)}

EDIT: please be aware that this PR doesn't include the changes from PR #48

rformassspectrometry / MetaboCoreUtils

refactor: vectorize *Element functions #49