trinker / qdap

Quantitative Discourse Analysis Package: Bridging the gap between qualitative data and quantitative analysis
http://cran.us.r-project.org/web/packages/qdap/index.html
175 stars 44 forks source link

Improve mgsub speed #201

Closed trinker closed 9 years ago

trinker commented 9 years ago

Alexey Ferapontov mentions a memory issue with mgsub http://stackoverflow.com/q/27367914/1000343. He's right in that the code for mgsub should be lighter weight, faster, and less intensive of memory.

I want to test removing unnecessary curly braces, repeated calculations, and replacing the lapply(<<-) with a for loop.

Here is the old and new proposed code:

OLD CODE

multigsub <-
function(pattern, replacement = NULL, text.var, leadspace = FALSE, 
    trailspace = FALSE, fixed = TRUE, trim = TRUE, order.pattern = fixed, ...){

    if (leadspace | trailspace) {
        replacement <- spaste(replacement, trailing = trailspace, 
            leading = leadspace)
    }

    ## replaces the larger n character words first
    if (fixed && order.pattern) {
        if (!is.null(replacement) && length(replacement) > 1) {
            replacement <- replacement[rev(order(nchar(pattern)))]
        }
        pattern <- pattern[rev(order(nchar(pattern)))]
    }

    key <- data.frame(pat=pattern, rep=replacement, 
        stringsAsFactors = FALSE)

    msubs <-function(K, x, trim, ...){
        sapply(seq_len(nrow(K)), function(i){
                x <<- gsub(K[i, 1], K[i, 2], x, fixed = fixed, ...)
            }
        )
        if (trim) x <- gsub(" +", " ", x)
        return(x)
    }

    if (trim) {
        x <- Trim(msubs(K=key, x=text.var, trim = trim, ...))
    } else {    
        x <- msubs(K=key, x=text.var, trim = trim, ...)
    }

    return(x)
}

New Code

mgsub2 <- function (pattern, replacement, text.var, leadspace = FALSE, 
    trailspace = FALSE, fixed = TRUE, trim = TRUE, order.pattern = fixed, 
    ...) {

    if (leadspace | trailspace) replacement <- spaste(replacement, trailing = trailspace, leading = leadspace)

    if (fixed && order.pattern) {
        ord <- rev(order(nchar(pattern)))
        pattern <- pattern[ord]

        if (length(replacement) != 1) replacement <- replacement[ord]

    }
    if (length(replacement) == 1) replacement <- rep(replacement, length(pattern))

    for (i in seq_along(pattern)){
        text.var <- gsub(pattern[i], replacement[i], text.var, fixed = fixed, ...)
    }

    if (trim) text.var <- gsub("\\s+", " ", gsub("^\\s+|\\s+$", "", text.var, perl=TRUE), perl=TRUE)
    text.var
}
trinker commented 9 years ago

Here are the resuts:

Regular expression based

 (op <- microbenchmark( 
     OLD=mgsub("[aeiou][a-z]{1,3}e", "[[FOO]]", raj[["dialogue"]], fixed=FALSE),
     NEW=mgsub2("[aeiou][a-z]{1,3}e", "[[FOO]]", raj[["dialogue"]], fixed=FALSE),
 times=100L))

a <- mgsub("[aeiou][a-z]{1,3}e", "[[FOO]]", raj[["dialogue"]], fixed=FALSE)
b <- mgsub("[aeiou][a-z]{1,3}e", "[[FOO]]", raj[["dialogue"]], fixed=FALSE)

Unit: milliseconds
 expr      min       lq     mean   median       uq      max neval
  OLD 18.33953 18.95063 20.44189 19.69823 20.31836 47.06260   100
  NEW 13.85184 14.08709 14.70118 14.38699 14.87862 22.86252   100

all.equal(a, b)
## > all.equal(a, b)
## [1] TRUE

Fixed gsub

 (op <- microbenchmark( 
     OLD=mgsub("Top100Words", paste0("[[", Top100Words, "]]"), raj[["dialogue"]]),
     NEW=mgsub2("Top100Words", paste0("[[", Top100Words, "]]"), raj[["dialogue"]]),
 times=100L))

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval
  OLD 11.350383 11.420792 11.608711 11.501875 11.716795 12.856678   100
  NEW  7.054833  7.084803  7.251297  7.147207  7.340574  8.024134   100

a <- mgsub("Top100Words", paste0("[[", Top100Words, "]]"), raj[["dialogue"]])
b <- mgsub2("Top100Words", paste0("[[", Top100Words, "]]"), raj[["dialogue"]])

all.equal(a, b)
## > all.equal(a, b)
## [1] TRUE

Larger Code given by the original post

line <- c("one", "two one", "four phones", "and a capsule", "But here's a caps key")
e <- c("one", "two", "caps")
r <- c("ONE", "TWO", "CAPS")

line <- rep(line, 1700000)

acc.roxygen2::tic()
a <- mgsub(e, r, line)
acc.roxygen2::toc()

## Time difference of 14.6345 secs

acc.roxygen2::tic()
b <- mgsub2(e, r, line)
acc.roxygen2::toc()

## Time difference of 9.509309 secs

all.equal(a, b)
## > all.equal(a, b)
## [1] TRUE

With Compiler

mgsub3 <- compiler::cmpfun(mgsub2)

 (op <- microbenchmark( 
     OLD=mgsub("[aeiou][a-z]{1,3}e", "[[FOO]]", raj[["dialogue"]], fixed=FALSE),
     NEW=mgsub2("[aeiou][a-z]{1,3}e", "[[FOO]]", raj[["dialogue"]], fixed=FALSE),
     COMP=mgsub3("[aeiou][a-z]{1,3}e", "[[FOO]]", raj[["dialogue"]], fixed=FALSE),
 times=100L))

Unit: milliseconds
 expr      min       lq     mean   median       uq      max neval
  OLD 18.25578 18.43621 18.96434 18.82931 19.26634 21.19899   100
  NEW 13.77959 13.91096 14.37786 14.22606 14.66226 16.59799   100
 COMP 13.83542 13.99430 15.28297 14.41593 15.01390 49.30213   100

 (op <- microbenchmark( 
     OLD=mgsub("Top200Words", paste0("[[", Top200Words, "]]"), raj[["dialogue"]]),
     NEW=mgsub2("Top200Words", paste0("[[", Top200Words, "]]"), raj[["dialogue"]]),
     COMP=mgsub3("Top200Words", paste0("[[", Top200Words, "]]"), raj[["dialogue"]]),
 times=100L))

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval
  OLD 11.250210 11.349151 11.550959 11.414017 11.639818 12.986822   100
  NEW  7.061813  7.108410  7.285446  7.191546  7.351043  8.163309   100
 COMP  7.057708  7.103073  7.333275  7.198114  7.453885  8.577141   100
trinker commented 9 years ago

Definite benefit with the for loop but copiler does not seem to improve speed. The new function it is.