trinker / termco

Regular Expression Counts of Terms and Substrings
Other
25 stars 5 forks source link

add regex_words #44

Closed trinker closed 7 years ago

trinker commented 7 years ago

Roughly an improved version of qdap::term_match:

regex_words <- function(text.var, match.list, ...) {

    terms <- unlist(lapply(match.list, function(m) paste(paste0("(", m, ")"), collapse = "|")))
    x <- sort(unique(as.character(quanteda::tokens(tolower(text.var), remove_punct = TRUE))))
    out <- lapply(terms, function(y) x[stringi::stri_detect_regex(x, y)])
    names(out) <- terms
    out
}

regex_words(
    text.var = c(
        'the dog is todo them funny', 
        'I\'d wait to eat the sandwich.', 
        'the dog ate the sandwiches'
    ), 
    match.list = c('the', 'sandwich', 'do', '^do', '\\bdo', 'do$')
)

regex_words(
    text.var = c(
        'the dog is todo them funny', 
        'I\'d wait to eat the sandwich.', 
        'the dog ate the sandwiches'
    ), 
    match.list = list(
        'the', 
        'sandwich', 
        'do', 
        '^do', 
        '\\bdo', 
        'do$',
        '\\b(eat|ate)\\b'
    )
)