regex_words <- function(text.var, match.list, ...) {
terms <- unlist(lapply(match.list, function(m) paste(paste0("(", m, ")"), collapse = "|")))
x <- sort(unique(as.character(quanteda::tokens(tolower(text.var), remove_punct = TRUE))))
out <- lapply(terms, function(y) x[stringi::stri_detect_regex(x, y)])
names(out) <- terms
out
}
regex_words(
text.var = c(
'the dog is todo them funny',
'I\'d wait to eat the sandwich.',
'the dog ate the sandwiches'
),
match.list = c('the', 'sandwich', 'do', '^do', '\\bdo', 'do$')
)
regex_words(
text.var = c(
'the dog is todo them funny',
'I\'d wait to eat the sandwich.',
'the dog ate the sandwiches'
),
match.list = list(
'the',
'sandwich',
'do',
'^do',
'\\bdo',
'do$',
'\\b(eat|ate)\\b'
)
)
Roughly an improved version of
qdap::term_match
: