Open trinker opened 7 years ago
will affect the plot methods
frequent_terms <- function(text.var, n = 20, grouping.var = NULL,
stopwords = stopwords::stopwords("english"), min.freq = NULL, min.char = 4,
max.char = Inf, stem = FALSE, language = "porter", strip = TRUE,
strip.regex = "[^a-z' ]", alphabetical = FALSE, ...) {
if (is.data.frame(text.var)) stop("`text.var` is a `data.frame`; please pass a vector")
text.var <- stringi::stri_trans_tolower(text.var)
## remove nonascii characters
text.var <- iconv(text.var, "latin1", "ASCII", sub = "")
## regex strip of non-word/space characters
if (isTRUE(strip)) text.var <- gsub(strip.regex, " ", text.var)
if(is.null(grouping.var)) {
G <- "all"
} else {
if (is.list(grouping.var)) {
m <- unlist(as.character(substitute(grouping.var))[-1])
G <- sapply(strsplit(m, "$", fixed=TRUE), function(x) {
x[length(x)]
})
} else {
G <- as.character(substitute(grouping.var))
G <- G[length(G)]
}
}
if(is.null(grouping.var)){
grouping <- rep("all", length(text.var))
} else {
if (isTRUE(grouping.var)) {
grouping <- seq_along(text.var)
} else {
if (is.list(grouping.var) & length(grouping.var)>1) {
grouping <- grouping.var
} else {
grouping <- unlist(grouping.var)
}
}
}
if(!missing(group.names)) {
G <- group.names
}
DF <- data.frame(text.var, check.names = FALSE, stringsAsFactors = FALSE)
DF[G] <- grouping
DF <- data.table::data.table(DF)
DF <- DF[, list(text.var = paste(text.var, collapse = ' ')), by = G]
grp <- DF[, G, with = FALSE]
outs <- lapply(seq_len(nrow(DF)), function(i){
cnts <- frequent_terms_helper(DF[['text.var']][i], n = n, stopwords = stopwords,
min.freq = min.freq, min.char = min.char, max.char = max.char,
stem = stem, language = language, strip = strip,
strip.regex = strip.regex, alphabetical = alphabetical
)
# browser()
out <- as.data.frame(
dplyr::bind_cols(grp[rep(i, nrow(cnts)), ], cnts),
check.names = FALSE,
stringsAsFactors = FALSE
)
n.words <- attributes(out)[["n.words"]]
if (isTRUE(alphabetical)){
out <- out[order(out[["term"]]), ]
}
if (n < 1) {
n <- round(n * nrow(out), 0)
}
if (n > nrow(out)) {
n <- nrow(out)
}
if (is.null(min.freq)) {
out2 <- out[out[["frequency"]] >= out[["frequency"]][n], ]
} else {
out2 <- out[out[["frequency"]] >= min.freq, ]
n <- nrow(out2)
}
class(out2) <- c('frequent_terms', class(out))
attributes(out2)[["n"]] <- n
attributes(out2)[["full"]] <- out
attributes(out2)[["n.words"]] <- n.words
attributes(out2)[["group.var"]] <- G
out2
})
}
text.var <- termco::presidential_debates_2012$dialogue
alphabetical <- FALSE
grouping.var <- termco::presidential_debates_2012$person
language <- "porter"
max.char <- Inf
min.char <- 4
min.freq <- NULL
n <- 20
stem <- FALSE
stopwords <- stopwords::stopwords("english")
strip <- TRUE
strip.regex <- "[^a-z' ]"
frequent_terms_helper <- function(text.var, n = 20,
stopwords = stopwords::stopwords("english"), min.freq = NULL, min.char = 4,
max.char = Inf, stem = FALSE, language = "porter", strip = TRUE,
strip.regex = "[^a-z' ]", alphabetical = FALSE, ...) {
y <- unlist(stringi::stri_extract_all_words(text.var))
n.words <- sum(stringi::stri_count_words(text.var), na.rm = TRUE)
## stemming
if (isTRUE(stem)) {
y <- SnowballC::wordStem(y, language = language)
if (! is.null(stopwords)) stopwords <- SnowballC::wordStem(stopwords, language = language)
}
## exclude less than the min character cut-off
y <- y[nchar(y) > min.char - 1]
## exclude more than the max character cut-off
y <- y[nchar(y) < max.char + 1]
## data frame of counts
y <- sort(table(y), TRUE)
## stopword removal
if (!is.null(stopwords)){
y <- y[!names(y) %in% stopwords]
}
out <- data.frame(term = names(y), frequency = c(unlist(y, use.names=FALSE)),
stringsAsFactors = FALSE, row.names=NULL)
attributes(out)[["n.words"]] <- n.words
out
}
n of each group