trinker / termco

Regular Expression Counts of Terms and Substrings
Other
25 stars 5 forks source link

Give important and frequent terms a grouping.var argument #37

Open trinker opened 7 years ago

trinker commented 7 years ago

n of each group

trinker commented 7 years ago

will affect the plot methods

trinker commented 6 years ago
frequent_terms <- function(text.var, n = 20, grouping.var = NULL, 
    stopwords = stopwords::stopwords("english"), min.freq = NULL, min.char = 4, 
    max.char = Inf, stem = FALSE, language = "porter", strip = TRUE,
    strip.regex = "[^a-z' ]", alphabetical = FALSE, ...) {

    if (is.data.frame(text.var)) stop("`text.var` is a `data.frame`; please pass a vector")

    text.var <- stringi::stri_trans_tolower(text.var)

    ## remove nonascii characters
    text.var <- iconv(text.var, "latin1", "ASCII", sub = "")

    ## regex strip of non-word/space characters
    if (isTRUE(strip)) text.var <- gsub(strip.regex, " ", text.var)

    if(is.null(grouping.var)) {
        G <- "all"
    } else {
        if (is.list(grouping.var)) {
            m <- unlist(as.character(substitute(grouping.var))[-1])
            G <- sapply(strsplit(m, "$", fixed=TRUE), function(x) {
                x[length(x)]
            })
        } else {
            G <- as.character(substitute(grouping.var))
            G <- G[length(G)]
        }
    }

    if(is.null(grouping.var)){
        grouping <- rep("all", length(text.var))
    } else {
        if (isTRUE(grouping.var)) {
            grouping <- seq_along(text.var)
        } else {
            if (is.list(grouping.var) & length(grouping.var)>1) {
                grouping <- grouping.var
            } else {
                grouping <- unlist(grouping.var)
            }
        }
    }

    if(!missing(group.names)) {
        G <- group.names
    }

    DF <- data.frame(text.var, check.names = FALSE, stringsAsFactors = FALSE)
    DF[G] <- grouping

    DF <- data.table::data.table(DF)

    DF <- DF[, list(text.var = paste(text.var, collapse = ' ')), by = G]

    grp <- DF[, G, with = FALSE]

    outs <- lapply(seq_len(nrow(DF)),  function(i){

        cnts <- frequent_terms_helper(DF[['text.var']][i], n = n, stopwords = stopwords, 
            min.freq = min.freq, min.char = min.char, max.char = max.char, 
            stem = stem, language = language, strip = strip, 
            strip.regex = strip.regex, alphabetical = alphabetical
        )
# browser()

        out <- as.data.frame(
            dplyr::bind_cols(grp[rep(i, nrow(cnts)), ], cnts),
            check.names = FALSE,
            stringsAsFactors = FALSE
        )

        n.words <- attributes(out)[["n.words"]]

        if (isTRUE(alphabetical)){
            out <- out[order(out[["term"]]), ]
        }

        if (n < 1) {
            n <- round(n * nrow(out), 0)
        }

        if (n > nrow(out)) {
            n <- nrow(out)
        }

        if (is.null(min.freq)) {
            out2 <- out[out[["frequency"]] >= out[["frequency"]][n], ]
        } else {
            out2 <- out[out[["frequency"]] >= min.freq, ]
            n <- nrow(out2)
        }

        class(out2) <- c('frequent_terms', class(out))
        attributes(out2)[["n"]] <- n
        attributes(out2)[["full"]] <- out
        attributes(out2)[["n.words"]] <- n.words
        attributes(out2)[["group.var"]] <- G
        out2        

    })

}

text.var <- termco::presidential_debates_2012$dialogue
alphabetical <- FALSE
grouping.var <- termco::presidential_debates_2012$person
language <- "porter"
max.char <- Inf
min.char <- 4
min.freq <- NULL
n <- 20
stem <- FALSE
stopwords <- stopwords::stopwords("english")
strip <- TRUE
strip.regex <- "[^a-z' ]"

frequent_terms_helper <- function(text.var, n = 20, 
    stopwords = stopwords::stopwords("english"), min.freq = NULL, min.char = 4, 
    max.char = Inf, stem = FALSE, language = "porter", strip = TRUE,
    strip.regex = "[^a-z' ]", alphabetical = FALSE, ...) {

    y <- unlist(stringi::stri_extract_all_words(text.var))
    n.words <- sum(stringi::stri_count_words(text.var), na.rm = TRUE)

    ## stemming
    if (isTRUE(stem)) {
        y <- SnowballC::wordStem(y, language = language)
        if (! is.null(stopwords)) stopwords <- SnowballC::wordStem(stopwords, language = language)
    }

    ## exclude less than the min character cut-off
    y <- y[nchar(y) > min.char - 1]

    ## exclude more than the max character cut-off
    y <- y[nchar(y) < max.char + 1]

    ## data frame of counts
    y <- sort(table(y), TRUE)

    ## stopword removal
    if (!is.null(stopwords)){
        y <- y[!names(y) %in% stopwords]
    }

    out <- data.frame(term = names(y), frequency = c(unlist(y, use.names=FALSE)),
        stringsAsFactors = FALSE, row.names=NULL)

    attributes(out)[["n.words"]] <- n.words

    out
}