trinker / termco

Regular Expression Counts of Terms and Substrings
Other
25 stars 5 forks source link

Add term before/after to help analyze #30

Closed trinker closed 7 years ago

trinker commented 7 years ago
term_before <- function(text.var, term, ignore.case = TRUE, ...){
    regex <- paste0(
        '[A-Za-z\'-]+(?=\\s', term, ')'
    )
    trms <- na.omit(unlist(stringi::stri_extract_all_regex(text.var, regex)))
    if (ignore.case) trms <- tolower(trms)
    tbl_df(textshape::bind_table(sort(table(trms), TRUE), "term", "frequency"))
}

term_after <- function(text.var, term, ignore.case = TRUE, ...){
    regex <- paste0(
        '(?<=', term, '\\s)[A-Za-z\'-]+'
    )
    trms <- na.omit(unlist(stringi::stri_extract_all_regex(text.var, regex)))
    if (ignore.case) trms <- tolower(trms)
    tbl_df(textshape::bind_table(sort(table(trms), TRUE), "term", "frequency"))
}

term_first <- function(text.var, ignore.case = TRUE, ...){
    regex <- '^[A-Za-z\'-]+'
    trms <- na.omit(unlist(stringi::stri_extract_all_regex(text.var, regex)))
    if (ignore.case) trms <- tolower(trms)
    tbl_df(textshape::bind_table(sort(table(trms), TRUE), "term", "frequency"))
}
trinker commented 7 years ago
#' term_before(presidential_debates_2012$dialogue, 'president')
#' term_after(presidential_debates_2012$dialogue, 'president')
#' term_after(presidential_debates_2012$dialogue, 'oil')
#' term_first(presidential_debates_2012$dialogue)
#' 
#' \dontrun{
#' library(dplyr); library(lexicon)
#'
#' pos_df_pronouns[['pronoun']][1:5] %>%
#'     lapply(function(x){
#'         term_after(presidential_debates_2012$dialogue, paste0("\\b", x, "\\b"))
#'     }) %>% 
#'     setNames(pos_df_pronouns[['pronoun']][1:5])
#'
#' term_first(presidential_debates_2012$dialogue) %>%
#'     filter(!term %in% tolower(sw_dolch) & !grepl("'", term))
#' }
term_before <- function(text.var, term, ignore.case = TRUE, ...){
    regex <- paste0(
        ifelse(ignore.case, "(?i)", ""),
        '[A-Za-z\'-]+(?=\\s', term, ')'
    )
    trms <- na.omit(unlist(stringi::stri_extract_all_regex(text.var, regex)))
    if (ignore.case) trms <- tolower(trms)    
    dplyr::tbl_df(textshape::tidy_table(as.table(sort(table(trms), TRUE)), "term", "frequency"))
}

term_after <- function(text.var, term, ignore.case = TRUE, ...){
    regex <- paste0(
        ifelse(ignore.case, "(?i)", ""),
        '(?<=', term, '\\s)[A-Za-z\'-]+'
    )
    trms <- na.omit(unlist(stringi::stri_extract_all_regex(text.var, regex)))
    if (ignore.case) trms <- tolower(trms)
    dplyr::tbl_df(textshape::tidy_table(as.table(sort(table(trms), TRUE)), "term", "frequency"))
}

term_first <- function(text.var, ignore.case = TRUE, ...){
    regex <- paste0(ifelse(ignore.case, "(?i)", ""), '^[A-Za-z\'-]+')
    trms <- na.omit(unlist(stringi::stri_extract_all_regex(text.var, regex)))
    if (ignore.case) trms <- tolower(trms)
    dplyr::tbl_df(textshape::tidy_table(as.table(sort(table(trms), TRUE)), "term", "frequency"))
}