Add dale_chall - Githubissues

Started but not getting results I believe I should be:
dale_chall_ <- function(n.words, p.hard, n.sents, ...){
    (.1579*(100*p.hard)) + (.0496*(n.words/n.sents)) + ifelse(p.hard > .05, 3.6365, 0)
}

perc_hard <- function(x){
    words <- unlist(stringi::stri_extract_all_words(gsub("\\d+", " ", tolower(x))))
    mean(!words %in% dale_chall_list)
}

stem <- function (x, language = "porter") SnowballC::wordStem(x, language)

out <- quanteda::toLower(quanteda::tokenize(x, what="word", removeNumbers = TRUE, removePunct = TRUE,
  removeSeparators = TRUE, removeTwitter = TRUE, removeHyphens = TRUE))

length(quanteda::removeFeatures(out, dale_chall_list)[[1]])/length(out[[1]])

x <- 'You said you didn’t want it,” said Thelma.
“And anyhow, I don’t want to sell it now.”
“Why not?” said Frances.
“Well,” said Thelma, “it is a very good tea set.
It is plastic that does not break.
It has pretty red flowers on it.
It has all the cups and saucers.
It has the sugar bowl and the cream pitcher
and the teapot.
It is almost new, and I think it cost a lot of
money.”
“I have two dollars and seventeen cents,” said
Frances.
“That’s a lot of money.”
“I don’t know,” said Thelma.
“If I sell you…
From A Bargain for Frances'

x <- paste(x, collapse=" ")

n.sents <- length(na.omit(unlist(textshape::split_sentence(textclean::replace_non_ascii(x))[[1]])))
length(na.omit(unlist(stringi::stri_split_boundaries(textclean::replace_non_ascii(x), type="sentence", locale="@ss=standard")[[1]])))

n.words <- sum(stringi::stri_count_words(x), na.rm = TRUE)
p.hard <- perc_hard(x)

dale_chall_(n.words, p.hard, n.sents)

pacman::p_load(rvest, xml2, dplyr, stringi)
pacman::p_load_current_gh('trinker/textclean')

dc_url <- 'http://www.readabilityformulas.com/articles/dale-chall-readability-word-list.php'

dale_chall_list <- dc_url %>% 
  read_html() %>%
  html_nodes('td') %>%
  `[`(-c(1:9)) %>% 
  html_text() %>%
  `[`(-c(145:154)) %>%
  stringi::stri_extract_all_words() %>%
  unlist() %>%
  tolower() #%>%length

devtools::use_data(dale_chall_list, pkg="C:/Users/Tyler/GitHub/readability", overwrite = TRUE, internal = TRUE)

dc_url <- 'http://www.rfp-templates.com/Research-Articles/Dale-Chall-3000-Simple-Word-List#the_dale-chall_word_list'

dc_url %>% 
  read_html() %>%
#  html_nodes('td') %>%
   html_nodes(xpath='//table/tbody/tr/td') %>%
  `[`(-c(1:44)) %>% 
  html_text() %>% 
  #`[`(-c(145:154)) %>%
  stringi::stri_extract_all_words() %>%
  unlist() #%>%length

doc_url <- 'http://opi.mt.gov/Pub/RTI/Forms/School/Choteau/The%20Dale-Chall%20Word%20List.doc'

doc_url %>%
    textreadr::download() %>%
    textreadr::read_doc() %>%
    qdapRegex::ex_default(pattern="(?<=\\|)[A-Za-z'-]+") %>%
    c() %>%
    na.omit() %>%
    c()
trinker / readability

Add dale_chall #2