findThoughts returning unassociated documents

Hi,

Running findThoughts returns documents that have almost no apparent association with the topic (i.e. the frex words from the topic are not present in any of the documents).

I wondered if I am missing something with my data / code or both.

Many thanks, Geoff

TextB <- readtext("Speeches KW MIG.csv", text_field = "text") subtextB <-subset.data.frame(TextB, Date >= "1996") subtextB$Date_ok <- as.numeric(as.factor(subtextB$Date)) newCorpB <- corpus(subtextB, docid_field = "doc_id") migTokB <- tokens(newCorpB, remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE) %>% tokens_remove(stopwords("english")) %>% tokens_wordstem() dfm_speech <-dfm(migTokB) dfm_speech.trim <- dfm_trim(dfm_speech, min_docfreq = 0.075, max_docfreq = 0.90, docfreq_type = "prop") stm_speech <- convert(dfm_speech.trim, to = "stm", docvars = docvars(newCorpB)) stmFitted <- stm(documents = stm_speech$documents, vocab = stm_speech$vocab, K = 45, prevalence = ~s(Date_ok), max.em.its = 75, data = stm_speech$meta, init.type = "Spectral") plot(stmFitted, type = "labels", labeltype = c("frex")) docs_S <- newCorpB$documents$texts thought36 <- findThoughts(stmFitted, texts=docs_S, topic=36, n=4) plotQuote(thought35, width = 90, main = "Topic 35")

newCorpB[["Speeches KW MIG.csv.586"]] newCorpB[["Speeches KW MIG.csv.666"]] newCorpB[["Speeches KW MIG.csv.739"]] newCorpB[["Speeches KW MIG.csv.434"]] Speeches KW MIG.csv

bstewart / stm

findThoughts returning unassociated documents #275