dselivanov / text2vec

Fast vectorization, topic modeling, distances and GloVe word embeddings in R.
http://text2vec.org
Other
850 stars 135 forks source link

coherence() on TCM created from create_tcm() tends to give Inf as output #295

Closed leungi closed 5 years ago

leungi commented 5 years ago

Hi,

I'm replicating an example from the textmineR vignette, but the same observation is seen using the movie_review data in text2vec.

Observation: coherence() in text2vec tends to give Inf/NaN when passing in a TCM created from create_tcm(), and when the argument skip_grams_window is set to a low value (e.g., 10L for sentence-level); setting skip_grams_window to high values (e.g., 150L for paragraph-level) overcomes this and output real numbers.

I'm not sure if it should be interpreted that the topic is highly coherent when score is Inf, and that it should be dropped when NaN.

library(text2vec)
library(textmineR)

it = itoken(nih_sample$ABSTRACT_TEXT,
            ids = nih_sample$APPLICATION_ID,
            preprocessor = tolower,
            tokenizer = word_tokenizer,
            progressbar = FALSE)

v = create_vocabulary(it, 
                      ngram = c(1, 2),
                      stopwords = c(stopwords::stopwords("en"), 
                                    stopwords::stopwords(source = "smart")), 
)

vectorizer = vocab_vectorizer(v)

tv_dtm = create_dtm(it, vectorizer)

lda_model = text2vec::LDA$new(n_topics = 20,
                              doc_topic_prior = 0.1,
                              topic_word_prior = 0.05)
doc_topic_distr = 
  lda_model$fit_transform(x = tv_dtm, n_iter = 1000, 
                          convergence_tol = 0.001, n_check_convergence = 25, 
                          progressbar = FALSE)
#> INFO [2019-01-03 10:01:32] iter 25 loglikelihood = -303860.356
#> INFO [2019-01-03 10:01:33] iter 50 loglikelihood = -303424.694
#> INFO [2019-01-03 10:01:34] iter 75 loglikelihood = -303943.376
#> INFO [2019-01-03 10:01:34] early stopping at 75 iteration

tw = lda_model$get_top_words(n = 10, lambda = 1)

# creating TCM from DTM
coherence(tw, tcm_from_dtm, n_doc_tcm = attr(v, 'document_count'))
#>          mean_logratio    mean_pmi    mean_npmi mean_difference
#> topic_1      -6.044362  -4.6124603  0.009022805      0.15670004
#> topic_2      -2.144657  -0.3046812  0.096358206      0.05506983
#> topic_3      -2.591855  -0.4438325  0.159954673      0.10981851
#> topic_4      -6.077788  -4.6132109 -0.049527630      0.04594639
#> topic_5     -12.808719 -10.9177127 -0.141381148      0.13357065
#> topic_6      -8.043359  -6.1511122 -0.068403030      0.04114088
#> topic_7      -2.152022  -0.3949065  0.066470330      0.07568157
#> topic_8      -1.484239   0.1604846  0.048735739      0.03953954
#> topic_9      -1.475678   0.4586305  0.116674044      0.08826674
#> topic_10     -1.259230   0.2058303  0.080182606      0.04961528
#> topic_11     -5.939040  -4.4798170  0.013898661      0.08715977
#> topic_12     -4.206347  -2.0830747  0.092120217      0.12265602
#> topic_13     -6.765727  -5.3191584 -0.085344509      0.02235498
#> topic_14    -12.690080 -11.8777201 -0.230685985      0.02824136
#> topic_15     -4.784090  -3.3428239 -0.020697690      0.03251698
#> topic_16     -6.402455  -4.2082940  0.112366318      0.14830416
#> topic_17     -1.155303   0.2583149  0.106303275      0.06920084
#> topic_18     -4.864199  -2.6637349  0.073950677      0.12606471
#> topic_19    -11.794832 -10.1049146 -0.147665166      0.05433755
#> topic_20     -7.302448  -5.2255722  0.062244498      0.11148252
#>          mean_npmi_cosim mean_npmi_cosim2
#> topic_1        0.4534164      0.119365206
#> topic_2        0.5035692      0.172794623
#> topic_3        0.6118779      0.306037837
#> topic_4        0.3143060      0.002694933
#> topic_5        0.3062466     -0.003841605
#> topic_6        0.3040448     -0.002185109
#> topic_7        0.4585443      0.122610179
#> topic_8        0.4395624      0.103740845
#> topic_9        0.5802603      0.263204914
#> topic_10       0.4999471      0.167426486
#> topic_11       0.4999107      0.171148461
#> topic_12       0.4937878      0.161357173
#> topic_13       0.4827335      0.151501932
#> topic_14       0.2883755     -0.013906997
#> topic_15       0.3050260     -0.006640314
#> topic_16       0.6578410      0.371244392
#> topic_17       0.5276036      0.200688876
#> topic_18       0.4486975      0.114614293
#> topic_19       0.2704456     -0.026523209
#> topic_20       0.5847184      0.270584152

# creating TCM from tokens
tcm_from_tcm_10 <- create_tcm(it,
                               vectorizer,
                               skip_grams_window = 10L)
coherence(tw, tcm_from_tcm_10, n_doc_tcm = attr(v, 'document_count'))
#>          mean_logratio mean_pmi mean_npmi mean_difference mean_npmi_cosim
#> topic_1      -21.34939      Inf       Inf       0.7014005             NaN
#> topic_2            Inf      Inf       Inf             Inf             NaN
#> topic_3            Inf      Inf       Inf             Inf             NaN
#> topic_4            Inf      Inf       Inf             Inf             NaN
#> topic_5            Inf      Inf       Inf             Inf             NaN
#> topic_6            Inf      Inf       Inf       0.2456639             NaN
#> topic_7            Inf      Inf       Inf             Inf             NaN
#> topic_8            Inf      Inf       Inf             Inf             NaN
#> topic_9            Inf      Inf       Inf       0.2740328             NaN
#> topic_10           Inf      Inf       Inf             Inf             NaN
#> topic_11           Inf      Inf       Inf             Inf             NaN
#> topic_12           Inf      Inf       Inf             Inf             NaN
#> topic_13           Inf      Inf       Inf             Inf             NaN
#> topic_14           Inf      Inf       Inf             Inf             NaN
#> topic_15           Inf      Inf       Inf             Inf             NaN
#> topic_16           Inf      Inf       Inf             Inf             NaN
#> topic_17           Inf      Inf       Inf             Inf             NaN
#> topic_18           Inf      Inf       Inf             Inf             NaN
#> topic_19           Inf      Inf       Inf             Inf             NaN
#> topic_20           Inf      Inf       Inf       0.1950946             NaN
#>          mean_npmi_cosim2
#> topic_1               NaN
#> topic_2               NaN
#> topic_3               NaN
#> topic_4               NaN
#> topic_5               NaN
#> topic_6               NaN
#> topic_7               NaN
#> topic_8               NaN
#> topic_9               NaN
#> topic_10              NaN
#> topic_11              NaN
#> topic_12              NaN
#> topic_13              NaN
#> topic_14              NaN
#> topic_15              NaN
#> topic_16              NaN
#> topic_17              NaN
#> topic_18              NaN
#> topic_19              NaN
#> topic_20              NaN

tcm_from_tcm_150 <- create_tcm(it,
                  vectorizer,
                  skip_grams_window = 150L)
coherence(tw, tcm_from_tcm_150, n_doc_tcm = attr(v, 'document_count'))
#>          mean_logratio   mean_pmi   mean_npmi mean_difference
#> topic_1     -18.639617 -10.024801  0.05052516      0.43611890
#> topic_2            Inf        Inf         Inf             Inf
#> topic_3      -8.962236 -16.740940 -0.16739028      0.72164213
#> topic_4     -13.174236 -18.453051 -0.30118283      0.21445099
#> topic_5     -13.889217 -17.716022 -0.25801591      0.26313682
#> topic_6      -9.277664 -19.952389 -0.42145817      0.06523561
#> topic_7      -7.070865 -16.576543 -0.21142950      0.53571858
#> topic_8      -8.211213 -11.779085  0.04709896      1.01227065
#> topic_9      -6.507963 -21.653536 -0.41553641      0.07872194
#> topic_10     -4.974619 -14.550431 -0.12139277      0.34915143
#> topic_11    -11.088104  -9.774559  0.20416013      4.01391810
#> topic_12     -7.474916 -13.764242 -0.11762530      0.91258881
#> topic_13     -8.008914 -13.840858 -0.10002040      1.56110323
#> topic_14    -20.175818 -11.690007  0.03596561      1.15218575
#> topic_15    -12.221565        Inf         Inf      0.14814857
#> topic_16    -13.771867 -14.940148 -0.01858747      1.93111703
#> topic_17           Inf        Inf         Inf             Inf
#> topic_18     -8.719549 -11.745498 -0.02294986      0.44488613
#> topic_19    -13.590988 -15.366961 -0.21606619      0.91227138
#> topic_20     -7.764232 -20.598502 -0.40124068      0.10434575
#>          mean_npmi_cosim mean_npmi_cosim2
#> topic_1        0.4692614      0.136772221
#> topic_2              NaN              NaN
#> topic_3        0.3892176      0.064127501
#> topic_4        0.3383733      0.016491412
#> topic_5        0.3214119      0.006012276
#> topic_6        0.4571263      0.121711696
#> topic_7        0.3169499      0.013082358
#> topic_8        0.2033123     -0.062068575
#> topic_9        0.4381783      0.102752973
#> topic_10       0.2996645     -0.010853628
#> topic_11       0.3665403      0.050528019
#> topic_12       0.3378262      0.019806544
#> topic_13       0.2705660     -0.022106696
#> topic_14       0.3968352      0.067186664
#> topic_15             NaN              NaN
#> topic_16       0.1750617     -0.057976765
#> topic_17             NaN              NaN
#> topic_18       0.3995846      0.067712461
#> topic_19       0.3970493      0.066755910
#> topic_20       0.5218966      0.191647911

Created on 2019-01-03 by the reprex package (v0.2.1)

manuelbickel commented 5 years ago

Thanks for reporting, I will have a look...

manuelbickel commented 5 years ago

You receive these Inf and NaN values because the tcm your are feeding to coherence has several zero entries in the diagonal, where there should be non-zero entries in the denominator. The result is that you divide by zero in several instances.

I am not aware of all details how create_tcm operates but it tends to output an upper triangle matrix. In the current version of text2vec, it seems that not all entries in the diagonal are empty, but most of them. When creating the coherence function, the diagonal from the ouptut has always been zero, not sure, if this was the case for the test examples by chance...

Anyway, the solution to your problem is one line in the example section of the coherence documentation that might admittedly not be sufficiently prominent. You need to re-assign the marginal probabilities of the individual terms to the diagonal, i.e., their total occurrence or in other words "the number of times a term co-occurs with itself" as follows: diag(tcm_from_tcm_10) = attributes(tcm_from_tcm_10)$word_count

Please also note, that so far coherence metrics are usually used on the basis of binary co-occurrence counts, which means that the reference tcm only stores the information if two terms co-occur at all in, e.g., a sentence, but not how often. At least, this is the way how coherence scores have been reported in literature, see, e.g. the paper by Röder mentioned in the documentation for coherence. Therefore, you should turn on the binary co-occurence option in create_tcm (@dselivanov implemented this as extra option only for making the coherence metrics available) and, furthermore, set all weights equal as follows: create_tcm(iterator, vectorizer, skip_grams_window = window_size ,weights = rep(1, window_size) ,binary_cooccurence = TRUE)

Please let us know, if this solves your issue so we can close.

manuelbickel commented 5 years ago

By the way, I have been working on a function that automatically creates tcms with the standard settings required by individual coherence metrics (each metric is thought to operate with different windows sizes, alos there are internal and external metrics). It is not documented very well at the moment (also be aware that it writes files to disk) but it might still help to improve your understanding. Maybe we can integrate an advanced version some time into text2vec... You may find it here: ceate_ref_tcm.R You will also need this: tcm_ specs_standard()

leungi commented 5 years ago

@manuelbickel: thanks for the prompt and detailed explanation.

I did come across diag(TCM) = attributes(TCM )$word_count and wondered what it's for. Upon inspecting the TCM from create_tcm(), I do see that at times, diag(TCM) tends to be plenty of zeroes. This solution solves the issue I have, however, I suspect there may be other reasons for the observation than this.

In the below example, though diag(tcm) still contains a bunch of zeroes, coherence() doesn't return Inf/NaN:

library(text2vec)
library(data.table)
library(Matrix)

data(movie_review)

setDT(movie_review)
setkey(movie_review, id)
set.seed(2016L)
all_ids <- movie_review$id
train_ids <- sample(all_ids, 1000)
test_ids <- setdiff(all_ids, train_ids)
train <- movie_review[J(train_ids)]
test <- movie_review[J(test_ids)]
# define preprocessing function and tokenization function
prep_fun <- tolower
tok_fun <- word_tokenizer

it_train <- itoken(
  train$review,
  preprocessor = prep_fun,
  tokenizer = tok_fun,
  ids = train$id,
  progressbar = FALSE
)

it_test <- itoken(
  test$review,
  preprocessor = prep_fun,
  tokenizer = tok_fun,
  ids = test$id,
  progressbar = FALSE
)

vocab <- create_vocabulary(it_train)
vocab <- prune_vocabulary(vocab, term_count_min = 5L)
# Use our filtered vocabulary
vectorizer <- vocab_vectorizer(vocab)
# use window of 5 for context words
window_size <- 5L
tcm <-
  create_tcm(
    it_train,
    vectorizer,
    skip_grams_window = window_size,
    weights = rep(1, window_size),
    binary_cooccurence = TRUE
  )
dtm <- create_dtm(it_train, vectorizer)

lda_model <- text2vec::LDA$new(
  n_topics = 10,
  doc_topic_prior = 0.1,
  topic_word_prior = 0.05
)

tcm_test <-
  create_tcm(
    it_test,
    vectorizer,
    skip_grams_window = window_size,
    weights = rep(1, window_size),
    binary_cooccurence = TRUE
  )

doc_topic_distr <-
  lda_model$fit_transform(
    x = dtm,
    n_iter = 1000,
    convergence_tol = 0.001,
    n_check_convergence = 25,
    progressbar = FALSE
  )
#> INFO [2019-01-03 15:20:42] iter 25 loglikelihood = -1226432.208
#> INFO [2019-01-03 15:20:43] iter 50 loglikelihood = -1211310.901
#> INFO [2019-01-03 15:20:43] iter 75 loglikelihood = -1207539.934
#> INFO [2019-01-03 15:20:44] iter 100 loglikelihood = -1205680.388
#> INFO [2019-01-03 15:20:44] iter 125 loglikelihood = -1205488.941
#> INFO [2019-01-03 15:20:44] early stopping at 125 iteration

tw <- lda_model$get_top_words(n = 10, lambda = 1)

sum(diag(tcm) == 0) / length(diag(tcm))
[1] 0.8720486
sum(diag(tcm_test) == 0) / length(diag(tcm_test))
[1] 0.7346526

coherence(tw, tcm, n_doc_tcm = attr(vocab, "document_count"))
## all real values returned
coherence(tw, tcm_test, n_doc_tcm = attr(vocab, "document_count"))
## all real values returned

Created on 2019-01-03 by the reprex package (v0.2.1)

leungi commented 5 years ago

Thanks for the link to create_reference_tcm(); I'll check it out.

manuelbickel commented 5 years ago

Within coherence function the "final" reference tcm is created by subsetting the top words. In your example the top words (that define the terms/diagonal of the final tcm for calculation) do not intersect with the terms that have zero entries in the diagonal of the tcm. Therefore, your example still works.

intersect(as.vector(tw),  colnames(tcm)[(which(diag(tcm) == 0))])
# character(0)
leungi commented 5 years ago

Noted; thanks again for your patience in explaining!