vi3k6i5 / GuidedLDA

semi supervised guided topic model with custom guidedLDA
Mozilla Public License 2.0
497 stars 108 forks source link

seeded topics words are not getting importance #31

Open ImSajeed opened 5 years ago

ImSajeed commented 5 years ago

Hi @vi3k6i5 ,

I'm trying guided lda on six reviews data by initializing seed confiedence of 0.15, but they are not moving up the list as expected.

code below:

df = pd.DataFrame(corpus,columns=['Review'])

import spacy

nlp = spacy.load("en_core_web_sm")

from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.en import English import string from unidecode import unidecode import unicodedata

punctuations = string.punctuation stopwords = list(STOP_WORDS)

stopwords = set(stopwords)-{'not','on'}

parser = English() def spacytokenizer(sentence): mytokens = parser(sentence) mytokens = [ word.lemma.lower().strip() if (word.lemma != "-PRON-" or word.lemma != "-X-") else word.lower_ for word in mytokens ] mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]

mytokens = remPunct(remNumbers(remNonAscii(mytokens)))

mytokens = " ".join([i for i in mytokens])
return mytokens

from tqdm import tqdm tqdm.pandas() df["cleaned_review"] = df['Review'].progress_apply(spacy_tokenizer)

all_review_list = [ review.split(' ') for review in df['cleaned_review']]

import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import WordPunctTokenizer from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures

def get_bigrams(tokens): bigram_finder = BigramCollocationFinder.from_words(tokens) bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)

for bigram_tuple in bigrams:
    x = ' '.join(bigram_tuple)
    tokens.append(x)

return tokens

import gensim

def bigrams(words, bi_min=10, tri_min=10): bigram = gensim.models.Phrases(words, min_count = bi_min) bigram_mod = gensim.models.phrases.Phraser(bigram) return bigram_mod

def get_corpus(words): bigram_mod = bigrams(words) bigram = [bigram_mod[review] for review in words] final_bigram = [] for gram in bigram: try: d = get_bigrams(gram) except: final_bigram.append(gram) continue final_bigram.append(d)

filtered_bigram = [[j for j in i if (j not in ['not','only','on'] and j.isdigit() == False)] for i in final_bigram]
id2word = gensim.corpora.Dictionary(final_bigram)
#id2word.filter_extremes(no_below=10, no_above=0.35)
id2word.compactify()
corpus = [id2word.doc2bow(text) for text in final_bigram]
return corpus, id2word, filtered_bigram  

train_corpus, train_id2word, bigram_train = get_corpus(all_review_list)

vocab = [] for i in range(len(train_id2word)): vocab.append(train_id2word[i])

import numpy as np from gensim import matutils from gensim.matutils import corpus2csc

def bow_iterator(docs, dictionary): for doc in docs: yield dictionary.doc2bow(doc)

def get_term_matrix(msgs, dictionary): bow = bow_iterator(msgs, dictionary) X = np.transpose(matutils.corpus2csc(bow).astype(np.int64)) return X

X = get_term_matrix(bigram_train, train_id2word)

import guidedlda model = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=42, refresh=20) model.fit(X)

topic_word = model.topicword n_top_words = 20 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] print('Topic {}: {}'.format(i, ','.join(topic_words)))

seed_topic_list = [['late pickup','point nearly','arrive 1hour','30min destination'],['accord time','hour not','time schedule'] ]

model = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=7, refresh=20)

word2id = dict((v, idx) for idx, v in enumerate(vocab))

seed_topics = {} for t_id, st in enumerate(seed_topic_list): for word in st: seed_topics[word2id[word]] = t_id

model.fit(X, seed_topics=seed_topics, seed_confidence=0.15)

topic_word = model.topicword n_top_words = 20 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] print('Topic {}: {}'.format(i, ','.join(topic_words)))