bab2min / tomotopy

Python package of Tomoto, the Topic Modeling Tool
https://bab2min.github.io/tomotopy
MIT License
548 stars 62 forks source link

add a new column to show which topic #179

Closed pariskang closed 1 year ago

pariskang commented 1 year ago

How to add a new column to show which topic is assigned to each document. I really need your help. Thank u.

import tomotopy as tp

tp.isa
import pandas as pd

df = pd.read_csv('/content/test.csv',usecols=['text']).iloc[:100000,:]
df.head()
import re
import jieba
from cntext import STOPWORDS_en

def segment(text):
    words = jieba.lcut(text)
    words = [w for w in words if w not in STOPWORDS_en]
    return words

test = "io.netty.handler.codec.http2.Http2FrameLogger."
print(segment(test))
df['words'] = df['text'].apply(segment)
df.head()
def find_k(docs, min_k=1, max_k=20, min_df=2):
    import matplotlib.pyplot as plt
    scores = []
    for k in range(min_k, max_k):
        mdl = tp.HDPModel(min_df=min_df, initial_k=k, seed=35)
        for words in docs:
            if words:
                mdl.add_doc(words)
        mdl.train(20)
        coh = tp.coherence.Coherence(mdl)
        scores.append(coh.get_score())

    plt.plot(range(min_k, max_k), scores)
    plt.xlabel("number of topics")
    plt.ylabel("coherence")
    plt.show()

find_k(docs=df['words'], min_k=1, max_k=20, min_df=2)
import tomotopy as tp

#初始化LDA
#mdl = tp.LDAModel(k=5, min_df=2, seed=555)
mdl = tp.HDPModel(initial_k=3, eta=0.02, seed=35, corpus=None, transform=None)
for words in df['words']:
    if words:
        mdl.add_doc(words=words)

mdl.train()

for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=10))
    print('\n')

mdl.docs[1].get_topic_dist() # topic distribution of the first doc

bab2min commented 1 year ago

Hi @pariskang You can get the top-n topics assigned to each document, using get_topics() method of Document class as follows:

for i, doc in enumerate(mdl.docs):
    print("{}th document's top-10 topics: {}".format(i, doc.get_topics(top_n=10)))
pariskang commented 1 year ago

Thank you! Your code seems really nice for me. I tried and then worked it out