Closed hbeelee closed 1 year ago
Did you also update the internal top_n_words
to a value higher than 5? That might be the reason for the low amount of words.
Did you also update the internal
top_n_words
to a value higher than 5? That might be the reason for the low amount of words.
I'm not exactly sure if I'm following you here. If you are asking whether I have updated the model with larger number at top_n_words
, yes I did.
model.update_topics(docs, vectorizer_model=vectorizer, top_n_words = 15)
timestamps = range_years
topics_over_time = model.topics_over_time(docs, timestamps)
model.visualize_topics_over_time(topics_over_time)
But still, dynamic topic modelling only shows five keywords.
Could you share your full code? How you instantiated the model and how you adapted the BERTopic code. I might be able to see what is going on if I have the complete picture.
Here it is. Much appreciated.
class BERTopic:
def topics_over_time(self,
docs: List[str],
timestamps: Union[List[str],
List[int]],
nr_bins: int = None,
datetime_format: str = None,
evolution_tuning: bool = True,
global_tuning: bool = True) -> pd.DataFrame:
check_is_fitted(self)
check_documents_type(docs)
documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Timestamps": timestamps})
global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False)
all_topics = sorted(list(documents.Topic.unique()))
all_topics_indices = {topic: index for index, topic in enumerate(all_topics)}
if isinstance(timestamps[0], str):
infer_datetime_format = True if not datetime_format else False
documents["Timestamps"] = pd.to_datetime(documents["Timestamps"],
infer_datetime_format=infer_datetime_format,
format=datetime_format)
if nr_bins:
documents["Bins"] = pd.cut(documents.Timestamps, bins=nr_bins)
documents["Timestamps"] = documents.apply(lambda row: row.Bins.left, 1)
# Sort documents in chronological order
documents = documents.sort_values("Timestamps")
timestamps = documents.Timestamps.unique()
if len(timestamps) > 100:
warnings.warn(f"There are more than 100 unique timestamps (i.e., {len(timestamps)}) "
"which significantly slows down the application. Consider setting `nr_bins` "
"to a value lower than 100 to speed up calculation. ")
# For each unique timestamp, create topic representations
topics_over_time = []
for index, timestamp in tqdm(enumerate(timestamps), disable=not self.verbose):
# Calculate c-TF-IDF representation for a specific timestamp
selection = documents.loc[documents.Timestamps == timestamp, :]
documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join,
"Timestamps": "count"})
c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False)
if global_tuning or evolution_tuning:
c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False)
# Fine-tune the c-TF-IDF matrix at timestamp t by averaging it with the c-TF-IDF
# matrix at timestamp t-1
if evolution_tuning and index != 0:
current_topics = sorted(list(documents_per_topic.Topic.values))
overlapping_topics = sorted(list(set(previous_topics).intersection(set(current_topics))))
current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics]
previous_overlap_idx = [previous_topics.index(topic) for topic in overlapping_topics]
c_tf_idf.tolil()[current_overlap_idx] = ((c_tf_idf[current_overlap_idx] +
previous_c_tf_idf[previous_overlap_idx]) / 2.0).tolil()
# Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation
# by simply taking the average of the two
if global_tuning:
selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values]
c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0
# Extract the words per topic
words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)
topic_frequency = pd.Series(documents_per_topic.Timestamps.values,
index=documents_per_topic.Topic).to_dict()
# Fill dataframe with results
topics_at_timestamp = [(topic,
", ".join([words[0] for words in values][:11]),
topic_frequency[topic],
timestamp) for topic, values in words_per_topic.items()]
topics_over_time.extend(topics_at_timestamp)
if evolution_tuning:
previous_topics = sorted(list(documents_per_topic.Topic.values))
previous_c_tf_idf = c_tf_idf.copy()
return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"])
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/docs.xlsx')
docs = [str(doc) for doc in df['speeches'].tolist()]
class CustomTokenizer:
def __init__(self, tagger):
self.tagger = tagger
def __call__(self, sent):
sent = sent[:1000000]
word_tokens = self.tagger.nouns(sent)
result = [word for word in word_tokens if len(word) > 1 and word not in stopwords]
return result
custom_tokenizer = CustomTokenizer(Mecab())
mecab_vectorizer = CountVectorizer(tokenizer=custom_tokenizer, stop_words=stopwords)
model_name ="model"
sentence_model = SentenceTransformer(model_name)
embeddings = sentence_model.encode(docs, show_progress_bar=True)
hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=3, metric='euclidean', prediction_data=True)
representation_model = MaximalMarginalRelevance(diversity=0.3)
model = BERTopic(embedding_model=sentence_model,
vectorizer_model=mecab_vectorizer,
hdbscan_model=hdbscan_model,
min_topic_size=10,
top_n_words=15,
nr_topics=44,
calculate_probabilities = True,
representation_model=representation_model,
verbose=True)
topics, probs = model.fit_transform(docs, embeddings)
new_topics = model.reduce_outliers(docs, topics, strategy="distributions")
model.update_topics(docs, vectorizer_model=mecab_vectorizer, topics=new_topics, top_n_words = 15)
timestamps = df.years.to_list()
topics_over_time = model.topics_over_time(docs, timestamps)
model.visualize_topics_over_time(topics_over_time)
Could you also share the imports? Make the example as complete as possible such that it is clear which functions are used how and in what order.
A few things I could notice. First, you are not using representation_model
during .update_topics
so that model is not taken into account. Second, it might be worthwhile to adjust the initial BERTopic as follows to prevent overwriting classes:
class CustomBERTopic(BERTopic):
def topics_over_time(self,
....
Then, you can use the class as follows which helps making sure that you are correctly using the updated class:
model = CustomBERTopic(embedding_model=sentence_model,
...
)
Second, it might be worthwhile to adjust the initial BERTopic as follows to prevent overwriting classes:
class CustomBERTopic(BERTopic): def topics_over_time(self, ....
Then, you can use the class as follows which helps making sure that you are correctly using the updated class:
model = CustomBERTopic(embedding_model=sentence_model, ... )
This worked perfectly. Thank you so much!
Hi,
I was wondering if it is possible to retrieve more than five keywords from dynamic topic modeling. I attempted to change the internal code as follows, but the output continues to show only five keywords.
topics_at_timestamp = [(topic, ", ".join([words[0] for words in values][:11]), topic_frequency[topic], timestamp) for topic, values in words_per_topic.items()]
from