Merging models - Githubissues

ayushjainr commented 11 months ago

Error merging topic models -

mergedModels = BERTopic.merge_models([model1,model2], min_similarity=0.9)

KeyError Traceback (most recent call last) Cell In[20], line 1 ----> 1 mergedModels = BERTopic.merge_models([m1[2],m1[0]], min_similarity=0.98)

File ~/test/lib/python3.10/site-packages/bertopic/_bertopic.py:3150, in BERTopic.merge_models(cls, models, min_similarity, embedding_model) 3147 merged_topics["topic_labels"][str(new_topic_val)] = selected_topics["topic_labels"][str(new_topic)] 3149 if selected_topics["topic_aspects"]: -> 3150 merged_topics["topic_aspects"][str(new_topic_val)] = selected_topics["topic_aspects"][str(new_topic)] 3152 # Add new embeddings 3153 new_tensors = tensors[new_topic - selected_topics["_outliers"]]

KeyError: '12'

One thing to note is that there's no error when I reduce the min_similarity value but I see no topics getting added

MaartenGr commented 11 months ago

Could you share your full code? Without knowing what exactly is run it is difficult to say what is happening here. It might indeed be related to the minimum similarity since a value of .98 is quite high and I wonder whether that actually does something

ayushjainr commented 11 months ago

Thanks @MaartenGr for your quick reply!

I essentially have multiple topic models that I am trying to merge, both use your llama methodology for representation. I was running into the issue of the merged model being the same as the first model even though the 2nd one has many different topics. So I was incrementally increasing the min similarity value, If I run a topic model on the whole combined text instead I do get topics from both the models.

from torch import cuda
from torch import bfloat16
import transformers
from huggingface_hub import login
import subprocess as sp
import os
import torch
import re
from random import sample 
import pandas as pd
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'; print(device)

cuda.empty_cache()

login(token = myToken)
model_id = 'meta-llama/Llama-2-13b-chat-hf'

cuda.empty_cache()
# Quantization to load an LLM with less GPU memory
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

# Llama 2 Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id,token=myToken)
cuda.empty_cache()

# Llama 2 Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()
cuda.empty_cache()

# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)
cuda.empty_cache()

prompt = system_prompt + example_prompt + main_prompt
## This prompt is the same as yours, copying here was causing some issues 

cuda.empty_cache()
import pandas as pd

df = pd.read_csv('myText.csv')
docs = [i.lower() for i in df.text]

df2 = pd.read_csv('myText2.csv')
docs2 = [i.lower() for i in df2.text]

from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(docs, show_progress_bar=True)
cuda.empty_cache()

from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(n_neighbors=100, n_components=5, min_dist=0.2, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', 
                        prediction_data=True,min_samples=10)

from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)

# Text generation with Llama 2
llama2 = TextGeneration(generator, prompt=prompt)
cuda.empty_cache()
# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Llama2": llama2,
    "MMR": mmr
}

from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True,
  calculate_probabilities=False
)
cuda.empty_cache()
topic_model.fit(docs, embeddings)

cuda.empty_cache()

embeddings2 = embedding_model.encode(docs2, show_progress_bar=True)
cuda.empty_cache()

topic_model2 = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True,
  calculate_probabilities=False
)

topic_model2.fit(docs2, embeddings2)

cuda.empty_cache()

l1= len(topic_model.get_topic_info())
l2= len(topic_model.get_topic_info())
minS = 0.7
while l2==l1:    
    merged_model = BERTopic.merge_models([topic_model, topic_model2],min_similarity=minS)
    l2= len(merged_model.get_topic_info())
    print('minS: {minS} --> [{l1},{l2}]'.format(minS=minS,l1=l1,l2=l2))
    minS = minS +0.01
merged_model.save('/mnt/ebs1/data/Share/GlobalFilingNLP/topicModels/mergedRisk2')
cuda.empty_cache()

MaartenGr commented 11 months ago

Based on your code, my guess would indeed be the relatively high min_similarity value when you set it .98. It is interesting though since I had tested this functionality before without any issues. It might also be related due to the multi-aspect modeling which seems to be accessed incorrectly. For now, lowering the min_similarity seems to fix it but I'll do some tests to see if I can resolve the issue.

Anirudh-Munnangi commented 11 months ago

I am facing similar issues with my models. I have a model with 64 topics (based 4000 text records) and another model with 115 topics (based on 8000ish records). When I try to merge them the merge_model either does not add any topics and when I increase the "min_similarity" value above a certain point, it fails with various errors such as KeyError: '40', KeyError: '41' etc.

If this issue could be looked into, it will be of great help.

[UPDATED BELOW with actual values from latest run]

MaartenGr commented 11 months ago

@Anirudh-Munnangi Thanks for sharing this. Could you also share your full code? Without it, it is hard to see what exactly is happening here. Also, could you share your full error log?

Anirudh-Munnangi commented 11 months ago

Thank you @MaartenGr for your quick response. Here is the following:

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import TextGeneration
from transformers import pipeline
from umap import UMAP
from sentence_transformers import SentenceTransformer
import pandas as pd
import openpyxl

input_file = pd.read_csv('inp_file.csv')
all_text_records = input_file.text_val.tolist()

part1 = all_text_records[:4000]
part2 = all_text_records[4000:]

keybert_rep_model = KeyBERTInspired()
all_rep_models = {
  'keyBertInspired' : keybert_rep_model
}

topic_model1 = BERTopic(language = 'English', calculate_probabilities = True, verbose = True, embedding_model = "gtr-t5-xl", representation_model = all_rep_models)
topics, probs = topic_model1.fit_transform(part1)
print(len(topic_model1.get_topic_info())) # Gets 70 i.e. 70 topics

topic_model2 = BERTopic(language = 'English', calculate_probabilities = True, verbose = True, embedding_model = "gtr-t5-xl", representation_model = all_rep_models)
topics, probs = topic_model2.fit_transform(part2)
print(len(topic_model2.get_topic_info())) # Gets 112 i.e 112 topics

merged_model = BERTopic.merge_models([topic_model1, topic_model2])
print(len(merged_model.get_topic_info())) # Gets 70 i.e. no topics merged

The code above works for "min_similarity" till 0.81. No merging of topics happens till then. After that value the errors start.

ERRORS

0.82<=min_similarity<=0.87

KeyError: '92'

0.88<=min_similarity<=0.90

KeyError: '74'

0.91<=min_similarity<=0.93

KeyError: '26'

0.94<=min_similarity<=0.95

KeyError: '10'

min_similarity = 0.96

KeyError: '9'

0.97<=min_similarity<=0.994

KeyError: '2'

min_similarity = 0.995

KeyError: '1'

min_similarity = 0.996

KeyError: '0'

min_similarity > 0.996

KeyError: '-1'

I have ran the code at different values of "min_similarity" and found these errors.

Thanks, Anirudh

MaartenGr commented 11 months ago

@Anirudh-Munnangi Thanks for code. Can you share a full error message also?

Anirudh-Munnangi commented 11 months ago

@MaartenGr Here is the full error log.

KeyError: '20'
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File <command-3564234355060601>, line 1
----> 1 merged_model = BERTopic.merge_models([topic_model1, topic_model2], min_similarity=0.95)
      2 print(len(merged_model.get_topic_info()))

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-ddf4dca3-fa75-4398-817a-316d213217af/lib/python3.10/site-packages/bertopic/_bertopic.py:3150, in BERTopic.merge_models(cls, models, min_similarity, embedding_model)
   3147 merged_topics["topic_labels"][str(new_topic_val)] = selected_topics["topic_labels"][str(new_topic)]
   3149 if selected_topics["topic_aspects"]:
-> 3150     merged_topics["topic_aspects"][str(new_topic_val)] = selected_topics["topic_aspects"][str(new_topic)]
   3152 # Add new embeddings
   3153 new_tensors = tensors[new_topic - selected_topics["_outliers"]]

KeyError: '20'

corsilt commented 11 months ago

i'm having the same issue. I noticed it only happens when using a representation model. If I don't use a representation model, I don't get the error. Looking at the source code I believe the problem is here:

(line 3149 _bertopic.py)

                if selected_topics["topic_aspects"]:
                    merged_topics["topic_aspects"][str(new_topic_val)] = selected_topics["topic_aspects"][str(new_topic)]

difficult to bebug from my end, but I wonder if topicaspects is being used properly

corsilt commented 11 months ago

Also, something I would like clarity on is are we updating our Representative_Docs or at least retaining the information from the base model after merging models? What I am seeing is this field gets converted to null. Same thing for representation model results. This information shouldn't be lost or we should be able to choose the base version

I see this explantion in the docs
First, the representative documents were not added to the model. This is because of privacy reasons, you might want to combine models that were trained on different stations which would allow for a degree of federated learning. Second, the names of the new topics contain topic ids that refer to one of the old models. They were purposefully left this way so that the user can identify which topics were newly added which you could inspect in the original models.

I don't agree with this assumption. I think this is overlooking some key functionality and desired control in the merge process. Why can't this behavior be optional? I think there is a lot of value to glean with the merge model method, but it needs some tweaks (tracking and retaining original information across merges. possibily updating representations after merge is complete)

corsilt commented 11 months ago

I had some time to dive more into debugging

The issue is here:

                if selected_topics["topic_aspects"]:
                    merged_topics["topic_aspects"][str(new_topic_val)] = selected_topics["topic_aspects"][str(new_topic)]

topic_aspects': {'short_label': {'-1': [['Resource Allocation for Perceptual Problems', 1]], '0': [['Feature elimination in supervised learning problems', 1]], '1': [['Non-Bayesian Restless Multi-Armed Bandit Problem', 1]]}}}

the dictionary for selected_topics["topic_aspects"] is what is shown above. The data needs to be accessed differently (topic keys are actually in a nested dictionary for each aspect)

Looks like I am able to get it what I wanted to do by changing the dictionary to this format {'1': {'short_label': [['Non-Bayesian Restless Multi-Armed Bandit Problem', 1]]}}

aleianno90 commented 10 months ago

I confirm the issue is here. I'm using these representation models:

representation_model = { "KeyBERT": KeyBERTInspired(), "MMR": MaximalMarginalRelevance(diversity=0.3), }

and my topic_aspects dictionary is as follows: {'KeyBERT': {-1: [('una', 0.4110595), ('della', 0.40966922), ('questo', 0.38504183)], 0: [('una', 0.5901735), ('questo', 0.52632904), ('niente', 0.5226551)], 1: [('una', 0.5901735), ('questo', 0.52632904), ('niente', 0.5226551)], ...

which has 2 keys ("KeyBERT" and "MMR"), each one with num_topicssubkeys.

MaartenGr commented 10 months ago

@aleianno90 @corsilt @ayushjainr @Anirudh-Munnangi I just created a PR that should fix this issue. You can install it as follows:

pip install git+https://github.com/MaartenGr/BERTopic.git@refs/pull/1762/head

Could you confirm this fix works?

ayushjainr commented 9 months ago

@aleianno90 @corsilt @ayushjainr @Anirudh-Munnangi I just created a PR that should fix this issue. You can install it as follows:
pip install git+https://github.com/MaartenGr/BERTopic.git@refs/pull/1762/head
Could you confirm this fix works?

@aleianno90 @corsilt @ayushjainr @Anirudh-Munnangi I just created a PR that should fix this issue. You can install it as follows:
pip install git+https://github.com/MaartenGr/BERTopic.git@refs/pull/1762/head
Could you confirm this fix works?

Still fails- Cell In[7], line 1 ----> 1 merged_model = BERTopic.merge_models([topic_model, topic_model2],min_similarity=0.98) 2 merged_model.get_topic_info()

File ~/test/lib/python3.10/site-packages/bertopic/_bertopic.py:3150, in BERTopic.merge_models(cls, models, min_similarity, embedding_model) 3147 merged_topics["topic_labels"][str(new_topic_val)] = selected_topics["topic_labels"][str(new_topic)] 3149 if selected_topics["topic_aspects"]: -> 3150 merged_topics["topic_aspects"][str(new_topic_val)] = selected_topics["topic_aspects"][str(new_topic)] 3152 # Add new embeddings 3153 new_tensors = tensors[new_topic - selected_topics["_outliers"]]

Separately, after installing the fix I am not able to save the topic model

{ "name": "PicklingError", "message": "Can't pickle <function add_hook_to_module..new_forward at 0x7f41a8421d80>: it's not found as accelerate.hooks.add_hook_to_module..new_forward", "stack": "--------------------------------------------------------------------------- PicklingError Traceback (most recent call last) Cell In[8], line 1 ----> 1 topic_model.save('/mnt/ebs1/data/Share/GlobalFilingNLP/topicModels/Risk_2008')

File ~/test/lib/python3.10/site-packages/bertopic/_bertopic.py:2987, in BERTopic.save(self, path, serialization, save_embedding_model, save_ctfidf) 2985 self.embedding_model = embedding_model 2986 else: -> 2987 joblib.dump(self, file) 2988 elif serialization == \"safetensors\" or serialization == \"pytorch\": 2989 2990 # Directory 2991 save_directory = Path(path)

File ~/test/lib/python3.10/site-packages/joblib/numpy_pickle.py:555, in dump(value, filename, compress, protocol, cache_size) 553 NumpyPickler(f, protocol=protocol).dump(value) 554 else: --> 555 NumpyPickler(filename, protocol=protocol).dump(value) 557 # If the target container is a file object, nothing is returned. 558 if is_fileobj:

File /usr/lib/python3.10/pickle.py:487, in _Pickler.dump(self, obj) 485 if self.proto >= 4: 486 self.framer.start_framing() --> 487 self.save(obj) 488 self.write(STOP) 489 self.framer.end_framing()

File ~/test/lib/python3.10/site-packages/joblib/numpy_pickle.py:355, in NumpyPickler.save(self, obj) 352 wrapper.write_array(obj, self) 353 return --> 355 return Pickler.save(self, obj)

File /usr/lib/python3.10/pickle.py:603, in _Pickler.save(self, obj, save_persistent_id) 599 raise PicklingError(\"Tuple returned by %s must have \" 600 \"two to six elements\" % reduce) 602 # Save the reduce() output and finally memoize the object --> 603 self.save_reduce(obj=obj, *rv)

File /usr/lib/python3.10/pickle.py:717, in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj) 715 if state is not None: 716 if state_setter is None: --> 717 save(state) 718 write(BUILD) 719 else: 720 # If a state_setter is specified, call it instead of load_build 721 # to update obj's with its previous state. 722 # First, push state_setter and its tuple of expected arguments 723 # (obj, state) onto the stack.

File ~/test/lib/python3.10/site-packages/joblib/numpy_pickle.py:355, in NumpyPickler.save(self, obj) 352 wrapper.write_array(obj, self) 353 return --> 355 return Pickler.save(self, obj)

File /usr/lib/python3.10/pickle.py:560, in _Pickler.save(self, obj, save_persistent_id) 558 f = self.dispatch.get(t) 559 if f is not None: --> 560 f(self, obj) # Call unbound method with explicit self 561 return 563 # Check private dispatch table if any, or else 564 # copyreg.dispatch_table

File /usr/lib/python3.10/pickle.py:972, in _Pickler.save_dict(self, obj) 969 self.write(MARK + DICT) 971 self.memoize(obj) --> 972 self._batch_setitems(obj.items())

File /usr/lib/python3.10/pickle.py:998, in _Pickler._batch_setitems(self, items) 996 for k, v in tmp: 997 save(k) --> 998 save(v) 999 write(SETITEMS) 1000 elif n:

[... skipping similar frames: NumpyPickler.save at line 355 (1 times)]

File /usr/lib/python3.10/pickle.py:560, in _Pickler.save(self, obj, save_persistent_id) 558 f = self.dispatch.get(t) 559 if f is not None: --> 560 f(self, obj) # Call unbound method with explicit self 561 return 563 # Check private dispatch table if any, or else 564 # copyreg.dispatch_table

File /usr/lib/python3.10/pickle.py:972, in _Pickler.save_dict(self, obj) 969 self.write(MARK + DICT) 971 self.memoize(obj) --> 972 self._batch_setitems(obj.items())

File /usr/lib/python3.10/pickle.py:998, in _Pickler._batch_setitems(self, items) 996 for k, v in tmp: 997 save(k) --> 998 save(v) 999 write(SETITEMS) 1000 elif n:

[... skipping similar frames: NumpyPickler.save at line 355 (1 times)]

File /usr/lib/python3.10/pickle.py:603, in _Pickler.save(self, obj, save_persistent_id) 599 raise PicklingError(\"Tuple returned by %s must have \" 600 \"two to six elements\" % reduce) 602 # Save the reduce() output and finally memoize the object --> 603 self.save_reduce(obj=obj, *rv)

File /usr/lib/python3.10/pickle.py:717, in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj) 715 if state is not None: 716 if state_setter is None: --> 717 save(state) 718 write(BUILD) 719 else: 720 # If a state_setter is specified, call it instead of load_build 721 # to update obj's with its previous state. 722 # First, push state_setter and its tuple of expected arguments 723 # (obj, state) onto the stack.

[... skipping similar frames: NumpyPickler.save at line 355 (4 times), _Pickler._batch_setitems at line 998 (2 times), _Pickler.save at line 560 (2 times), _Pickler.save at line 603 (2 times), _Pickler.save_dict at line 972 (2 times), _Pickler.save_reduce at line 717 (1 times)]

File /usr/lib/python3.10/pickle.py:717, in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj) 715 if state is not None: 716 if state_setter is None: --> 717 save(state) 718 write(BUILD) 719 else: 720 # If a state_setter is specified, call it instead of load_build 721 # to update obj's with its previous state. 722 # First, push state_setter and its tuple of expected arguments 723 # (obj, state) onto the stack.

[... skipping similar frames: NumpyPickler.save at line 355 (1 times)]

File /usr/lib/python3.10/pickle.py:560, in _Pickler.save(self, obj, save_persistent_id) 558 f = self.dispatch.get(t) 559 if f is not None: --> 560 f(self, obj) # Call unbound method with explicit self 561 return 563 # Check private dispatch table if any, or else 564 # copyreg.dispatch_table

File /usr/lib/python3.10/pickle.py:972, in _Pickler.save_dict(self, obj) 969 self.write(MARK + DICT) 971 self.memoize(obj) --> 972 self._batch_setitems(obj.items())

File /usr/lib/python3.10/pickle.py:998, in _Pickler._batch_setitems(self, items) 996 for k, v in tmp: 997 save(k) --> 998 save(v) 999 write(SETITEMS) 1000 elif n:

File ~/test/lib/python3.10/site-packages/joblib/numpy_pickle.py:355, in NumpyPickler.save(self, obj) 352 wrapper.write_array(obj, self) 353 return --> 355 return Pickler.save(self, obj)

File /usr/lib/python3.10/pickle.py:603, in _Pickler.save(self, obj, save_persistent_id) 599 raise PicklingError(\"Tuple returned by %s must have \" 600 \"two to six elements\" % reduce) 602 # Save the reduce() output and finally memoize the object --> 603 self.save_reduce(obj=obj, *rv)

File /usr/lib/python3.10/pickle.py:713, in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj) 710 self._batch_appends(listitems) 712 if dictitems is not None: --> 713 self._batch_setitems(dictitems) 715 if state is not None: 716 if state_setter is None: