Errors processing 500k Customer Service text-based data-log

Hi, I have good anticipation for topic modeling with Bertopic.

However, my trial with live data has resulted in errors. I did not apply any filtering to the data, as you recommended.

The data sample is as follows:

import pandas as pd
doc = pd.read_csv('cs-data.txt', encoding='utf8', sep='\t')
doc

data:

SOURCE | CHANNEL | ID | MESSAGE
-- | -- | -- | --
LISA | 1 | 2625 | notification_template
LISA | 1 | 47215 | Good morning
LISA | 1 | 40993 | Hi
LISA | 1 | 41061 | Hi
LISA | 1 | 50805 | Yes
... | ... | ... | ...
PADI | 1 | 388829 | Hi
PADI | 1 | 388878 | I am no longer in a paid employment how do I c...
PADI | 1 | 388916 | Good morning
PADI | 1 | 388931 | How do I start adding money on it
PADI | 1 | 389003 | Payment

568109 rows × 4 columns

My code:

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(doc['MESSAGE'])
topic_model.get_topic_info()

Errors:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[27], line 2
      1 topic_model = BERTopic()
----> 2 topics, probs = topic_model.fit_transform(doc['MESSAGE'])
      3 topic_model.get_topic_info()

File C:\ProgramData\miniconda3\Lib\site-packages\bertopic\_bertopic.py:433, in BERTopic.fit_transform(self, documents, embeddings, images, y)
    430     self._save_representative_docs(custom_documents)
    431 else:
    432     # Extract topics by calculating c-TF-IDF
--> 433     self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose)
    435     # Reduce topics
    436     if self.nr_topics:

File C:\ProgramData\miniconda3\Lib\site-packages\bertopic\_bertopic.py:3635, in BERTopic._extract_topics(self, documents, embeddings, mappings, verbose)
   3633 if verbose:
   3634     logger.info("Representation - Extracting topics from clusters using representation models.")
-> 3635 documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
   3636 self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)
   3637 self.topic_representations_ = self._extract_words_per_topic(words, documents)

File C:\ProgramData\miniconda3\Lib\site-packages\pandas\core\groupby\generic.py:1445, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
   1442     kwargs["engine_kwargs"] = engine_kwargs
   1444 op = GroupByApply(self, func, args=args, kwargs=kwargs)
-> 1445 result = op.agg()
   1446 if not is_dict_like(func) and result is not None:
   1447     # GH #52849
   1448     if not self.as_index and is_list_like(func):

File C:\ProgramData\miniconda3\Lib\site-packages\pandas\core\apply.py:175, in Apply.agg(self)
    172     return self.apply_str()
    174 if is_dict_like(func):
--> 175     return self.agg_dict_like()
    176 elif is_list_like(func):
    177     # we require a list, but not a 'str'
    178     return self.agg_list_like()

File C:\ProgramData\miniconda3\Lib\site-packages\pandas\core\apply.py:406, in Apply.agg_dict_like(self)
    398 def agg_dict_like(self) -> DataFrame | Series:
    399     """
    400     Compute aggregation in the case of a dict-like argument.
    401 
   (...)
    404     Result of aggregation.
    405     """
--> 406     return self.agg_or_apply_dict_like(op_name="agg")

File C:\ProgramData\miniconda3\Lib\site-packages\pandas\core\apply.py:1388, in GroupByApply.agg_or_apply_dict_like(self, op_name)
   1383     kwargs.update({"engine": engine, "engine_kwargs": engine_kwargs})
   1385 with com.temp_setattr(
   1386     obj, "as_index", True, condition=hasattr(obj, "as_index")
   1387 ):
-> 1388     result_index, result_data = self.compute_dict_like(
   1389         op_name, selected_obj, selection, kwargs
   1390     )
   1391 result = self.wrap_results_dict_like(selected_obj, result_index, result_data)
   1392 return result

File C:\ProgramData\miniconda3\Lib\site-packages\pandas\core\apply.py:479, in Apply.compute_dict_like(self, op_name, selected_obj, selection, kwargs)
    476         results += key_data
    477 else:
    478     # key used for column selection and output
--> 479     results = [
    480         getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs)
    481         for key, how in func.items()
    482     ]
    483     keys = list(func.keys())
    485 return keys, results

File C:\ProgramData\miniconda3\Lib\site-packages\pandas\core\apply.py:480, in <listcomp>(.0)
    476         results += key_data
    477 else:
    478     # key used for column selection and output
    479     results = [
--> 480         getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs)
    481         for key, how in func.items()
    482     ]
    483     keys = list(func.keys())
    485 return keys, results

File C:\ProgramData\miniconda3\Lib\site-packages\pandas\core\groupby\generic.py:292, in SeriesGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    289     return self._python_agg_general(func, *args, **kwargs)
    291 try:
--> 292     return self._python_agg_general(func, *args, **kwargs)
    293 except KeyError:
    294     # KeyError raised in test_groupby.test_basic is bc the func does
    295     #  a dictionary lookup on group.name, but group name is not
    296     #  pinned in _python_agg_general, only in _aggregate_named
    297     result = self._aggregate_named(func, *args, **kwargs)

File C:\ProgramData\miniconda3\Lib\site-packages\pandas\core\groupby\generic.py:325, in SeriesGroupBy._python_agg_general(self, func, *args, **kwargs)
    322 f = lambda x: func(x, *args, **kwargs)
    324 obj = self._obj_with_exclusions
--> 325 result = self.grouper.agg_series(obj, f)
    326 res = obj._constructor(result, name=obj.name)
    327 return self._wrap_aggregated_output(res)

File C:\ProgramData\miniconda3\Lib\site-packages\pandas\core\groupby\ops.py:850, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
    843 if len(obj) > 0 and not isinstance(obj._values, np.ndarray):
    844     # we can preserve a little bit more aggressively with EA dtype
    845     #  because maybe_cast_pointwise_result will do a try/except
    846     #  with _from_sequence.  NB we are assuming here that _from_sequence
    847     #  is sufficiently strict that it casts appropriately.
    848     preserve_dtype = True
--> 850 result = self._aggregate_series_pure_python(obj, func)
    852 npvalues = lib.maybe_convert_objects(result, try_float=False)
    853 if preserve_dtype:

File C:\ProgramData\miniconda3\Lib\site-packages\pandas\core\groupby\ops.py:871, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
    868 splitter = self._get_splitter(obj, axis=0)
    870 for i, group in enumerate(splitter):
--> 871     res = func(group)
    872     res = extract_result(res)
    874     if not initialized:
    875         # We only do this validation on the first iteration

File C:\ProgramData\miniconda3\Lib\site-packages\pandas\core\groupby\generic.py:322, in SeriesGroupBy._python_agg_general.<locals>.<lambda>(x)
    320     alias = com._builtin_table_alias[func]
    321     warn_alias_replacement(self, orig_func, alias)
--> 322 f = lambda x: func(x, *args, **kwargs)
    324 obj = self._obj_with_exclusions
    325 result = self.grouper.agg_series(obj, f)

TypeError: sequence item 0: expected str instance, float found

Do you still recommend not cleaning the data (all text-based entries from customers)?

Also, I have an NVidea 3800Ti which is taking over 2 hours, I have no idea if my GPU is being used. I'm running this locally

Thanks!

MaartenGr / BERTopic

Errors processing 500k Customer Service text-based data-log #1664