When using a tokenizer that does not remove all special characters in your vectorizer model the PartOfSpeech representation model produces an error as it's trying to compile the topics to regex patterns.
Fixed by #2138.
Reproduction
from bertopic import BERTopic
from bertopic.representation import PartOfSpeech
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from umap import UMAP
dataset = fetch_20newsgroups(subset="all", categories=["sci.space"])["data"]
model = BERTopic(
embedding_model="all-MiniLM-L6-v2",
representation_model=PartOfSpeech(),
vectorizer_model=CountVectorizer(tokenizer=str.split, ngram_range=(1, 2)),
umap_model=UMAP(random_state=42),
)
model.fit_transform(dataset)
error: unbalanced parenthesis at position 9
Traceback
```
---------------------------------------------------------------------------
error Traceback (most recent call last)
[](https://localhost:8080/#) in ()
13 umap_model=UMAP(random_state=42),
14 )
---> 15 model.fit_transform(dataset)
[/usr/local/lib/python3.10/dist-packages/bertopic/_bertopic.py](https://localhost:8080/#) in fit_transform(self, documents, embeddings, images, y)
490 else:
491 # Extract topics by calculating c-TF-IDF
--> 492 self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose)
493
494 # Reduce topics
[/usr/local/lib/python3.10/dist-packages/bertopic/_bertopic.py](https://localhost:8080/#) in _extract_topics(self, documents, embeddings, mappings, verbose)
3982 documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
3983 self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)
-> 3984 self.topic_representations_ = self._extract_words_per_topic(words, documents)
3985 self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings)
3986 if verbose:
[/usr/local/lib/python3.10/dist-packages/bertopic/_bertopic.py](https://localhost:8080/#) in _extract_words_per_topic(self, words, documents, c_tf_idf, calculate_aspects)
4289 topics = tuner.extract_topics(self, documents, c_tf_idf, topics)
4290 elif isinstance(self.representation_model, BaseRepresentation):
-> 4291 topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics)
4292 elif isinstance(self.representation_model, dict):
4293 if self.representation_model.get("Main"):
[/usr/local/lib/python3.10/dist-packages/bertopic/representation/_pos.py](https://localhost:8080/#) in extract_topics(self, topic_model, documents, c_tf_idf, topics)
121 for keyword in keywords:
122 selection = documents.loc[documents.Topic == topic, :]
--> 123 selection = selection.loc[selection.Document.str.contains(keyword), "Document"]
124 if len(selection) > 0:
125 for document in selection[:2]:
[/usr/local/lib/python3.10/dist-packages/pandas/core/strings/accessor.py](https://localhost:8080/#) in wrapper(self, *args, **kwargs)
134 )
135 raise TypeError(msg)
--> 136 return func(self, *args, **kwargs)
137
138 wrapper.__name__ = func_name
[/usr/local/lib/python3.10/dist-packages/pandas/core/strings/accessor.py](https://localhost:8080/#) in contains(self, pat, case, flags, na, regex)
1301 dtype: bool
1302 """
-> 1303 if regex and re.compile(pat).groups:
1304 warnings.warn(
1305 "This pattern is interpreted as a regular expression, and has "
[/usr/lib/python3.10/re.py](https://localhost:8080/#) in compile(pattern, flags)
249 def compile(pattern, flags=0):
250 "Compile a regular expression pattern, returning a Pattern object."
--> 251 return _compile(pattern, flags)
252
253 def purge():
[/usr/lib/python3.10/re.py](https://localhost:8080/#) in _compile(pattern, flags)
301 if not sre_compile.isstring(pattern):
302 raise TypeError("first argument must be string or compiled pattern")
--> 303 p = sre_compile.compile(pattern, flags)
304 if not (flags & DEBUG):
305 if len(_cache) >= _MAXCACHE:
[/usr/lib/python3.10/sre_compile.py](https://localhost:8080/#) in compile(p, flags)
786 if isstring(p):
787 pattern = p
--> 788 p = sre_parse.parse(p, flags)
789 else:
790 pattern = None
[/usr/lib/python3.10/sre_parse.py](https://localhost:8080/#) in parse(str, flags, state)
967 if source.next is not None:
968 assert source.next == ")"
--> 969 raise source.error("unbalanced parenthesis")
970
971 for g in p.state.grouprefpos:
error: unbalanced parenthesis at position 9
```
Have you searched existing issues? 🔎
Desribe the bug
When using a tokenizer that does not remove all special characters in your vectorizer model the PartOfSpeech representation model produces an error as it's trying to compile the topics to regex patterns.
Fixed by #2138.
Reproduction
Traceback
``` --------------------------------------------------------------------------- error Traceback (most recent call last) [BERTopic Version
0.16.3