MaartenGr / BERTopic

Leveraging BERT and c-TF-IDF to create easily interpretable topics.
https://maartengr.github.io/BERTopic/
MIT License
5.97k stars 747 forks source link

Invalid request using OpenAI backend. #1425

Open zhimin-z opened 1 year ago

zhimin-z commented 1 year ago

It always returns an invalid request error when using the OpenAI model. I am not sure why this happens. Any idea? @MaartenGr

InvalidRequestError(message="'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", param=None, code=None, http_status=400, request_id=None)

My code is:

import gensim.corpora as corpora
import pandas as pd
import openai
import wandb
import os

from gensim.parsing.preprocessing import strip_punctuation
# from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.coherencemodel import CoherenceModel
# from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.backend import OpenAIBackend

# from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

path_output = os.path.join(os.getcwd(), 'Result', 'RQ1', 'Special Topics')
path_model = os.path.join(os.getcwd(), 'Code', 'RQ1', 'Special Topic Modeling', 'Model')
if not os.path.exists(path_model):
    os.makedirs(path_model)

wandb_project = 'asset-management-topic-modeling'
openai.api_key = os.getenv('OPENAI_API_KEY')

os.environ["WANDB_API_KEY"] = 'xxxxxx'
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["WANDB__SERVICE_WAIT"] = "100"

# set default sweep configuration
config_defaults = {
    # Refer to https://platform.openai.com/docs/models/embeddings
    'model_name': 'text-embedding-ada-002',
    'metric_distane': 'cosine',
    'calculate_probabilities': True,
    'reduce_frequent_words': True,
    'prediction_data': True,
    'low_memory': False,
    'random_state': 42,
    'ngram_range': 2,
}

config_sweep = {
    'method': 'grid',
    'metric': {
        'name': 'Coherence CV',
        'goal': 'maximize'
    },
    'parameters': {
        'n_components': {
            'values': list(range(3,11)),
        },
    }
}

class TopicModeling:
    def __init__(self, topic_type, min_cluster_size=20):
        # Initialize an empty list to store top models
        self.top_models = []
        self.path_model = path_model

        df = pd.read_json(os.path.join(path_output, 'labels.json'))
        if topic_type == 'anomaly':
            df = df[df['Challenge_type'] == 'anomaly']
            self.docs = df[df['Challenge_summary'] != 'na']['Challenge_summary'].tolist() + df[df['Challenge_root_cause'] != 'na']['Challenge_root_cause'].tolist()
        elif topic_type == 'solution':
            docs = df[df['Solution'] != 'na']['Solution'].tolist()
            self.docs = [strip_punctuation(doc) for doc in docs]

        config_defaults['min_cluster_size'] = min_cluster_size
        config_sweep['name'] = topic_type
        config_sweep['parameters']['min_samples'] = {
            'values': list(range(1, config_defaults['min_cluster_size'] + 1))
        }

    def __train(self):
        # Initialize a new wandb run
        with wandb.init() as run:
            # update any values not set by sweep
            run.config.setdefaults(config_defaults)

            # Step 1 - Extract embeddings
            embedding_model = OpenAIBackend("text-embedding-ada-002")

            # Step 2 - Reduce dimensionality
            umap_model = UMAP(n_components=wandb.config.n_components, metric=run.config.metric_distane,
                              random_state=run.config.random_state, low_memory=run.config.low_memory)

            # Step 3 - Cluster reduced embeddings
            hdbscan_model = HDBSCAN(min_cluster_size=run.config.min_cluster_size,
                                    min_samples=wandb.config.min_samples, prediction_data=run.config.prediction_data)

            # Step 4 - Tokenize topics
            # vectorizer_model = TfidfVectorizer(ngram_range=(1, run.config.ngram_range))

            # Step 5 - Create topic representation
            # ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=run.config.reduce_frequent_words)

            # # Step 6 - Fine-tune topic representation
            # representation_model = KeyBERTInspired()

            # All steps together
            topic_model = BERTopic(
                embedding_model=embedding_model,
                umap_model=umap_model,
                hdbscan_model=hdbscan_model,
                # vectorizer_model=vectorizer_model,
                # ctfidf_model=ctfidf_model,
                # representation_model=representation_model,
                calculate_probabilities=run.config.calculate_probabilities
            )

            topic_model = topic_model.fit(self.docs)
            # topic_model.reduce_topics(self.docs, nr_topics='auto')

            # Preprocess Documents
            documents = pd.DataFrame({"Document": self.docs,
                                      "ID": range(len(self.docs)),
                                      "Topic": topic_model.topics_})
            documents_per_topic = documents.groupby(
                ['Topic'], as_index=False).agg({'Document': ' '.join})
            cleaned_docs = topic_model._preprocess_text(
                documents_per_topic.Document.values)

            # Extract vectorizer and analyzer from BERTopic
            vectorizer = topic_model.vectorizer_model
            analyzer = vectorizer.build_analyzer()

            # Extract features for Topic Coherence evaluation
            tokens = [analyzer(doc) for doc in cleaned_docs]
            dictionary = corpora.Dictionary(tokens)
            corpus = [dictionary.doc2bow(token) for token in tokens]
            topic_words = [[words for words, _ in topic_model.get_topic(
                topic)] for topic in range(len(set(topic_model.topics_))-1)]

            coherence_cv = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='c_v'
            )

            coherence_umass = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='u_mass'
            )

            coherence_cuci = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='c_uci'
            )

            coherence_cnpmi = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='c_npmi'
            )

            wandb.log({'Coherence CV': coherence_cv.get_coherence()})
            wandb.log({'Coherence UMASS': coherence_umass.get_coherence()})
            wandb.log({'Coherence UCI': coherence_cuci.get_coherence()})
            wandb.log({'Coherence NPMI': coherence_cnpmi.get_coherence()})
            wandb.log({'Topic Number': topic_model.get_topic_info().shape[0] - 1})
            wandb.log({'Uncategorized Post Number': topic_model.get_topic_info().at[0, 'Count']})

            model_name = f'{config_sweep["name"]}_{run.id}'
            topic_model.save(os.path.join(self.path_model, model_name))

    def sweep(self):
        wandb.login()
        sweep_id = wandb.sweep(config_sweep, project=wandb_project)
        wandb.agent(sweep_id, function=self.__train)
zhimin-z commented 1 year ago

Is this related? But I can confirm that adding batch size does not help. I use the starter code and face the same issue whether or not adding batch size.

import openai
from bertopic.backend import OpenAIBackend

openai.api_key = os.getenv('OPENAI_API_KEY')
embedding_model = OpenAIBackend("text-embedding-ada-002", batch_size=10)

from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data'][:1000]

topic_model = BERTopic(embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

Log trace:

---------------------------------------------------------------------------
InvalidRequestError                       Traceback (most recent call last)
Cell In[240], line 14
     11 docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data'][:1000]
     13 topic_model = BERTopic(embedding_model=embedding_model)
---> 14 topics, probs = topic_model.fit_transform(docs)

File [~/Asset-Management-Topic-Modeling/.venv/lib/python3.10/site-packages/bertopic/_bertopic.py:373](https://vscode-remote+ssh-002dremote-002bdocjk-002dgpu-002d01.vscode-resource.vscode-cdn.net/home/21zz42/Asset-Management-Topic-Modeling/Code/RQ1/~/Asset-Management-Topic-Modeling/.venv/lib/python3.10/site-packages/bertopic/_bertopic.py:373), in BERTopic.fit_transform(self, documents, embeddings, images, y)
    370 if embeddings is None:
    371     self.embedding_model = select_backend(self.embedding_model,
    372                                           language=self.language)
--> 373     embeddings = self._extract_embeddings(documents.Document.values.tolist(),
    374                                           images=images,
    375                                           method="document",
    376                                           verbose=self.verbose)
    377     logger.info("Transformed documents to Embeddings")
    378 else:

File [~/Asset-Management-Topic-Modeling/.venv/lib/python3.10/site-packages/bertopic/_bertopic.py:3126](https://vscode-remote+ssh-002dremote-002bdocjk-002dgpu-002d01.vscode-resource.vscode-cdn.net/home/21zz42/Asset-Management-Topic-Modeling/Code/RQ1/~/Asset-Management-Topic-Modeling/.venv/lib/python3.10/site-packages/bertopic/_bertopic.py:3126), in BERTopic._extract_embeddings(self, documents, images, method, verbose)
   3124     embeddings = self.embedding_model.embed_words(words=documents, verbose=verbose)
   3125 elif method == "document":
-> 3126     embeddings = self.embedding_model.embed_documents(documents, verbose=verbose)
   3127 elif documents[0] is None and images is None:
   3128     raise ValueError("Make sure to use an embedding model that can either embed documents"
   3129                      "or images depending on which you want to embed.")
...
    764         rbody, rcode, resp.data, rheaders, stream_error=stream_error
    765     )
    766 return resp

InvalidRequestError: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.
MaartenGr commented 1 year ago

Hmmm, not sure what is happening here. I might be mistaken but is that the full error log? It does not seem to show the error with respect to the source of the issue, namely what is happening inside embeddings = self.embedding_model.embed_documents(documents, verbose=verbose). If there is more to the error log, could you share it?

Also, is '$.input' perhaps part of one of your documents? If so, it might be worthwhile to check whether special signs like $ is supported by the embedding model.

zhimin-z commented 1 year ago

Is this related? But I can confirm that adding batch size does not help. I use the starter code and face the same issue whether or not adding batch size.

import openai
from bertopic.backend import OpenAIBackend

openai.api_key = os.getenv('OPENAI_API_KEY')
embedding_model = OpenAIBackend("text-embedding-ada-002", batch_size=10)

from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data'][:1000]

topic_model = BERTopic(embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

Log trace:

---------------------------------------------------------------------------
InvalidRequestError                       Traceback (most recent call last)
Cell In[240], line 14
     11 docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data'][:1000]
     13 topic_model = BERTopic(embedding_model=embedding_model)
---> 14 topics, probs = topic_model.fit_transform(docs)

File [~/Asset-Management-Topic-Modeling/.venv/lib/python3.10/site-packages/bertopic/_bertopic.py:373](https://vscode-remote+ssh-002dremote-002bdocjk-002dgpu-002d01.vscode-resource.vscode-cdn.net/home/21zz42/Asset-Management-Topic-Modeling/Code/RQ1/~/Asset-Management-Topic-Modeling/.venv/lib/python3.10/site-packages/bertopic/_bertopic.py:373), in BERTopic.fit_transform(self, documents, embeddings, images, y)
    370 if embeddings is None:
    371     self.embedding_model = select_backend(self.embedding_model,
    372                                           language=self.language)
--> 373     embeddings = self._extract_embeddings(documents.Document.values.tolist(),
    374                                           images=images,
    375                                           method="document",
    376                                           verbose=self.verbose)
    377     logger.info("Transformed documents to Embeddings")
    378 else:

File [~/Asset-Management-Topic-Modeling/.venv/lib/python3.10/site-packages/bertopic/_bertopic.py:3126](https://vscode-remote+ssh-002dremote-002bdocjk-002dgpu-002d01.vscode-resource.vscode-cdn.net/home/21zz42/Asset-Management-Topic-Modeling/Code/RQ1/~/Asset-Management-Topic-Modeling/.venv/lib/python3.10/site-packages/bertopic/_bertopic.py:3126), in BERTopic._extract_embeddings(self, documents, images, method, verbose)
   3124     embeddings = self.embedding_model.embed_words(words=documents, verbose=verbose)
   3125 elif method == "document":
-> 3126     embeddings = self.embedding_model.embed_documents(documents, verbose=verbose)
   3127 elif documents[0] is None and images is None:
   3128     raise ValueError("Make sure to use an embedding model that can either embed documents"
   3129                      "or images depending on which you want to embed.")
...
    764         rbody, rcode, resp.data, rheaders, stream_error=stream_error
    765     )
    766 return resp

InvalidRequestError: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.

But even if I am using the starter, it still gives the same error. I am sure my API key is a valid one since it does not throw an authentication error.

My requirements look like this:

bertopic==0.15.0
gensim==4.3.1
wandb==0.15.5
MaartenGr commented 1 year ago

I believe the dataset contains some empty documents which cannot be handled by OpenAI's API, so adding the following should work:

docs = [doc for doc in docs if len(doc) > 0]

Also, the full error log showed that it was indeed a result of the embedding model.

zhimin-z commented 1 year ago

I tried it immediately but this time it gives:

InvalidRequestError: This model's maximum context length is 8191 tokens, however you requested 10266 tokens (10266 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.

Maybe we should update the documentation for the caution? It is hard to know in advance the length of the allowed string due to ambiguous token calculation. What do you think?

zhimin-z commented 1 year ago
import openai
from bertopic.backend import OpenAIBackend

embedding_model = OpenAIBackend()

from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
docs = [doc for doc in docs if len(doc) > 0 and len(doc) < 500]

topic_model = BERTopic(embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

This time I constrained the doc size to 1~500 characters, but it still gives the same InvalidRequestError: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference. error. Is this a bug?

MaartenGr commented 1 year ago

Maybe we should update the documentation for the caution? It is hard to know in advance the length of the allowed string due to ambiguous token calculation. What do you think?

It depends on which embedding model you use. OpenAI has several embedding models with different token limits, so I believe that their API documentation is most relevant for this. Also, I believe you can use their underlying tokenizer to calculate beforehand how many tokens a piece of text is. You can find more about that here.

This time I constrained the doc size to 1~500 characters, but it still gives the same InvalidRequestError: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference. error. Is this a bug?

Not sure, it might be worthwhile to remove all very short-length documents in case they are preprocessed beforehand. For example, if a text has only a single space it will still remain in docs which might result in the error. So increasing that value (e.g., len(doc) > 3) might be worthwhile to experiment with.

zhimin-z commented 1 year ago

Maybe we should update the documentation for the caution? It is hard to know in advance the length of the allowed string due to ambiguous token calculation. What do you think?

It depends on which embedding model you use. OpenAI has several embedding models with different token limits, so I believe that their API documentation is most relevant for this. Also, I believe you can use their underlying tokenizer to calculate beforehand how many tokens a piece of text is. You can find more about that here.

This time I constrained the doc size to 1~500 characters, but it still gives the same InvalidRequestError: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference. error. Is this a bug?

Not sure, it might be worthwhile to remove all very short-length documents in case they are preprocessed beforehand. For example, if a text has only a single space it will still remain in docs which might result in the error. So increasing that value (e.g., len(doc) > 3) might be worthwhile to experiment with.

You are right, and there are a lot of '\n' in the sentences. image Basically, the batch size of all sentences are aggregated together as a whole to input the openai model. That is the culprit why it reach rate limit easily.

zhimin-z commented 1 year ago

BTW, is there some other reason which causes the openai throwing InvalidRequestError: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference?

I used my own dataset and include only non-empty entries (min length is 4-character words and there are ~6500 entris), but it still throws the above exception. I do now know what to do now...

import openai
from bertopic.backend import OpenAIBackend

# openai.api_key = MY_API_KEY
embedding_model = OpenAIBackend(delay_in_seconds=50, batch_size=3000)

from bertopic import BERTopic

df = pd.read_json(os.path.join(path_special_output, 'labels.json'))

docs = df[df['Challenge_summary'] != 'na']['Challenge_summary'].tolist() + df[df['Challenge_root_cause'] != 'na']['Challenge_root_cause'].tolist()

topic_model = BERTopic(embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()
HeroadZ commented 11 months ago

Any progress here? I have met the same issue.

MaartenGr commented 11 months ago

@HeroadZ It might be worthwhile to check your documents first and see whether they contain empty documents. Note that documents that only contain '\n' like tokens could theoretically also be considered as empty.

HeroadZ commented 11 months ago

@MaartenGr Thank you for the response. I will check that!

chentitus commented 8 months ago

I have encountered the same issue, and I have include only non-empty entries.

import umap import hdbscan from umap import UMAP from hdbscan import HDBSCAN import openai from bertopic.backend import OpenAIBackend from bertopic.representation import KeyBERTInspired from bertopic.representation import OpenAI

Embedding model

openai.api_key = 'sk-...' embedding_model = OpenAIBackend("text-embedding-ada-002", batch_size=15, delay_in_seconds=2)

Dimension reduction

umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.1, metric='cosine', random_state=42)

Clustering

hdbscan_model = HDBSCAN(min_cluster_size=7, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

KeyBERT

keybert_model = KeyBERTInspired() representation_model = { "KeyBERT": keybert_model }

Initialize BERTopic Model

topic_model = BERTopic(embedding_model=embedding_model, language="Chinese", verbose=True, calculate_probabilities=True, umap_model=umap_model, hdbscan_model=hdbscan_model, representation_model=representation_model )

topics, probs = topic_model.fit_transform(data2a, embeddings)

2024-01-03 10:54:45,662 - BERTopic - Cluster - Completed ✓ 2024-01-03 10:54:45,736 - BERTopic - Representation - Extracting topics from clusters using representation models.

InvalidRequestError Traceback (most recent call last) Cell In [62], line 1 ----> 1 topics, probs = topic_model.fit_transform(data2a, embeddings)

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/bertopic/_bertopic.py:433, in BERTopic.fit_transform(self, documents, embeddings, images, y) 430 self._save_representative_docs(custom_documents) 431 else: 432 # Extract topics by calculating c-TF-IDF --> 433 self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose) 435 # Reduce topics 436 if self.nr_topics:

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/bertopic/_bertopic.py:3637, in BERTopic._extract_topics(self, documents, embeddings, mappings, verbose) 3635 documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) 3636 self.c_tfidf, words = self._c_tf_idf(documents_per_topic) -> 3637 self.topicrepresentations = self._extract_words_per_topic(words, documents) 3638 self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings) 3639 self.topiclabels = {key: f"{key}" + "".join([word[0] for word in values[:4]]) 3640 for key, values in 3641 self.topicrepresentations.items()}

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/bertopic/_bertopic.py:3938, in BERTopic._extract_words_per_topic(self, words, documents, c_tf_idf, calculate_aspects) 3936 self.topicaspects[aspect] = aspects 3937 elif isinstance(aspect_model, BaseRepresentation): ... 766 rbody, rcode, resp.data, rheaders, stream_error=stream_error 767 ) 768 return resp

InvalidRequestError: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.

MaartenGr commented 8 months ago

@chentitus If you google this error, you will notice that there might be several reasons for this happening. One that I saw frequently is the token limit of the input documents. Make sure that the documents are not too large for the embedding model that you choose.

chentitus commented 8 months ago

@chentitus If you google this error, you will notice that there might be several reasons for this happening. One that I saw frequently is the token limit of the input documents. Make sure that the documents are not too large for the embedding model that you choose.

Thank you @MaartenGr! Will do that!

chentitus commented 8 months ago

@MaartenGr I find that the max number of tokens in my data is 393, and there are no empty entries in my data. I don't think my docs exceed the max input tokens for the model (8191 tokens for text-embedding-ada-002).

MaartenGr commented 8 months ago

@chentitus Hmmm, then I'm not sure what exactly is happening. Could you perhaps share the full error log? I have a feeling that nearing the end of the log something is missing. I can't see the exact line the error originates from.

chentitus commented 8 months ago

@MaartenGr sure. See below please.

2024-01-03 10:54:45,662 - BERTopic - Cluster - Completed ✓ 2024-01-03 10:54:45,736 - BERTopic - Representation - Extracting topics from clusters using representation models.

InvalidRequestError Traceback (most recent call last) Cell In [62], line 1 ----> 1 topics, probs = topic_model.fit_transform(data2a, embeddings)

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/bertopic/_bertopic.py:433, in BERTopic.fit_transform(self, documents, embeddings, images, y) 430 self._save_representative_docs(custom_documents) 431 else: 432 # Extract topics by calculating c-TF-IDF --> 433 self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose) 435 # Reduce topics 436 if self.nr_topics:

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/bertopic/_bertopic.py:3637, in BERTopic._extract_topics(self, documents, embeddings, mappings, verbose) 3635 documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) 3636 self.c_tfidf, words = self._c_tf_idf(documents_per_topic) -> 3637 self.topicrepresentations = self._extract_words_per_topic(words, documents) 3638 self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings) 3639 self.topiclabels = {key: f"{key}" + "".join([word[0] for word in values[:4]]) 3640 for key, values in 3641 self.topicrepresentations.items()}

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/bertopic/_bertopic.py:3938, in BERTopic._extract_words_per_topic(self, words, documents, c_tf_idf, calculate_aspects) 3936 self.topicaspects[aspect] = aspects 3937 elif isinstance(aspect_model, BaseRepresentation): -> 3938 self.topicaspects[aspect] = aspect_model.extract_topics(self, documents, c_tf_idf, aspects) 3940 return topics

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/bertopic/representation/_keybert.py:91, in KeyBERTInspired.extract_topics(self, topic_model, documents, c_tf_idf, topics) 87 topics = self._extract_candidate_words(topic_model, c_tf_idf, topics) 89 # We calculate the similarity between word and document embeddings and create 90 # topic embeddings from the representative document embeddings ---> 91 sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices) 93 # Find the best matching words based on the similarity matrix for each topic 94 updated_topics = self._extract_top_words(words, topics, sim_matrix)

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/bertopic/representation/_keybert.py:168, in KeyBERTInspired._extract_embeddings(self, topic_model, topics, representative_docs, repr_doc_indices) 166 # Calculate word embeddings and extract best matching with updated topic_embeddings 167 vocab = list(set([word for words in topics.values() for word in words])) --> 168 word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False) 169 sim = cosine_similarity(topic_embeddings, word_embeddings) 171 return sim, vocab

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/bertopic/_bertopic.py:3293, in BERTopic._extract_embeddings(self, documents, images, method, verbose) 3291 embeddings = self.embedding_model.embed_words(words=documents, verbose=verbose) 3292 elif method == "document": -> 3293 embeddings = self.embedding_model.embed_documents(documents, verbose=verbose) 3294 elif documents[0] is None and images is None: 3295 raise ValueError("Make sure to use an embedding model that can either embed documents" 3296 "or images depending on which you want to embed.")

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/bertopic/backend/_base.py:69, in BaseEmbedder.embed_documents(self, document, verbose) 55 def embed_documents(self, 56 document: List[str], 57 verbose: bool = False) -> np.ndarray: 58 """ Embed a list of n words into an n-dimensional 59 matrix of embeddings 60 (...) 67 that each have an embeddings size of m 68 """ ---> 69 return self.embed(document, verbose)

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/bertopic/backend/_openai.py:68, in OpenAIBackend.embed(self, documents, verbose) 66 embeddings = [] 67 for batch in tqdm(self._chunks(documents), disable=not verbose): ---> 68 response = openai.Embedding.create(input=batch, **self.generator_kwargs) 69 embeddings.extend([r["embedding"] for r in response["data"]]) 71 # Delay subsequent calls

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/openai/api_resources/embedding.py:33, in Embedding.create(cls, *args, *kwargs) 31 while True: 32 try: ---> 33 response = super().create(args, **kwargs) 35 # If a user specifies base64, we'll just return the encoded string. 36 # This is only for the default case. 37 if not user_provided_encoding_format:

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py:153, in EngineAPIResource.create(cls, api_key, api_base, api_type, request_id, api_version, organization, params) 127 @classmethod 128 def create( 129 cls, (...) 136 params, 137 ): 138 ( 139 deployment_id, 140 engine, (...) 150 api_key, api_base, api_type, api_version, organization, **params 151 ) --> 153 response, _, api_key = requestor.request( 154 "post", 155 url, 156 params=params, 157 headers=headers, 158 stream=stream, 159 request_id=request_id, 160 request_timeout=request_timeout, 161 ) 163 if stream: 164 # must be an iterator 165 assert not isinstance(response, OpenAIResponse)

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/openai/api_requestor.py:298, in APIRequestor.request(self, method, url, params, headers, files, stream, request_id, request_timeout) 277 def request( 278 self, 279 method, (...) 286 request_timeout: Optional[Union[float, Tuple[float, float]]] = None, 287 ) -> Tuple[Union[OpenAIResponse, Iterator[OpenAIResponse]], bool, str]: 288 result = self.request_raw( 289 method.lower(), 290 url, (...) 296 request_timeout=request_timeout, 297 ) --> 298 resp, got_stream = self._interpret_response(result, stream) 299 return resp, got_stream, self.api_key

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/openai/api_requestor.py:700, in APIRequestor._interpret_response(self, result, stream) 692 return ( 693 self._interpret_response_line( 694 line, result.status_code, result.headers, stream=True 695 ) 696 for line in parse_stream(result.iter_lines()) 697 ), True 698 else: 699 return ( --> 700 self._interpret_response_line( 701 result.content.decode("utf-8"), 702 result.status_code, 703 result.headers, 704 stream=False, 705 ), 706 False, 707 )

File ~/miniconda3/envs/pythonProject1/lib/python3.9/site-packages/openai/api_requestor.py:765, in APIRequestor._interpret_response_line(self, rbody, rcode, rheaders, stream) 763 stream_error = stream and "error" in resp.data 764 if stream_error or not 200 <= rcode < 300: --> 765 raise self.handle_error_response( 766 rbody, rcode, resp.data, rheaders, stream_error=stream_error 767 ) 768 return resp

InvalidRequestError: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.

MaartenGr commented 8 months ago

Hmmm, I have the feeling there are technically empty "documents" passed to OpenAI. Not your input documents but the vocabulary of the c-TF-IDF representations. At least, a part of it. This line gives it away (I think):

word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False)

The vocab variable is simply a list of words that are passed to OpenAI. However, I can imagine that there are some words not properly processed by OpenAI.

I see two potential solutions. Either pass in a custom vocab that only contains actual "words" and not things like "\n". Or a fix that takes the InvalidRequestError and returns an empty embedding. The last part would be a bit tricky if we do not know beforehand what the size of the embeddings should be.

chentitus commented 8 months ago

"there are some words not properly processed by OpenAI." - could it be Chinese symbols or punctuation marks, like ↓ 《 》 , ?

MaartenGr commented 8 months ago

That could be the case although I believe OpenAI has a multilingual backend but I might be mistaken. What I think is happening is that certain words/characters are preprocessed and removed before applying the embedding model or that indeed symbols and punctuation marks will not work with the embedding model if given by themselves.

chentitus commented 8 months ago

I removed Chinese punctuation and symbols from documents, and it worked! It seems that OpenAI cannot properly process them. Thank you @MaartenGr !

liaoelton commented 6 months ago

Hey @MaartenGr , I may found another cause to this '$.input' is invalid error issue. This could potentially originate from the method of selecting the top n words per topic based on the c-TF-IDF score (the else part) in both _bertopic.py and _keybert.py, which may inadvertently lead to requests for embeddings of empty words.

https://github.com/MaartenGr/BERTopic/blob/99ee553e3ee00fa7189d3210bdc618a7c7a943c8/bertopic/_bertopic.py#L4025-L4031

Given that SentenceTransformer's "all-MiniLM-L6-v2" model returns the same embeddings for "" and " ", how about preprocessing documents to replace empty strings with a whitespace for openai backend? I'm considering submitting a PR to address this. Would love to hear your thoughts or any additional insights you might have on this approach.

MaartenGr commented 6 months ago

@liaoelton Yes, that makes sense! I think you would only need to update the OpenAI backend to change any incoming "" to " ". I think doing any changes might make changes more isolated than if we were to change it throughout all embeddings.