Open Skar0 opened 1 month ago
I just realized that the RunnablePassthrough()
in the provided code sample from the documentation is not correct, as it results in the whole input (document key + question key) being passed through to the keywords
key and thus the prompt contains the full input dictionary where {question}
appears in the prompt.
This sample code (slightly modified from the example in the documentation)
from bertopic import BERTopic
from bertopic.representation import LangChain
from langchain.chains.question_answering import load_qa_chain
from langchain_core.documents import Document
from langchain_core.runnables import RunnablePassthrough
representation_llm = ...
representation_prompt = "summarize these documents, here are keywords about them [KEYWORDS]"
chain = (
{
"input_documents": (
lambda inp: [
Document(
page_content=d.page_content.upper()
)
for d in inp["input_documents"]
]
),
"question": RunnablePassthrough(),
}
| load_qa_chain(representation_llm, chain_type="stuff")
| (lambda output: {"output_text": output["output_text"]})
)
representation_model = LangChain(chain, prompt=representation_prompt, nr_docs=2)
docs = [
"The sky is blue and the sun is shining.",
"I love going to the beach on sunny days.",
"Artificial intelligence is transforming the world.",
"Machine learning enables computers to learn from data.",
"It's important to wear sunscreen to avoid sunburns.",
"Deep learning models require a lot of data and computation.",
"Today's weather forecast predicts a clear sky.",
"Neural networks are powerful models in AI.",
"I need to buy a new pair of sunglasses for summer.",
"Natural language processing is a subset of AI."
]
topic_model = BERTopic(representation_model=representation_model)
topics, probabilities = topic_model.fit_transform(docs)
results in this prompt being created
================================ System Message ================================
Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
DEEP LEARNING MODELS REQUIRE A LOT OF DATA AND COMPUTATION.
THE SKY IS BLUE AND THE SUN IS SHINING.
================================ Human Message =================================
{'input_documents': [Document(metadata={}, page_content='Deep learning models require a lot of data and computation.'), Document(metadata={}, page_content='The sky is blue and the sun is shining.')], 'question': 'summarize these documents, here are keywords about them to, is, the, of, learning, ai, data, models, and, sky, networks, neural, new, sunny, wear, on, pair, transforming, powerful, sunscreen, require, sunglasses, sunburns, todays, predicts, processing, subset, sun, summer, shining'}
I'll fix this as well in the PR 😄
If you need a typical user to test run this, am happy to do so. Am currently fighting to get it linked up and working in Jupyter with local Ollama models.
Here is a full example script with the new LangChain representation in action (both with basic and advanced usage, and with list output in the case of advanced usage).
import pandas as pd
from typing import List
from pydantic import BaseModel, Field
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import LangChain
from langchain_core.callbacks import BaseCallbackHandler
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import CommaSeparatedListOutputParser
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import AzureChatOpenAI
# Custom callback handler for logging the prompts used internally in the chains after they are formatted
class CustomCallbackHandler(BaseCallbackHandler):
def on_chat_model_start(self, serialized, messages, **kwargs):
for message in messages[0]:
message.pretty_print()
# List of documents
documents = [
"The sky is blue and the sun is shining.",
"I love going to the beach on sunny days.",
"Artificial intelligence is transforming the world.",
"Machine learning enables computers to learn from data.",
"It's important to wear sunscreen to avoid sunburns.",
"Deep learning models require a lot of data and computation.",
"Today's weather forecast predicts a clear sky.",
"Neural networks are powerful models in AI.",
"I need to buy a new pair of sunglasses for summer.",
"Natural language processing is a subset of AI."
]
# Create a sentence transformer model and compute embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=False)
# Create an AzureChatOpenAI model
azure_chat_model = AzureChatOpenAI(
azure_deployment="gpt-4o",
temperature=0,
azure_endpoint="redacted",
openai_api_key="redacted",
openai_api_version="2024-08-01-preview",
)
# Create a LangChain representation using the default prompt
# Demonstrates basic usage of LangChain with BERTopic using the default prompt.
langchain_repr_default_prompt = LangChain(
llm=azure_chat_model,
chain_config={
"callbacks": [
CustomCallbackHandler()
]
}
)
# Create a LangChain representation with a custom prompt
# Demonstrates how to use a custom prompt with LangChain and BERTopic.
custom_prompt = "Here is a list of documents: [DOCUMENTS], output a single keyword in Italian that represents these documents. Keyword:"
langchain_repr_custom_prompt = LangChain(
llm=azure_chat_model,
prompt=custom_prompt,
chain_config={
"callbacks": [
CustomCallbackHandler()
]
}
)
# Create a LangChain representation with structured output using a custom chain
# Pydantic model for structured output
class MultiLingualKeywords(BaseModel):
"""A class to contain keywords in multiple languages."""
keyword_en: str = Field(description='A keyword in English that represents the cluster.')
keywords_es: List[str] = Field(description='A list of several keywords in Spanish that represents the cluster.')
# Demonstrates how to use structured output with LangChain and BERTopic using Pydantic models.
azure_chat_model_structured_output = azure_chat_model.with_structured_output(MultiLingualKeywords, method="json_schema")
structured_output_prompt = ChatPromptTemplate.from_template(
"Here is a list of documents: {DOCUMENTS}, output a single keyword in English and a list of keywords in Spanish that represents these documents."
)
structured_output_chain = create_stuff_documents_chain(
llm=azure_chat_model_structured_output,
prompt=structured_output_prompt,
document_variable_name="DOCUMENTS",
output_parser=lambda x: [x.keyword_en] + x.keywords_es # Transforming the structured output into a list
)
langchain_repr_structured_output = LangChain(
chain=structured_output_chain,
chain_config={
"callbacks": [
CustomCallbackHandler()
]
}
)
# Create a LangChain representation that outputs a list of keywords
# Demonstrates how to output a list of keywords using a custom chain and output parser.
list_output_prompt = ChatPromptTemplate.from_template(
"Here is a list of documents: {DOCUMENTS}, output a comma-separated list of keywords in English that represents these documents."
)
list_output_chain = create_stuff_documents_chain(
llm=azure_chat_model,
prompt=list_output_prompt,
document_variable_name="DOCUMENTS",
output_parser=CommaSeparatedListOutputParser(),
)
langchain_repr_list_output = LangChain(
chain=list_output_chain,
chain_config={
"callbacks": [
CustomCallbackHandler()
]
}
)
# Create a custom chain that outputs a single string keyword
# Demonstrates how to create a custom chain that outputs a single string keyword.
single_keyword_prompt = ChatPromptTemplate.from_template(
"Here is a list of documents: {DOCUMENTS}, output a single keyword in English that represents these documents."
)
single_keyword_chain = create_stuff_documents_chain(
llm=azure_chat_model,
prompt=single_keyword_prompt,
document_variable_name="DOCUMENTS",
)
langchain_repr_single_keyword = LangChain(
chain=single_keyword_chain,
chain_config={
"callbacks": [
CustomCallbackHandler()
]
}
)
representation_models = {
"DefaultPrompt": langchain_repr_default_prompt,
"CustomPrompt": langchain_repr_custom_prompt,
"StructuredOutput": langchain_repr_structured_output,
"ListOutput": langchain_repr_list_output,
"SingleKeyword": langchain_repr_single_keyword
}
# Create and fit the BERTopic model with multiple representations
topic_model = BERTopic(representation_model=representation_models)
topics, probabilities = topic_model.fit_transform(documents, embeddings)
topic_info = topic_model.get_topic_info()
def is_non_empty(value):
if value is None:
return False
if isinstance(value, float) and pd.isnull(value):
return False
if isinstance(value, (list, tuple, set, str)):
return len(value) > 0
return True
# Access topics from different representation models
for index, row in topic_info.iterrows():
topic_num = row['Topic']
print(f"\nTopic {topic_num}:")
for representation_name in representation_models.keys():
if representation_name in topic_info.columns:
value = row[representation_name]
if is_non_empty(value):
print(f"Representation '{representation_name}': {value}")
And here are the formatted prompts and the resulting representations:
================================ Human Message =================================
This is a list of texts where each collection of texts describes a topic. After each collection of texts, the name of the topic they represent is mentioned as a short, highly descriptive title.
---
Topic:
Sample texts from this topic:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial-style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the worst food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
Keywords: meat beef eat eating emissions steak food health processed chicken
Topic name: Environmental impacts of eating meat
---
Topic:
Sample texts from this topic:
- I have ordered the product weeks ago but it still has not arrived!
- The website mentions that it only takes a couple of days to deliver but I still have not received mine.
- I got a message stating that I received the monitor but that is not true!
- It took a month longer to deliver than was advised...
Keywords: deliver weeks product shipping long delivery received arrived arrive week
Topic name: Shipping and delivery issues
---
Topic:
Sample texts from this topic:
I love going to the beach on sunny days.
I need to buy a new pair of sunglasses for summer.
Deep learning models require a lot of data and computation.
The sky is blue and the sun is shining.
Keywords: to, the, is, of, data, and, sky, models, learning, ai, artificial, computers, computation, clear, days, for, forecast, deep, enables, going, from, in, blue, buy, avoid, beach, are, learn, language, its
Topic name:
================================ Human Message =================================
Here is a list of documents: I love going to the beach on sunny days.
I need to buy a new pair of sunglasses for summer.
Deep learning models require a lot of data and computation.
The sky is blue and the sun is shining., output a single keyword in Italian that represents these documents. Keyword:
================================ Human Message =================================
Here is a list of documents: I love going to the beach on sunny days.
I need to buy a new pair of sunglasses for summer.
Deep learning models require a lot of data and computation.
The sky is blue and the sun is shining., output a single keyword in English and a list of keywords in Spanish that represents these documents.
================================ Human Message =================================
Here is a list of documents: I love going to the beach on sunny days.
I need to buy a new pair of sunglasses for summer.
Deep learning models require a lot of data and computation.
The sky is blue and the sun is shining., output a comma-separated list of keywords in English that represents these documents.
================================ Human Message =================================
Here is a list of documents: I love going to the beach on sunny days.
I need to buy a new pair of sunglasses for summer.
Deep learning models require a lot of data and computation.
The sky is blue and the sun is shining., output a single keyword in English that represents these documents.
Topic -1:
Representation 'DefaultPrompt': ['Mixed topics: Weather, Summer Activities, and Artificial Intelligence', '', '', '', '', '', '', '', '', '']
Representation 'CustomPrompt': ['Sole.', '', '', '', '', '', '', '', '', '']
Representation 'StructuredOutput': ['summer', 'verano', 'playa', 'sol', 'gafas de sol', '', '', '', '', '']
Representation 'ListOutput': ['beach', 'sunny days', 'sunglasses', 'summer', 'deep learning', 'models', 'data', 'computation', 'sky', 'blue']
Representation 'SingleKeyword': ['Sunshine', '', '', '', '', '', '', '', '', '']
As you can see, this implementation should allow for a very flexible way to create more or less complex representations.
@mepearson
If you need a typical user to test run this, am happy to do so. Am currently fighting to get it linked up and working in Jupyter with local Ollama models.
Awesome, that would be great!
Ran it with success (yay!) on our system, but got some weird results. Our data input is a set of transcripts of presentations on subsidence. Many of the labels were the types of 1 sentence, topic representation we would expect. And then some of them seem to include prompting information as well.
Here is an example of a row that seems weird (I saved the model output saved as csv and I no longer have access to the generating model, though can re-run it):
These columns are as expected:
Name:
75_study_applications_broader_completed
Representation:
['study', 'applications', 'broader', 'completed', 'neat', 'manuscript', 'piecing', 'pieces', 'trueshot', 'brush']
Representative_Docs:
['It was really neat to see the different applications from the really detailed to the broader.', 'Like Dr. Tissot talked about those different pieces, kind of piecing them together to really understand the broader picture.', 'Shortly after we completed this study, they asked me to come over and talk to their board about the results of the study.']
And then there is this output in the 'DefaultPrompt' column:
Study Results Presentation
---
Topic:
Sample texts from this topic:
- I'm looking for a new phone case that can protect my phone from drops and has a good grip.
- I need a phone case that is durable and can withstand daily use.
- I want a phone case that looks stylish but also provides good protection.
Keywords: phone, case, drop, protection, durable, grip, style, screen, cover, protect, shockproof, slim, design, bumper, wallet, clear, rubber, hard, plastic, silicone, leather
Topic name: Phone Case Selection
---
Topic:
Sample texts from this topic:
- I've been feeling really down lately and don't know how to get out of this funk.
- I feel like I'm stuck in a rut and can't seem to find my motivation.
- I'm struggling with anxiety and depression, and it's hard for me to enjoy things anymore.
Keywords: down, feeling, depressed, anxious, rut, funk, motivation, struggle, mental, health, therapy, support, coping, mechanisms, self, care, exercise, mindfulness, meditation, counseling, treatment
Topic name: Overcoming Mental Health Struggles
Edit: Info on Model run .
Text preparation:
# Sentencize the transcripts and track their titles
docs= []
files = []
for file, transcript in zip(list(whisper_df['file']),list(whisper_df['transcript'])):
sentences = sent_tokenize(transcript)
docs.extend(sentences)
files.extend([file] * len(sentences))
from sentence_transformers import SentenceTransformer
# Create embeddings from the documents
sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
embeddings = sentence_model.encode(docs)
LLM Model Representation:
ollama_chat_model = ChatOllama(
model="mixtral",
temperature=0,
# other params...
)
langchain_repr_default_prompt = langchain_rep.LangChain(
llm=ollama_chat_model,
chain_config={
"callbacks": [
# CustomCallbackHandler()
]
}
)
representation_models["DefaultPrompt"] = langchain_repr_default_prompt
BerTopic Topic Model:
topic_model = BERTopic(
embedding_model=sentence_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer,
representation_model=representation_models
).fit(docs, embeddings)
Ran it with success (yay!) on our system, but got some weird results. Our data input is a set of transcripts of presentations on subsidence. Many of the labels were the types of 1 sentence, topic representation we would expect. And then some of them seem to include prompting information as well.
Thank you for trying this out! 😄
I am to understand from your comment that the representation ran fine with the default prompt, but that some of the generated representations actually look like the prompt (like the example you shared)? If so, could it be an hallucination from the model (because it's not used to being prompted like this)? Is the data in the representation fake data or is it related to your documents?
If this happens a lot, I can give you a piece of code that splits the examples into several messages instead of a single prompt with examples. This could improve things, but requires using a custom chain as it is not really compatible with the approach of providing a single string for a prompt in the basic usage.
@mepearson To me, it actually seems like the stop token is not correctly initialized/chosen since it should have stopped after Study Results Presentation
which seems like a decent topic label considering the documents and keywords. I don't think this is hallucination but merely the model continuing to create tokens.
@Skar0 Could that be related to the PR? I'm guessing it likely relates to the underlying LLM+token settings but want to make sure.
@mepearson To me, it actually seems like the stop token is not correctly initialized/chosen since it should have stopped after
Study Results Presentation
which seems like a decent topic label considering the documents and keywords. I don't think this is hallucination but merely the model continuing to create tokens.@Skar0 Could that be related to the PR? I'm guessing it likely relates to the underlying LLM+token settings but want to make sure.
Full disclosure: I am running this within Bertopic because I'm still figuring out how to configure / piece together the LLM pipeline. I would be perfectly happy to run the Bertopic model and THEN run the outputs (and probably just say the top X topics at first as we will likely want to use those to guide the iterations on the model.)
All of which means I don't actually understand where in the code the stop tokens are initialized, and why it would vary between the assorted runs. I have commented out the CustomCallbackHandler()
because it was just too much in the notebook, with an intent to convert that to a log file but haven't completed that yet.
To answer the question - the sentences listed in the prompt section of the representation don't come from our data. I don't know if they are created whole cloth from the pattern or if they exist somewhere as example prompts.
I'm back on the system today, so am re-running it using llama2:7b this time rather than mixtral.
EDIT: I'll add I'm using the nltk.tokenize and paraphrase-multilingual-mpnet-base-v2
sentence model as these were in the examples I'm following and I try to change as few variables as possible. But I'm not familiar with either and don't know if this may be coming from there.
So this may have something to do with how the prompts are formatted, or the integration with Ollama models? I ran it again but also did an OpenAI representation model at the same time, which gives the results I'm expecting.
These are the top couple topics for this run. It seems the Default Prompt gives 3 topic names / keywords, but the first 2 are repeated for each entry.
EDIT: looking at the langchain_rep.py I copied in order to try and use this, the items in 1 and 2 come straight from there, so I'm guessing there's some integration piece I'm missing?
Representation: ['gps', 'stations', 'station', 'grid', 'data', 'insar', 'reference', 'constrain', 'insert', 'calibration']
OpenAI Representation: GPS station distribution and spatial uniformity
Default Prompt Representation:
It seems like you have provided a list of topics and sample texts for each topic. Here are the topic names and their corresponding keywords:
1. Environmental impacts of eating meat
* Keywords: meat, beef, eat, eating, emissions, steak, food, health, processed, chicken
2. Shipping and delivery issues
* Keywords: deliver, weeks, product, shipping, long, delivery, received, arrived, arrive, week
3. GPS installation and data analysis
* Keywords: gps, stations, station, grid, data, insar, reference, constrain, insert, calibration, spatial, south, continuous, operating, network, uh, validation, north, permanent, dense, extensometer, inner, surface, accuracy, need, calibrated, installation, using, points, use
Please let me know if you have any questions or if there's anything else I can help you with!
-------------------------------------
Representation: ['appreciate', 'thank', 'today', 'thanks', 'presentation', 'dr', 'mike', 'joining', 'time', 'informative']
OpenAI Representation: Appreciation and Thanks in Today's Presentation
Default Prompt Representation:
It seems like you have provided a list of topics and associated texts. Here are the topic names and corresponding keywords for each topic:
1. Environmental impacts of eating meat
* Keywords: meat, beef, eat, emissions, steak, food, health, processed, chicken
2. Shipping and delivery issues
* Keywords: deliver, weeks, product, shipping, long, delivery, received, arrived, week
3. Appreciation and thanks
* Keywords: appreciate, thank, today, thanks, presentation, dr, mike, joining, time, informative, mukesh, coming, presenting, really, talk, khan, attending, listening, sharing, august, forward, present, great, listen, participating, inviting, john, doing, fantastic, gibbo
-------------------------------------
Representation: ['subsidence', 'kind', 'studies', 'areas', 'pretty', 'created', 'think', 'workshop', 'focus', 'prediction']
OpenAI Representation: Subsidence in Active Fault Zones and its Impact on Living Areas
Default Prompt Representation:
The topics in this list are:
1. Environmental impacts of eating meat
2. Shipping and delivery issues
3. Subsidence in certain areas
Each topic has a set of sample texts that describe the topic in question. The keywords for each topic are also provided, along with a short, highly descriptive title for each topic.
-------------------------------------
Representation: ['regulatory', 'regional', 'transmission', 'state', 'northeast', 'public', 'east', 'efforts', 'plan', 'centralize']
OpenAI Representation: Regulatory Plan Review and Joint Efforts in Northeast Transmission Projects
Default Prompt Representation:
It seems like you have provided a list of topics and sample texts for each topic. Here are the topic names and their corresponding keywords:
1. Environmental impacts of eating meat
* Keywords: meat, beef, eat, emissions, steak, food, health, processed, chicken
2. Shipping and delivery issues
* Keywords: deliver, weeks, product, shipping, long, delivery, received, arrived, arrive, week
3. Regulatory plan review
* Keywords: regulatory, regional, transmission, state, northeast, public, east, efforts, plan, centralize, underway, agencies, uplift, example, international, stakeholders, growing, area, local, statewide, north, things, tasks, sec, canada, joint, context, projects, review, city
Please let me know if you have any further questions or if there's anything else I can help you with!
-------------------------------------
All of which means I don't actually understand where in the code the stop tokens are initialized, and why it would vary between the assorted runs. I have commented out the CustomCallbackHandler() because it was just too much in the notebook, with an intent to convert that to a log file but haven't completed that yet.
Typically, stop tokens are created when you either initialize the model or when you run a given generation. I'm not sure though where that's the case for LangChain. This might give you an idea how to do that when you load in the model.
These are the top couple topics for this run. It seems the Default Prompt gives 3 topic names / keywords, but the first 2 are repeated for each entry.
Representation: ['gps', 'stations', 'station', 'grid', 'data', 'insar', 'reference', 'constrain', 'insert', 'calibration'] OpenAI Representation: GPS station distribution and spatial uniformity
Default Prompt Representation: It seems like you have provided a list of topics and sample texts for each topic. Here are the topic names and their corresponding keywords:
Please let me know if you have any questions or if there's anything else I can help you with!
Representation: ['appreciate', 'thank', 'today', 'thanks', 'presentation', 'dr', 'mike', 'joining', 'time', 'informative'] OpenAI Representation: Appreciation and Thanks in Today's Presentation
Default Prompt Representation: It seems like you have provided a list of topics and associated texts. Here are the topic names and corresponding keywords for each topic:
Representation: ['subsidence', 'kind', 'studies', 'areas', 'pretty', 'created', 'think', 'workshop', 'focus', 'prediction'] OpenAI Representation: Subsidence in Active Fault Zones and its Impact on Living Areas
Default Prompt Representation: The topics in this list are:
Each topic has a set of sample texts that describe the topic in question. The keywords for each topic are also provided, along with a short, highly descriptive title for each topic.
Representation: ['regulatory', 'regional', 'transmission', 'state', 'northeast', 'public', 'east', 'efforts', 'plan', 'centralize'] OpenAI Representation: Regulatory Plan Review and Joint Efforts in Northeast Transmission Projects
Default Prompt Representation: It seems like you have provided a list of topics and sample texts for each topic. Here are the topic names and their corresponding keywords:
Please let me know if you have any further questions or if there's anything else I can help you with!
Thanks for sharing this but I'm not entirely sure what I'm looking at. Is this the output of one topic or perhaps more topics? Or is this the prompt for each topic? Could you describe what all these sections mean?
Sorry for confusion - that was just a couple columns of output from rows 1:5 of the topic_model.get_topic_info()
dataframe presented in a more readable form.
So it's the output from the 'Representation' , 'OpenAI' and first item in the 'DefaultPrompt' list.
It may also be that I need to grab more code from the branch. As of now I'm just using the langchain_rep.py
file, which runs but may not run correctly
langhcain_rep.py file:
import pandas as pd
from langchain_core.documents import Document
from scipy.sparse import csr_matrix
from typing import Callable, Mapping, List, Tuple, Union
from langchain_core.language_models import LanguageModelLike
from langchain_core.runnables import Runnable
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from bertopic.representation._base import BaseRepresentation
from bertopic.representation._utils import truncate_document
DEFAULT_PROMPT = """
This is a list of texts where each collection of texts describes a topic. After each collection of texts, the name of the topic they represent is mentioned as a short, highly descriptive title.
---
Topic:
Sample texts from this topic:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial-style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the worst food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
Keywords: meat beef eat eating emissions steak food health processed chicken
Topic name: Environmental impacts of eating meat
---
Topic:
Sample texts from this topic:
- I have ordered the product weeks ago but it still has not arrived!
- The website mentions that it only takes a couple of days to deliver but I still have not received mine.
- I got a message stating that I received the monitor but that is not true!
- It took a month longer to deliver than was advised...
Keywords: deliver weeks product shipping long delivery received arrived arrive week
Topic name: Shipping and delivery issues
---
Topic:
Sample texts from this topic:
[DOCUMENTS]
Keywords: [KEYWORDS]
Topic name:"""
class LangChain(BaseRepresentation):
"""This representation model uses LangChain to generate descriptive topic labels.
It supports two main usage patterns:
1. Basic usage with a language model and optional custom prompt
2. Advanced usage with a custom LangChain chain for full control over the generation process
Arguments:
llm: A LangChain text model or chat model used to generate representations, only needed for basic usage.
Examples include ChatOpenAI or ChatAnthropic. Ignored if a custom chain is provided.
prompt: A string template containing the placeholder [DOCUMENTS] and optionally [KEYWORDS], only needed for basic usage.
Defaults to a pre-defined prompt defined in DEFAULT_PROMPT. Ignored if a custom chain is provided.
chain: A custom LangChain chain to generate representations, only needed for advanced usage.
The chain must be a LangChain Runnable that implements the batch method and accepts these input keys:
- DOCUMENTS: (required) A list of LangChain Document objects
- KEYWORDS: (optional) A list of topic keywords
The chain must directly output either a string label or a list of strings.
If provided, llm and prompt are ignored.
nr_docs: The number of documents to pass to LangChain
diversity: The diversity of documents to pass to LangChain.
Accepts values between 0 and 1. A higher
values results in passing more diverse documents
whereas lower values passes more similar documents.
doc_length: The maximum length of each document. If a document is longer,
it will be truncated. If None, the entire document is passed.
tokenizer: The tokenizer used to calculate to split the document into segments
used to count the length of a document.
* If tokenizer is 'char', then the document is split up
into characters which are counted to adhere to `doc_length`
* If tokenizer is 'whitespace', the document is split up
into words separated by whitespaces. These words are counted
and truncated depending on `doc_length`
* If tokenizer is 'vectorizer', then the internal CountVectorizer
is used to tokenize the document. These tokens are counted
and truncated depending on `doc_length`. They are decoded with
whitespaces.
* If tokenizer is a callable, then that callable is used to tokenize
the document. These tokens are counted and truncated depending
on `doc_length`
chain_config: The configuration for the LangChain chain. Can be used to set options like max_concurrency to avoid rate limiting errors.
Usage:
To use this representation, you will need to install the LangChain package first.
`pip install langchain`
There are two ways to use the LangChain representation:
1. Use a default LangChain chain that is created using an underlying language model and a prompt.
You will first need to install the package for the underlying model. For example, if you want to use OpenAI:
`pip install langchain_openai`
```python
from bertopic.representation import LangChain
from langchain_openai import ChatOpenAI
chat_model = ChatOpenAI(temperature=0, openai_api_key=my_openai_api_key)
# Create your representation model with the pre-defined prompt
representation_model = LangChain(llm=chat_model)
# Create your representation model with a custom prompt
prompt = "What are these documents about? [DOCUMENTS] Here are keywords related to them [KEYWORDS]."
representation_model = LangChain(llm=chat_model, prompt=prompt)
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
2. Use a custom LangChain chain for full control over the generation process:
Remember that the chain will receive two inputs: `DOCUMENTS` and `KEYWORDS` and that it must return directly a string label
or a list of strings.
```python
from bertopic.representation import LangChain
from langchain_anthropic import ChatAnthropic
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer
prompt = ...
chat_model = ...
# We will construct a special privacy-preserving chain using Microsoft Presidio
pii_handler = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"])
chain = (
{
"DOCUMENTS": (
lambda inp: [
Document(
page_content=pii_handler.anonymize(
d.page_content,
language="en",
),
)
for d in inp["DOCUMENTS"]
]
),
"KEYWORDS": lambda keywords: keywords["KEYWORDS"],
}
| create_stuff_documents_chain(chat_model, prompt, document_variable_name="DOCUMENTS")
)
representation_model = LangChain(chain=chain)
```
"""
def __init__(
self,
llm: LanguageModelLike = None,
prompt: str = DEFAULT_PROMPT,
chain: Runnable = None,
nr_docs: int = 4,
diversity: float = None,
doc_length: int = None,
tokenizer: Union[str, Callable] = None,
chain_config: dict = None,
):
self.prompt = prompt
if chain is not None:
self.chain = chain
elif llm is not None:
# Check that the prompt contains the necessary placeholder
if "[DOCUMENTS]" not in prompt:
raise ValueError("The prompt must contain the placeholder [DOCUMENTS]")
# Convert prompt placeholders to the LangChain format
langchain_prompt = prompt.replace("[DOCUMENTS]", "{DOCUMENTS}").replace("[KEYWORDS]", "{KEYWORDS}")
# Create ChatPromptTemplate
chat_prompt = ChatPromptTemplate.from_template(langchain_prompt)
# Create a basic LangChain chain using create_stuff_documents_chain
self.chain = create_stuff_documents_chain(llm, chat_prompt, document_variable_name="DOCUMENTS")
else:
raise ValueError("Either `llm` or `chain` must be provided")
self.nr_docs = nr_docs
self.diversity = diversity
self.doc_length = doc_length
self.tokenizer = tokenizer
self.chain_config = chain_config
def extract_topics(
self,
topic_model,
documents: pd.DataFrame,
c_tf_idf: csr_matrix,
topics: Mapping[str, List[Tuple[str, float]]],
) -> Mapping[str, List[Tuple[str, int]]]:
"""Extract topics.
Arguments:
topic_model: A BERTopic model
documents: All input documents
c_tf_idf: The topic c-TF-IDF representation
topics: The candidate topics as calculated with c-TF-IDF
Returns:
updated_topics: Updated topic representations
"""
# Extract the top 4 representative documents per topic
repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(
c_tf_idf=c_tf_idf,
documents=documents,
topics=topics,
nr_samples=500,
nr_repr_docs=self.nr_docs,
diversity=self.diversity,
)
# Generate label using langchain's batch functionality
chain_docs: List[List[Document]] = [
[
Document(page_content=truncate_document(topic_model, self.doc_length, self.tokenizer, doc))
for doc in docs
]
for docs in repr_docs_mappings.values()
]
# Extract keywords from the topics and format them as a string
formatted_keywords_list = []
for topic in topics:
keywords = list(zip(*topics[topic]))[0]
formatted_keywords_list.append(", ".join(keywords))
# self.chain must accept DOCUMENTS as a mandatory input key and KEYWORDS as an optional input key
# We always pass both keys to the chain, and the chain can choose to use them or not
# Documents are passed as a list of LangChain Document objects, it is up to the chain to format them into a string
inputs = [
{"DOCUMENTS": docs, "KEYWORDS": formatted_keywords}
for docs, formatted_keywords in zip(chain_docs, formatted_keywords_list)
]
# self.chain must return a string label or a list of string labels for each input
outputs = self.chain.batch(inputs=inputs, config=self.chain_config)
# Process outputs from the chain - can be either strings or lists of strings
updated_topics = {}
for topic, output in zip(repr_docs_mappings.keys(), outputs):
# Each output can be either:
# - A single string representing the main topic label
# - A list of strings representing multiple related labels
if isinstance(output, str):
# For string output: use it as the main label (weight=1)
# and pad with 9 empty strings (weight=0)
labels = [(output.strip(), 1)] + [("", 0) for _ in range(9)]
else:
# For list output:
# 1. Convert all elements to stripped strings
# 2. Take up to 10 elements
# 3. Assign decreasing weights from 1.0 to 0.1
# 4. Pad with empty strings if needed to always have 10 elements
clean_outputs = [str(label).strip() for label in output]
top_labels = clean_outputs[:10]
# Create (label, weight) pairs with decreasing weights
labels = [(label, 1.0 - (i * 0.1)) for i, label in enumerate(top_labels)]
# Pad with empty strings if we have less than 10 labels
if len(labels) < 10:
labels.extend([("", 0.0) for _ in range(10 - len(labels))])
updated_topics[topic] = labels
return updated_topics
@mepearson I might be missing something here, but in the examples you showed the OpenAI Representation:
seems like a good label, right?
@mepearson Also, do you perhaps have a reproducible example? That would make it much easier to see what is happening.
@MaartenGr - yes, OpenAI is working great and is what I expected to get. It's when I use our internal Ollama models with the new langchain connection that things get wonky.
While I think the transcripts I was using are from public videos, they aren't mine so I don't want to share them. But I'll rerun the model with a news dataset or something and share that.
I might be missing something here, but in the examples you showed the OpenAI Representation: seems like a good label, right?
I think that in the provided examples of output, several representations were configured: one with OpenAI (I guess the existing OpenAI representation from BERTopic) and one with the representation from this PR using a model through LangChain's Ollama integration. It does indeed seem like the OpenAI representation yields good labels while the LangChain representation sometimes yields a label that looks like a prompt, or that contains garbage (like the model saying "It seems like you have provided a list of topics ..."). To be fair, this seems to me like a problem with the LLM that you are using or the LangChain integration with Ollama rather than an issue with the implementation of the LangChain integration. I think that this can be confirmed by using the OpenAI model in the LangChain representation instead of using the Ollama model - unless this is already what is meant by your OpenAI Representation
.
yes, OpenAI is working great and is what I expected to get. It's when I use our internal Ollama models with the new langchain connection that things get wonky.
I wonder if it's related to how the messages are passed to your Ollama model by LangChain. At the moment, with the default prompt that I set up as a single string in the LangChain representation, a single LangChain message will be provided to the model. That messages contains formatted examples which may confuse the model. Could you try using the LangChain representation with another prompt, or with the same prompt formatted as a list of messages (to split prompt and completion in examples)? I provide examples on how to do this below.
from bertopic import BERTopic
from bertopic.representation import LangChain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_ollama.chat_models import ChatOllama
# Your data preparation ...
# Setting-up your connection to the Ollama model
ollama_chat_model = ChatOllama(...)
# Create a LangChain representation with a custom prompt that **does not contain examples**
custom_prompt = """I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short topic label in the following format:
topic: <topic label>
"""
langchain_repr_custom_prompt = LangChain(
llm=ollama_chat_model,
prompt=custom_prompt,
chain_config={
"callbacks": [
# CustomCallbackHandler()
]
}
)
# Create a LangChain representation with the same prompt as the hard-coded one used in the basic usage,
# **where examples are split into several messages**
split_examples_prompt = ChatPromptTemplate.from_messages([
("system",
"This is a list of texts where each collection of texts describes a topic. "
"After each collection of texts, the name of the topic they represent is mentioned as a short, highly descriptive title."),
("human",
"Topic:\n"
"Sample texts from this topic:\n"
"Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial-style meat production and factory farming, meat has become a staple food.\n"
"Meat, but especially beef, is the worst food in terms of emissions.\n"
"Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.\n"
"Keywords: meat, beef, eat, eating, emissions, steak, food, health, processed, chicken\n"
"Topic name:"),
("ai", "Environmental impacts of eating meat"),
("human",
"Topic:\n"
"Sample texts from this topic:\n"
"I have ordered the product weeks ago but it still has not arrived!\n"
"The website mentions that it only takes a couple of days to deliver but I still have not received mine.\n"
"I got a message stating that I received the monitor but that is not true!\n"
"It took a month longer to deliver than was advised...\n"
"Keywords: deliver, weeks, product, shipping, long, delivery, received, arrived, arrive, week\n"
"Topic name:"),
("ai", "Shipping and delivery issues"),
("human",
"Topic:\n"
"Sample texts from this topic:\n"
"{DOCUMENTS}\n"
"Keywords: {KEYWORDS}\n"
"Topic name:")
])
split_examples_chain = create_stuff_documents_chain(
llm=ollama_chat_model,
prompt=split_examples_prompt,
document_variable_name="DOCUMENTS",
)
langchain_repr_split_examples = LangChain(
chain=split_examples_chain,
chain_config={
"callbacks": [
# CustomCallbackHandler()
]
}
)
representation_models = {
"prompt-no-examples": langchain_repr_custom_prompt,
"custom-chain-split-examples": langchain_repr_split_examples,
}
# Rest of your BERTopic setup ...
What does this PR do?
WIP, fixes https://github.com/MaartenGr/BERTopic/issues/2187