[X] I added a very descriptive title to this issue.
[X] I searched the LangChain documentation with the integrated search.
[X] I used the GitHub search to find a similar question and didn't find it.
[X] I am sure that this is a bug in LangChain rather than my code.
[X] The bug is not resolved by updating to the latest stable version of LangChain (or the specific integration package).
Example Code
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader
from chromadb import PersistentClient
import os
CHROMA_PATH = "Chroma"
# get OpenAI Embedding model
text_embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
collection_name = "test"
embeddings_model=HuggingFaceEmbeddings(model_name=text_embedding_model_name, model_kwargs={'device': 'cpu'})
client = PersistentClient(path=CHROMA_PATH)
print(client.get_max_batch_size())
if not collection_name in [c.name for c in client.list_collections()]:
DOC_PATHS = ["Novel.txt"]
loaders = [TextLoader(path) for path in DOC_PATHS]
pages = []
for loader in loaders:
pages.extend(loader.load())
# split the doc into smaller chunks i.e. chunk_size=500
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
chunks = text_splitter.split_documents(pages)
Chroma.from_documents(chunks,
embeddings_model,
collection_name=collection_name,
collection_metadata ={"dimensionality": 384},
persist_directory=CHROMA_PATH,
client = client)
Error Message and Stack Trace (if applicable)
Traceback (most recent call last):
File "D:\github\llm\app.py", line 32, in
db_chroma = Chroma.from_documents(chunks,
File "D:\github\llm\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 878, in from_documents
return cls.from_texts(
File "D:\github\llm\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 842, in from_texts
chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids)
File "D:\github\llm\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 313, in add_texts
raise e
File "D:\github\llm\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 299, in add_texts
self._collection.upsert(
File "D:\github\llm\venv\lib\site-packages\chromadb\api\models\Collection.py", line 300, in upsert
self._client._upsert(
File "D:\github\llm\venv\lib\site-packages\chromadb\telemetry\opentelemetry__init__.py", line 146, in wrapper
return f(*args, **kwargs)
File "D:\github\llm\venv\lib\site-packages\chromadb\api\segment.py", line 429, in _upsert
validate_batch(
File "D:\github\llm\venv\lib\site-packages\chromadb\api\types.py", line 541, in validate_batch
raise ValueError(
ValueError: Batch size 10337 exceeds maximum batch size 5461
Description
Embeddings should be stored in the chroma db, in the batches.
I was able to fix this issue with a little twaek in following file :
venv\Lib\site-packages\langchain_community\vectorstores\chroma.py
line number: 825, 826, 827
current
if hasattr(
chroma_collection._client, "max_batch_size"
): # for Chroma 0.4.10 and above
changed lines
if hasattr(
chroma_collection._client, "get_max_batch_size"
): # for Chroma 0.4.10 and above
System Info
plateform: windows
python version: 3.10.11
langchain version: 0.2.14
Checked other resources
Example Code
Error Message and Stack Trace (if applicable)
Traceback (most recent call last): File "D:\github\llm\app.py", line 32, in
db_chroma = Chroma.from_documents(chunks,
File "D:\github\llm\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 878, in from_documents
return cls.from_texts(
File "D:\github\llm\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 842, in from_texts
chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids)
File "D:\github\llm\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 313, in add_texts
raise e
File "D:\github\llm\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 299, in add_texts
self._collection.upsert(
File "D:\github\llm\venv\lib\site-packages\chromadb\api\models\Collection.py", line 300, in upsert
self._client._upsert(
File "D:\github\llm\venv\lib\site-packages\chromadb\telemetry\opentelemetry__init__.py", line 146, in wrapper
return f(*args, **kwargs)
File "D:\github\llm\venv\lib\site-packages\chromadb\api\segment.py", line 429, in _upsert
validate_batch(
File "D:\github\llm\venv\lib\site-packages\chromadb\api\types.py", line 541, in validate_batch
raise ValueError(
ValueError: Batch size 10337 exceeds maximum batch size 5461
Description
Embeddings should be stored in the chroma db, in the batches.
I was able to fix this issue with a little twaek in following file : venv\Lib\site-packages\langchain_community\vectorstores\chroma.py
line number: 825, 826, 827 current if hasattr( chroma_collection._client, "max_batch_size" ): # for Chroma 0.4.10 and above
changed lines if hasattr( chroma_collection._client, "get_max_batch_size" ): # for Chroma 0.4.10 and above
System Info
plateform: windows python version: 3.10.11 langchain version: 0.2.14