Closed springrain closed 3 months ago
Hello! I ran the example and could not reproduce the error.
Can you add a reproducible code example that triggers the error?
Hello! I ran the example and could not reproduce the error.
Can you add a reproducible code example that triggers the error?
import os
from haystack.components.writers import DocumentWriter
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack import Pipeline
from pathlib import Path
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack.document_stores.types import DuplicatePolicy
OPENAI_API_KEY = "sk-dummytoken1234567890abcdef"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
document_store = ChromaDocumentStore(persist_path="./chroma")
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf"])
text_file_converter = TextFileToDocument()
pdf_converter = PyPDFToDocument()
document_joiner = DocumentJoiner()
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=30)
document_embedder = OpenAIDocumentEmbedder(model="bge-large-zh-v1.5",api_base_url="http://192.168.1.10:9998/v1")
document_writer = DocumentWriter(document_store=document_store,policy=DuplicatePolicy.SKIP)
index_pipeline = Pipeline()
index_pipeline.add_component(instance=file_type_router, name="file_type_router")
index_pipeline.add_component(instance=text_file_converter, name="text_file_converter")
index_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
index_pipeline.add_component(instance=document_joiner, name="document_joiner")
index_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
index_pipeline.add_component(instance=document_splitter, name="document_splitter")
index_pipeline.add_component(instance=document_embedder, name="document_embedder")
index_pipeline.add_component(instance=document_writer, name="document_writer")
index_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
index_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
index_pipeline.connect("text_file_converter", "document_joiner")
index_pipeline.connect("pypdf_converter", "document_joiner")
index_pipeline.connect("document_joiner", "document_cleaner")
index_pipeline.connect("document_cleaner", "document_splitter")
index_pipeline.connect("document_splitter", "document_embedder")
index_pipeline.connect("document_embedder", "document_writer")
result = index_pipeline.run(data={"file_type_router": {"sources": list(Path("./testdata").glob("**/*"))}})
print(result)
Ok, you are right. Thanks for reporting the bug!
from haystack.components.preprocessors import DocumentSplitter
from haystack import Document
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
document_store = ChromaDocumentStore()
documents = [Document(content = "This is a test document to split "*10)]
document_splitter = DocumentSplitter(split_by="word", split_length=5, split_overlap=2)
splitted_docs=document_splitter.run(documents=documents)["documents"]
print(splitted_docs[0].meta)
# {'source_id': '7c1703594787e30800683d64673880811611051d4444f08d8619f8fba6ab1480', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0,
# '_split_overlap': [{'doc_id': '187258e53ee90d2cf6f67f2e63e17d390c195b929b8721572e61c9e389a23e8d', 'range': (0, 13)}]}
document_store.write_documents(splitted_docs)
ValueError: Expected metadata value to be a str, int, float or bool, got [{'doc_id': '187258e53ee90d2cf6f67f2e63e17d390c195b929b8721572e61c9e389a23e8d', 'range': (0, 13)}] which is a list
In https://github.com/deepset-ai/haystack/pull/7933, we added the _split_overlap
list to meta
, but Chroma cannot handle lists in metadata.
One solution in ChromaDocumentStore
might be to check the type of the single entry in meta
before writing to Chroma and to discard the entry if not valid.
Okay, thanks, I'm currently using Haystack 2.2.4
I reopened the issue because this is a bug we should fix.
@springrain fixed and released a new version of chroma-haystack
: https://pypi.org/project/chroma-haystack/0.21.1/
Describe the bug chroma-haystack 0.20.1 is not compatible with haystack 2.3.0.
Describe your environment (please complete the following information):