Closed SuperBobRob closed 1 week ago
Hey there, @SuperBobRob! I'm here to help you with any issues you're facing. Let's work together to solve this bug!
It looks like the issue arises because the AutoMergingRetriever
cannot find the parent nodes in the document store when querying the VectorStoreIndex
loaded from ChromaDB. This could be due to the parent nodes not being properly stored or retrieved from ChromaDB.
Here are a few steps to ensure that the parent nodes are correctly stored and retrieved:
Ensure Parent Nodes are Stored: Make sure that both the leaf nodes and parent nodes are added to the document store in the initialization script. You are already adding the leaf nodes and parent nodes, but double-check that all nodes are correctly added.
storage_context.docstore.add_documents(leaf_nodes)
storage_context.docstore.add_documents(nodes)
Persist the Document Store: Ensure that the document store is persisted after adding the nodes. This step might be missing in your initialization script.
storage_context.docstore.persist()
Load the Document Store Correctly:
When loading the VectorStoreIndex
in the query script, ensure that the document store is correctly loaded and associated with the StorageContext
.
storage_context = StorageContext.from_defaults(vector_store=vector_store)
Check for Missing Parent Nodes: If the error persists, you can add some debugging code to check if the parent nodes are present in the document store before querying.
for node in nodes:
if node.parent_node:
parent_node_id = node.parent_node.node_id
try:
parent_node = storage_context.docstore.get_document(parent_node_id)
print(f"Parent node {parent_node_id} found.")
except ValueError:
print(f"Parent node {parent_node_id} not found.")
Initialization and Query Script Adjustments:
Ensure that the initialization and query scripts are consistent in how they handle the StorageContext
and VectorStoreIndex
.
Here is an updated version of your initialization script with the persistence step added:
# Initialization Script
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.retrievers.auto_merging_retriever import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.prompts import PromptTemplate
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext
from pathlib import Path
import sys
ROOT_PATH = Path(__file__).parents[1]
sys.path.insert(0, str(ROOT_PATH))
from lib.load_models import load_models_GPT4
from lib.automergingRetrivalFuncs import load_docs, get_nodes
print("0")
service_context = load_models_GPT4()
from constants.prompts import DAX_template_str
print("1")
orig_docs, merged_docs = load_docs(docs_dir=Path("./documents/DAX Auszug/ShortSample/"))
nodes, leaf_nodes = get_nodes(chunk_sizes=[2048, 512, 128], original_docs=orig_docs, merged_docs=merged_docs)
print("2")
db = chromadb.PersistentClient(path="./chroma_db/ShortSample/")
print("3")
chroma_collection = db.get_or_create_collection("ShortSample")
print("4")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
print("5")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context.docstore.add_documents(leaf_nodes)
storage_context.docstore.add_documents(nodes)
storage_context.docstore.persist() # Persist the document store
print("6")
index = VectorStoreIndex(leaf_nodes, storage_context=storage_context, show_progress=True, service_context=service_context, store_nodes_override=True)
base_retriever = index.as_retriever(similarity_top_k=10)
retriever = AutoMergingRetriever(base_retriever, index.storage_context, verbose=True)
print("7")
query_engine = RetrieverQueryEngine.from_args(
retriever,
service_context=service_context,
)
print("8")
query = "what is the revenue of Brenntag?"
print(query)
response = query_engine.query(query)
print(response)
And here is the query script:
# Query Script
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from pathlib import Path
import sys
ROOT_PATH = Path(__file__).parents[1]
sys.path.insert(0, str(ROOT_PATH))
from constants.prompts import DAX_template_str
from lib.load_models import load_models_GPT4
print("0")
service_context = load_models_GPT4()
print("1")
db = chromadb.PersistentClient(path="./chroma_db/ShortSample/")
print("2")
chroma_collection = db.get_or_create_collection("ShortSample")
print("3")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
print("4")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
print("5")
index = VectorStoreIndex.from_vector_store(vector_store, service_context=service_context)
print("6")
base_retriever = index.as_retriever(similarity_top_k=10)
retriever = AutoMergingRetriever(base_retriever, index.storage_context, verbose=True)
print("7")
query_engine = RetrieverQueryEngine.from_args(
retriever,
service_context=service_context,
)
print("8")
response = query_engine.query("what is the revenue of airbus?")
print(response)
response = query_engine.query("what is the revenue of Brenntag?")
print(response)
response = query_engine.query("what is the revenue of Zalando?")
print(response)
By ensuring that the document store is persisted and correctly loaded, the AutoMergingRetriever
should be able to find the parent nodes and function correctly [1][2].
To continue talking to Dosu, mention @dosu.
@dosu: thanks a lot, but it didn't resolve the issue. I've used the provided scripts... any other hint?
To resolve the ValueError: doc_id 778fcb4e-8775-4965-abde-77d558a395f6 not found
error when querying a VectorStoreIndex
loaded from ChromaDB using AutoMergingRetriever
, ensure that the document with the specified doc_id
exists in the ChromaDB collection. This error typically occurs when the document ID you are querying for does not exist in the collection.
Here are the steps to troubleshoot and resolve this issue:
Verify Document Existence: Ensure that the document with the doc_id
778fcb4e-8775-4965-abde-77d558a395f6
exists in the ChromaDB collection. You can do this by querying the collection directly to check if the document is present.
Add Missing Document: If the document is not found, you need to add it to the collection. Use the add
method of the ChromaVectorStore
class to add the document.
Here is an example of how you can add a document to the ChromaDB collection:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.legacy.schema import TextNode
# Create a Chroma client and collection
chroma_client = chromadb.HttpClient(host="your_host", port="your_port", ssl=True, headers={"Authorization": "Bearer your_token"})
chroma_collection = chroma_client.get_or_create_collection(name="your_collection_name")
# Initialize ChromaVectorStore
chroma_store = ChromaVectorStore(chroma_collection=chroma_collection)
# Create a TextNode with the specified doc_id
node = TextNode(
doc_id="778fcb4e-8775-4965-abde-77d558a395f6",
text="Your document text here",
metadata={"key": "value"} # Add any metadata if needed
)
# Add the node to the ChromaDB collection
chroma_store.add([node])
AutoMergingRetriever
.Additionally, you can refer to the following test cases to understand how to work with VectorStoreIndex
and AutoMergingRetriever
:
from typing import List, cast
from llama_index.core.indices.vector_store.base import VectorStoreIndex
from llama_index.core.schema import (
Document,
NodeRelationship,
QueryBundle,
RelatedNodeInfo,
TextNode,
)
from llama_index.core.service_context import ServiceContext
from llama_index.core.vector_stores.simple import SimpleVectorStore
def test_simple_query(
documents: List[Document],
mock_service_context: ServiceContext,
) -> None:
"""Test embedding query."""
index = VectorStoreIndex.from_documents(
documents, service_context=mock_service_context
)
# test embedding query
query_str = "What is?"
retriever = index.as_retriever(similarity_top_k=1)
nodes = retriever.retrieve(QueryBundle(query_str))
assert len(nodes) == 1
assert nodes[0].node.get_content() == "This is another test."
def test_query_and_similarity_scores(
mock_service_context: ServiceContext,
) -> None:
"""Test that sources nodes have similarity scores."""
doc_text = (
"Hello world.\n"
"This is a test.\n"
"This is another test.\n"
"This is a test v2."
)
document = Document(text=doc_text)
index = VectorStoreIndex.from_documents(
[document], service_context=mock_service_context
)
# test embedding query
query_str = "What is?"
retriever = index.as_retriever()
nodes = retriever.retrieve(QueryBundle(query_str))
assert len(nodes) > 0
assert nodes[0].score is not None
def test_simple_check_ids(
mock_service_context: ServiceContext,
) -> None:
"""Test build VectorStoreIndex."""
ref_doc_id = "ref_doc_id_test"
source_rel = {NodeRelationship.SOURCE: RelatedNodeInfo(node_id=ref_doc_id)}
all_nodes = [
TextNode(text="Hello world.", id_="node1", relationships=source_rel),
TextNode(text="This is a test.", id_="node2", relationships=source_rel),
TextNode(text="This is another test.", id_="node3", relationships=source_rel),
TextNode(text="This is a test v2.", id_="node4", relationships=source_rel),
]
index = VectorStoreIndex(all_nodes, service_context=mock_service_context)
# test query
query_str = "What is?"
retriever = index.as_retriever()
nodes = retriever.retrieve(QueryBundle(query_str))
assert nodes[0].node.get_content() == "This is another test."
assert nodes[0].node.ref_doc_id == "ref_doc_id_test"
assert nodes[0].node.node_id == "node3"
vector_store = cast(SimpleVectorStore, index._vector_store)
assert "node3" in vector_store._data.embedding_dict
assert "node3" in vector_store._data.text_id_to_ref_doc_id
def test_query(mock_service_context: ServiceContext) -> None:
"""Test embedding query."""
doc_text = (
"Hello world.\n"
"This is a test.\n"
"This is another test.\n"
"This is a test v2."
)
document = Document(text=doc_text)
index = VectorStoreIndex.from_documents(
[document], service_context=mock_service_context
)
# test embedding query
query_str = "What is?"
retriever = index.as_retriever()
_ = retriever.retrieve(QueryBundle(query_str))
By following these steps and referring to the test cases, you should be able to resolve the ValueError
and successfully query the VectorStoreIndex
[1][2][3][4].
To continue talking to Dosu, mention @dosu.
@SuperBobRob the auto merging retriever requires a docstore. But most vector dbs disable this to make storage easier.
You can override
index = VectorStoreIndex(...., store_nodes_override=True)
But then you need to take care of saving/loading the docstore.
Hi @logan-markewich,
Thanks a lot for the clarification! How can I achieve the storing/loading of the docstore? I think the storing should already be done by this:
storage_context.docstore.add_documents(leaf_nodes) storage_context.docstore.add_documents(nodes) storage_context.docstore.persist() # Persist the document store
So, I guess I need to change the Query-Script in order to load the docstore, right?
Thanks again and have a nice day, Robert
maybe @dosu can answer this?
To properly store and load the docstore to ensure that the AutoMergingRetriever
can find the parent nodes when querying the VectorStoreIndex
, you need to follow these steps:
Store the nodes in the docstore:
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
Define the storage context with the docstore:
storage_context = StorageContext.from_defaults(docstore=docstore)
Load the index into the vector index with the storage context:
base_index = VectorStoreIndex(
leaf_nodes,
storage_context=storage_context,
)
Define the retriever with the storage context:
base_retriever = base_index.as_retriever(similarity_top_k=6)
retriever = AutoMergingRetriever(base_retriever, storage_context, verbose=True)
To load the docstore correctly in your query script, ensure you initialize the SimpleDocumentStore
and add the nodes to it before creating the StorageContext
. Here is the relevant part of the script:
# Initialize the docstore and add nodes
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
# Define the storage context with the docstore
storage_context = StorageContext.from_defaults(docstore=docstore)
# Load the index into the vector index with the storage context
base_index = VectorStoreIndex(
leaf_nodes,
storage_context=storage_context,
)
# Define the retriever with the storage context
base_retriever = base_index.as_retriever(similarity_top_k=6)
retriever = AutoMergingRetriever(base_retriever, storage_context, verbose=True)
This ensures that the AutoMergingRetriever
can access the parent nodes when querying the VectorStoreIndex
[1][2][3][4].
To continue talking to Dosu, mention @dosu.
Bug Description
Hi there,
I want to save a vectorstoreindex to a chromaDB (thats what happens in the initialization script) and then - in another script - i want to read the vectorstoreindex from the chromaDB again and query it via an automergingretriever. The automergingretriever works fine in the initialization script. in the query script however, i receive the error shown in the traceback section. querying the vectorstoreindex without an automergingretriever works fine. but when using the automergingretriever it seems like the parent nodes wouldn't be stored in the chromaDB or so? I've checked the doc_id which is referred to in the error message. in the embedding_metadata table within the chroma.sqlite3 file this doc_id exists (even 5 times).
am i missing anything when setting up the chromaDB-VectorStore or when passing it to the AutoMergingRetriever? any ideas someone?
Thank in advance, Robert
Version
0.10.43
Steps to Reproduce
Initialization Script :
import chromadb from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.core.retrievers.auto_merging_retriever import AutoMergingRetriever from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core.prompts import PromptTemplate from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext
from pathlib import Path import sys
ROOT_PATH = Path(file).parents[1] sys.path.insert(0, str(ROOT_PATH))
from lib.load_models import load_models_GPT4 from lib.automergingRetrivalFuncs import ( load_docs, get_nodes, )
print("0") service_context = load_models_GPT4() from constants.prompts import DAX_template_str
print("1")
load some documents
orig_docs, merged_docs = load_docs(docs_dir=Path("./documents/DAX Auszug/ShortSample/")) nodes, leaf_nodes = get_nodes(chunk_sizes=[2048, 512, 128], original_docs=orig_docs, merged_docs=merged_docs)
print("2")
initialize client, setting path to save data
db = chromadb.PersistentClient(path="./chroma_db/ShortSample/")
print("3")
create collection
chroma_collection = db.get_or_create_collection("ShortSample")
print("4")
assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
vector_store.stores_text = True
print("5")
define storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store) storage_context.docstore.add_documents(leaf_nodes) storage_context.docstore.add_documents(nodes)
storage_context.docstore.add_documents(orig_docs)
storage_context.docstore.add_documents(merged_docs)
print("6")
create your index
index = VectorStoreIndex(leaf_nodes, storage_context=storage_context, show_progress=True, service_context=service_context, store_nodes_override=True)
index.vector_store.persist
index.docstore.add_documents(nodes)
index.docstore.persist
base_retriever = index.as_retriever(similarity_top_k=10) # one may experiment with this number retriever = AutoMergingRetriever(base_retriever, index.storage_context, verbose=True)
initializing the reranker model
rerank = SentenceTransformerRerank(top_n=6, model="BAAI/bge-reranker-base") # top_n open for experimentation
print("7")
setup the query engine
query_engine = RetrieverQueryEngine.from_args( retriever,
node_postprocessors=[rerank],
)
print("8")
query
query = "what is the revenue of Brenntag?" print(query) response = query_engine.query(query) print(response)
Query Script:
import chromadb from llama_index.core import VectorStoreIndex from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.core import StorageContext from llama_index.core.retrievers import AutoMergingRetriever from llama_index.core.query_engine import RetrieverQueryEngine from pathlib import Path import sys
ROOT_PATH = Path(file).parents[1] sys.path.insert(0, str(ROOT_PATH))
from constants.prompts import DAX_template_str from lib.load_models import load_models_GPT4
print("0") service_context = load_models_GPT4()
print("1")
initialize client
db = chromadb.PersistentClient(path="./chroma_db/ShortSample/")
print("2")
get collection
chroma_collection = db.get_or_create_collection("ShortSample")
print("3")
assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
print("4") storage_context = StorageContext.from_defaults(vector_store=vector_store)
print("5")
load your index from stored vectors
index = VectorStoreIndex.from_vector_store(vector_store, service_context=service_context)
print("6")
create a retriever
base_retriever = index.as_retriever(similarity_top_k=10) # one may experiment with this number
storage_context = StorageContext.from_defaults(index_store=index, docstore=index.docstore)
retriever = AutoMergingRetriever(base_retriever, index.storage_context, verbose=True)
print("7")
setup the query engine
query_engine = RetrieverQueryEngine.from_args( retriever,
node_postprocessors=[rerank],
service_context=service_context,
text_qa_template= PromptTemplate(DAX_template_str),
) query_engine = index.as_query_engine()
print("8")
query
response = query_engine.query("what is the revenue of airbus?") print(response) response = query_engine.query("what is the revenue of Brenntag?") print(response) response = query_engine.query("what is the revenue of Zalando?") print(response)
Relevant Logs/Tracbacks