Chromadb embeddings not working with densex

run-llama / llama_index

LlamaIndex is a data framework for your LLM applications

MIT License

36.85k stars 5.28k forks source link

nodes = text_splitter.get_nodes_from_documents(documents) sub_nodes = self._gen_propositions(nodes) all_nodes = nodes + sub_nodes all_nodes_dict = {n.node_id: n for n in all_nodes} service_context = ServiceContext.from_defaults( llm=query_llm, embed_model=embed_model, num_output=self._proposition_llm.metadata.num_output, ) if os.path.exists('./chroma_db'): chroma_client = chromadb.PersistentClient(path="./chroma_db") chroma_collection = chroma_client.get_or_create_collection("quickstart") vector_store = ChromaVectorStore(chroma_collection=chroma_collection) storage_context = StorageContext.from_defaults(vector_store=vector_store) self.vector_index = VectorStoreIndex.from_vector_store(vector_store,service_context=service_context,storage_context=storage_context) else: chroma_client = chromadb.PersistentClient(path="./chroma_db") chroma_collection = chroma_client.get_or_create_collection("quickstart") vector_store = ChromaVectorStore(chroma_collection=chroma_collection) storage_context = StorageContext.from_defaults(vector_store=vector_store) self.vector_index = VectorStoreIndex( all_nodes, service_context=service_context, show_progress=True,storage_context=storage_context ) self.retriever = RecursiveRetriever( "vector", #root_id="root_retriever_id", retriever_dict={ "vector": self.vector_index.as_retriever( similarity_top_k=similarity_top_k ), #"root_retriever_id": retriever_instance, }, #query_engine_dict=query_engine_dict, node_dict=all_nodes_dict, ) self.query_engine = RetrieverQueryEngine.from_args( self.retriever, service_context=service_context ) async def _aget_proposition(self, node: TextNode) -> List[TextNode]: """Get proposition.""" inital_output = await self._proposition_llm.apredict( PROPOSITIONS_PROMPT, node_text=node.text ) outputs = inital_output.split("\n") all_propositions = [] for output in outputs: if not output.strip(): continue if not output.strip().endswith("]"): if not output.strip().endswith('"') and not output.strip().endswith( "," ): output = output + '"' output = output + " ]" if not output.strip().startswith("["): if not output.strip().startswith('"'): output = '"' + output output = "[ " + output try: propositions = json.loads(output) except Exception: # fallback to yaml try: propositions = yaml.safe_load(output) except Exception: # fallback to next output continue if not isinstance(propositions, list): continue all_propositions.extend(propositions) #time.sleep(1) assert isinstance(all_propositions, list) nodes = [TextNode(text=prop) for prop in all_propositions if prop] await asyncio.sleep(1) return [IndexNode.from_text_node(n, node.node_id) for n in nodes] def _gen_propositions(self, nodes: List[TextNode]) -> List[TextNode]: """Get propositions.""" sub_nodes = asyncio.run( run_jobs( [self._aget_proposition(node) for node in nodes], show_progress=True, workers=8, ) ) # Flatten list return [node for sub_node in sub_nodes for node in sub_node How to solve the error

To resolve the ValueError: Query id node-0 not found in either retriever_dict or query_engine_dict error when using embeddings a second time in ChromaDB, ensure that the query id is correctly mapped and available in the retriever_dict or query_engine_dict. This error typically occurs when the query id is not properly registered or has been removed.

Here are some steps to troubleshoot and resolve the issue:

Ensure Query ID Mapping:
- Verify that the query_id (e.g., node-0) is correctly mapped in the retriever_dict or query_engine_dict.
Check Node IDs and Document Store:
- Ensure that the node_ids and doc_ids are correctly set and that the docstore contains the necessary nodes.
Initialization and Usage of ChromaDB:
- Ensure that the ChromaDB collection is properly initialized and that the nodes are correctly added to the collection.

Here is a relevant code snippet that handles the retrieval and node mapping:

def _build_node_list_from_query_result(
        self, query_result: VectorStoreQueryResult
    ) -> List[NodeWithScore]:
    if query_result.nodes is None:
        if query_result.ids is None:
            raise ValueError(
                "Vector store query result should return at "
                "least one of nodes or ids."
            )
        assert isinstance(self._index.index_struct, IndexDict)
        node_ids = [
            self._index.index_struct.nodes_dict[idx] for idx in query_result.ids
        ]
        nodes = self._docstore.get_nodes(node_ids)
        query_result.nodes = nodes
    else:
        for i in range(len(query_result.nodes)):
            source_node = query_result.nodes[i].source_node
            if (not self._vector_store.stores_text) or (
                source_node is not None and source_node.node_type != ObjectType.TEXT
            ):
                node_id = query_result.nodes[i].node_id
                if self._docstore.document_exists(node_id):
                    query_result.nodes[i] = self._docstore.get_node(
                        node_id
                    )  # type: ignore[index]

    log_vector_store_query_result(query_result)

    node_with_scores: List[NodeWithScore] = []
    for ind, node in enumerate(query_result.nodes):
        score: Optional[float] = None
        if query_result.similarities is not None:
            score = query_result.similarities[ind]
        node_with_scores.append(NodeWithScore(node=node, score=score))

    return node_with_scores

Example Initialization and Usage of ChromaDB

from chromadb import HttpClient
from llama_index.legacy.vector_stores.chroma import ChromaVectorStore

# Initialize ChromaDB client and collection
client = HttpClient(host='localhost', port='8000', ssl=False)
collection = client.get_or_create_collection(name='my_collection')

# Initialize ChromaVectorStore
vector_store = ChromaVectorStore(chroma_collection=collection)

# Add nodes to the collection
nodes = [...]  # List of BaseNode instances with embeddings
vector_store.add(nodes)

# Query the collection
query = VectorStoreQuery(query_embedding=[...], similarity_top_k=10)
result = vector_store.query(query)

Debugging Steps

Verify query_result.ids:
- Ensure query_result.ids is not None and contains valid ids.
Check docstore:
- Ensure the docstore has the nodes corresponding to these ids.
Check index_struct.nodes_dict:
- Ensure index_struct.nodes_dict correctly maps the ids to node ids.

By following these steps, you should be able to resolve the error and ensure that the query ids are correctly registered and available in the retriever_dict or query_engine_dict ^[1]^[2].

To continue talking to Dosu, mention @dosu.

run-llama / llama_index

Chromadb embeddings not working with densex #14141

Question Validation

Question

Example Initialization and Usage of ChromaDB

Debugging Steps