Open robertobalestri opened 3 weeks ago
Ciao @robertobalestri! Sono qui per aiutarti con qualsiasi problema o domanda tu possa avere. Sono un bot che può aiutarti a risolvere bug, rispondere a domande e diventare un collaboratore.
To update your vector store with new PDF documents without recalculating embeddings for the entire document set, you can modify your script to recognize previously added documents and only process new ones. Here's how you can achieve this:
Use a Document Store Strategy: Implement a strategy to handle duplicates and only process new documents by checking the document hash.
Modify the Script: Update your script to include this strategy and ensure that only new documents are processed.
Here's the modified script:
import streamlit as st
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import SimpleDirectoryReader, Settings, VectorStoreIndex, StorageContext
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.readers.file import PDFReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.ingestion import DocstoreStrategy, IngestionPipeline, IngestionCache
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache
from llama_index.storage.docstore.redis import RedisDocumentStore
from llama_index.core.node_parser import SentenceSplitter
import chromadb
def main():
st.title("Il tuo itinerario in Campania")
# Configure Azure OpenAI
llm = AzureOpenAI(
engine=st.secrets["AZURE_OPENAI_LLM_DEPLOYMENT_NAME"],
model="gpt-4o",
temperature=0.0,
azure_endpoint=st.secrets["AZURE_OPENAI_API_ENDPOINT"],
api_key=st.secrets["AZURE_OPENAI_API_KEY"],
api_version=st.secrets["AZURE_OPENAI_API_VERSION"],
)
embed_model = OllamaEmbedding(
model_name="mxbai-embed-large:335m",
base_url="http://localhost:11434",
)
# Set configuration parameters
Settings.llm = llm
Settings.embed_model = embed_model
# Load documents
parser = PDFReader()
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
"./content/Documents", file_extractor=file_extractor, filename_as_id=True
).load_data()
# Create ChromaDB client and collection
db = chromadb.PersistentClient(path="./chroma_db")
# Load or create the vector store index
try:
chroma_collection = db.get_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_store_index = VectorStoreIndex.from_vector_store(
vector_store,
embed_model=embed_model,
)
print("Index loaded from storage.")
except Exception as e:
print(f"Failed to load index from storage: {e}")
st.write("Qualcosa è andato storto. Riprova più tardi.")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Create the index from documents if it doesn't exist
vector_store_index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
embed_model=embed_model,
show_progress=True,
)
print(f"Index created and stored in: ./chroma_db")
# Ingestion pipeline to handle new documents
pipeline = IngestionPipeline(
transformations=[
SentenceSplitter(),
embed_model,
],
docstore=RedisDocumentStore.from_host_and_port(
"localhost", 6379, namespace="document_store"
),
vector_store=vector_store,
cache=IngestionCache(
cache=RedisCache.from_host_and_port("localhost", 6379),
collection="redis_cache",
),
docstore_strategy=DocstoreStrategy.DUPLICATES_ONLY,
)
# Run the pipeline to ingest new documents
nodes = pipeline.run(documents=documents)
print(f"Ingested {len(nodes)} Nodes")
if __name__ == "__main__":
main()
DocstoreStrategy.DUPLICATES_ONLY
to handle duplicates and only process new documents.This setup ensures that previously added documents are recognized, and only new ones are processed, avoiding the need to recalculate embeddings for the entire document set [1][2].
To continue talking to Dosu, mention @dosu.
@dosu
I've reached this point: chroma_collection = db.get_collection("quickstart") vector_store = ChromaVectorStore(chroma_collection=chroma_collection) storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_store_index = VectorStoreIndex.from_vector_store(
vector_store,
embed_model=embed_model,
)
print("Index loaded from storage.")
# Load and add new documents
new_documents = load_new_documents(folder_path)
if new_documents:
# Parse documents into nodes
print("Parsing new documents into nodes...")
parser = SimpleNodeParser()
new_nodes = parser.get_nodes_from_documents(new_documents)
# Add nodes to the existing index
print("Adding new nodes to the existing index...")
vector_store_index.insert_nodes(new_nodes)
print("New documents added to the index.")
the problem is that the new documents' nodes don't have embeddings, they're added as plain documents chunk i think, because the embedder model doesn't even start.
To ensure that the new nodes have embeddings generated by the embedder model, you need to explicitly generate embeddings for the new documents before adding them to the vector store index. Here's how you can modify your script to achieve this:
Here's the updated code snippet:
import streamlit as st
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import SimpleDirectoryReader, Settings, VectorStoreIndex, StorageContext
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.readers.file import PDFReader
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
def main():
st.title("Il tuo itinerario in Campania")
# Configure Azure OpenAI
llm = AzureOpenAI(
engine=st.secrets["AZURE_OPENAI_LLM_DEPLOYMENT_NAME"],
model="gpt-4o",
temperature=0.0,
azure_endpoint=st.secrets["AZURE_OPENAI_API_ENDPOINT"],
api_key=st.secrets["AZURE_OPENAI_API_KEY"],
api_version=st.secrets["AZURE_OPENAI_API_VERSION"],
)
embed_model = OllamaEmbedding(
model_name="mxbai-embed-large:335m",
base_url="http://localhost:11434",
)
# Set configuration parameters
Settings.llm = llm
Settings.embed_model = embed_model
# Create ChromaDB client and collection
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_store_index = VectorStoreIndex.from_vector_store(
vector_store,
embed_model=embed_model,
)
print("Index loaded from storage.")
# Load and add new documents
new_documents = load_new_documents(folder_path)
if new_documents:
# Parse documents into nodes
print("Parsing new documents into nodes...")
parser = SimpleNodeParser()
new_nodes = parser.get_nodes_from_documents(new_documents)
# Generate embeddings for the new nodes
print("Generating embeddings for new nodes...")
for node in new_nodes:
node.embedding = embed_model._get_text_embedding(node.text)
# Add nodes with embeddings to the existing index
print("Adding new nodes to the existing index...")
vector_store_index._add_nodes_to_index(vector_store_index.index_struct, new_nodes)
print("New documents added to the index.")
if __name__ == "__main__":
main()
_add_nodes_to_index
method to add nodes with embeddings to the existing index.This ensures that the new documents are processed, embeddings are generated, and the nodes are correctly added to the vector store index [1][2].
To continue talking to Dosu, mention @dosu.
Question Validation
Question
I have this code:
I want to add new documents to the folder and have the script automatically integrate them into the vector database without recalculating embeddings for the entire document set. Additionally, I don't want to delete any documents from the folder; the program should recognize previously added documents and only generate and store embeddings for new ones that haven't been processed yet.
I really can't understand if it's possible.
Thanks to whoever will help me <3