run-llama / llama_index

LlamaIndex is a data framework for your LLM applications
https://docs.llamaindex.ai
MIT License
35.42k stars 5k forks source link

[Question]: Update VectorStore if new pdf added to the folder #15626

Open robertobalestri opened 3 weeks ago

robertobalestri commented 3 weeks ago

Question Validation

Question

I have this code:

import streamlit as st
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import SimpleDirectoryReader, Settings, VectorStoreIndex, StorageContext
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.readers.file import PDFReader
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

def main():

    st.title("Il tuo itinerario in Campania")

    # Configure Azure OpenAI
    llm = AzureOpenAI(
        engine=st.secrets["AZURE_OPENAI_LLM_DEPLOYMENT_NAME"],
        model="gpt-4o",
        temperature=0.0,
        azure_endpoint=st.secrets["AZURE_OPENAI_API_ENDPOINT"],
        api_key=st.secrets["AZURE_OPENAI_API_KEY"],
        api_version=st.secrets["AZURE_OPENAI_API_VERSION"],
    )

      embed_model = OllamaEmbedding(
          model_name="mxbai-embed-large:335m",
          base_url="http://localhost:11434",
      )

    # Set configuration parameters
    Settings.llm = llm
    Settings.embed_model = embed_model

      # Load documents
      parser = PDFReader()
      file_extractor = {".pdf": parser}
      documents = SimpleDirectoryReader(
          "./content/Documents", file_extractor=file_extractor
      ).load_data()      

    # Create ChromaDB client and collection
    db = chromadb.PersistentClient(path="./chroma_db")

    # Load or create the vector store index
    try:
        chroma_collection = db.get_collection("quickstart")

        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)

        vector_store_index = VectorStoreIndex.from_vector_store(
            vector_store,
            embed_model=embed_model,
        )
        print("Index loaded from storage.")

    except Exception as e:
        print(f"Failed to load index from storage: {e}")
        st.write("Qualcosa è andato storto. Riprova più tardi.")

        chroma_collection = db.get_or_create_collection("quickstart")

        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)

        # Create the index from documents if it doesn't exist
        vector_store_index = VectorStoreIndex.from_documents(
            documents,
            storage_context=storage_context,
            embed_model=embed_model,
            show_progress=True,
        )
        print(f"Index created and stored in: ./chroma_db")

I want to add new documents to the folder and have the script automatically integrate them into the vector database without recalculating embeddings for the entire document set. Additionally, I don't want to delete any documents from the folder; the program should recognize previously added documents and only generate and store embeddings for new ones that haven't been processed yet.

I really can't understand if it's possible.

Thanks to whoever will help me <3

dosubot[bot] commented 3 weeks ago

Ciao @robertobalestri! Sono qui per aiutarti con qualsiasi problema o domanda tu possa avere. Sono un bot che può aiutarti a risolvere bug, rispondere a domande e diventare un collaboratore.

To update your vector store with new PDF documents without recalculating embeddings for the entire document set, you can modify your script to recognize previously added documents and only process new ones. Here's how you can achieve this:

  1. Use a Document Store Strategy: Implement a strategy to handle duplicates and only process new documents by checking the document hash.

  2. Modify the Script: Update your script to include this strategy and ensure that only new documents are processed.

Here's the modified script:

import streamlit as st
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import SimpleDirectoryReader, Settings, VectorStoreIndex, StorageContext
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.readers.file import PDFReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.ingestion import DocstoreStrategy, IngestionPipeline, IngestionCache
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache
from llama_index.storage.docstore.redis import RedisDocumentStore
from llama_index.core.node_parser import SentenceSplitter
import chromadb

def main():
    st.title("Il tuo itinerario in Campania")

    # Configure Azure OpenAI
    llm = AzureOpenAI(
        engine=st.secrets["AZURE_OPENAI_LLM_DEPLOYMENT_NAME"],
        model="gpt-4o",
        temperature=0.0,
        azure_endpoint=st.secrets["AZURE_OPENAI_API_ENDPOINT"],
        api_key=st.secrets["AZURE_OPENAI_API_KEY"],
        api_version=st.secrets["AZURE_OPENAI_API_VERSION"],
    )

    embed_model = OllamaEmbedding(
        model_name="mxbai-embed-large:335m",
        base_url="http://localhost:11434",
    )

    # Set configuration parameters
    Settings.llm = llm
    Settings.embed_model = embed_model

    # Load documents
    parser = PDFReader()
    file_extractor = {".pdf": parser}
    documents = SimpleDirectoryReader(
        "./content/Documents", file_extractor=file_extractor, filename_as_id=True
    ).load_data()

    # Create ChromaDB client and collection
    db = chromadb.PersistentClient(path="./chroma_db")

    # Load or create the vector store index
    try:
        chroma_collection = db.get_collection("quickstart")
        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)

        vector_store_index = VectorStoreIndex.from_vector_store(
            vector_store,
            embed_model=embed_model,
        )
        print("Index loaded from storage.")

    except Exception as e:
        print(f"Failed to load index from storage: {e}")
        st.write("Qualcosa è andato storto. Riprova più tardi.")

        chroma_collection = db.get_or_create_collection("quickstart")
        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)

        # Create the index from documents if it doesn't exist
        vector_store_index = VectorStoreIndex.from_documents(
            documents,
            storage_context=storage_context,
            embed_model=embed_model,
            show_progress=True,
        )
        print(f"Index created and stored in: ./chroma_db")

    # Ingestion pipeline to handle new documents
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(),
            embed_model,
        ],
        docstore=RedisDocumentStore.from_host_and_port(
            "localhost", 6379, namespace="document_store"
        ),
        vector_store=vector_store,
        cache=IngestionCache(
            cache=RedisCache.from_host_and_port("localhost", 6379),
            collection="redis_cache",
        ),
        docstore_strategy=DocstoreStrategy.DUPLICATES_ONLY,
    )

    # Run the pipeline to ingest new documents
    nodes = pipeline.run(documents=documents)
    print(f"Ingested {len(nodes)} Nodes")

if __name__ == "__main__":
    main()

Key Changes:

  1. Document Store Strategy: Added DocstoreStrategy.DUPLICATES_ONLY to handle duplicates and only process new documents.
  2. Ingestion Pipeline: Created an ingestion pipeline to manage the integration of new documents.

This setup ensures that previously added documents are recognized, and only new ones are processed, avoiding the need to recalculate embeddings for the entire document set [1][2].

To continue talking to Dosu, mention @dosu.

robertobalestri commented 3 weeks ago

@dosu

I've reached this point: chroma_collection = db.get_collection("quickstart") vector_store = ChromaVectorStore(chroma_collection=chroma_collection) storage_context = StorageContext.from_defaults(vector_store=vector_store)

    vector_store_index = VectorStoreIndex.from_vector_store(
        vector_store,
        embed_model=embed_model,
    )
    print("Index loaded from storage.")

    # Load and add new documents
    new_documents = load_new_documents(folder_path)
    if new_documents:
        # Parse documents into nodes
        print("Parsing new documents into nodes...")
        parser = SimpleNodeParser()
        new_nodes = parser.get_nodes_from_documents(new_documents)

        # Add nodes to the existing index
        print("Adding new nodes to the existing index...")
        vector_store_index.insert_nodes(new_nodes)

        print("New documents added to the index.")

the problem is that the new documents' nodes don't have embeddings, they're added as plain documents chunk i think, because the embedder model doesn't even start.

dosubot[bot] commented 3 weeks ago

To ensure that the new nodes have embeddings generated by the embedder model, you need to explicitly generate embeddings for the new documents before adding them to the vector store index. Here's how you can modify your script to achieve this:

  1. Generate Embeddings for New Documents: Use the embedder model to generate embeddings for the new documents.
  2. Add Nodes with Embeddings to the Index: Insert the nodes with embeddings into the existing index.

Here's the updated code snippet:

import streamlit as st
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import SimpleDirectoryReader, Settings, VectorStoreIndex, StorageContext
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.readers.file import PDFReader
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

def main():
    st.title("Il tuo itinerario in Campania")

    # Configure Azure OpenAI
    llm = AzureOpenAI(
        engine=st.secrets["AZURE_OPENAI_LLM_DEPLOYMENT_NAME"],
        model="gpt-4o",
        temperature=0.0,
        azure_endpoint=st.secrets["AZURE_OPENAI_API_ENDPOINT"],
        api_key=st.secrets["AZURE_OPENAI_API_KEY"],
        api_version=st.secrets["AZURE_OPENAI_API_VERSION"],
    )

    embed_model = OllamaEmbedding(
        model_name="mxbai-embed-large:335m",
        base_url="http://localhost:11434",
    )

    # Set configuration parameters
    Settings.llm = llm
    Settings.embed_model = embed_model

    # Create ChromaDB client and collection
    db = chromadb.PersistentClient(path="./chroma_db")
    chroma_collection = db.get_collection("quickstart")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    vector_store_index = VectorStoreIndex.from_vector_store(
        vector_store,
        embed_model=embed_model,
    )
    print("Index loaded from storage.")

    # Load and add new documents
    new_documents = load_new_documents(folder_path)
    if new_documents:
        # Parse documents into nodes
        print("Parsing new documents into nodes...")
        parser = SimpleNodeParser()
        new_nodes = parser.get_nodes_from_documents(new_documents)

        # Generate embeddings for the new nodes
        print("Generating embeddings for new nodes...")
        for node in new_nodes:
            node.embedding = embed_model._get_text_embedding(node.text)

        # Add nodes with embeddings to the existing index
        print("Adding new nodes to the existing index...")
        vector_store_index._add_nodes_to_index(vector_store_index.index_struct, new_nodes)

        print("New documents added to the index.")

if __name__ == "__main__":
    main()

Key Changes:

  1. Generate Embeddings: Added a loop to generate embeddings for each new node using the embedder model.
  2. Add Nodes with Embeddings: Used the _add_nodes_to_index method to add nodes with embeddings to the existing index.

This ensures that the new documents are processed, embeddings are generated, and the nodes are correctly added to the vector store index [1][2].

To continue talking to Dosu, mention @dosu.