run-llama / llama_index

LlamaIndex is a data framework for your LLM applications
https://docs.llamaindex.ai
MIT License
36.96k stars 5.3k forks source link

[Question]: Multi-doc RAG using Weaviate vector store #15530

Open farrinfedra opened 3 months ago

farrinfedra commented 3 months ago

Question Validation

Question

Hello, How can I do multi-doc RAG using Weaviate as vector store?

dosubot[bot] commented 3 months ago

To perform multi-document Retrieval-Augmented Generation (RAG) using Weaviate as a vector store, follow these steps:

  1. Install Required Packages:

    %pip install llama-index-readers-github
    %pip install llama-index-vector-stores-weaviate
    %pip install llama-index-llms-openai
  2. Setup and Download Data:

    import nest_asyncio
    nest_asyncio.apply()
    
    import os
    os.environ["GITHUB_TOKEN"] = "ghp_..."
    os.environ["OPENAI_API_KEY"] = "sk-..."
    
    from llama_index.readers.github import (
        GitHubRepositoryIssuesReader,
        GitHubIssuesClient,
    )
    
    github_client = GitHubIssuesClient()
    loader = GitHubRepositoryIssuesReader(
        github_client,
        owner="run-llama",
        repo="llama_index",
        verbose=True,
    )
    
    orig_docs = loader.load_data()
    
    limit = 100
    docs = []
    for idx, doc in enumerate(orig_docs):
        doc.metadata["index_id"] = int(doc.id_)
        if idx >= limit:
            break
        docs.append(doc)
  3. Setup the Vector Store and Index:

    import weaviate
    
    auth_config = weaviate.AuthApiKey(api_key="XRa15cDIkYRT7AkrpqT6jLfE4wropK1c1TGk")
    client = weaviate.Client(
        "https://llama-index-test-v0oggsoz.weaviate.network",
        auth_client_secret=auth_config,
    )
    
    class_name = "LlamaIndex_docs"
    client.schema.delete_class(class_name)  # optional: delete schema
    
    from llama_index.vector_stores.weaviate import WeaviateVectorStore
    from llama_index.core import VectorStoreIndex, StorageContext
    
    vector_store = WeaviateVectorStore(
        weaviate_client=client, index_name=class_name
    )
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    doc_index = VectorStoreIndex.from_documents(docs, storage_context=storage_context)
  4. Create IndexNodes for Retrieval and Filtering:

    from llama_index.core import SummaryIndex
    from llama_index.core.async_utils import run_jobs
    from llama_index.llms.openai import OpenAI
    from llama_index.core.schema import IndexNode
    from llama_index.core.vector_stores import (
        FilterOperator,
        MetadataFilter,
        MetadataFilters,
    )
    
    async def aprocess_doc(doc, include_summary: bool = True):
        metadata = doc.metadata
        date_tokens = metadata["created_at"].split("T")[0].split("-")
        year = int(date_tokens[0])
        month = int(date_tokens[1])
        day = int(date_tokens[2])
        assignee = "" if "assignee" not in doc.metadata else doc.metadata["assignee"]
        size = ""
        if len(doc.metadata["labels"]) > 0:
            size_arr = [l for l in doc.metadata["labels"] if "size:" in l]
            size = size_arr[0].split(":")[1] if len(size_arr) > 0 else ""
        new_metadata = {
            "state": metadata["state"],
            "year": year,
            "month": month,
            "day": day,
            "assignee": assignee,
            "size": size,
        }
    
        summary_index = SummaryIndex.from_documents([doc])
        query_str = "Give a one-sentence concise summary of this issue."
        query_engine = summary_index.as_query_engine(llm=OpenAI(model="gpt-3.5-turbo"))
        summary_txt = await query_engine.aquery(query_str)
        summary_txt = str(summary_txt)
    
        index_id = doc.metadata["index_id"]
        filters = MetadataFilters(
            filters=[
                MetadataFilter(key="index_id", operator=FilterOperator.EQ, value=int(index_id)),
            ]
        )
    
        index_node = IndexNode(
            text=summary_txt,
            metadata=new_metadata,
            obj=doc_index.as_retriever(filters=filters),
            index_id=doc.id_,
        )
    
        return index_node
    
    async def aprocess_docs(docs):
        index_nodes = []
        tasks = []
        for doc in docs:
            task = aprocess_doc(doc)
            tasks.append(task)
    
        index_nodes = await run_jobs(tasks, show_progress=True, workers=3)
        return index_nodes
    
    index_nodes = await aprocess_docs(docs)
  5. Create the Top-Level AutoRetriever:

    auth_config = weaviate.AuthApiKey(api_key="XRa15cDIkYRT7AkrpqT6jLfE4wropK1c1TGk")
    client = weaviate.Client(
        "https://llama-index-test-v0oggsoz.weaviate.network",
        auth_client_secret=auth_config,
    )
    
    class_name = "LlamaIndex_auto"
    client.schema.delete_class(class_name)  # optional: delete schema
    
    vector_store_auto = WeaviateVectorStore(
        weaviate_client=client, index_name=class_name
    )
    storage_context_auto = StorageContext.from_defaults(vector_store=vector_store_auto)
    
    index = VectorStoreIndex(objects=index_nodes, storage_context=storage_context_auto)
  6. Setup Composable Auto-Retriever:

    from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo
    
    vector_store_info = VectorStoreInfo(
        content_info="Github Issues",
        metadata_info=[
            MetadataInfo(name="state", description="Whether the issue is `open` or `closed`", type="string"),
            MetadataInfo(name="year", description="The year issue was created", type="integer"),
            MetadataInfo(name="month", description="The month issue was created", type="integer"),
            MetadataInfo(name="day", description="The day issue was created", type="integer"),
            MetadataInfo(name="assignee", description="The assignee of the ticket", type="string"),
            MetadataInfo(name="size", description="How big the issue is (XS, S, M, L, XL, XXL)", type="string"),
        ],
    )
    
    from llama_index.core.retrievers import VectorIndexAutoRetriever
    
    retriever = VectorIndexAutoRetriever(
        index,
        vector_store_info=vector_store_info,
        similarity_top_k=2,
        empty_query_top_k=10,
        verbose=True,
    )
  7. Try It Out:

    from llama_index.core import QueryBundle
    
    nodes = retriever.retrieve(QueryBundle("Tell me about some issues on 01/11"))
    print(f"Number of source nodes: {len(nodes)}")
    print(nodes[0].node.metadata)
    
    from llama_index.core.query_engine import RetrieverQueryEngine
    from llama_index.llms.openai import OpenAI
    
    llm = OpenAI(model="gpt-3.5-turbo")
    query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)
    
    response = query_engine.query("Tell me about some issues on 01/11")
    print(str(response))
    
    response = query_engine.query("Tell me about some open issues related to agents")
    print(str(response))

By following these steps, you can set up a multi-document RAG system using Weaviate as a vector store. Additionally, the MultiDocAutoRetrieverPack class from the LlamaIndex library can be used to streamline the process, ensuring that metadata nodes and document chunks are indexed separately and enabling recursive retrieval for more efficient querying [1][2][3][4].

To continue talking to Dosu, mention @dosu.