Open farrinfedra opened 3 months ago
To perform multi-document Retrieval-Augmented Generation (RAG) using Weaviate as a vector store, follow these steps:
Install Required Packages:
%pip install llama-index-readers-github
%pip install llama-index-vector-stores-weaviate
%pip install llama-index-llms-openai
Setup and Download Data:
import nest_asyncio
nest_asyncio.apply()
import os
os.environ["GITHUB_TOKEN"] = "ghp_..."
os.environ["OPENAI_API_KEY"] = "sk-..."
from llama_index.readers.github import (
GitHubRepositoryIssuesReader,
GitHubIssuesClient,
)
github_client = GitHubIssuesClient()
loader = GitHubRepositoryIssuesReader(
github_client,
owner="run-llama",
repo="llama_index",
verbose=True,
)
orig_docs = loader.load_data()
limit = 100
docs = []
for idx, doc in enumerate(orig_docs):
doc.metadata["index_id"] = int(doc.id_)
if idx >= limit:
break
docs.append(doc)
Setup the Vector Store and Index:
import weaviate
auth_config = weaviate.AuthApiKey(api_key="XRa15cDIkYRT7AkrpqT6jLfE4wropK1c1TGk")
client = weaviate.Client(
"https://llama-index-test-v0oggsoz.weaviate.network",
auth_client_secret=auth_config,
)
class_name = "LlamaIndex_docs"
client.schema.delete_class(class_name) # optional: delete schema
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core import VectorStoreIndex, StorageContext
vector_store = WeaviateVectorStore(
weaviate_client=client, index_name=class_name
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
doc_index = VectorStoreIndex.from_documents(docs, storage_context=storage_context)
Create IndexNodes for Retrieval and Filtering:
from llama_index.core import SummaryIndex
from llama_index.core.async_utils import run_jobs
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import IndexNode
from llama_index.core.vector_stores import (
FilterOperator,
MetadataFilter,
MetadataFilters,
)
async def aprocess_doc(doc, include_summary: bool = True):
metadata = doc.metadata
date_tokens = metadata["created_at"].split("T")[0].split("-")
year = int(date_tokens[0])
month = int(date_tokens[1])
day = int(date_tokens[2])
assignee = "" if "assignee" not in doc.metadata else doc.metadata["assignee"]
size = ""
if len(doc.metadata["labels"]) > 0:
size_arr = [l for l in doc.metadata["labels"] if "size:" in l]
size = size_arr[0].split(":")[1] if len(size_arr) > 0 else ""
new_metadata = {
"state": metadata["state"],
"year": year,
"month": month,
"day": day,
"assignee": assignee,
"size": size,
}
summary_index = SummaryIndex.from_documents([doc])
query_str = "Give a one-sentence concise summary of this issue."
query_engine = summary_index.as_query_engine(llm=OpenAI(model="gpt-3.5-turbo"))
summary_txt = await query_engine.aquery(query_str)
summary_txt = str(summary_txt)
index_id = doc.metadata["index_id"]
filters = MetadataFilters(
filters=[
MetadataFilter(key="index_id", operator=FilterOperator.EQ, value=int(index_id)),
]
)
index_node = IndexNode(
text=summary_txt,
metadata=new_metadata,
obj=doc_index.as_retriever(filters=filters),
index_id=doc.id_,
)
return index_node
async def aprocess_docs(docs):
index_nodes = []
tasks = []
for doc in docs:
task = aprocess_doc(doc)
tasks.append(task)
index_nodes = await run_jobs(tasks, show_progress=True, workers=3)
return index_nodes
index_nodes = await aprocess_docs(docs)
Create the Top-Level AutoRetriever:
auth_config = weaviate.AuthApiKey(api_key="XRa15cDIkYRT7AkrpqT6jLfE4wropK1c1TGk")
client = weaviate.Client(
"https://llama-index-test-v0oggsoz.weaviate.network",
auth_client_secret=auth_config,
)
class_name = "LlamaIndex_auto"
client.schema.delete_class(class_name) # optional: delete schema
vector_store_auto = WeaviateVectorStore(
weaviate_client=client, index_name=class_name
)
storage_context_auto = StorageContext.from_defaults(vector_store=vector_store_auto)
index = VectorStoreIndex(objects=index_nodes, storage_context=storage_context_auto)
Setup Composable Auto-Retriever:
from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo
vector_store_info = VectorStoreInfo(
content_info="Github Issues",
metadata_info=[
MetadataInfo(name="state", description="Whether the issue is `open` or `closed`", type="string"),
MetadataInfo(name="year", description="The year issue was created", type="integer"),
MetadataInfo(name="month", description="The month issue was created", type="integer"),
MetadataInfo(name="day", description="The day issue was created", type="integer"),
MetadataInfo(name="assignee", description="The assignee of the ticket", type="string"),
MetadataInfo(name="size", description="How big the issue is (XS, S, M, L, XL, XXL)", type="string"),
],
)
from llama_index.core.retrievers import VectorIndexAutoRetriever
retriever = VectorIndexAutoRetriever(
index,
vector_store_info=vector_store_info,
similarity_top_k=2,
empty_query_top_k=10,
verbose=True,
)
Try It Out:
from llama_index.core import QueryBundle
nodes = retriever.retrieve(QueryBundle("Tell me about some issues on 01/11"))
print(f"Number of source nodes: {len(nodes)}")
print(nodes[0].node.metadata)
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-3.5-turbo")
query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)
response = query_engine.query("Tell me about some issues on 01/11")
print(str(response))
response = query_engine.query("Tell me about some open issues related to agents")
print(str(response))
By following these steps, you can set up a multi-document RAG system using Weaviate as a vector store. Additionally, the MultiDocAutoRetrieverPack
class from the LlamaIndex library can be used to streamline the process, ensuring that metadata nodes and document chunks are indexed separately and enabling recursive retrieval for more efficient querying [1][2][3][4].
To continue talking to Dosu, mention @dosu.
Question Validation
Question
Hello, How can I do multi-doc RAG using Weaviate as vector store?