Closed jjoaqu7 closed 3 months ago
The issue you're encountering is due to the load_index_from_storage
function returning a list of indices, and you're trying to use the first index as both a VectorStoreIndex
and a KnowledgeGraphIndex
. This causes the AttributeError
because the KnowledgeGraphIndex
does not have a vector_store
attribute.
Here's an enhanced version of your script with the necessary corrections and improvements:
VectorStoreIndex
and KnowledgeGraphIndex
when loading from storage.from flask import Flask, request, jsonify
import os
from llama_index.core import (
VectorStoreIndex,
StorageContext,
Document,
load_index_from_storage,
Settings,
PromptTemplate,
KnowledgeGraphIndex
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.nebula import NebulaGraphStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from llama_index.core.schema import QueryBundle
from llama_index.core.base.base_retriever import BaseRetriever
from llama_index.core.response_synthesizers import TreeSummarize
import base64, logging, json
app = Flask(__name__)
logging.basicConfig(level=logging.DEBUG)
os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"
os.environ["NEBULA_ADDRESS"] = "127.0.0.1:9669"
persist_dir = './storage_graph'
# Configure settings
Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.embed_model = embed_model
Settings.chunk_size = 512
vector_index = None
kg_index = None
query_engine = None
def load_json_nodes(json_directory):
nodes = []
for filename in os.listdir(json_directory):
if filename.endswith('.json'):
with open(os.path.join(json_directory, filename), 'r') as file:
data = json.load(file)
for node_data in data:
sanitized_metadata = {k: base64.urlsafe_b64encode(v.encode()).decode() if isinstance(v, str) else v for k, v in node_data['metadata'].items()}
node = Document(
text=base64.urlsafe_b64encode(node_data['text'].encode()).decode(),
id_=node_data['id_'],
embedding=node_data['embedding'],
metadata=sanitized_metadata
)
nodes.append(node)
return nodes
def initialize_indices():
global vector_index, kg_index, query_engine
storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
if os.path.exists(persist_dir):
indices = load_index_from_storage(storage_context)
for index in indices:
if isinstance(index, VectorStoreIndex):
vector_index = index
elif isinstance(index, KnowledgeGraphIndex):
kg_index = index
else:
json_nodes = load_json_nodes("JSON_nodes_999_large_syll")
documents = [Document(
text=base64.urlsafe_b64decode(node.text).decode(),
id_=node.id_,
metadata={k: base64.urlsafe_b64decode(v).decode() if isinstance(v, str) else v for k, v in node.metadata.items()},
embedding=node.embedding)
for node in json_nodes
]
vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
graph_store = NebulaGraphStore(
space_name="test3",
edge_types=["relationship"],
rel_prop_names=["relationship"],
tags=["entity"]
)
kg_index = KnowledgeGraphIndex.from_documents(documents, storage_context=storage_context, graph_store=graph_store)
storage_context.persist()
if vector_index is None or kg_index is None:
raise ValueError("Failed to load both VectorStoreIndex and KnowledgeGraphIndex.")
vector_retriever = VectorIndexRetriever(index=vector_index, top_k=7)
kg_retriever = KnowledgeGraphRAGRetriever(index=kg_index, top_k=7)
class CustomRetriever(BaseRetriever):
def __init__(self, vector_retriever, kg_retriever, mode="OR"):
self._vector_retriever = vector_retriever
self._kg_retriever = kg_retriever
if mode not in ("AND", "OR"):
raise ValueError("Invalid mode.")
self._mode = mode
def _retrieve(self, query_bundle):
vector_nodes = self._vector_retriever.retrieve(query_bundle)
kg_nodes = self._kg_retriever.retrieve(query_bundle)
vector_ids = {n.node.node_id for n in vector_nodes}
kg_ids = {n.node.node_id for n in kg_nodes}
combined_dict = {n.node.node_id: n for n in vector_nodes}
combined_dict.update({n.node.node_id: n for n in kg_nodes})
if self._mode == "AND":
retrieve_ids = vector_ids.intersection(kg_ids)
else:
retrieve_ids = vector_ids.union(kg_ids)
return [combined_dict[rid] for rid in retrieve_ids]
custom_retriever = CustomRetriever(vector_retriever, kg_retriever)
custom_prompt_template = PromptTemplate(
template="""\
Your Tasks Include:
Provide a concise yet comprehensive response to the user leveraging the content provided to you.
---------------------
{context_str}
---------------------
User Inquiry: {query_str}
Answer:
"""
)
response_synthesizer = TreeSummarize(
verbose=True,
summary_template=custom_prompt_template,
)
query_engine = RetrieverQueryEngine(
retriever=custom_retriever,
response_synthesizer=response_synthesizer,
)
def query_index(query_text):
global query_engine
query_bundle = QueryBundle(query_text)
response = query_engine.query(query_bundle)
return str(response)
initialize_indices()
@app.route("/query", methods=["GET"])
def query_index_endpoint():
query_text = request.args.get("text", None)
if query_text is None:
return "No text found, please include a ?text=blah parameter in the URL", 400
response = query_index(query_text)
return jsonify(response), 200
if __name__ == "__main__":
app.run(host="0.0.0.0", port=3000, debug=True)
VectorStoreIndex
and KnowledgeGraphIndex
when loading from storage.This should resolve the AttributeError
and ensure that both indices are correctly initialized and used in your application.
To continue talking to Dosu, mention @dosu.
Using this script I get the following error:
Traceback (most recent call last):
File "c:\Users\jram1\custom-index\main_complete-emb.py", line 144, in
The error TypeError: 'KnowledgeGraphIndex' object is not iterable
indicates that the load_index_from_storage
function is returning a single KnowledgeGraphIndex
object instead of a list of indices. This happens because the function is designed to return a single BaseIndex
object when a specific index_id
is provided or when there is only one index in the storage context.
To fix this, you need to load each index separately by specifying their index_id
. Here is the corrected version of your script:
from flask import Flask, request, jsonify
import os
from llama_index.core import (
VectorStoreIndex,
StorageContext,
Document,
load_index_from_storage,
Settings,
PromptTemplate,
KnowledgeGraphIndex
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.nebula import NebulaGraphStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from llama_index.core.schema import QueryBundle
from llama_index.core.base.base_retriever import BaseRetriever
from llama_index.core.response_synthesizers import TreeSummarize
import base64, logging, json
app = Flask(__name__)
logging.basicConfig(level=logging.DEBUG)
os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"
os.environ["NEBULA_ADDRESS"] = "127.0.0.1:9669"
persist_dir = './storage_graph'
# Configure settings
Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.embed_model = embed_model
Settings.chunk_size = 512
vector_index = None
kg_index = None
query_engine = None
def load_json_nodes(json_directory):
nodes = []
for filename in os.listdir(json_directory):
if filename.endswith('.json'):
with open(os.path.join(json_directory, filename), 'r') as file:
data = json.load(file)
for node_data in data:
sanitized_metadata = {k: base64.urlsafe_b64encode(v.encode()).decode() if isinstance(v, str) else v for k, v in node_data['metadata'].items()}
node = Document(
text=base64.urlsafe_b64encode(node_data['text'].encode()).decode(),
id_=node_data['id_'],
embedding=node_data['embedding'],
metadata=sanitized_metadata
)
nodes.append(node)
return nodes
def initialize_indices():
global vector_index, kg_index, query_engine
storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
if os.path.exists(persist_dir):
vector_index = load_index_from_storage(storage_context, index_id="vector_index")
kg_index = load_index_from_storage(storage_context, index_id="kg_index")
else:
json_nodes = load_json_nodes("JSON_nodes_999_large_syll")
documents = [Document(
text=base64.urlsafe_b64decode(node.text).decode(),
id_=node.id_,
metadata={k: base64.urlsafe_b64decode(v).decode() if isinstance(v, str) else v for k, v in node.metadata.items()},
embedding=node.embedding)
for node in json_nodes
]
vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
graph_store = NebulaGraphStore(
space_name="test3",
edge_types=["relationship"],
rel_prop_names=["relationship"],
tags=["entity"]
)
kg_index = KnowledgeGraphIndex.from_documents(documents, storage_context=storage_context, graph_store=graph_store)
vector_index.set_index_id("vector_index")
kg_index.set_index_id("kg_index")
storage_context.persist()
if vector_index is None or kg_index is None:
raise ValueError("Failed to load both VectorStoreIndex and KnowledgeGraphIndex.")
vector_retriever = VectorIndexRetriever(index=vector_index, top_k=7)
kg_retriever = KnowledgeGraphRAGRetriever(index=kg_index, top_k=7)
class CustomRetriever(BaseRetriever):
def __init__(self, vector_retriever, kg_retriever, mode="OR"):
self._vector_retriever = vector_retriever
self._kg_retriever = kg_retriever
if mode not in ("AND", "OR"):
raise ValueError("Invalid mode.")
self._mode = mode
def _retrieve(self, query_bundle):
vector_nodes = self._vector_retriever.retrieve(query_bundle)
kg_nodes = self._kg_retriever.retrieve(query_bundle)
vector_ids = {n.node.node_id for n in vector_nodes}
kg_ids = {n.node.node_id for n in kg_nodes}
combined_dict = {n.node.node_id: n for n in vector_nodes}
combined_dict.update({n.node.node_id: n for n in kg_nodes})
if self._mode == "AND":
retrieve_ids = vector_ids.intersection(kg_ids)
else:
retrieve_ids = vector_ids.union(kg_ids)
return [combined_dict[rid] for rid in retrieve_ids]
custom_retriever = CustomRetriever(vector_retriever, kg_retriever)
custom_prompt_template = PromptTemplate(
template="""\
Your Tasks Include:
Provide a concise yet comprehensive response to the user leveraging the content provided to you.
---------------------
{context_str}
---------------------
User Inquiry: {query_str}
Answer:
"""
)
response_synthesizer = TreeSummarize(
verbose=True,
summary_template=custom_prompt_template,
)
query_engine = RetrieverQueryEngine(
retriever=custom_retriever,
response_synthesizer=response_synthesizer,
)
def query_index(query_text):
global query_engine
query_bundle = QueryBundle(query_text)
response = query_engine.query(query_bundle)
return str(response)
initialize_indices()
@app.route("/query", methods=["GET"])
def query_index_endpoint():
query_text = request.args.get("text", None)
if query_text is None:
return "No text found, please include a ?text=blah parameter in the URL", 400
response = query_index(query_text)
return jsonify(response), 200
if __name__ == "__main__":
app.run(host="0.0.0.0", port=3000, debug=True)
index_id
when calling load_index_from_storage
.index_id
for each index when creating them to ensure they can be correctly identified and loaded later.This should resolve the TypeError
and ensure that both indices are correctly loaded and used in your application.
To continue talking to Dosu, mention @dosu.
To modify your index creation script to set the index_id
for both the VectorStoreIndex
and KnowledgeGraphIndex
, you can use the set_index_id
method provided in the BaseIndex
class. Here is how you can do it:
def create_index():
graph_store = NebulaGraphStore(
space_name=space_name,
edge_types=edge_types,
rel_prop_names=rel_prop_names,
tags=tags
)
storage_context = StorageContext.from_defaults(graph_store=graph_store)
json_nodes = load_json_nodes("JSON_nodes_999_large")
documents = [
Document(
text=decode_string(node.text),
id_=node.id_,
metadata=decode_metadata(node.metadata),
embedding=node.embedding
) for node in json_nodes
]
kg_index = KnowledgeGraphIndex.from_documents(
documents,
storage_context=storage_context,
max_triplets_per_chunk=10,
space_name=space_name,
edge_types=edge_types,
rel_prop_names=rel_prop_names,
tags=tags,
max_knowledge_sequence=15,
include_embeddings=True
)
# Set the index_id for KnowledgeGraphIndex
kg_index.set_index_id("kg_index")
kg_index.storage_context.persist(persist_dir='./storage_graph')
# Create VectorStoreIndex
vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
# Set the index_id for VectorStoreIndex
vector_index.set_index_id("vector_index")
# Persist the storage context
storage_context.persist(persist_dir='./storage_graph')
return kg_index, vector_index, storage_context
# Example usage
kg_index, vector_index, storage_context = create_index()
index_id
for KnowledgeGraphIndex
: After creating the KnowledgeGraphIndex
, use kg_index.set_index_id("kg_index")
to set its index_id
.index_id
for VectorStoreIndex
: Similarly, after creating the VectorStoreIndex
, use vector_index.set_index_id("vector_index")
to set its index_id
.index_id
for both indices.This ensures that both indices are correctly identified and can be loaded later using their respective index_id
s.
To continue talking to Dosu, mention @dosu.
Question Validation
Question
Hello, I was having some problems trying to load my vector and kg index graphs, the following is my script, perhaps someone can point me in right direction? Thanks! @dosubot
Any script enhancements or unnecessary things as well?