run-llama / llama_index

LlamaIndex is a data framework for your LLM applications
https://docs.llamaindex.ai
MIT License
35.89k stars 5.09k forks source link

[Question]: load index object attribute #14460

Closed jjoaqu7 closed 3 months ago

jjoaqu7 commented 3 months ago

Question Validation

Question

Hello, I was having some problems trying to load my vector and kg index graphs, the following is my script, perhaps someone can point me in right direction? Thanks! @dosubot

Traceback (most recent call last):
  File "c:\Users\jram1\custom-index\main_complete-emb.py", line 137, in <module>
    initialize_indices()
  File "c:\Users\jram1\custom-index\main_complete-emb.py", line 86, in initialize_indices
    vector_retriever = VectorIndexRetriever(index=vector_index, top_k=7)   
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^   
  File "C:\Users\jram1\custom-index\env\Lib\site-packages\llama_index\core\indices\vector_store\retrievers\retriever.py", line 59, in __init__        
    self._vector_store = self._index.vector_store
                         ^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'KnowledgeGraphIndex' object has no attribute 'vector_store'. Did you mean: '_vector_store'?

Any script enhancements or unnecessary things as well?

dosubot[bot] commented 3 months ago

The issue you're encountering is due to the load_index_from_storage function returning a list of indices, and you're trying to use the first index as both a VectorStoreIndex and a KnowledgeGraphIndex. This causes the AttributeError because the KnowledgeGraphIndex does not have a vector_store attribute.

Here's an enhanced version of your script with the necessary corrections and improvements:

  1. Separate Loading of Indices: Ensure that you correctly differentiate between the VectorStoreIndex and KnowledgeGraphIndex when loading from storage.
  2. Error Handling: Add error handling for better robustness.
  3. Code Cleanup: Remove unnecessary parts and improve readability.
from flask import Flask, request, jsonify
import os
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    Document,
    load_index_from_storage,
    Settings,
    PromptTemplate,
    KnowledgeGraphIndex
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.nebula import NebulaGraphStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from llama_index.core.schema import QueryBundle
from llama_index.core.base.base_retriever import BaseRetriever
from llama_index.core.response_synthesizers import TreeSummarize
import base64, logging, json

app = Flask(__name__)
logging.basicConfig(level=logging.DEBUG)

os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"
os.environ["NEBULA_ADDRESS"] = "127.0.0.1:9669"

persist_dir = './storage_graph'

# Configure settings
Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.embed_model = embed_model
Settings.chunk_size = 512

vector_index = None
kg_index = None
query_engine = None

def load_json_nodes(json_directory):
    nodes = []
    for filename in os.listdir(json_directory):
        if filename.endswith('.json'):
            with open(os.path.join(json_directory, filename), 'r') as file:
                data = json.load(file)
                for node_data in data:
                    sanitized_metadata = {k: base64.urlsafe_b64encode(v.encode()).decode() if isinstance(v, str) else v for k, v in node_data['metadata'].items()}
                    node = Document(
                        text=base64.urlsafe_b64encode(node_data['text'].encode()).decode(),
                        id_=node_data['id_'],
                        embedding=node_data['embedding'],
                        metadata=sanitized_metadata
                    )
                    nodes.append(node)
    return nodes

def initialize_indices():
    global vector_index, kg_index, query_engine
    storage_context = StorageContext.from_defaults(persist_dir=persist_dir)

    if os.path.exists(persist_dir):
        indices = load_index_from_storage(storage_context)
        for index in indices:
            if isinstance(index, VectorStoreIndex):
                vector_index = index
            elif isinstance(index, KnowledgeGraphIndex):
                kg_index = index
    else:
        json_nodes = load_json_nodes("JSON_nodes_999_large_syll")
        documents = [Document(
            text=base64.urlsafe_b64decode(node.text).decode(), 
            id_=node.id_, 
            metadata={k: base64.urlsafe_b64decode(v).decode() if isinstance(v, str) else v for k, v in node.metadata.items()}, 
            embedding=node.embedding) 
            for node in json_nodes
        ]

        vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
        graph_store = NebulaGraphStore(
            space_name="test3",
            edge_types=["relationship"],
            rel_prop_names=["relationship"],
            tags=["entity"]
        )
        kg_index = KnowledgeGraphIndex.from_documents(documents, storage_context=storage_context, graph_store=graph_store)
        storage_context.persist()

    if vector_index is None or kg_index is None:
        raise ValueError("Failed to load both VectorStoreIndex and KnowledgeGraphIndex.")

    vector_retriever = VectorIndexRetriever(index=vector_index, top_k=7)
    kg_retriever = KnowledgeGraphRAGRetriever(index=kg_index, top_k=7)

    class CustomRetriever(BaseRetriever):
        def __init__(self, vector_retriever, kg_retriever, mode="OR"):
            self._vector_retriever = vector_retriever
            self._kg_retriever = kg_retriever
            if mode not in ("AND", "OR"):
                raise ValueError("Invalid mode.")
            self._mode = mode

        def _retrieve(self, query_bundle):
            vector_nodes = self._vector_retriever.retrieve(query_bundle)
            kg_nodes = self._kg_retriever.retrieve(query_bundle)
            vector_ids = {n.node.node_id for n in vector_nodes}
            kg_ids = {n.node.node_id for n in kg_nodes}
            combined_dict = {n.node.node_id: n for n in vector_nodes}
            combined_dict.update({n.node.node_id: n for n in kg_nodes})
            if self._mode == "AND":
                retrieve_ids = vector_ids.intersection(kg_ids)
            else:
                retrieve_ids = vector_ids.union(kg_ids)
            return [combined_dict[rid] for rid in retrieve_ids]

    custom_retriever = CustomRetriever(vector_retriever, kg_retriever)
    custom_prompt_template = PromptTemplate(
        template="""\
        Your Tasks Include:
        Provide a concise yet comprehensive response to the user leveraging the content provided to you.
        ---------------------
        {context_str}
        ---------------------
        User Inquiry: {query_str}
        Answer: 
        """
    )
    response_synthesizer = TreeSummarize(
        verbose=True,
        summary_template=custom_prompt_template,
    )
    query_engine = RetrieverQueryEngine(
        retriever=custom_retriever,
        response_synthesizer=response_synthesizer,
    )

def query_index(query_text):
    global query_engine
    query_bundle = QueryBundle(query_text)
    response = query_engine.query(query_bundle)
    return str(response)

initialize_indices()

@app.route("/query", methods=["GET"])
def query_index_endpoint():
    query_text = request.args.get("text", None)
    if query_text is None:
        return "No text found, please include a ?text=blah parameter in the URL", 400
    response = query_index(query_text)
    return jsonify(response), 200

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=3000, debug=True)

Key Changes:

  1. Separate Loading of Indices: The script now correctly differentiates between VectorStoreIndex and KnowledgeGraphIndex when loading from storage.
  2. Error Handling: Added error handling to ensure both indices are loaded correctly.
  3. Code Cleanup: Improved readability and removed unnecessary parts.

This should resolve the AttributeError and ensure that both indices are correctly initialized and used in your application.

To continue talking to Dosu, mention @dosu.

jjoaqu7 commented 3 months ago

Using this script I get the following error:

Traceback (most recent call last): File "c:\Users\jram1\custom-index\main_complete-emb.py", line 144, in initialize_indices() File "c:\Users\jram1\custom-index\main_complete-emb.py", line 65, in initialize_indices for index in indices: TypeError: 'KnowledgeGraphIndex' object is not iterable @dosu , @dosubot

dosubot[bot] commented 3 months ago

The error TypeError: 'KnowledgeGraphIndex' object is not iterable indicates that the load_index_from_storage function is returning a single KnowledgeGraphIndex object instead of a list of indices. This happens because the function is designed to return a single BaseIndex object when a specific index_id is provided or when there is only one index in the storage context.

To fix this, you need to load each index separately by specifying their index_id. Here is the corrected version of your script:

from flask import Flask, request, jsonify
import os
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    Document,
    load_index_from_storage,
    Settings,
    PromptTemplate,
    KnowledgeGraphIndex
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.nebula import NebulaGraphStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from llama_index.core.schema import QueryBundle
from llama_index.core.base.base_retriever import BaseRetriever
from llama_index.core.response_synthesizers import TreeSummarize
import base64, logging, json

app = Flask(__name__)
logging.basicConfig(level=logging.DEBUG)

os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"
os.environ["NEBULA_ADDRESS"] = "127.0.0.1:9669"

persist_dir = './storage_graph'

# Configure settings
Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.embed_model = embed_model
Settings.chunk_size = 512

vector_index = None
kg_index = None
query_engine = None

def load_json_nodes(json_directory):
    nodes = []
    for filename in os.listdir(json_directory):
        if filename.endswith('.json'):
            with open(os.path.join(json_directory, filename), 'r') as file:
                data = json.load(file)
                for node_data in data:
                    sanitized_metadata = {k: base64.urlsafe_b64encode(v.encode()).decode() if isinstance(v, str) else v for k, v in node_data['metadata'].items()}
                    node = Document(
                        text=base64.urlsafe_b64encode(node_data['text'].encode()).decode(),
                        id_=node_data['id_'],
                        embedding=node_data['embedding'],
                        metadata=sanitized_metadata
                    )
                    nodes.append(node)
    return nodes

def initialize_indices():
    global vector_index, kg_index, query_engine
    storage_context = StorageContext.from_defaults(persist_dir=persist_dir)

    if os.path.exists(persist_dir):
        vector_index = load_index_from_storage(storage_context, index_id="vector_index")
        kg_index = load_index_from_storage(storage_context, index_id="kg_index")
    else:
        json_nodes = load_json_nodes("JSON_nodes_999_large_syll")
        documents = [Document(
            text=base64.urlsafe_b64decode(node.text).decode(), 
            id_=node.id_, 
            metadata={k: base64.urlsafe_b64decode(v).decode() if isinstance(v, str) else v for k, v in node.metadata.items()}, 
            embedding=node.embedding) 
            for node in json_nodes
        ]

        vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
        graph_store = NebulaGraphStore(
            space_name="test3",
            edge_types=["relationship"],
            rel_prop_names=["relationship"],
            tags=["entity"]
        )
        kg_index = KnowledgeGraphIndex.from_documents(documents, storage_context=storage_context, graph_store=graph_store)
        vector_index.set_index_id("vector_index")
        kg_index.set_index_id("kg_index")
        storage_context.persist()

    if vector_index is None or kg_index is None:
        raise ValueError("Failed to load both VectorStoreIndex and KnowledgeGraphIndex.")

    vector_retriever = VectorIndexRetriever(index=vector_index, top_k=7)
    kg_retriever = KnowledgeGraphRAGRetriever(index=kg_index, top_k=7)

    class CustomRetriever(BaseRetriever):
        def __init__(self, vector_retriever, kg_retriever, mode="OR"):
            self._vector_retriever = vector_retriever
            self._kg_retriever = kg_retriever
            if mode not in ("AND", "OR"):
                raise ValueError("Invalid mode.")
            self._mode = mode

        def _retrieve(self, query_bundle):
            vector_nodes = self._vector_retriever.retrieve(query_bundle)
            kg_nodes = self._kg_retriever.retrieve(query_bundle)
            vector_ids = {n.node.node_id for n in vector_nodes}
            kg_ids = {n.node.node_id for n in kg_nodes}
            combined_dict = {n.node.node_id: n for n in vector_nodes}
            combined_dict.update({n.node.node_id: n for n in kg_nodes})
            if self._mode == "AND":
                retrieve_ids = vector_ids.intersection(kg_ids)
            else:
                retrieve_ids = vector_ids.union(kg_ids)
            return [combined_dict[rid] for rid in retrieve_ids]

    custom_retriever = CustomRetriever(vector_retriever, kg_retriever)
    custom_prompt_template = PromptTemplate(
        template="""\
        Your Tasks Include:
        Provide a concise yet comprehensive response to the user leveraging the content provided to you.
        ---------------------
        {context_str}
        ---------------------
        User Inquiry: {query_str}
        Answer: 
        """
    )
    response_synthesizer = TreeSummarize(
        verbose=True,
        summary_template=custom_prompt_template,
    )
    query_engine = RetrieverQueryEngine(
        retriever=custom_retriever,
        response_synthesizer=response_synthesizer,
    )

def query_index(query_text):
    global query_engine
    query_bundle = QueryBundle(query_text)
    response = query_engine.query(query_bundle)
    return str(response)

initialize_indices()

@app.route("/query", methods=["GET"])
def query_index_endpoint():
    query_text = request.args.get("text", None)
    if query_text is None:
        return "No text found, please include a ?text=blah parameter in the URL", 400
    response = query_index(query_text)
    return jsonify(response), 200

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=3000, debug=True)

Key Changes:

  1. Separate Loading of Indices: Load each index separately by specifying their index_id when calling load_index_from_storage.
  2. Set Index IDs: Set the index_id for each index when creating them to ensure they can be correctly identified and loaded later.

This should resolve the TypeError and ensure that both indices are correctly loaded and used in your application.

To continue talking to Dosu, mention @dosu.

dosubot[bot] commented 3 months ago

To modify your index creation script to set the index_id for both the VectorStoreIndex and KnowledgeGraphIndex, you can use the set_index_id method provided in the BaseIndex class. Here is how you can do it:

def create_index():
    graph_store = NebulaGraphStore(
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags
    )

    storage_context = StorageContext.from_defaults(graph_store=graph_store)

    json_nodes = load_json_nodes("JSON_nodes_999_large")
    documents = [
        Document(
            text=decode_string(node.text),
            id_=node.id_,
            metadata=decode_metadata(node.metadata),
            embedding=node.embedding
        ) for node in json_nodes
    ]

    kg_index = KnowledgeGraphIndex.from_documents(
        documents,
        storage_context=storage_context,
        max_triplets_per_chunk=10,
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
        max_knowledge_sequence=15,
        include_embeddings=True
    )

    # Set the index_id for KnowledgeGraphIndex
    kg_index.set_index_id("kg_index")

    kg_index.storage_context.persist(persist_dir='./storage_graph')

    # Create VectorStoreIndex
    vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

    # Set the index_id for VectorStoreIndex
    vector_index.set_index_id("vector_index")

    # Persist the storage context
    storage_context.persist(persist_dir='./storage_graph')

    return kg_index, vector_index, storage_context

# Example usage
kg_index, vector_index, storage_context = create_index()

Key Changes:

  1. Set index_id for KnowledgeGraphIndex: After creating the KnowledgeGraphIndex, use kg_index.set_index_id("kg_index") to set its index_id.
  2. Set index_id for VectorStoreIndex: Similarly, after creating the VectorStoreIndex, use vector_index.set_index_id("vector_index") to set its index_id.
  3. Persist the Storage Context: Ensure the storage context is persisted after setting the index_id for both indices.

This ensures that both indices are correctly identified and can be loaded later using their respective index_ids.

To continue talking to Dosu, mention @dosu.