run-llama / llama_index

LlamaIndex is a data framework for your LLM applications
https://docs.llamaindex.ai
MIT License
35.88k stars 5.09k forks source link

[Question]: Big problem on saving and retrieve KnowledgeGraphIndex (Neo4j) #14171

Closed robertobalestri closed 3 months ago

robertobalestri commented 3 months ago

Question Validation

Question

I have a big problem, i can't save or retrieve the graph in memory do perform queries on it, so everytime the graph gets recalculated. This is the code that after 2 days Im arrived at, also asking help to chatgpt and reading docs, but it doesn't work.

If anyone know how to do it please help me.

Thank you!

import os
import openai
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex
from dotenv import load_dotenv
from llama_index.core import Settings
from llama_index.graph_stores.neo4j import Neo4jGraphStore
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import StorageContext
from llama_index.core.indices.loading import load_graph_from_storage
from llama_index.core.indices.composability.graph import ComposableGraph

load_dotenv()
print(os.getenv('AZURE_OPENAI_LLM_DEPLOYMENT_NAME'))
print(os.getenv('AZURE_OPENAI_API_ENDPOINT'))
print(os.getenv('AZURE_OPENAI_API_KEY'))
print(os.getenv('AZURE_OPENAI_API_VERSION'))

llm = AzureOpenAI(
    engine=os.getenv('AZURE_OPENAI_LLM_DEPLOYMENT_NAME'),
    model="gpt-4o",
    temperature=0.0,
    azure_endpoint=os.getenv('AZURE_OPENAI_API_ENDPOINT'),
    api_key=os.getenv('AZURE_OPENAI_API_KEY'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
)

embed_model = OllamaEmbedding(model_name="mxbai-embed-large:335m", embed_batch_size=512)

Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = 512

username = "neo4j"
database = "neo4j"
password = "xxx"
url = "bolt://localhost:7687"
print(username, password, url, database)

graph_store = Neo4jGraphStore(
    username=username,
    password=password,
    url=url,
    database=database,
)

# Directory for the serialized graph
storage_dir = './storage'
os.makedirs(storage_dir, exist_ok=True)

# Define a consistent root ID
root_id = 'knowledge_graph_index'

# Check if storage context files exist
docstore_path = os.path.join(storage_dir, 'docstore.json')
index_store_path = os.path.join(storage_dir, 'index_store.json')
graph_store_path = os.path.join(storage_dir, 'graph_store.json')

# Attempt to load the graph
try:
    if os.path.exists(docstore_path) and os.path.exists(index_store_path) and os.path.exists(graph_store_path):
        storage_context = StorageContext.from_defaults(graph_store=graph_store, persist_dir=storage_dir)
        knowledge_graph_index = load_graph_from_storage(storage_context, root_id=root_id)
        print("Loaded graph from storage.")
        print(f"Root ID: {root_id}")
    else:
        raise FileNotFoundError("Required storage files not found, creating new graph.")
except Exception as e:
    print(f"Failed to load graph from storage: {e}")
    # Graph doesn't exist, so create it from documents
    documents = SimpleDirectoryReader("./content/Documents").load_data()
    storage_context = StorageContext.from_defaults(graph_store=graph_store)

    # NOTE: can take a while!
    knowledge_graph_index = KnowledgeGraphIndex.from_documents(
        documents,
        storage_context=storage_context,
        max_triplets_per_chunk=3,
        show_progress=True,
        include_embeddings=True,
    )
    # Set the root ID and save the newly created graph
    knowledge_graph_index.set_index_id(root_id)

    storage_context.persist(persist_dir=storage_dir)
    print(f"Persisted graph in directory: {storage_dir}")

# Verify that the graph is correctly loaded
try:
    if knowledge_graph_index is None:
        raise ValueError("Failed to create or load KnowledgeGraphIndex.")
    print("Successfully created or loaded KnowledgeGraphIndex.")

    # Check the contents of all_indices
    print(f"Contents of all_indices: {knowledge_graph_index.all_indices}")
    # Check the root_id
    print(f"Root ID set in graph: {knowledge_graph_index._root_id}")

    # Verify the root_id is in all_indices
    if knowledge_graph_index.root_id not in knowledge_graph_index.all_indices:
        raise KeyError(f"The specified root_id '{knowledge_graph_index.index_id()}' was not found in the graph indices.")
    print("The root ID was found in the graph indices.")
except Exception as e:
    print(f"Error verifying the KnowledgeGraphIndex: {e}")

# Now, whether loaded or created, you can use `knowledge_graph_index` as before
try:
    query_engine = knowledge_graph_index.as_query_engine(
        include_text=True,
        response_mode="tree_summarize",
        embedding_mode="hybrid",
        similarity_top_k=5,
    )
    response = query_engine.query("Quali sono le chiese disegnate da Raffaello?")
    print(response)
except KeyError as e:
    print(f"KeyError: {e} - The specified root_id '{root_id}' was not found in the graph indices.")
except Exception as e:
    print(f"An error occurred while creating the query engine: {e}")
logan-markewich commented 3 months ago

Is there any error? Does this line every print?

print(f"Failed to load graph from storage: {e}") ?

robertobalestri commented 3 months ago

This is my output... but it desn't print your string.

neo4j xxx bolt://localhost:7687 neo4j Failed to load graph from storage: Required storage files not found, creating new graph. Parsing nodes: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 124.90it/s] Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████| 3/3 [00:16<00:00, 5.36s/it] Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00, 2.22s/it] Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00, 2.20s/it] Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00, 2.21s/it] Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00, 2.22s/it] Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00, 2.23s/it] Processing nodes: 100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [01:02<00:00, 10.37s/it] Persisted graph in directory: ./storage Successfully created or loaded KnowledgeGraphIndex. Error verifying the KnowledgeGraphIndex: 'KnowledgeGraphIndex' object has no attribute 'all_indices' Una delle chiese disegnate da Raffaello è S. Eligio degli Orefici.

robertobalestri commented 3 months ago

Ok, after days tryng i fount out that the load_graph doesn't work, but load index does.

import os
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex, Settings, StorageContext
from llama_index.graph_stores.neo4j import Neo4jGraphStore
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.indices.loading import load_index_from_storage
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Configure Azure OpenAI
llm = AzureOpenAI(
    engine=os.getenv('AZURE_OPENAI_LLM_DEPLOYMENT_NAME'),
    model="gpt-4o",
    temperature=0.0,
    azure_endpoint=os.getenv('AZURE_OPENAI_API_ENDPOINT'),
    api_key=os.getenv('AZURE_OPENAI_API_KEY'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
)

# Configure the embedding model
embed_model = OllamaEmbedding(model_name="mxbai-embed-large:335m", embed_batch_size=512)

# Set configuration parameters
Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = 512

# Configure the Neo4j database connection
username = "neo4j"
database = "neo4j"
password = "password"
url = "bolt://localhost:7687"

graph_store = Neo4jGraphStore(
    username=username,
    password=password,
    url=url,
    database=database,
)

# Directory for storage
storage_dir = './storage'
os.makedirs(storage_dir, exist_ok=True)

# Consistent root ID
root_id = 'knowledge_graph_index'

# Load or create the knowledge graph
try:
    storage_context = StorageContext.from_defaults(graph_store=graph_store, persist_dir=storage_dir)
    knowledge_graph_index = load_index_from_storage(storage_context, index_id=root_id)
    print("Graph loaded from storage.")
except Exception as e:
    print(f"Failed to load graph from storage: {e}")
    # Create the graph from documents if it doesn't exist
    documents = SimpleDirectoryReader("./content/Documents").load_data()
    storage_context = StorageContext.from_defaults(graph_store=graph_store)
    knowledge_graph_index = KnowledgeGraphIndex.from_documents(
        documents,
        storage_context=storage_context,
        max_triplets_per_chunk=3,
        show_progress=True,
        include_embeddings=True,
    )
    knowledge_graph_index.set_index_id(root_id)
    storage_context.persist(persist_dir=storage_dir)
    print(f"Graph created and stored in: {storage_dir}")

# Verify the graph is loaded correctly
try:
    if knowledge_graph_index is None:
        raise ValueError("Failed to create or load KnowledgeGraphIndex.")
    print("KnowledgeGraphIndex created or loaded successfully.")
    root_id_set = knowledge_graph_index.index_id
    if root_id_set != root_id:
        raise KeyError(f"The specified root ID '{root_id}' does not match the loaded root ID '{root_id_set}'.")
    print("The root ID matches and is correct.")
except Exception as e:
    print(f"Error verifying the KnowledgeGraphIndex: {e}")

# Use `knowledge_graph_index` for queries
try:
    query_engine = knowledge_graph_index.as_query_engine(
        include_text=True,
        response_mode="tree_summarize",
        embedding_mode="hybrid",
        similarity_top_k=5,
    )
    response = query_engine.query("Quali chiese ha disegnato Raffaello?")
    print(response)
except KeyError as e:
    print(f"KeyError: {e} - The specified root_id '{root_id}' was not found in the graph indices.")
except Exception as e:
    print(f"An error occurred while creating the query engine: {e}")
logan-markewich commented 3 months ago

@robertobalestri ah good catch load_graph is an old method for something completely unrelated actually.

Load index is the one to use, I didn't notice that in your code