run-llama / llama_index

LlamaIndex is a data framework for your LLM applications
https://docs.llamaindex.ai
MIT License
33.3k stars 4.66k forks source link

[Question]: rel_props not saving to persistant storage #14468

Open jjoaqu7 opened 5 days ago

jjoaqu7 commented 5 days ago

Question Validation

Question

Hello, I am wondering why the rel_props are not being saved to my graph index persistent storage? This could be leading to my issue where, when I query my indices after loading them from the persistent storage, nothing is being returned for my knowledge graph. The following is my script for creating and storing my persistent storage, however, an example of my index_store.json within my persistent directory, storage_graph, is appended after my script. Script:

from flask import Flask, request, jsonify
import os
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    Document,
    Settings,
    PromptTemplate,
    KnowledgeGraphIndex
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.nebula import NebulaGraphStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from llama_index.core.schema import QueryBundle
from llama_index.core.base.base_retriever import BaseRetriever
from llama_index.core.response_synthesizers import TreeSummarize
from llama_index.core.schema import TextNode
import base64, logging, json

logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)

Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.embed_model = embed_model
Settings.chunk_size = 512

os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"
os.environ["NEBULA_ADDRESS"] = "127.0.0.1:9669"

space_name = "test9"
edge_types, rel_prop_names = ["relationship"], ["relationship"]
tags = ["entity"]

def encode_string(s):
    return base64.urlsafe_b64encode(s.encode()).decode()

def decode_string(s):
    return base64.urlsafe_b64decode(s.encode()).decode()

def sanitize_and_encode(data):
    sanitized_data = {}
    for key, value in data.items():
        if isinstance(value, str):
            sanitized_data[key] = encode_string((value))
        else:
            sanitized_data[key] = value
    return sanitized_data

def decode_metadata(metadata):
    decoded_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, str):
            decoded_metadata[key] = decode_string(value)
        else:
            decoded_metadata[key] = value
    return decoded_metadata

def load_json_nodes(json_directory):
    nodes = []
    for filename in os.listdir(json_directory):
        if filename.endswith('.json'):
            with open(os.path.join(json_directory, filename), 'r') as file:
                data = json.load(file)
                for node_data in data:
                    sanitized_metadata = sanitize_and_encode(node_data['metadata'])
                    node = TextNode(
                        text=encode_string((node_data['text'])),
                        id_=node_data['id_'],
                        embedding=node_data['embedding'],
                        metadata=sanitized_metadata
                    )
                    nodes.append(node)
                    logging.debug(f"Loaded node ID: {node.id_}, text: {node_data['text']}, metadata: {node_data['metadata']}")

    return nodes

def create_index():
    graph_store = NebulaGraphStore(
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags
    )

    storage_context = StorageContext.from_defaults(graph_store=graph_store)

    json_nodes = load_json_nodes("JSON_nodes_999_large_syll")
    documents = [
        Document(
            text=decode_string(node.text),
            id_=node.id_,
            metadata=decode_metadata(node.metadata),
            embedding=node.embedding
        ) for node in json_nodes
    ]

    kg_index = KnowledgeGraphIndex.from_documents(
        documents,
        storage_context=storage_context,
        max_triplets_per_chunk=10,
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
        max_knowledge_sequence=15,
        include_embeddings=True
    )

    # Set the index_id for KnowledgeGraphIndex
    kg_index.set_index_id("kg_index")

    kg_index.storage_context.persist(persist_dir='./storage_graph_test10')
    logging.debug(f"KG Index created with {len(documents)} documents")

    # Create VectorStoreIndex
    vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

    # Set the index_id for VectorStoreIndex
    vector_index.set_index_id("vector_index")

    # Persist the storage context
    storage_context.persist(persist_dir='./storage_graph_test10')
    logging.debug(f"Vector Index created with {len(documents)} documents")
    return kg_index, vector_index, storage_context

print("Creating Index...")
kg_index, vector_index, storage_context = create_index()
print("Index Created...")

index_store.json file:

{"index_store/data": {"kg_index": {"__type__": "kg", "__data__": "{\"index_id\": \"kg_index\", \"summary\": null, \"table\": {\"Mech 3202\": [\"48ae3288-8d75-429f-864d-0ba2054433d9\"], \"Fluid dynamics\": [\"48ae3288-8d75-429f-864d-0ba2054433d9\"], \"York university\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\", \"48ae3288-8d75-429f-864d-0ba2054433d9\"], \"Mech 3202 fluid dynamics\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"Course_name\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"University\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"Text\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"Content_type\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"Pse 317\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"Tutorial location\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"]}, \"rel_map\": {}, \"embedding_dict\": {\"('Mech 3202', 'Is', 'Fluid dynamics')\": [0. ...
dosubot[bot] commented 5 days ago

The rel_props are not being saved to the persistent storage because the refresh_schema method is not being called or executed properly. This method is responsible for updating the structured_schema dictionary, which includes the rel_props.

To resolve this issue, ensure that the refresh_schema method is called after any changes to the relationships or their properties. This can be done by explicitly calling self.refresh_schema() after any operation that modifies the relationships.

Here is the relevant part of the code where refresh_schema is called:

def __init__(
        self,
        username: str,
        password: str,
        url: str,
        database: Optional[str] = "neo4j",
        refresh_schema: bool = True,
        sanitize_query_output: bool = True,
        enhanced_schema: bool = False,
        **neo4j_kwargs: Any,
    ) -> None:
        self.sanitize_query_output = sanitize_query_output
        self.enhanced_schema = enhanced_schema
        self._driver = neo4j.GraphDatabase.driver(
            url, auth=(username, password), **neo4j_kwargs
        )
        self._async_driver = neo4j.AsyncGraphDatabase.driver(
            url,
            auth=(username, password),
            **neo4j_kwargs,
        )
        self._database = database
        self.structured_schema = {}
        if refresh_schema:
            self.refresh_schema()

Make sure that refresh_schema is set to True when initializing the class, or call self.refresh_schema() manually after any updates to the relationships.

Additionally, ensure that the persist method is correctly implemented to save the graph store to a file. Here is an example of how the persist method can be implemented:

def persist(
    self, persist_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
) -> None:
    """Persist the graph store to a file."""
    if fs is None:
        fs = fsspec.filesystem("file")
    with fs.open(persist_path, "w") as f:
        f.write(self.graph.json())

This method ensures that the current state of the graph, including rel_props, is saved to the specified file.

According to the PropertyGraphStore class, the persist method is defined as an abstract method, which means it must be implemented in any subclass. Ensure that your subclass correctly implements this method to handle the persistence of rel_props:

class MyGraphStore(PropertyGraphStore):
    def persist(
        self, persist_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
    ) -> None:
        """Persist the graph store to a file."""
        if fs is None:
            fs = fsspec.filesystem("file")
        with fs.open(persist_path, "w") as f:
            f.write(self.graph.json())

By ensuring that refresh_schema is called appropriately and the persist method is correctly implemented, you can resolve the issue of rel_props not being saved to persistent storage [1][2][3].

To continue talking to Dosu, mention @dosu.

jjoaqu7 commented 5 days ago

So it is possible to save my rel_props to the persistent graph storage rel_maps, or do I have to connect to my nebula graph space to retrieve these rel_props?

@dosu @dosubot

dosubot[bot] commented 5 days ago

Yes, it is possible to save your rel_props to the persistent graph storage rel_maps. The NebulaPropertyGraphStore class provides methods to upsert triplets and manage relationships, which implies that you can store your rel_props directly in the graph storage.

Here is the relevant method from the NebulaPropertyGraphStore class:

def upsert_triplet(self, subj: str, rel: str, obj: str) -> None:
    """Add triplet."""
    # Note, to enable leveraging existing knowledge graph,
    # the (triplet -- property graph) mapping
    #   makes (n:1) edge_type.prop_name --> triplet.rel
    # thus we have to assume rel to be the first edge_type.prop_name
    # here in upsert_triplet().
    # This applies to the type of entity(tags) with subject and object, too,
    # thus we have to assume subj to be the first entity.tag_name

This method allows you to add a triplet (subject, relationship, object) to the graph, which means you can save your rel_props directly to the persistent graph storage without needing to retrieve them from the Nebula graph space [1][2][3].

To continue talking to Dosu, mention @dosu.

jjoaqu7 commented 5 days ago

How can I add include this to my previous script that creates both vector and graph indices, and saves them to a persistent storage? @dosu

jjoaqu7 commented 5 days ago

so hwo can I update my script to ensure the rel_map is being populated within my index_store.json in my persistent storage? @dosu

dosubot[bot] commented 5 days ago

To ensure that the rel_map is being populated correctly within your index_store.json in the persistent storage, you need to verify that the get_rel_map method is correctly fetching and structuring the relationship data. Here is how you can update your script to include this functionality:

  1. Ensure get_rel_map is correctly implemented: Verify that the get_rel_map method is correctly fetching and structuring the relationship data.

  2. Persist the rel_map to index_store.json: Serialize the rel_map dictionary to JSON and write it to the file.

Here is an updated version of your script with these changes:

from flask import Flask, request, jsonify
import os
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    Document,
    Settings,
    PromptTemplate,
    KnowledgeGraphIndex
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.nebula import NebulaGraphStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from llama_index.core.schema import QueryBundle
from llama_index.core.base.base_retriever import BaseRetriever
from llama_index.core.response_synthesizers import TreeSummarize
from llama_index.core.schema import TextNode
import base64, logging, json

logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)

Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.embed_model = embed_model
Settings.chunk_size = 512

os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"
os.environ["NEBULA_ADDRESS"] = "127.0.0.1:9669"

space_name = "test9"
edge_types, rel_prop_names = ["relationship"], ["relationship"]
tags = ["entity"]

def encode_string(s):
    return base64.urlsafe_b64encode(s.encode()).decode()

def decode_string(s):
    return base64.urlsafe_b64decode(s.encode()).decode()

def sanitize_and_encode(data):
    sanitized_data = {}
    for key, value in data.items():
        if isinstance(value, str):
            sanitized_data[key] = encode_string((value))
        else:
            sanitized_data[key] = value
    return sanitized_data

def decode_metadata(metadata):
    decoded_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, str):
            decoded_metadata[key] = decode_string(value)
        else:
            decoded_metadata[key] = value
    return decoded_metadata

def load_json_nodes(json_directory):
    nodes = []
    for filename in os.listdir(json_directory):
        if filename.endswith('.json'):
            with open(os.path.join(json_directory, filename), 'r') as file:
                data = json.load(file)
                for node_data in data:
                    sanitized_metadata = sanitize_and_encode(node_data['metadata'])
                    node = TextNode(
                        text=encode_string((node_data['text']))),
                        id_=node_data['id_'],
                        embedding=node_data['embedding'],
                        metadata=sanitized_metadata
                    )
                    nodes.append(node)
                    logging.debug(f"Loaded node ID: {node.id_}, text: {node_data['text']}, metadata: {node_data['metadata']}")

    return nodes

def save_rel_map_to_json(rel_map, file_path):
    with open(file_path, 'w') as json_file:
        json.dump(rel_map, json_file, indent=4)

def create_index():
    graph_store = NebulaGraphStore(
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags
    )

    storage_context = StorageContext.from_defaults(graph_store=graph_store)

    json_nodes = load_json_nodes("JSON_nodes_999_large_syll")
    documents = [
        Document(
            text=decode_string(node.text),
            id_=node.id_,
            metadata=decode_metadata(node.metadata),
            embedding=node.embedding
        ) for node in json_nodes
    ]

    kg_index = KnowledgeGraphIndex.from_documents(
        documents,
        storage_context=storage_context,
        max_triplets_per_chunk=10,
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
        max_knowledge_sequence=15,
        include_embeddings=True
    )

    # Set the index_id for KnowledgeGraphIndex
    kg_index.set_index_id("kg_index")

    kg_index.storage_context.persist(persist_dir='./storage_graph_test10')
    logging.debug(f"KG Index created with {len(documents)} documents")

    # Create VectorStoreIndex
    vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

    # Set the index_id for VectorStoreIndex
    vector_index.set_index_id("vector_index")

    # Persist the storage context
    storage_context.persist(persist_dir='./storage_graph_test10')
    logging.debug(f"Vector Index created with {len(documents)} documents")

    # Get and save the rel_map
    rel_map = graph_store.get_rel_map()
    save_rel_map_to_json(rel_map, './storage_graph_test10/index_store.json')

    return kg_index, vector_index, storage_context

print("Creating Index...")
kg_index, vector_index, storage_context = create_index()
print("Index Created...")

This script ensures that the rel_map is correctly populated and saved to your index_store.json in the persistent storage [1][2].

To continue talking to Dosu, mention @dosu.