run-llama / llama_index

LlamaIndex is a data framework for your LLM applications
https://docs.llamaindex.ai
MIT License
33.62k stars 4.72k forks source link

[Question]: kg index embeddings insertion #14449

Closed jjoaqu7 closed 2 weeks ago

jjoaqu7 commented 2 weeks ago

Question Validation

Question

Hello, I am wondering how can I can use embeddings for querying with this kg index in my script, I have used the include_embeddings=True and embedding_mode="hybrid" and then storing mt created index in a persistent storage. However, when I try to check if the embeddings are created and are present in my index, I run into problems, additionally, within my docstore in my persistent storage I can see the embeddings field being null, making me even more confused about if the embeddings are even being generated. Can anyone help me here?

from nebula3.gclient.net import ConnectionPool
from nebula3.Config import Config
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    KnowledgeGraphIndex,
    Settings,
    StorageContext,
    PromptTemplate,
    load_index_from_storage
)
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.graph_stores.nebula import NebulaGraphStore
from llama_index.core.query_engine import KnowledgeGraphQueryEngine
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from typing import List
from llama_index.core.schema import NodeWithScore, QueryBundle
import os
import json
import base64
import subprocess

# Configure OpenAI settings
Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.embed_model = embed_model
Settings.chunk_size = 512

# Environment variables for NebulaGraph connection
os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"
os.environ["NEBULA_ADDRESS"] = "127.0.0.1:9669"

# NebulaGraph store configuration
space_name = "embtest"
edge_types, rel_prop_names = ["relationship"], ["relationship"]
tags = ["entity"]

graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags
)

storage_context = StorageContext.from_defaults(graph_store=graph_store)

# Load documents
documents = SimpleDirectoryReader("./data/paul_graham").load_data()

# Convert document text to lowercase
for doc in documents:
    doc.text = doc.text.lower()

# Generate embeddings and create KnowledgeGraphIndex
print("Generating embeddings and creating KnowledgeGraphIndex...")
kg_index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=10,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    max_knowledge_sequence=15,
    include_embeddings=True,
)

# Debug: Print out embeddings during the indexing process
for doc in documents:
    embedding = embed_model.embed(doc.text)
    print(f"Document ID: {doc.id}")
    print(f"Embedding: {embedding[:20]}")  # Print first 20 elements of the embedding

# Persist the KnowledgeGraphIndex
kg_index.storage_context.persist(persist_dir='./storage_graph2')
print("KnowledgeGraphIndex created and persisted.")

# Load the persisted KnowledgeGraphIndex
print("Loading KnowledgeGraphIndex from persistent storage...")
kg_index = load_index_from_storage(storage_context=storage_context, persist_dir='./storage_graph2')

# Print out embeddings from the loaded index
print("Printing embeddings from the loaded index:")
nodes = kg_index.graph_store.get_nodes()
for node in nodes:
    if hasattr(node, 'embedding'):
        embedding = node.embedding
        print(f"Node ID: {node.id}")
        print(f"Embedding: {embedding[:20]}")  # Print first 20 elements of the embedding

print("Loaded KnowledgeGraphIndex and printed embeddings.")

# Set up query engine using the as_query_engine method
query_engine = kg_index.as_query_engine(
    include_text=True,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=5,
)

# Execute a sample query
response = query_engine.query("What is Hacker news")
print(response)
logan-markewich commented 2 weeks ago

The embeddings are in the index store

logan-markewich commented 2 weeks ago

They won't be directly attached to the documents.

jjoaqu7 commented 2 weeks ago

Oh I see now! Thanks for the help!