HKUDS / LightRAG

"LightRAG: Simple and Fast Retrieval-Augmented Generation"
https://arxiv.org/abs/2410.05779
MIT License
9.28k stars 1.14k forks source link

Neo4j Instance: Unable to retrieve routing information #222

Open TharunSivamani opened 2 weeks ago

TharunSivamani commented 2 weeks ago

I am trying to try the LightRAG implementation of using NEO4J as Storage as stated in the README

[2024.11.04]🎯📢You can use Neo4J for Storage now.

Steps to reproduce the error

  1. Create an instance in Neo4j and launch a free online instance
  2. Save the password, url, uri.
# At the top of the file, after the imports
import os
from dotenv import load_dotenv

# Load environment variables from .env file if it exists
load_dotenv()

# Set Neo4j environment variables with fallback values
NEO4J_URI = os.getenv("NEO4J_URI", "")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

if not NEO4J_PASSWORD:
    raise ValueError("NEO4J_PASSWORD environment variable must be set")

import asyncio
import aiohttp
import numpy as np
import nest_asyncio
import shutil
from lightrag import LightRAG, QueryParam
from lightrag.llm import openai_complete_if_cache
from lightrag.utils import EmbeddingFunc
from neo4j import AsyncGraphDatabase

# Allow nested event loops for Jupyter or environments that require it
nest_asyncio.apply()

WORKING_DIR = "neo4j-local"

if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

# Define the LLM model function
async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], **kwargs
) -> str:
    return await openai_complete_if_cache(
        "meta/llama-3.1-8b-instruct",
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key="ollama",
        base_url="http://xxx:yyy/v1",
        **kwargs,
    )

# Fetch embeddings from server function
async def fetch_embeddings_from_server(
    texts: list[str],
    model: str = "nvidia/nv-embedqa-e5-v5",
    base_url: str = "http://xxx:yyy/v1/embeddings",
    api_key: str = None,
    input_type: str = "passage",
    encoding_format: str = "float",
    user: str = "user-identifier",
    truncate: str = "END",
    max_token_size: int = 512
) -> np.ndarray:
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    payload = {
        "input": texts,
        "model": model,
        "input_type": input_type,
        "encoding_format": encoding_format,
        "user": user,
        "truncate": truncate
    }

    async with aiohttp.ClientSession() as session:
        async with session.post(base_url, headers=headers, json=payload) as response:
            response_data = await response.json()
            if 'data' not in response_data:
                raise ValueError(f"Unexpected response format: {response_data}")
            embeddings = [item["embedding"] for item in response_data["data"]]
            return np.array(embeddings)

# Async function to determine embedding dimension
async def get_embedding_dim():
    test_text = ["This is a test sentence."]
    embedding = await fetch_embeddings_from_server(test_text)
    embedding_dim = embedding.shape[1]
    print(f"Embedding Dimensions: {embedding.shape}")
    return embedding_dim

# Run an async function to set up LightRAG
async def main():

    embedding_dimension = await get_embedding_dim()

    # Initialize the LightRAG instance with Neo4j configuration
    rag = LightRAG(
        working_dir=WORKING_DIR,
        llm_model_func=llm_model_func,
        embedding_func=EmbeddingFunc(
            embedding_dim=embedding_dimension,
            max_token_size=8192,
            func=fetch_embeddings_from_server,
        ),
        kg="Neo4JStorage",
        log_level="DEBUG"
    )

    try:
        # Load text into the knowledge graph
        with open("./book.txt") as f:
            await rag.insert(f.read())

        # Perform queries with different search modes
        for mode in ["naive", "local", "global", "hybrid"]:
            try:
                result = await rag.query(
                    "What are the top themes in this story?",
                    param=QueryParam(mode=mode)
                )
                print(f"\nResults for {mode} mode:")
                print(result)
            except Exception as e:
                print(f"Query failed for {mode} mode: {str(e)}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Run the main async function
asyncio.run(main())

Output:

(rag) root@labserver04:~/tharun/Light-RAG# python3 neon.py 
Embedding Dimensions: (1, 1024)
INFO:lightrag:Logger initialized for working directory: neo4j-local
DEBUG:lightrag:LightRAG init with param:
  working_dir = neo4j-local,
  kg = Neo4JStorage,
  log_level = DEBUG,
  chunk_token_size = 1200,
  chunk_overlap_token_size = 100,
  tiktoken_model_name = gpt-4o-mini,
  entity_extract_max_gleaning = 1,
  entity_summary_to_max_tokens = 500,
  node_embedding_algorithm = node2vec,
  node2vec_params = {'dimensions': 1536, 'num_walks': 10, 'walk_length': 40, 'window_size': 2, 'iterations': 3, 'random_seed': 3},
  embedding_func = {'embedding_dim': 1024, 'max_token_size': 8192, 'func': <function fetch_embeddings_from_server at 0x7fc6bbcea680>},
  embedding_batch_num = 32,
  embedding_func_max_async = 16,
  llm_model_func = <function llm_model_func at 0x7fc6bbcea710>,
  llm_model_name = meta-llama/Llama-3.2-1B-Instruct,
  llm_model_max_token_size = 32768,
  llm_model_max_async = 16,
  llm_model_kwargs = {},
  key_string_value_json_storage_cls = <class 'lightrag.storage.JsonKVStorage'>,
  vector_db_storage_cls = <class 'lightrag.storage.NanoVectorDBStorage'>,
  vector_db_storage_cls_kwargs = {},
  enable_llm_cache = True,
  addon_params = {},
  convert_response_to_json_func = <function convert_response_to_json at 0x7fc6c9742d40>

INFO:lightrag:Load KV full_docs with 0 data
INFO:lightrag:Load KV text_chunks with 0 data
INFO:lightrag:Load KV llm_response_cache with 84 data
INFO:nano-vectordb:Load (0, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': 'neo4j-local/vdb_entities.json'} 0 data
INFO:nano-vectordb:Load (0, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': 'neo4j-local/vdb_relationships.json'} 0 data
INFO:nano-vectordb:Load (42, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': 'neo4j-local/vdb_chunks.json'} 42 data
INFO:lightrag:[New Docs] inserting 1 docs
INFO:lightrag:[New Chunks] inserting 42 chunks
INFO:lightrag:Inserting 42 vectors to chunks
INFO:lightrag:[Entity Extraction]...
â ¹ Processed 42 chunks, 689 entities(duplicated), 427 relations(duplicated)
ERROR:neo4j.pool:Unable to retrieve routing information
KG successfully indexed.
An error occurred: Unable to retrieve routing information