HKUDS / LightRAG

"LightRAG: Simple and Fast Retrieval-Augmented Generation"
https://arxiv.org/abs/2410.05779
MIT License
9.22k stars 1.13k forks source link

Model outputs gibberish using Local and Global search. #309

Closed Tejaswgupta closed 15 hours ago

Tejaswgupta commented 2 days ago

I got correct result in the first iteration of playing around with the dickens book text. However when i tried with some other long form texts , the naive type query worked fine while other took too long and also output gibberish.

I'm using Qwen 2.5 7B/14B INT4 models with 32k context length which are hosted with vllm on a VM.

Logs: https://pastebin.com/fvy3DsSH

Code snippet:

WORKING_DIR = "./dickens"

if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], **kwargs
) -> str:
    return await openai_complete_if_cache(
        # "gemini-flash",
        "self-hosted-small", #this is the qwen-2.5-7b-gptq-int4 model
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key="emtp",
        base_url="https://xx.xx.com",
        **kwargs,
    )

async def embedding_func(texts: list[str]) -> np.ndarray:
    return await openai_embedding(
        texts,
        model="embeddings-finetune/gte-base-case-law-v2",
        api_key="emtp",
        base_url="https://xx.xx.com",
    )

async def get_embedding_dim():
    test_text = ["This is a test sentence."]
    embedding = await embedding_func(test_text)
    embedding_dim = embedding.shape[1]
    return embedding_dim

# function test
async def test_funcs():
    result = await llm_model_func("How are you?")
    print("llm_model_func: ", result)

    result = await embedding_func(["How are you?"])
    print("embedding_func: ", result)

# asyncio.run(test_funcs())

async def main():
    try:
        embedding_dimension = await get_embedding_dim()
        print(f"Detected embedding dimension: {embedding_dimension}")

        rag = LightRAG(
            working_dir=WORKING_DIR,
            llm_model_func=llm_model_func,
            embedding_func=EmbeddingFunc(
                embedding_dim=embedding_dimension,
                max_token_size=8192,
                func=embedding_func,
            ),
        )

        with open("./book.txt", "r", encoding="utf-8") as f:
            await rag.ainsert(f.read())

        # Perform naive search
        print(
            await rag.aquery(
                "What are the top themes in this story?", param=QueryParam(mode="naive")
            )
        )
        print('---'*50)

        # Perform local search
        print(
            await rag.aquery(
                "What are the top themes in this story?", param=QueryParam(mode="local")
            )
        )
        print('---'*50)

        # Perform global search
        print(
            await rag.aquery(
                "What are the top themes in this story?",
                param=QueryParam(mode="global"),
            )
        )
        print('---'*50)

        # Perform hybrid search
        print(
            await rag.aquery(
                "What are the top themes in this story?",
                param=QueryParam(mode="hybrid"),
            )
        )
    except Exception as e:
        print(f"An error occurred: {e}")
nihirv commented 1 day ago

Can you give an example of what gibberish means? Literal gibberish (i.e. strings of tokens that make no sense), or coherent text but unrelated to the source documents?

Tejaswgupta commented 1 day ago

@nihirv you can check the logs i've shared. It's literal gibberish.