explodinggradients / ragas

Evaluation framework for your Retrieval Augmented Generation (RAG) pipelines
https://docs.ragas.io
Apache License 2.0
6.55k stars 643 forks source link

RuntimeError #1212

Open minglong-huang opened 3 weeks ago

minglong-huang commented 3 weeks ago

Here is my result: image

and my code:

import typing as t
import asyncio
from typing import List
from datasets import load_dataset, load_from_disk
from ragas.metrics import faithfulness, context_recall, context_precision
from ragas.metrics import AnswerRelevancy
from ragas import evaluate
from ragas.llms import BaseRagasLLM
from langchain.schema import LLMResult
from langchain.schema import Generation
from langchain.callbacks.base import Callbacks
from langchain.schema.embeddings import Embeddings
from transformers import AutoModel, AutoTokenizer
from ragas.llms.prompt import PromptValue
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from FlagEmbedding import FlagModel
from ragas.metrics import answer_relevancy
from langchain_core.language_models import BaseLanguageModel
from langchain_core.embeddings import Embeddings
from ragas.llms import BaseRagasLLM
from ragas.embeddings import BaseRagasEmbeddings

class MyLLM(BaseRagasLLM):

    def __init__(self,llm_path):
        self.tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)
        self.base_llm = AutoModel.from_pretrained(llm_path, trust_remote_code=True).cuda()
        self.base_llm = self.base_llm.eval()

    @property
    def llm(self):
        return self.base_llm

    def get_llm_result(self, prompt):
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        text, history = self.base_llm.chat(self.tokenizer, content, history=[])

        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        return LLMResult(generations=generations, llm_output=llm_output)

    def generate_text(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: float = 1e-8,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = [],
    ):
        result = self.get_llm_result(prompt)
        return result

    async def agenerate_text(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: float = 1e-8,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = [],
    ) -> LLMResult:
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        text, history = await asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer,
                                                                       content, [])

        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        result = LLMResult(generations=generations, llm_output=llm_output)
        return result

class MyEmbedding(Embeddings):

    def __init__(self, path,max_length=8192, batch_size=256):
        # self.model = AutoModel.from_pretrained(llm_path, trust_remote_code=True).cuda()
        self.model = FlagModel(path, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:")
        self.max_length = max_length
        self.batch_size = batch_size

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode_corpus(texts, self.batch_size, self.max_length).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode_queries(text, self.batch_size, self.max_length).tolist()

#数据
data_path = "/home/nlp/graphrag/eval_dataset/amnesty_qa"
amnesty_qa = load_dataset("/home/kelvin/nlp/graphrag/eval_dataset/amnesty_qa")

MODEL_PATH = '/home/nlp/model/LLM/THUDM/glm-4-9b-chat'
embed_model_path = '/home/nlp/model/Embedding/BAAI/bge-m3'

embedding_model = MyEmbedding(embed_model_path)
my_llm = MyLLM(MODEL_PATH)

ans_relevancy = AnswerRelevancy()

result = evaluate(
    amnesty_qa["eval"],
    metrics=[context_recall, context_precision, ans_relevancy, faithfulness],
    llm=my_llm,
    embeddings=embedding_model

)

df = result.to_pandas()
print(df.head())
df.to_csv("result.csv", index=False)

Dataset source:https://huggingface.co/datasets/explodinggradients/amnesty_qa image

dosubot[bot] commented 3 weeks ago

Hello @minglong-huang! I'm here to help you with any bugs, questions, or contributions. Let's tackle this together!

The RuntimeError you're encountering is related to a device-side assertion triggered during evaluation on a CUDA device. This can happen due to various reasons, such as mismatched tensor devices or invalid operations on CUDA tensors.

Here are a few steps to help you debug and resolve this issue:

  1. Ensure All Tensors Are on the Same Device: Make sure that all tensors involved in the computation are on the same CUDA device. You can check and move tensors to the correct device using .to('cuda').

  2. Enable Device-Side Assertions: Set the environment variable TORCH_USE_CUDA_DSA to 1 to enable device-side assertions, which can provide more detailed error messages:

    export TORCH_USE_CUDA_DSA=1
  3. Check for Asynchronous CUDA Errors: CUDA operations are asynchronous, and errors may not be reported immediately. Insert torch.cuda.synchronize() at strategic points in your code to force synchronization and catch errors earlier.

  4. Review Custom LLM and Embedding Implementations: Ensure that your custom LLM and embedding implementations correctly handle CUDA devices. For example, verify that the models are moved to the CUDA device and that all operations are performed on the same device.

Here is a modified version of your MyLLM and MyEmbedding classes to ensure proper device handling:

class MyLLM(BaseRagasLLM):

    def __init__(self, llm_path):
        self.tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)
        self.base_llm = AutoModel.from_pretrained(llm_path, trust_remote_code=True).cuda()
        self.base_llm = self.base_llm.eval()

    @property
    def llm(self):
        return self.base_llm

    def get_llm_result(self, prompt):
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        text, history = self.base_llm.chat(self.tokenizer, content, history=[])

        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        return LLMResult(generations=generations, llm_output=llm_output)

    def generate_text(self, prompt: PromptValue, n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, callbacks: Callbacks = []):
        result = self.get_llm_result(prompt)
        return result

    async def agenerate_text(self, prompt: PromptValue, n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, callbacks: Callbacks = []) -> LLMResult:
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        text, history = await asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer, content, [])

        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        result = LLMResult(generations=generations, llm_output=llm_output)
        return result

class MyEmbedding(Embeddings):

    def __init__(self, path, max_length=8192, batch_size=256):
        self.model = FlagModel(path, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:").cuda()
        self.max_length = max_length
        self.batch_size = batch_size

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode_corpus(texts, self.batch_size, self.max_length).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode_queries(text, self.batch_size, self.max_length).tolist()
  1. Error Handling in evaluate Function: The evaluate function in the ragas library has an error handling mechanism for CUDA-related runtime errors. You can set the raise_exceptions parameter to True to raise exceptions if any of the metrics fail, which can help you identify the exact cause of the error:
    result = evaluate(
       amnesty_qa["eval"],
       metrics=[context_recall, context_precision, ans_relevancy, faithfulness],
       llm=my_llm,
       embeddings=embedding_model,
       raise_exceptions=True
    )

By following these steps, you should be able to identify and resolve the CUDA RuntimeError during evaluation. If the issue persists, consider simplifying your code to isolate the problem and ensure that all components are correctly configured for CUDA usage.

To continue talking to Dosu, mention @dosu.

minglong-huang commented 3 weeks ago

Thanks for your help

But it did't work.

I made some changes to my code:

In MyLLM class self.base_llm = AutoModel.from_pretrained(llm_path, trust_remote_code=True).cuda()->self.base_llm = AutoModel.from_pretrained(llm_path, trust_remote_code=True) I have remove cuda()

The program did not report an error

But it has no any output after I run my program about 3 hours 8990149bfd8bfbf97e35ef64c16bcae @dosu

dosubot[bot] commented 3 weeks ago

Given that removing .cuda() from your model initialization resolved the error but caused the program to hang, it seems like the issue might be related to the model not being able to utilize the GPU, leading to extremely slow processing on the CPU.

Here are a few steps to address this:

  1. Re-enable CUDA: Re-enable CUDA for your model and embeddings to leverage GPU acceleration. Ensure that all tensors and models are moved to the same CUDA device.

  2. Debugging with Smaller Dataset: Test your code with a smaller subset of your dataset to quickly identify if the issue is related to dataset size or model configuration.

  3. Check for Deadlocks: Ensure there are no deadlocks or infinite loops in your custom LLM and embedding implementations.

  4. Add Logging: Add logging statements to track the progress and identify where the code might be hanging.

Here is the updated code with CUDA enabled and added logging:

import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class MyLLM(BaseRagasLLM):

    def __init__(self, llm_path):
        self.tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)
        self.base_llm = AutoModel.from_pretrained(llm_path, trust_remote_code=True).cuda()
        self.base_llm = self.base_llm.eval()

    @property
    def llm(self):
        return self.base_llm

    def get_llm_result(self, prompt):
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        logger.info("Generating text for prompt: %s", content)
        text, history = self.base_llm.chat(self.tokenizer, content, history=[])
        logger.info("Generated text: %s", text)

        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        return LLMResult(generations=generations, llm_output=llm_output)

    def generate_text(self, prompt: PromptValue, n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, callbacks: Callbacks = []):
        result = self.get_llm_result(prompt)
        return result

    async def agenerate_text(self, prompt: PromptValue, n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, callbacks: Callbacks = []) -> LLMResult:
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        logger.info("Asynchronously generating text for prompt: %s", content)
        text, history = await asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer, content, [])
        logger.info("Asynchronously generated text: %s", text)

        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        result = LLMResult(generations=generations, llm_output=llm_output)
        return result

class MyEmbedding(Embeddings):

    def __init__(self, path, max_length=8192, batch_size=256):
        self.model = FlagModel(path, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:").cuda()
        self.max_length = max_length
        self.batch_size = batch_size

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        logger.info("Embedding documents")
        embeddings = self.model.encode_corpus(texts, self.batch_size, self.max_length).tolist()
        logger.info("Embedded documents")
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        logger.info("Embedding query")
        embedding = self.model.encode_queries(text, self.batch_size, self.max_length).tolist()
        logger.info("Embedded query")
        return embedding

# Data
data_path = "/home/nlp/graphrag/eval_dataset/amnesty_qa"
amnesty_qa = load_dataset("/home/kelvin/nlp/graphrag/eval_dataset/amnesty_qa")

MODEL_PATH = '/home/nlp/model/LLM/THUDM/glm-4-9b-chat'
embed_model_path = '/home/nlp/model/Embedding/BAAI/bge-m3'

embedding_model = MyEmbedding(embed_model_path)
my_llm = MyLLM(MODEL_PATH)

ans_relevancy = AnswerRelevancy()

result = evaluate(
    amnesty_qa["eval"],
    metrics=[context_recall, context_precision, ans_relevancy, faithfulness],
    llm=my_llm,
    embeddings=embedding_model,
    raise_exceptions=True
)

df = result.to_pandas()
print(df.head())
df.to_csv("result.csv", index=False)

By re-enabling CUDA and adding logging, you should be able to identify where the code is hanging and ensure that the evaluation leverages GPU acceleration for better performance.

To continue talking to Dosu, mention @dosu.