explodinggradients / ragas

Supercharge Your LLM Application Evaluations 🚀
https://docs.ragas.io
Apache License 2.0
7.33k stars 746 forks source link

Exceeding model token limit issues with open-source models locally hosted via vLLM #1663

Open YuffieHuang opened 1 week ago

YuffieHuang commented 1 week ago

[X] I have checked the documentation and related resources and couldn't resolve my bug.

Hi Team, I'm hosting open-source LLMs via vLLM myself as the evaluation model for Ragas. I always observe a bug on my evaluation pipeline using self-hosted models, no matter Qwen2.5-32B-Instruct, Mistral-7B, or Mistral-nemo. The vllm logs always look fine. But on the evaluation part, the LLM will start to repeat certain sentences or words till reaching token limit. The issue always happens after sending hundreds of question-answer pairs to the LLM. But when I re-run the evaluation, starting from the question-answer pair that led to the failure, the pipeline can give the correct answer without errors. But it will fail again at a different pair after a large number of requests. Also, once a query failed, all requests after it will always fail unless restarting the pipeline. I have tried to set time.sleep() between requests but it doesn't help.

Ragas version: 0.2.4 Python version: 3.12

Code to Reproduce

from datasets import Dataset as ragas_dataset

from langchain_openai import ChatOpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
import pandas as pd
from tqdm import tqdm
import asyncio
from langchain_core.prompts import ChatPromptTemplate

import tempfile
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics._factual_correctness import FactualCorrectness
from ragas.metrics import SemanticSimilarity
from ragas.run_config import RunConfig

run_config = RunConfig(timeout=300, max_retries=15, max_workers=8, log_tenacity=True)  # Set timeout to 300 seconds

class Evaluator:
    def __init__(self, dataset):
        self.eval_dataset = dataset
        self.embeddings = self.initialize_embeddings()
        self.llm_model = self.initialize_llm()

    def initialize_embeddings(self):
        embedding_function = SentenceTransformerEmbeddings(model_name="BAAI/bge-m3")
        langchain_embeddings = LangchainEmbeddingsWrapper(embedding_function)
        return langchain_embeddings

    def initialize_llm(self):

        model_name = 'Qwen/Qwen2.5-32B-Instruct'
        model_url = 'http://localhost:8000/v1'

        self.model = ChatOpenAI(openai_api_base=model_url,
                                api_key="EMPTY",
                                temperature=0,
                                model_name=model_name)

        # test the connections
        # prompt_template = ChatPromptTemplate.from_messages([
        # ("system", "You are a helpful assistant"),
        # ("user", "Tell me a joke about {topic}")
        # ])
        # print(self.model.invoke(prompt_template.invoke({"topic": "cats"})))

        langchain_llm = LangchainLLMWrapper(self.model)

        return langchain_llm

    async def get_semantic_similarity(self, df):
        print('Evaluating semantic similarity.')
        results = []

        for index, row in tqdm(df.iterrows(), total=len(df)):
            try:
                sample = SingleTurnSample(
                    response=row['Generated_Answer'],
                    reference=row['Expected_Answer']
                )
                scorer = SemanticSimilarity()
                scorer.embeddings = self.embeddings
                similarity_score = await scorer.single_turn_ascore(sample)
                result = {
                    'Question': row['Question'],
                    'Answer_Similarity': similarity_score
                }
                results.append(result)
            except Exception as e:
                print(f'Error handling the question {row["Question"]}: {str(e)}')
                results.append({
                    'Question': row['Question'],
                    'Answer_Similarity': None
                })

        similarity_df = pd.DataFrame(results)
        return similarity_df

    async def get_factual_correctness(self, df):
        print('Evaluating factual correctness.')
        results = []

        for index, row in tqdm(df.iterrows(), total=len(df)):
            try:
                sample = SingleTurnSample(
                    response=row['Generated_Answer'],
                    reference=row['Expected_Answer']
                )
                scorer = FactualCorrectness()
                scorer.llm = self.llm_model
                correctness_score = await scorer.single_turn_ascore(sample)
                result = {
                    'Question': row['Question'],
                    'Factual_Correctness': correctness_score
                }
                results.append(result)
            except Exception as e:
                print(f'Error handling the question {row["Question"]}: {str(e)}')
                results.append({
                    'Question': row['Question'],
                    'Factual_Correctness': None
                })

        correctness_df = pd.DataFrame(results)
        return correctness_df

    async def get_answer_recall(self, df):
        print('Evaluating answer recall.')
        results = []

        for index, row in tqdm(df.iterrows(), total=len(df)):
            try:
                sample = SingleTurnSample(
                    response=row['Generated_Answer'],
                    reference=row['Expected_Answer']
                )
                scorer = FactualCorrectness(mode="recall", atomicity="low", coverage="low")
                scorer.llm = self.llm_model
                recall_score = await scorer.single_turn_ascore(sample)
                result = {
                    'Question': row['Question'],
                    'Answer_Recall': recall_score
                }
                results.append(result)
            except Exception as e:
                print(f'Error handling the question {row["Question"]}: {str(e)}')
                results.append({
                    'Question': row['Question'],
                    'Answer_Recall': None
                })

        recall_df = pd.DataFrame(results)
        return recall_df

    async def get_answer_precision(self, df):
        print('Evaluating answer precision.')
        results = []

        for index, row in tqdm(df.iterrows(), total=len(df)):
            try:
                sample = SingleTurnSample(
                    response=row['Generated_Answer'],
                    reference=row['Expected_Answer']
                )
                scorer = FactualCorrectness(mode="precision", atomicity="low", coverage="low")
                scorer.llm = self.llm_model
                precision_score = await scorer.single_turn_ascore(sample)
                result = {
                    'Question': row['Question'],
                    'Answer_Precision': precision_score
                }
                results.append(result)
            except Exception as e:
                print(f'Error handling the question {row["Question"]}: {str(e)}')
                results.append({
                    'Question': row['Question'],
                    'Answer_Precision': None
                })

        precision_df = pd.DataFrame(results)
        return precision_df

Error trace image

Expected behavior The evaluation pipeline should be able to complete all evaluations on precision and recall.