[X] I have checked the documentation and related resources and couldn't resolve my bug.
Hi Team, I'm hosting open-source LLMs via vLLM myself as the evaluation model for Ragas. I always observe a bug on my evaluation pipeline using self-hosted models, no matter Qwen2.5-32B-Instruct, Mistral-7B, or Mistral-nemo. The vllm logs always look fine. But on the evaluation part, the LLM will start to repeat certain sentences or words till reaching token limit.
The issue always happens after sending hundreds of question-answer pairs to the LLM. But when I re-run the evaluation, starting from the question-answer pair that led to the failure, the pipeline can give the correct answer without errors. But it will fail again at a different pair after a large number of requests.
Also, once a query failed, all requests after it will always fail unless restarting the pipeline.
I have tried to set time.sleep() between requests but it doesn't help.
Ragas version: 0.2.4
Python version: 3.12
Code to Reproduce
from datasets import Dataset as ragas_dataset
from langchain_openai import ChatOpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
import pandas as pd
from tqdm import tqdm
import asyncio
from langchain_core.prompts import ChatPromptTemplate
import tempfile
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics._factual_correctness import FactualCorrectness
from ragas.metrics import SemanticSimilarity
from ragas.run_config import RunConfig
run_config = RunConfig(timeout=300, max_retries=15, max_workers=8, log_tenacity=True) # Set timeout to 300 seconds
class Evaluator:
def __init__(self, dataset):
self.eval_dataset = dataset
self.embeddings = self.initialize_embeddings()
self.llm_model = self.initialize_llm()
def initialize_embeddings(self):
embedding_function = SentenceTransformerEmbeddings(model_name="BAAI/bge-m3")
langchain_embeddings = LangchainEmbeddingsWrapper(embedding_function)
return langchain_embeddings
def initialize_llm(self):
model_name = 'Qwen/Qwen2.5-32B-Instruct'
model_url = 'http://localhost:8000/v1'
self.model = ChatOpenAI(openai_api_base=model_url,
api_key="EMPTY",
temperature=0,
model_name=model_name)
# test the connections
# prompt_template = ChatPromptTemplate.from_messages([
# ("system", "You are a helpful assistant"),
# ("user", "Tell me a joke about {topic}")
# ])
# print(self.model.invoke(prompt_template.invoke({"topic": "cats"})))
langchain_llm = LangchainLLMWrapper(self.model)
return langchain_llm
async def get_semantic_similarity(self, df):
print('Evaluating semantic similarity.')
results = []
for index, row in tqdm(df.iterrows(), total=len(df)):
try:
sample = SingleTurnSample(
response=row['Generated_Answer'],
reference=row['Expected_Answer']
)
scorer = SemanticSimilarity()
scorer.embeddings = self.embeddings
similarity_score = await scorer.single_turn_ascore(sample)
result = {
'Question': row['Question'],
'Answer_Similarity': similarity_score
}
results.append(result)
except Exception as e:
print(f'Error handling the question {row["Question"]}: {str(e)}')
results.append({
'Question': row['Question'],
'Answer_Similarity': None
})
similarity_df = pd.DataFrame(results)
return similarity_df
async def get_factual_correctness(self, df):
print('Evaluating factual correctness.')
results = []
for index, row in tqdm(df.iterrows(), total=len(df)):
try:
sample = SingleTurnSample(
response=row['Generated_Answer'],
reference=row['Expected_Answer']
)
scorer = FactualCorrectness()
scorer.llm = self.llm_model
correctness_score = await scorer.single_turn_ascore(sample)
result = {
'Question': row['Question'],
'Factual_Correctness': correctness_score
}
results.append(result)
except Exception as e:
print(f'Error handling the question {row["Question"]}: {str(e)}')
results.append({
'Question': row['Question'],
'Factual_Correctness': None
})
correctness_df = pd.DataFrame(results)
return correctness_df
async def get_answer_recall(self, df):
print('Evaluating answer recall.')
results = []
for index, row in tqdm(df.iterrows(), total=len(df)):
try:
sample = SingleTurnSample(
response=row['Generated_Answer'],
reference=row['Expected_Answer']
)
scorer = FactualCorrectness(mode="recall", atomicity="low", coverage="low")
scorer.llm = self.llm_model
recall_score = await scorer.single_turn_ascore(sample)
result = {
'Question': row['Question'],
'Answer_Recall': recall_score
}
results.append(result)
except Exception as e:
print(f'Error handling the question {row["Question"]}: {str(e)}')
results.append({
'Question': row['Question'],
'Answer_Recall': None
})
recall_df = pd.DataFrame(results)
return recall_df
async def get_answer_precision(self, df):
print('Evaluating answer precision.')
results = []
for index, row in tqdm(df.iterrows(), total=len(df)):
try:
sample = SingleTurnSample(
response=row['Generated_Answer'],
reference=row['Expected_Answer']
)
scorer = FactualCorrectness(mode="precision", atomicity="low", coverage="low")
scorer.llm = self.llm_model
precision_score = await scorer.single_turn_ascore(sample)
result = {
'Question': row['Question'],
'Answer_Precision': precision_score
}
results.append(result)
except Exception as e:
print(f'Error handling the question {row["Question"]}: {str(e)}')
results.append({
'Question': row['Question'],
'Answer_Precision': None
})
precision_df = pd.DataFrame(results)
return precision_df
Error trace
Expected behavior
The evaluation pipeline should be able to complete all evaluations on precision and recall.
[X] I have checked the documentation and related resources and couldn't resolve my bug.
Hi Team, I'm hosting open-source LLMs via vLLM myself as the evaluation model for Ragas. I always observe a bug on my evaluation pipeline using self-hosted models, no matter Qwen2.5-32B-Instruct, Mistral-7B, or Mistral-nemo. The vllm logs always look fine. But on the evaluation part, the LLM will start to repeat certain sentences or words till reaching token limit. The issue always happens after sending hundreds of question-answer pairs to the LLM. But when I re-run the evaluation, starting from the question-answer pair that led to the failure, the pipeline can give the correct answer without errors. But it will fail again at a different pair after a large number of requests. Also, once a query failed, all requests after it will always fail unless restarting the pipeline. I have tried to set time.sleep() between requests but it doesn't help.
Ragas version: 0.2.4 Python version: 3.12
Code to Reproduce
Error trace
Expected behavior The evaluation pipeline should be able to complete all evaluations on precision and recall.