Open minglong-huang opened 3 weeks ago
To use local LLMs and embeddings for Ragas, you need to ensure that your custom LLM and embeddings are correctly integrated and wrapped. Here are the steps to follow, along with the necessary code adjustments:
Initialize your custom LLM and Embeddings:
Replace the default OpenAI models with your local models. For example, using m2-bert-80M-8k-retrieval
for embeddings and NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT
for LLM.
Wrap the custom LLM and Embeddings:
Use the LangchainLLMWrapper
and LangchainEmbeddingsWrapper
to wrap your custom models.
Pass the wrapped LLM and Embeddings to the evaluate
function:
Ensure that the wrapped models are passed correctly to the evaluation function.
Here is the updated code based on your provided script:
import typing as t
import asyncio
from typing import List
from datasets import Dataset
from ragas.metrics import faithfulness, context_recall, context_precision, answer_relevancy
from ragas import evaluate
from ragas.llms import BaseRagasLLM, LangchainLLMWrapper
from langchain.schema import LLMResult, Generation
from langchain.callbacks.base import Callbacks
from langchain.schema.embeddings import Embeddings
from transformers import AutoModel, AutoTokenizer
from ragas.llms.prompt import PromptValue
from ragas.embeddings import LangchainEmbeddingsWrapper
class MyLLM(BaseRagasLLM):
def __init__(self, llm_path):
self.tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)
self.base_llm = AutoModel.from_pretrained(llm_path, trust_remote_code=True)
self.base_llm = self.base_llm.eval()
@property
def llm(self):
return self.base_llm
def get_llm_result(self, prompt):
generations = []
llm_output = {}
token_total = 0
content = prompt.to_string()
text, history = self.base_llm.chat(self.tokenizer, content, history=[])
generations.append([Generation(text=text)])
token_total += len(text)
llm_output['token_total'] = token_total
return LLMResult(generations=generations, llm_output=llm_output)
def generate_text(
self,
prompt: PromptValue,
n: int = 1,
temperature: float = 1e-8,
stop: t.Optional[t.List[str]] = None,
callbacks: Callbacks = None,
) -> LLMResult:
return self.get_llm_result(prompt)
async def agenerate_text(
self,
prompt: PromptValue,
n: int = 1,
temperature: float = 1e-8,
stop: t.Optional[t.List[str]] = None,
callbacks: Callbacks = None,
) -> LLMResult:
generations = []
llm_output = {}
token_total = 0
content = prompt.to_string()
try:
text, history = await asyncio.wait_for(
asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer, content, []),
timeout=150
)
except asyncio.TimeoutError:
print("Operation timed out, please check the code or increase the timeout")
except asyncio.CancelledError:
print("Task was cancelled, please check the code")
except Exception as e:
print(f"An unknown error occurred: {e}")
generations.append([Generation(text=text)])
token_total += len(text)
llm_output['token_total'] = token_total
return LLMResult(generations=generations, llm_output=llm_output)
class MyEmbedding(Embeddings):
def __init__(self, path, max_length=8192, batch_size=256):
self.model = FlagModel(path, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:")
self.max_length = max_length
self.batch_size = batch_size
def embed_documents(self, texts: List[str]) -> List[List[float]]:
return self.model.encode_corpus(texts, self.batch_size, self.max_length).tolist()
def embed_query(self, text: str) -> List[float]:
return self.model.encode_queries(text, self.batch_size, self.max_length).tolist()
# Data
data_samples = {
'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}
dataset = Dataset.from_dict(data_samples)
# Model paths
MODEL_PATH = '/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat'
embed_model_path = '/home/kelvin/nlp/model/Embedding/BAAI/bge-m3'
# Initialize custom LLM and Embeddings
embedding_model = MyEmbedding(embed_model_path)
my_llm = MyLLM(MODEL_PATH)
# Wrap the custom LLM and Embeddings
wrapped_llm = LangchainLLMWrapper(my_llm)
wrapped_embeddings = LangchainEmbeddingsWrapper(embedding_model)
# Evaluate
result = evaluate(
dataset,
metrics=[context_recall, context_precision, answer_relevancy, faithfulness],
llm=wrapped_llm,
embeddings=wrapped_embeddings,
raise_exceptions=True
)
df = result.to_pandas()
print(df.head())
df.to_csv("result.csv", index=False)
This code ensures that your custom LLM and embeddings are correctly wrapped and used within the evaluate
function. The LangchainLLMWrapper
and LangchainEmbeddingsWrapper
classes are used to adapt your custom models to the expected interfaces [1][2].
To continue talking to Dosu, mention @dosu.
sorry but it didn‘t work After I run your code
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.chains import RetrievalQA
import os
import openai
import time
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, LlamaTokenizerFast
from typing import Any, List, Optional
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
class Qwen_LLM(LLM):
# 基于本地 Qwen 自定义 LLM 类
tokenizer: AutoTokenizer = None
model: AutoModelForCausalLM = None
def __init__(self, mode_name_or_path :str):
super().__init__()
print("正在从本地加载模型...")
self.tokenizer = AutoTokenizer.from_pretrained(mode_name_or_path, use_fast=False)
self.model = AutoModelForCausalLM.from_pretrained(mode_name_or_path, torch_dtype=torch.bfloat16, device_map="auto")
self.model.generation_config = GenerationConfig.from_pretrained(mode_name_or_path)
print("完成本地模型的加载")
def _call(self, prompt : str, stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any):
messages = [{"role": "user", "content": prompt }]
input_ids = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = self.tokenizer([input_ids], return_tensors="pt").to('cuda')
generated_ids = self.model.generate(model_inputs.input_ids,max_new_tokens=512)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return response
@property
def _llm_type(self) -> str:
return "Qwen_LLM"
mode_path = ""
llm = Qwen_LLM(mode_name_or_path = mode_path)
embedding_model_dir = ""
embedding_model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embedding_model = HuggingFaceBgeEmbeddings(
model_name=embedding_model_dir,
model_kwargs=embedding_model_kwargs,
encode_kwargs=encode_kwargs,
query_instruction="为这个句子生成表示以用于检索相关文章:"
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
llm = LangchainLLMWrapper(llm)
embedding_model = LangchainEmbeddingsWrapper(embedding_model)
from datasets import Dataset
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from ragas import evaluate
# faithfulness.llm = llm
# faithfulness.embeddings = embedding_model
# data_samples = {
# 'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
# 'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
# 'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
# ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
# }
data_samples = {
'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}
dataset = Dataset.from_dict(data_samples)
# score = evaluate(dataset,metrics=[faithfulness])
score = evaluate(dataset,metrics=[faithfulness, answer_relevancy, context_precision, context_recall],llm=llm,embeddings=embedding_model)
score.to_pandas()
print(score)
After integrating a local model with Langchain and using LangchainLLMWrapper and LangchainEmbeddingsWrapper for wrapping, the code was able to run. However, the results displayed 'faithfulness': nan, 'answer_relevancy': 0.7429, 'context_precision': 0.5000, 'context_recall': 0.5000, with 'nan' present in the 'faithfulness' metric.
from langchain_community.vectorstores import FAISS from langchain_community.vectorstores import Chroma from langchain.text_splitter import CharacterTextSplitter from langchain_community.embeddings import OpenAIEmbeddings from langchain_community.embeddings import HuggingFaceBgeEmbeddings from langchain.chains import RetrievalQA import os import openai import time from langchain.llms.base import LLM from langchain.callbacks.manager import CallbackManagerForLLMRun from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, LlamaTokenizerFast from typing import Any, List, Optional import torch os.environ["CUDA_VISIBLE_DEVICES"] = "6" class Qwen_LLM(LLM): # 基于本地 Qwen 自定义 LLM 类 tokenizer: AutoTokenizer = None model: AutoModelForCausalLM = None def __init__(self, mode_name_or_path :str): super().__init__() print("正在从本地加载模型...") self.tokenizer = AutoTokenizer.from_pretrained(mode_name_or_path, use_fast=False) self.model = AutoModelForCausalLM.from_pretrained(mode_name_or_path, torch_dtype=torch.bfloat16, device_map="auto") self.model.generation_config = GenerationConfig.from_pretrained(mode_name_or_path) print("完成本地模型的加载") def _call(self, prompt : str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any): messages = [{"role": "user", "content": prompt }] input_ids = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) model_inputs = self.tokenizer([input_ids], return_tensors="pt").to('cuda') generated_ids = self.model.generate(model_inputs.input_ids,max_new_tokens=512) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] return response @property def _llm_type(self) -> str: return "Qwen_LLM" mode_path = "" llm = Qwen_LLM(mode_name_or_path = mode_path) embedding_model_dir = "" embedding_model_kwargs = {'device': 'cuda'} encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity embedding_model = HuggingFaceBgeEmbeddings( model_name=embedding_model_dir, model_kwargs=embedding_model_kwargs, encode_kwargs=encode_kwargs, query_instruction="为这个句子生成表示以用于检索相关文章:" ) from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper llm = LangchainLLMWrapper(llm) embedding_model = LangchainEmbeddingsWrapper(embedding_model) from datasets import Dataset from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall from ragas import evaluate # faithfulness.llm = llm # faithfulness.embeddings = embedding_model # data_samples = { # 'question': ['When was the first super bowl?', 'Who won the most super bowls?'], # 'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'], # 'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], # ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']], # } data_samples = { 'question': ['When was the first super bowl?', 'Who won the most super bowls?'], 'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'], 'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']], 'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times'] } dataset = Dataset.from_dict(data_samples) # score = evaluate(dataset,metrics=[faithfulness]) score = evaluate(dataset,metrics=[faithfulness, answer_relevancy, context_precision, context_recall],llm=llm,embeddings=embedding_model) score.to_pandas() print(score)
After integrating a local model with Langchain and using LangchainLLMWrapper and LangchainEmbeddingsWrapper for wrapping, the code was able to run. However, the results displayed 'faithfulness': nan, 'answer_relevancy': 0.7429, 'context_precision': 0.5000, 'context_recall': 0.5000, with 'nan' present in the 'faithfulness' metric.
我也遇到这个问题了,老哥,解决了吗
I have down model weights in my computer,but i don't how to use LOCAL LLMS and Embeddings for ragas according to <Ragas / 🛠️ How-to Guides / Customizations / Bring Your Own LLMs and Embeddings>
Here is My code but it did't work
Has anyone successfully tested the local model? I hope to receive everyone's help Thanks