HOW to use LOCAL LLMS and Embeddings for ragas？

minglong-huang commented 3 weeks ago

I have down model weights in my computer，but i don't how to use LOCAL LLMS and Embeddings for ragas according to <Ragas / 🛠️ How-to Guides / Customizations / Bring Your Own LLMs and Embeddings>

Here is My code but it did't work

import typing as t
import asyncio
from typing import List
from datasets import load_dataset, load_from_disk
from ragas.metrics import faithfulness, context_recall, context_precision
from ragas.metrics import AnswerRelevancy
from ragas import evaluate
from ragas.llms import BaseRagasLLM
from langchain.schema import LLMResult
from langchain.schema import Generation
from langchain.callbacks.base import Callbacks
from langchain.schema.embeddings import Embeddings
from transformers import AutoModel, AutoTokenizer
from ragas.llms.prompt import PromptValue
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from FlagEmbedding import FlagModel
from FlagEmbedding import BGEM3FlagModel
from ragas.metrics import answer_relevancy
from langchain_core.language_models import BaseLanguageModel
from langchain_core.embeddings import Embeddings
from ragas.llms import BaseRagasLLM
from ragas.embeddings import BaseRagasEmbeddings
import asyncio
import traceback
from datasets import Dataset
from Langchain_my_llm import GLM_4

class MyLLM(BaseRagasLLM):

    def __init__(self,llm_path):
        self.tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)
        self.base_llm = AutoModel.from_pretrained(llm_path, trust_remote_code=True)
        self.base_llm = self.base_llm.eval()

    @property
    def llm(self):
        return self.base_llm

    def get_llm_result(self, prompt):
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        generate_config ={
            'max_tokens':8192,
            'stream':True

        }
        print(content)
        text, history = self.base_llm.chat(self.tokenizer, content, history=[])
        print(("Generated text: %s", text))
        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        return LLMResult(generations=generations, llm_output=llm_output)

    def generate_text(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: float = 1e-8,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = [],
    ):
        result = self.get_llm_result(prompt)
        return result

    async def agenerate_text(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: float = 1e-8,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = [],
    ) -> LLMResult:
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        print('')
        print('*' * 20)
        print(f'content={content}')
        print('*'*20)
        print('')
        try:
            # text, history = await asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer,
            #                                                            content, [])
            # 使用更合理的超时时间
            text, history = await asyncio.wait_for(
                asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer,content, []),
                timeout=150  # 例如，设置超时时间为60秒
            )
        except asyncio.TimeoutError:
            print("操作超时，请检查代码或增加超时时间")
        except asyncio.CancelledError:
            print("任务被取消，请检查代码")
            info = traceback.format_exc()
            print(f"info ={info}")
        except Exception as e:
            print(f"发生未知错误：{e}")

        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        result = LLMResult(generations=generations, llm_output=llm_output)
        return result

class MyEmbedding(Embeddings):

    def __init__(self, path,max_length=8192, batch_size=256):
        #self.model = AutoModel.from_pretrained(path, trust_remote_code=True).cuda()
        self.model = FlagModel(path, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：")
        #self.model = BGEM3FlagModel(path, map_location='cuda')
        self.max_length = max_length
        self.batch_size = batch_size

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode_corpus(texts, self.batch_size, self.max_length).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode_queries(text, self.batch_size, self.max_length).tolist()

#数据
data_path = "/home/kelvin/nlp/graphrag/eval_dataset/amnesty_qa"
amnesty_qa = load_dataset("/home/kelvin/nlp/graphrag/eval_dataset/amnesty_qa")

# MODEL_PATH = '/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat'
MODEL_PATH = '/home/kelvin/nlp/model/LLM/Qwen/Qwen1.5-32B'
embed_model_path = '/home/kelvin/nlp/model/Embedding/BAAI/bge-m3'

MODEL_PATH = '/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat'
embedding_model = MyEmbedding(embed_model_path)
my_llm = MyLLM(MODEL_PATH)

ans_relevancy = AnswerRelevancy()

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}

dataset = Dataset.from_dict(data_samples)
# amnesty_qa["eval"],
result = evaluate(
    dataset,
    metrics=[context_recall, context_precision, ans_relevancy, faithfulness],
    llm=my_llm,
    embeddings=embedding_model,
    raise_exceptions=True
)

df = result.to_pandas()
print(df.head())
df.to_csv("result.csv", index=False)

Has anyone successfully tested the local model? I hope to receive everyone's help Thanks

dosubot[bot] commented 3 weeks ago

To use local LLMs and embeddings for Ragas, you need to ensure that your custom LLM and embeddings are correctly integrated and wrapped. Here are the steps to follow, along with the necessary code adjustments:

Initialize your custom LLM and Embeddings: Replace the default OpenAI models with your local models. For example, using m2-bert-80M-8k-retrieval for embeddings and NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT for LLM.
Wrap the custom LLM and Embeddings: Use the LangchainLLMWrapper and LangchainEmbeddingsWrapper to wrap your custom models.
Pass the wrapped LLM and Embeddings to the evaluate function: Ensure that the wrapped models are passed correctly to the evaluation function.

Here is the updated code based on your provided script:

import typing as t
import asyncio
from typing import List
from datasets import Dataset
from ragas.metrics import faithfulness, context_recall, context_precision, answer_relevancy
from ragas import evaluate
from ragas.llms import BaseRagasLLM, LangchainLLMWrapper
from langchain.schema import LLMResult, Generation
from langchain.callbacks.base import Callbacks
from langchain.schema.embeddings import Embeddings
from transformers import AutoModel, AutoTokenizer
from ragas.llms.prompt import PromptValue
from ragas.embeddings import LangchainEmbeddingsWrapper

class MyLLM(BaseRagasLLM):

    def __init__(self, llm_path):
        self.tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)
        self.base_llm = AutoModel.from_pretrained(llm_path, trust_remote_code=True)
        self.base_llm = self.base_llm.eval()

    @property
    def llm(self):
        return self.base_llm

    def get_llm_result(self, prompt):
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        text, history = self.base_llm.chat(self.tokenizer, content, history=[])
        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        return LLMResult(generations=generations, llm_output=llm_output)

    def generate_text(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: float = 1e-8,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = None,
    ) -> LLMResult:
        return self.get_llm_result(prompt)

    async def agenerate_text(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: float = 1e-8,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = None,
    ) -> LLMResult:
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        try:
            text, history = await asyncio.wait_for(
                asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer, content, []),
                timeout=150
            )
        except asyncio.TimeoutError:
            print("Operation timed out, please check the code or increase the timeout")
        except asyncio.CancelledError:
            print("Task was cancelled, please check the code")
        except Exception as e:
            print(f"An unknown error occurred: {e}")

        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        return LLMResult(generations=generations, llm_output=llm_output)

class MyEmbedding(Embeddings):

    def __init__(self, path, max_length=8192, batch_size=256):
        self.model = FlagModel(path, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：")
        self.max_length = max_length
        self.batch_size = batch_size

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode_corpus(texts, self.batch_size, self.max_length).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode_queries(text, self.batch_size, self.max_length).tolist()

# Data
data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}

dataset = Dataset.from_dict(data_samples)

# Model paths
MODEL_PATH = '/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat'
embed_model_path = '/home/kelvin/nlp/model/Embedding/BAAI/bge-m3'

# Initialize custom LLM and Embeddings
embedding_model = MyEmbedding(embed_model_path)
my_llm = MyLLM(MODEL_PATH)

# Wrap the custom LLM and Embeddings
wrapped_llm = LangchainLLMWrapper(my_llm)
wrapped_embeddings = LangchainEmbeddingsWrapper(embedding_model)

# Evaluate
result = evaluate(
    dataset,
    metrics=[context_recall, context_precision, answer_relevancy, faithfulness],
    llm=wrapped_llm,
    embeddings=wrapped_embeddings,
    raise_exceptions=True
)

df = result.to_pandas()
print(df.head())
df.to_csv("result.csv", index=False)

This code ensures that your custom LLM and embeddings are correctly wrapped and used within the evaluate function. The LangchainLLMWrapper and LangchainEmbeddingsWrapper classes are used to adapt your custom models to the expected interfaces ^[1]^[2].

To continue talking to Dosu, mention @dosu.

minglong-huang commented 3 weeks ago

sorry but it didn‘t work After I run your code

19-Xiang commented 2 weeks ago

from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.chains import RetrievalQA
import  os
import openai
import  time

from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, LlamaTokenizerFast
from typing import Any, List, Optional
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "6"

class Qwen_LLM(LLM):
    # 基于本地 Qwen 自定义 LLM 类
    tokenizer: AutoTokenizer = None
    model: AutoModelForCausalLM = None
    def __init__(self, mode_name_or_path :str):
        super().__init__()
        print("正在从本地加载模型...")
        self.tokenizer = AutoTokenizer.from_pretrained(mode_name_or_path, use_fast=False)
        self.model = AutoModelForCausalLM.from_pretrained(mode_name_or_path, torch_dtype=torch.bfloat16, device_map="auto")
        self.model.generation_config = GenerationConfig.from_pretrained(mode_name_or_path)
        print("完成本地模型的加载")

    def _call(self, prompt : str, stop: Optional[List[str]] = None,
                run_manager: Optional[CallbackManagerForLLMRun] = None,
                **kwargs: Any):

        messages = [{"role": "user", "content": prompt }]
        input_ids = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        model_inputs = self.tokenizer([input_ids], return_tensors="pt").to('cuda')
        generated_ids = self.model.generate(model_inputs.input_ids,max_new_tokens=512)
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        return response

    @property
    def _llm_type(self) -> str:
        return "Qwen_LLM"

mode_path = ""
llm = Qwen_LLM(mode_name_or_path = mode_path)

embedding_model_dir = ""
embedding_model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embedding_model = HuggingFaceBgeEmbeddings(
    model_name=embedding_model_dir,
    model_kwargs=embedding_model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction="为这个句子生成表示以用于检索相关文章："
)

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

llm = LangchainLLMWrapper(llm)
embedding_model = LangchainEmbeddingsWrapper(embedding_model)

from datasets import Dataset 
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from ragas import evaluate

# faithfulness.llm = llm
# faithfulness.embeddings = embedding_model

# data_samples = {
#     'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
#     'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
#     'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
#     ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
# }
data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}
dataset = Dataset.from_dict(data_samples)
# score = evaluate(dataset,metrics=[faithfulness])
score = evaluate(dataset,metrics=[faithfulness, answer_relevancy, context_precision, context_recall],llm=llm,embeddings=embedding_model)
score.to_pandas()
print(score)

After integrating a local model with Langchain and using LangchainLLMWrapper and LangchainEmbeddingsWrapper for wrapping, the code was able to run. However, the results displayed 'faithfulness': nan, 'answer_relevancy': 0.7429, 'context_precision': 0.5000, 'context_recall': 0.5000, with 'nan' present in the 'faithfulness' metric.

JUN-ZZ commented 1 week ago

from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.chains import RetrievalQA
import  os
import openai
import  time

from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, LlamaTokenizerFast
from typing import Any, List, Optional
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "6"

class Qwen_LLM(LLM):
    # 基于本地 Qwen 自定义 LLM 类
    tokenizer: AutoTokenizer = None
    model: AutoModelForCausalLM = None
    def __init__(self, mode_name_or_path :str):
        super().__init__()
        print("正在从本地加载模型...")
        self.tokenizer = AutoTokenizer.from_pretrained(mode_name_or_path, use_fast=False)
        self.model = AutoModelForCausalLM.from_pretrained(mode_name_or_path, torch_dtype=torch.bfloat16, device_map="auto")
        self.model.generation_config = GenerationConfig.from_pretrained(mode_name_or_path)
        print("完成本地模型的加载")

    def _call(self, prompt : str, stop: Optional[List[str]] = None,
                run_manager: Optional[CallbackManagerForLLMRun] = None,
                **kwargs: Any):

        messages = [{"role": "user", "content": prompt }]
        input_ids = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        model_inputs = self.tokenizer([input_ids], return_tensors="pt").to('cuda')
        generated_ids = self.model.generate(model_inputs.input_ids,max_new_tokens=512)
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        return response

    @property
    def _llm_type(self) -> str:
        return "Qwen_LLM"

mode_path = ""
llm = Qwen_LLM(mode_name_or_path = mode_path)

embedding_model_dir = ""
embedding_model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embedding_model = HuggingFaceBgeEmbeddings(
    model_name=embedding_model_dir,
    model_kwargs=embedding_model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction="为这个句子生成表示以用于检索相关文章："
)

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

llm = LangchainLLMWrapper(llm)
embedding_model = LangchainEmbeddingsWrapper(embedding_model)

from datasets import Dataset 
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from ragas import evaluate

# faithfulness.llm = llm
# faithfulness.embeddings = embedding_model

# data_samples = {
#     'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
#     'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
#     'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
#     ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
# }
data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}
dataset = Dataset.from_dict(data_samples)
# score = evaluate(dataset,metrics=[faithfulness])
score = evaluate(dataset,metrics=[faithfulness, answer_relevancy, context_precision, context_recall],llm=llm,embeddings=embedding_model)
score.to_pandas()
print(score)

After integrating a local model with Langchain and using LangchainLLMWrapper and LangchainEmbeddingsWrapper for wrapping, the code was able to run. However, the results displayed 'faithfulness': nan, 'answer_relevancy': 0.7429, 'context_precision': 0.5000, 'context_recall': 0.5000, with 'nan' present in the 'faithfulness' metric.

我也遇到这个问题了，老哥，解决了吗

explodinggradients / ragas

HOW to use LOCAL LLMS and Embeddings for ragas？ #1214