explodinggradients / ragas

Supercharge Your LLM Application Evaluations 🚀
https://docs.ragas.io
Apache License 2.0
7.33k stars 747 forks source link

faithfulness.adapt(language="chinese") is no useful #1298

Open beatG123 opened 2 months ago

beatG123 commented 2 months ago

[ ] I have checked the documentation and related resources and couldn't resolve my bug.

Describe the bug 【faithfulness.adapt(language="chinese") is no useful】

Ragas version:0.1.0 Python version:

Code to Reproduce Share code to reproduce the issue from future import annotations

import logging import typing as t from dataclasses import dataclass, field

import numpy as np

from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM

if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks

from ragas.llms.prompt import PromptValue

logger = logging.getLogger(name)

LONG_FORM_ANSWER_PROMPT = Prompt( name="long_form_answer", instruction="Create one or more statements from each sentence in the given answer.", examples=[ { "question": "Who was Albert Einstein and what is he best known for?", "answer": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.", "statements": { "statements": [ "Albert Einstein, a German-born theoretical physicist, is renowned for being one of the most influential physicists in history.", "Albert Einstein was best known for his theory of relativity.", "Einstein's contributions significantly advanced the field of quantum mechanics", "Recognized globally, Einstein's work has profoundly impacted the scientific community", "Einstein's groundbreaking theories continue to shape our understanding of physics today.", ] }, }, { "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", "answer": "alcohol", "statements": { "statements": ["Cadmium Chloride is slightly soluble in alcohol."] }, }, { "question": "Were Hitler and Benito Mussolini of the same nationality?", "answer": "Sorry, I can't provide answer to that question.", "statements": {"statements": []}, }, ], input_keys=["question", "answer"], output_key="statements", output_type="JSON", ) # noqa: E501

NLI_STATEMENTS_MESSAGE = Prompt( name="nli_statements", instruction="Natural language inference. Use only 'Yes' (1), 'No' (0) and 'Null' (-1) as verdict.", examples=[ { "context": """John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.""", "statements": """ statement_1: John is majoring in Biology. statement_2: John is taking a course on Artificial Intelligence. statement_3: John is a dedicated student. statement_4: John has a part-time job. """, "answer": [ { "statement_1": "John is majoring in Biology.", "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", "verdict": "0", }, { "statement_2": "John is taking a course on Artificial Intelligence.", "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", "verdict": "0", }, { "statement_3": "John is a dedicated student.", "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", "verdict": "1", }, { "statement_4": "John has a part-time job.", "reason": "There is no information given in the context about John having a part-time job.", "verdict": "0", }, ], }, { "context": """Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.""", "statements": """statement_1: Albert Einstein was a genius.""", "answer": { "statement_1": "Albert Einstein was a genius.", "reason": "The context and statement are unrelated", "verdict": "0", }, }, { "context": """Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.""", "statements": """statement_1: Nil""", "answer": { "statement_1": "Nil", "reason": "The statement is invalid", "verdict": "-1", }, }, ], input_keys=["context", "statements"], output_key="answer", output_type="JSON", ) # noqa: E501

@dataclass class Faithfulness(MetricWithLLM): name: str = "faithfulness" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore long_form_answer_prompt: Prompt = field( default_factory=lambda: LONG_FORM_ANSWER_PROMPT ) nli_statements_message: Prompt = field( default_factory=lambda: NLI_STATEMENTS_MESSAGE )

def _create_answer_prompt(self, row: t.Dict) -> PromptValue:
    question, answer = row["question"], row["answer"]

    # extract statements from answer given the question
    prompt_value = self.long_form_answer_prompt.format(
        question=question, answer=answer
    )
    return prompt_value

def _create_nli_prompt(self, row: t.Dict, statements: t.Any) -> PromptValue:
    assert self.llm is not None, "llm must be set to compute score"

    contexts = row["contexts"]
    # check if the statements are support in the contexts
    contexts_str: str = "\n".join(contexts)
    statements = statements if statements != [] else ["Nil"]
    statements_str: str = "\n".join(
        [f"statement_{i+1}: {st}" for i, st in enumerate(statements)]
    )
    prompt_value = self.nli_statements_message.format(
        context=contexts_str, statements=statements_str
    )
    return prompt_value

def _compute_score(self, output: t.Any):
    # check the verdicts and compute the score
    verdict_score_map = {"1": 1, "0": 0, "null": np.nan}
    output = output if isinstance(output, list) else [output]
    faithful_statements = sum(
        verdict_score_map.get(
            statement_with_validation.get("verdict", "").lower(), np.nan
        )
        for statement_with_validation in output
    )
    num_statements = len(output)
    if num_statements:
        score = faithful_statements / num_statements
    else:
        logger.warning(
            "Invalid JSON response. Expected dictionary with key 'verdict'"
        )
        score = np.nan

    return score

async def _ascore(
    self: t.Self, row: t.Dict, callbacks: Callbacks, is_async: bool
) -> float:
    """
    returns the NLI score for each (q, c, a) pair
    """
    assert self.llm is not None, "LLM is not set"
    p = self._create_answer_prompt(row)
    answer_result = await self.llm.generate(
        p, callbacks=callbacks, is_async=is_async
    )
    statements = await json_loader.safe_load(
        text=answer_result.generations[0][0].text,
        llm=self.llm,
        callbacks=callbacks,
        is_async=is_async,
    )

    p = self._create_nli_prompt(row, statements.get("statements", []))
    nli_result = await self.llm.generate(p, callbacks=callbacks, is_async=is_async)
    json_output = await json_loader.safe_load(
        text=nli_result.generations[0][0].text,
        llm=self.llm,
        callbacks=callbacks,
        is_async=is_async,
    )
    return self._compute_score(json_output)

def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None:
    assert self.llm is not None, "LLM is not set"

    logger.info(f"Adapting Faithfulness metric to {language}")
    self.long_form_answer_prompt = self.long_form_answer_prompt.adapt(
        language, self.llm, cache_dir
    )
    self.nli_statements_message = self.nli_statements_message.adapt(
        language, self.llm, cache_dir
    )

def save(self, cache_dir: t.Optional[str] = None) -> None:
    self.long_form_answer_prompt.save(cache_dir)
    self.nli_statements_message.save(cache_dir)

faithfulness = Faithfulness()

Error trace Traceback (most recent call last): File "D:.study\myProject\ragas_eval_2.py", line 65, in faithfulness.adapt(language="chinese") File "F:\Application2022\Anaconda\an\lib\site-packages\ragas\metrics_faithfulness.py", line 204, in adapt self.long_form_answer_prompt = self.long_form_answer_prompt.adapt( File "F:\Application2022\Anaconda\an\lib\site-packages\ragas\llms\prompt.py", line 166, in adapt return self._load(language, self.name, cache_dir) File "F:\Application2022\Anaconda\an\lib\site-packages\ragas\llms\prompt.py", line 242, in _load return cls(**json.load(open(path))) File "F:\Application2022\Anaconda\an\lib\site-packages\pydantic\v1\main.py", line 341, in init raise validation_error pydantic.v1.error_wrappers.ValidationError: 1 validation error for Prompt root analysis in example 1 is not in valid json format: Expecting value: line 1 column 1 (char 0) (type=value_error)

Expected behavior I'm trying to use ragas to test my RAG (Retriever-Augmented Generation) program with a Chinese dataset. Initially, I encountered an error saying "no statements were generated from the answer." Later, I added the following code: from ragas.llms import LangchainLLMWrapper
llm_instance = LangchainLLMWrapper(ChatOpenAI())
faithfulness.llm = llm_instance
faithfulness.adapt(language="chinese") However, this didn't work, and I'm still getting the same or similar error.

The key points are:

I'm using a Chinese dataset. I'm trying to set up the language model wrapper for Chinese. The faithfulness.adapt(language="chinese") call does not seem to be effective in resolving the issue.

Additional context Add any other context about the problem here.

dosubot[bot] commented 2 months ago

The faithfulness.adapt(language="chinese") method is causing a validation error because the Prompt class does not support dynamic language adaptation directly. The Prompt class is instantiated with a language parameter set to "english", but it does not have a mechanism to change or validate the language attribute dynamically.

The adapt method in the Prompt class attempts to load a cached version of the prompt in the target language. If it cannot find one, it uses a language model to translate the prompt and then validates the translated prompt. If the translation is not accurate or the validation fails, it results in a validation error.

Here is the relevant part of the adapt method:

def adapt(
    self, language: str, llm: BaseRagasLLM, cache_dir: t.Optional[str] = None
) -> Prompt:
    if self.language == language:
        return self

    cache_dir = cache_dir if cache_dir else get_cache_dir()
    if os.path.exists(os.path.join(cache_dir, language, f"{self.name}.json")):
        self_cp = self._load(language, self.name, cache_dir)

        self.language = self_cp.language
        self.examples = self_cp.examples

        return self_cp

    # Translation logic using the language model
    # ...

    self.language = language
    self.save(cache_dir=cache_dir)

    return self

To resolve this issue, ensure that the prompt is correctly translated and validated, or provide a pre-translated version of the prompt in the target language. This can be done by manually translating the prompt and saving it in the cache directory before calling the adapt method.

Additionally, you can check the Prompt class's validate_prompt method to ensure that the translated prompt meets all validation criteria:

@root_validator
def validate_prompt(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]:
    if values.get("instruction") is None or values.get("instruction") == "":
        raise ValueError("instruction cannot be empty")
    if values.get("input_keys") is None or values.get("instruction") == []:
        raise ValueError("input_keys cannot be empty")
    if values.get("output_key") is None or values.get("output_key") == "":
        raise ValueError("output_key cannot be empty")
    if values.get("examples"):
        output_key = values["output_key"]
        for no, example in enumerate(values["examples"]):
            for inp_key in values["input_keys"]:
                if inp_key not in example:
                    raise ValueError(
                        f"example {no+1} does not have the variable {inp_key} in the definition"
                    )
            if output_key not in example:
                raise ValueError(
                    f"example {no+1} does not have the variable {output_key} in the definition"
                )
            if values["output_type"].lower() == "json":
                try:
                    if output_key in example:
                        if isinstance(example[output_key], str):
                            json.loads(example[output_key])
                except ValueError as e:
                    raise ValueError(
                        f"{output_key} in example {no+1} is not in valid json format: {e}"
                    )

    return values

Ensure that the translated prompt adheres to these validation rules to avoid errors during the adaptation process [1][2].

To continue talking to Dosu, mention @dosu.

beatG123 commented 2 months ago

The underlying code is difficult to modify. Is there any other way?

dosubot[bot] commented 2 months ago

Yes, there is an alternative way to resolve the validation error in faithfulness.adapt(language="chinese") without modifying the underlying code. You can set the language attribute of the nli_statements_message and statement_prompt objects to "chinese" before calling the adapt method.

Here is an example of how you can do this:

faithfulness.nli_statements_message.language = "chinese"
faithfulness.statement_prompt.language = "chinese"
faithfulness.adapt(language="chinese")

This approach ensures that the language is set correctly before the adapt method is invoked, thereby avoiding the validation error. Additionally, the adapt method itself sets the language for nli_statements_message and statement_prompt and initializes the sentence_segmenter with the specified language, ensuring that all components are correctly adapted to the new language setting [1][2].

To continue talking to Dosu, mention @dosu.

beatG123 commented 2 months ago

Traceback (most recent call last): File "D:.study\myProject\ragas_eval_2.py", line 71, in result = evaluate( File "F:\Application2022\Anaconda\an\lib\site-packages\ragas_analytics.py", line 129, in wrapper result = func(*args, **kwargs) File "F:\Application2022\Anaconda\an\lib\site-packages\ragas\evaluation.py", line 177, in evaluate validate_required_columns(dataset, metrics) File "F:\Application2022\Anaconda\an\lib\site-packages\ragas\validation.py", line 62, in validate_required_columns raise ValueError( ValueError: The metric [context_precision] that that is used requires the following additional columns ['reference'] to be present in the dataset.

After making changes according to your method, an error occurred. How can I further modify it to resolve the issue?

dosubot[bot] commented 2 months ago

To resolve the ValueError related to the missing 'reference' column in the dataset when using the evaluate function, you can use the column_map parameter to map the existing column names in your dataset to the expected column names.

Here is an example of how to use the column_map parameter to map the columns correctly:

from ragas import evaluate
from ragas.datasets import Dataset

# Assuming your dataset has a column named 'reference' that needs to be mapped
column_map = {
    "reference": "ground_truth"  # Map 'reference' to 'ground_truth'
}

# Your dataset
dataset = Dataset({
    'question': [...],
    'reference': [...],  # This is the column that needs to be mapped
    'answer': [...],
    'contexts': [...]
})

# Evaluate the dataset with the column mapping
result = evaluate(dataset, column_map=column_map)
print(result)

By providing the column_map parameter, you can map the 'reference' column in your dataset to the 'ground_truth' column expected by the evaluate function, thus resolving the ValueError [1].

To continue talking to Dosu, mention @dosu.

beatG123 commented 2 months ago

{'question': ['在青岛市供应链创新与应用示范创建行动计划中,如何增强供应链的弹性?'], 'answer': ['在青岛市供应链创新与应用示范创建行动计划(2021—2025年)中,增强供应链的弹性主要通过以下几个方面:\n\n1. 系统筹划与整体推进:采取系统方法进行筹划,确保各环节之间的有效联动,以应对市场变化和不确定性,从而提升整体的供应链弹性。\n\n2. 数字驱动与创新发展:利用数字技术提升供应链的透明度和响应速度。通过信息化手段实现对供应链的实时监控,及时调整供应链策略以应对突发情况。\n\n3. 绿色环保与提质增效:在推动经济效益的同时,注重环境保护与资源的高效利用,从而建立更为可持续的供应链体系。这不仅降低了运营风险,也提升了整体弹性。\n\n4. 内外联动与高效协同:加强与外部合作伙伴的协作,形成资源共享和信息流通的机制,从而增强对于外部环境变化的适应能力。\n\n这些策略共同作用,旨在将青岛市的供应链建设成为更加高效、创新、和可持续的系统,以应对未来可能的挑战。\n\n数据来源:关于印发青岛市供应链创新与应用示范创建行动计划(2021—2025年)的通知。'], 'contexts': [['应用示范创建工作,按照《国务院办公厅关于积极推进供应链创新与应用的指导意见》\n(国办发〔2017〕84 号)和商务部等部门关于开展供应链创新与应用示范创建工作的有\n关要求,结合我市实际,制定本计划。一、总体要求强化新发展理念,围绕打造全国供应\n链创新与应用示范城市和全球供应链重要节点城市目标,坚持“系统筹划、整体推进,示\n范引领、全面提升,内外联动、高效协同,数字驱动、创新发展,绿色环保、提质增效”\n的原则,深化供应链创新与应用,构建形成现代供应链体系。通过 5 年的示范创建,创新\n形成 10 种供应链商业模式,青岛市进入全国供应链创新与应用示范城市第一方阵,10 家', 'qd.shandong.cn)。市商务局将会同相关单位按照《创建工作规范》要求,组织专家对申\n报企业的申报材料进行评估,按程序择优确定青岛市全国供应链创新与应用示范企业申报\n推荐名单。\n三、有关要求\n(一)进一步提高认识。供应链创新与应用示范创建工作,既是深化供应链创新与应用的\n具体举措,也是 2025 年之前商务部每年重点推进的商贸工作。各区、市要高度重视供应\n链创新与应用示范创建工作,组织本区、市优秀供应链企业申报。\n(二)严格审核把关。各区、市要按照《创建工作规范》要求认真组织申报工作,严格审\n核企业的申报材料,确保数据真实、材料完整,并督促按规定时限报送。申报企业要如实', '关于印发青岛市供应链创新与应用示范创建行动计划(2021—2025\n年)的通知\n发布日期: 2022-01-20\nword 下载\npdf 下载\n各区、市人民政府,青岛西海岸新区管委,市政府有关部门,市直有关单位:《青岛市供\n应链创新与应用示范创建行动计划(2021—2025 年)》已经市政府研究同意,现印发给\n你们,请认真贯彻执行。\n青岛市人民政府办公厅 2022 年 1 月 19 日(此件公开发布)\n青岛市供应链创新与应用示范创建行动计划(2021—2025 年)为深入开展供应链创新与\n应用示范创建工作,按照《国务院办公厅关于积极推进供应链创新与应用的指导意见》']], 'ground_truths': [['通过加强供应链风险监测与预警分析,完善现代应急物流体系和市场联保联供机制,注重流程管控和应急储备,提高关键产品和过程的质量保障与风险抵御能力。']]}

我的数据集格式是这样的,并没有所谓的‘reference’

dosubot[bot] commented 2 months ago

To resolve the ValueError related to the missing 'reference' column in your dataset, you need to ensure that your dataset includes the required columns for the context_precision metric. The required columns for this metric are user_input, retrieved_contexts, and reference [1].

Since your dataset does not have a 'reference' column, you can use the column_map parameter to map your existing column names to the required ones. Here is how you can do it:

  1. Map your ground_truths column to reference.
  2. Ensure that your dataset includes the user_input and retrieved_contexts columns.

Here is an example of how to modify your dataset and use the column_map parameter:

from ragas import evaluate
from ragas.datasets import Dataset

# Your dataset
dataset = Dataset({
    'question': ['在青岛市供应链创新与应用示范创建行动计划中,如何增强供应链的弹性?'],
    'answer': ['在青岛市供应链创新与应用示范创建行动计划(2021—2025年)中,增强供应链的弹性主要通过以下几个方面:\n\n1. **系统筹划与整体推进**:采取系统方法进行筹划,确保各环节之间的有效联动,以应对市场变化和不确定性,从而提升整体的供应链弹性。\n\n2. **数字驱动与创新发展**:利用数字技术提升供应链的透明度和响应速度。通过信息化手段实现对供应链的实时监控,及时调整供应链策略以应对突发情况。\n\n3. **绿色环保与提质增效**:在推动经济效益的同时,注重环境保护与资源的高效利用,从而建立更为可持续的供应链体系。这不仅降低了运营风险,也提升了整体弹性。\n\n4. **内外联动与高效协同**:加强与外部合作伙伴的协作,形成资源共享和信息流通的机制,从而增强对于外部环境变化的适应能力。\n\n这些策略共同作用,旨在将青岛市的供应链建设成为更加高效、创新、和可持续的系统,以应对未来可能的挑战。\n\n数据来源:关于印发青岛市供应链创新与应用示范创建行动计划(2021—2025年)的通知。'],
    'contexts': [['应用示范创建工作,按照《国务院办公厅关于积极推进供应链创新与应用的指导意见》\n(国办发〔2017〕84 号)和商务部等部门关于开展供应链创新与应用示范创建工作的有\n关要求,结合我市实际,制定本计划。一、总体要求强化新发展理念,围绕打造全国供应\n链创新与应用示范城市和全球供应链重要节点城市目标,坚持“系统筹划、整体推进,示\n范引领、全面提升,内外联动、高效协同,数字驱动、创新发展,绿色环保、提质增效”\n的原则,深化供应链创新与应用,构建形成现代供应链体系。通过 5 年的示范创建,创新\n形成 10 种供应链商业模式,青岛市进入全国供应链创新与应用示范城市第一方阵,10 家', 'qd.shandong.cn)。市商务局将会同相关单位按照《创建工作规范》要求,组织专家对申\n报企业的申报材料进行评估,按程序择优确定青岛市全国供应链创新与应用示范企业申报\n推荐名单。\n三、有关要求\n(一)进一步提高认识。供应链创新与应用示范创建工作,既是深化供应链创新与应用的\n具体举措,也是 2025 年之前商务部每年重点推进的商贸工作。各区、市要高度重视供应\n链创新与应用示范创建工作,组织本区、市优秀供应链企业申报。\n(二)严格审核把关。各区、市要按照《创建工作规范》要求认真组织申报工作,严格审\n核企业的申报材料,确保数据真实、材料完整,并督促按规定时限报送。申报企业要如实', '关于印发青岛市供应链创新与应用示范创建行动计划(2021—2025\n年)的通知\n发布日期: 2022-01-20\nword 下载\npdf 下载\n各区、市人民政府,青岛西海岸新区管委,市政府有关部门,市直有关单位:《青岛市供\n应链创新与应用示范创建行动计划(2021—2025 年)》已经市政府研究同意,现印发给\n你们,请认真贯彻执行。\n青岛市人民政府办公厅 2022 年 1 月 19 日(此件公开发布)\n青岛市供应链创新与应用示范创建行动计划(2021—2025 年)为深入开展供应链创新与\n应用示范创建工作,按照《国务院办公厅关于积极推进供应链创新与应用的指导意见》']],
    'ground_truths': [['通过加强供应链风险监测与预警分析,完善现代应急物流体系和市场联保联供机制,注重流程管控和应急储备,提高关键产品和过程的质量保障与风险抵御能力。']]
})

# Column mapping
column_map = {
    "reference": "ground_truths"
}

# Evaluate the dataset with the column mapping
result = evaluate(dataset, column_map=column_map)
print(result)

This approach maps the ground_truths column in your dataset to the reference column required by the context_precision metric, thus resolving the ValueError [2].

To continue talking to Dosu, mention @dosu.

beatG123 commented 2 months ago

Traceback (most recent call last): File "D:.study\myProject\ragas_eval_2.py", line 75, in result = evaluate( File "F:\Application2022\Anaconda\an\lib\site-packages\ragas_analytics.py", line 129, in wrapper result = func(*args, kwargs) File "F:\Application2022\Anaconda\an\lib\site-packages\ragas\evaluation.py", line 174, in evaluate dataset = EvaluationDataset.from_list(dataset.to_list()) File "F:\Application2022\Anaconda\an\lib\site-packages\ragas\dataset_schema.py", line 106, in from_list samples.extend(SingleTurnSample(sample) for sample in mapping) File "F:\Application2022\Anaconda\an\lib\site-packages\ragas\dataset_schema.py", line 106, in samples.extend(SingleTurnSample(**sample) for sample in mapping) File "F:\Application2022\Anaconda\an\lib\site-packages\pydantic\v1\main.py", line 341, in init raise validation_error pydantic.v1.error_wrappers.ValidationError: 1 validation error for SingleTurnSample reference str type expected (type=type_error.str)

dosubot[bot] commented 2 months ago

To resolve the pydantic.v1.error_wrappers.ValidationError: 1 validation error for SingleTurnSample reference str type expected (type=type_error.str) error, ensure that the user_input and response fields in the SingleTurnSample are of type str.

Here is an example of how to correctly instantiate a SingleTurnSample:

from ragas.dataset_schema import SingleTurnSample

single_turn_sample = SingleTurnSample(user_input="What is X", response="Y")

Make sure that both user_input and response are strings. If either of these fields is not a string, you will encounter the type_error.str validation error [1].

To continue talking to Dosu, mention @dosu.

beatG123 commented 2 months ago

faithfulness.nli_statements_message.language = "chinese" faithfulness.statement_prompt.language = "chinese"

Why didn't this error occur before this operation, and how did the 'reference' cause the error? Please provide a better solution considering the previous questions!

dosubot[bot] commented 2 months ago

The 'reference' error did not occur before setting faithfulness.nli_statements_message.language = "chinese" and faithfulness.statement_prompt.language = "chinese" because the sentence_segmenter was not being initialized correctly. The error was caused by the sentence_segmenter not being set properly when the language was changed, leading to a failure in segmenting sentences.

The sentence_segmenter is initialized in the __post_init__ method of the Faithfulness class, which relies on the language attribute of nli_statements_message. When the language is changed, the sentence_segmenter needs to be re-initialized to handle the new language.

A better solution is to ensure that the sentence_segmenter is re-initialized whenever the language is changed. This can be done by adding a method to update the language and re-initialize the sentence_segmenter accordingly.

Here is the improved solution:

@dataclass
class FaithfulnessExperimental(MetricWithLLM, SingleTurnMetric):
    name: str = "faithfulness_experimental"  # type: ignore
    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
        default_factory=lambda: {
            MetricType.SINGLE_TURN: {"user_input", "response", "retreived_contexts"}
        }
    )
    sentence_segmenter: t.Optional[HasSegmentMethod] = None
    max_retries: int = 1
    _reproducibility: int = 1

    @property
    def reproducibility(self):
        return self._reproducibility

    @reproducibility.setter
    def reproducibility(self, value):
        if value < 1:
            logger.warning("reproducibility cannot be less than 1, setting to 1")
            value = 1
        elif value % 2 == 0:
            logger.warning(
                "reproducibility level cannot be set to even number, setting to odd"
            )
            value += 1
        self._reproducibility = value

    def __post_init__(self):
        self.long_form_answer_prompt = LongFormAnswerPrompt()
        self.nli_statement_prompt = NLIStatementPrompt()
        self._initialize_segmenter()

    def _initialize_segmenter(self):
        if self.sentence_segmenter is None:
            language = self.nli_statement_prompt.input_model.__fields__['context'].default
            self.sentence_segmenter = get_segmenter(language=language, clean=False)

    def set_language(self, language: str):
        self.nli_statement_prompt.input_model.__fields__['context'].default = language
        self.long_form_answer_prompt.input_model.__fields__['context'].default = language
        self._initialize_segmenter()

    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
        assert self.llm is not None, "LLM is not set"

        answer, question, contexts = (
            row["response"],
            row["user_input"],
            row["retrieved_contexts"],
        )
        if self.sentence_segmenter is None:
            raise ValueError("Sentence segmenter is not set")
        sentences = self.sentence_segmenter.segment(answer)
        sentences = [
            sentence for sentence in sentences if sentence.strip().endswith(".")
        ]
        sentence_components = await self.long_form_answer_prompt.generate(
            FaithfulnessStatements(
                question=question,
                answer=answer,
                sentences={i: sentence for i, sentence in enumerate(sentences)},
            ),
            llm=self.llm,
            callbacks=callbacks,
        )

        statements = [
            statement
            for component in sentence_components.sentences
            for statement in component.simpler_statements
        ]
        verdicts = await self.nli_statement_prompt.generate(
            NLIStatementInput(
                context="\n".join(contexts),
                statements=statements,
            ),
            llm=self.llm,
            callbacks=callbacks,
        )

        num_faithful_statements = sum(
            verdict.verdict for verdict in verdicts.statements
        )
        if len(statements):
            score = num_faithful_statements / len(statements)
        else:
            score = np.nan
        return score

    async def _single_turn_ascore(
        self: t.Self, sample: SingleTurnSample, callbacks: Callbacks
    ) -> float:
        row = sample.dict()
        return await self._ascore(row, callbacks)

With this solution, you can change the language and ensure the sentence_segmenter is correctly re-initialized:

faithfulness = FaithfulnessExperimental()
faithfulness.set_language("chinese")

This approach ensures that the sentence_segmenter is properly initialized for the new language, preventing the 'reference' error from occurring [1][2].

To continue talking to Dosu, mention @dosu.