explodinggradients / ragas

Supercharge Your LLM Application Evaluations 🚀
https://docs.ragas.io
Apache License 2.0
7.38k stars 751 forks source link

faithfulness.adapt(language="chinese") is no useful #1296

Open beatG123 opened 2 months ago

beatG123 commented 2 months ago

[ ] I have checked the documentation and related resources and couldn't resolve my bug.

Describe the bug 【faithfulness.adapt(language="chinese") is no useful】

Ragas version:0.1.0 Python version:

Code to Reproduce Share code to reproduce the issue from future import annotations

import logging import typing as t from dataclasses import dataclass, field

import numpy as np

from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM

if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks

from ragas.llms.prompt import PromptValue

logger = logging.getLogger(name)

LONG_FORM_ANSWER_PROMPT = Prompt( name="long_form_answer", instruction="Create one or more statements from each sentence in the given answer.", examples=[ { "question": "Who was Albert Einstein and what is he best known for?", "answer": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.", "statements": { "statements": [ "Albert Einstein, a German-born theoretical physicist, is renowned for being one of the most influential physicists in history.", "Albert Einstein was best known for his theory of relativity.", "Einstein's contributions significantly advanced the field of quantum mechanics", "Recognized globally, Einstein's work has profoundly impacted the scientific community", "Einstein's groundbreaking theories continue to shape our understanding of physics today.", ] }, }, { "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", "answer": "alcohol", "statements": { "statements": ["Cadmium Chloride is slightly soluble in alcohol."] }, }, { "question": "Were Hitler and Benito Mussolini of the same nationality?", "answer": "Sorry, I can't provide answer to that question.", "statements": {"statements": []}, }, ], input_keys=["question", "answer"], output_key="statements", output_type="JSON", ) # noqa: E501

NLI_STATEMENTS_MESSAGE = Prompt( name="nli_statements", instruction="Natural language inference. Use only 'Yes' (1), 'No' (0) and 'Null' (-1) as verdict.", examples=[ { "context": """John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.""", "statements": """ statement_1: John is majoring in Biology. statement_2: John is taking a course on Artificial Intelligence. statement_3: John is a dedicated student. statement_4: John has a part-time job. """, "answer": [ { "statement_1": "John is majoring in Biology.", "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", "verdict": "0", }, { "statement_2": "John is taking a course on Artificial Intelligence.", "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", "verdict": "0", }, { "statement_3": "John is a dedicated student.", "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", "verdict": "1", }, { "statement_4": "John has a part-time job.", "reason": "There is no information given in the context about John having a part-time job.", "verdict": "0", }, ], }, { "context": """Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.""", "statements": """statement_1: Albert Einstein was a genius.""", "answer": { "statement_1": "Albert Einstein was a genius.", "reason": "The context and statement are unrelated", "verdict": "0", }, }, { "context": """Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.""", "statements": """statement_1: Nil""", "answer": { "statement_1": "Nil", "reason": "The statement is invalid", "verdict": "-1", }, }, ], input_keys=["context", "statements"], output_key="answer", output_type="JSON", ) # noqa: E501

@dataclass class Faithfulness(MetricWithLLM): name: str = "faithfulness" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore long_form_answer_prompt: Prompt = field( default_factory=lambda: LONG_FORM_ANSWER_PROMPT ) nli_statements_message: Prompt = field( default_factory=lambda: NLI_STATEMENTS_MESSAGE )

def _create_answer_prompt(self, row: t.Dict) -> PromptValue:
    question, answer = row["question"], row["answer"]

    # extract statements from answer given the question
    prompt_value = self.long_form_answer_prompt.format(
        question=question, answer=answer
    )
    return prompt_value

def _create_nli_prompt(self, row: t.Dict, statements: t.Any) -> PromptValue:
    assert self.llm is not None, "llm must be set to compute score"

    contexts = row["contexts"]
    # check if the statements are support in the contexts
    contexts_str: str = "\n".join(contexts)
    statements = statements if statements != [] else ["Nil"]
    statements_str: str = "\n".join(
        [f"statement_{i+1}: {st}" for i, st in enumerate(statements)]
    )
    prompt_value = self.nli_statements_message.format(
        context=contexts_str, statements=statements_str
    )
    return prompt_value

def _compute_score(self, output: t.Any):
    # check the verdicts and compute the score
    verdict_score_map = {"1": 1, "0": 0, "null": np.nan}
    output = output if isinstance(output, list) else [output]
    faithful_statements = sum(
        verdict_score_map.get(
            statement_with_validation.get("verdict", "").lower(), np.nan
        )
        for statement_with_validation in output
    )
    num_statements = len(output)
    if num_statements:
        score = faithful_statements / num_statements
    else:
        logger.warning(
            "Invalid JSON response. Expected dictionary with key 'verdict'"
        )
        score = np.nan

    return score

async def _ascore(
    self: t.Self, row: t.Dict, callbacks: Callbacks, is_async: bool
) -> float:
    """
    returns the NLI score for each (q, c, a) pair
    """
    assert self.llm is not None, "LLM is not set"
    p = self._create_answer_prompt(row)
    answer_result = await self.llm.generate(
        p, callbacks=callbacks, is_async=is_async
    )
    statements = await json_loader.safe_load(
        text=answer_result.generations[0][0].text,
        llm=self.llm,
        callbacks=callbacks,
        is_async=is_async,
    )

    p = self._create_nli_prompt(row, statements.get("statements", []))
    nli_result = await self.llm.generate(p, callbacks=callbacks, is_async=is_async)
    json_output = await json_loader.safe_load(
        text=nli_result.generations[0][0].text,
        llm=self.llm,
        callbacks=callbacks,
        is_async=is_async,
    )
    return self._compute_score(json_output)

def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None:
    assert self.llm is not None, "LLM is not set"

    logger.info(f"Adapting Faithfulness metric to {language}")
    self.long_form_answer_prompt = self.long_form_answer_prompt.adapt(
        language, self.llm, cache_dir
    )
    self.nli_statements_message = self.nli_statements_message.adapt(
        language, self.llm, cache_dir
    )

def save(self, cache_dir: t.Optional[str] = None) -> None:
    self.long_form_answer_prompt.save(cache_dir)
    self.nli_statements_message.save(cache_dir)

faithfulness = Faithfulness()

Error trace Traceback (most recent call last): File "D:.study\myProject\ragas_eval_2.py", line 65, in faithfulness.adapt(language="chinese") File "F:\Application2022\Anaconda\an\lib\site-packages\ragas\metrics_faithfulness.py", line 204, in adapt self.long_form_answer_prompt = self.long_form_answer_prompt.adapt( File "F:\Application2022\Anaconda\an\lib\site-packages\ragas\llms\prompt.py", line 166, in adapt return self._load(language, self.name, cache_dir) File "F:\Application2022\Anaconda\an\lib\site-packages\ragas\llms\prompt.py", line 242, in _load return cls(**json.load(open(path))) File "F:\Application2022\Anaconda\an\lib\site-packages\pydantic\v1\main.py", line 341, in init raise validation_error pydantic.v1.error_wrappers.ValidationError: 1 validation error for Prompt root analysis in example 1 is not in valid json format: Expecting value: line 1 column 1 (char 0) (type=value_error)

Expected behavior I'm trying to use ragas to test my RAG (Retriever-Augmented Generation) program with a Chinese dataset. Initially, I encountered an error saying "no statements were generated from the answer." Later, I added the following code: from ragas.llms import LangchainLLMWrapper
llm_instance = LangchainLLMWrapper(ChatOpenAI())
faithfulness.llm = llm_instance
faithfulness.adapt(language="chinese") However, this didn't work, and I'm still getting the same or similar error.

The key points are:

I'm using a Chinese dataset. I'm trying to set up the language model wrapper for Chinese. The faithfulness.adapt(language="chinese") call does not seem to be effective in resolving the issue.

Additional context Add any other context about the problem here.

dosubot[bot] commented 2 months ago

The faithfulness.adapt(language="chinese") method is not working in Ragas version 0.0.22 with a Chinese dataset because the Prompt class did not originally support the adapt method. However, the Prompt class has now been updated to include an adapt method, which allows for adapting the prompt to different languages.

To resolve the validation error in the Prompt class, you need to ensure that the Prompt class has the adapt method implemented correctly. The updated Prompt class with the adapt method is as follows:

class Prompt(BaseModel):
    # existing fields and methods

    def adapt(self, language: str, llm, cache_dir: t.Optional[str] = None) -> Prompt:
        def get_all_keys(nested_json):
            keys = set()
            for key, value in nested_json.items():
                keys.add(key)
                if isinstance(value, dict):
                    keys = keys.union(get_all_keys(value))
            return keys

        if self.language == language:
            return self

        cache_dir = cache_dir if cache_dir else get_cache_dir()
        if os.path.exists(os.path.join(cache_dir, language, f"{self.name}.json")):
            self_cp = self._load(language, self.name, cache_dir)

            self.language = self_cp.language
            self.examples = self_cp.examples

            return self_cp

        logger.info("Adapting %s to %s", self.name, language)
        prompts = []
        output_keys = []
        for example in self.examples:
            prompts.extend(
                [
                    str_translation.format(
                        translate_to=language, input=example.get(key)
                    )
                    for key in self.input_keys
                ]
            )
            prompts.append(
                json_translatation.format(
                    translate_to=language, input=example.get(self.output_key)
                )
                if self.output_type.lower() == "json"
                else str_translation.format(
                    translate_to=language, input=example.get(self.output_key)
                )
            )
            if self.output_type.lower() == "json":
                output = example.get(self.output_key)
                if isinstance(output, str):
                    output = json.loads(output)
                if isinstance(output, dict):
                    output_keys.append(get_all_keys(output))
                elif isinstance(output, list) and all(
                    isinstance(item, dict) for item in output
                ):
                    output_keys.append([get_all_keys(item) for item in output])

        results = []
        for p in prompts:
            results.append(llm.generate_text(p).generations[0][0].text)
        per_example_items = len(self.input_keys) + 1
        grouped_results = [
            results[i : i + per_example_items]
            for i in range(0, len(results), per_example_items)
        ]
        assert len(grouped_results) == len(
            self.examples
        ), "examples and adapted examples must be of equal length"
        for i, example in enumerate(grouped_results):
            example_dict = {}
            example_dict.update(
                {k: v for k, v in zip(self.input_keys, example[: len(self.input_keys)])}
            )
            if self.output_type.lower() == "json":
                example_dict[self.output_key] = json_loader._safe_load(example[-1], llm)
                if example_dict[self.output_key] == {}:
                    dict_str = example[-1].split("(")[0].strip()
                    example_dict[self.output_key] = ast.literal_eval(dict_str)
                else:
                    example_dict[self.output_key] = example[-1]
            if self.output_type.lower() == "json":
                output = example_dict[self.output_key]
                if isinstance(output, dict):
                    assert (
                        set(output.keys()) == output_keys[i]
                    ), f"Adapted output keys {set(output.keys())=} do not match with the original output keys: {output_keys[i]=}"
                elif isinstance(output, list) and all(
                    isinstance(item, dict) for item in output
                ):
                    assert all(
                        set(item.keys()) in output_keys[i] for item in output
                    ), "Adapted output keys do not match with the original output keys"

            self.examples[i] = example_dict

        self.language = language
        self.save(cache_dir=cache_dir)

        return self

After implementing the adapt method in the Prompt class, you can then call it in the Faithfulness class's adapt method:

@dataclass
class Faithfulness(MetricWithLLM, SingleTurnMetric):
    # existing fields and methods

    def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None:
        assert self.llm is not None, "LLM is not set"

        logger.info(f"Adapting Faithfulness metric to {language}")

        self.nli_statements_message = self.nli_statements_message.adapt(
            language, self.llm, cache_dir
        )
        self.statement_prompt = self.statement_prompt.adapt(
            language, self.llm, cache_dir
        )

        self.sentence_segmenter = get_segmenter(language=language, clean=False)

By following these steps, you should be able to resolve the issue with the faithfulness.adapt(language="chinese") method not working and the validation error in the Prompt class [1][2][3].

To continue talking to Dosu, mention @dosu.