ragas evaluate asking for OPENAI_API_KEY when using locally hosted Langchain TGI LLM

jenghub commented 11 months ago

Trying to piece together a basic evaluation example from the docs with a locally-hosted LLM through langchain textgeninference but running into problems in evaluate(). Where am I going wrong?

import os

import pandas as pd
from datasets import Dataset
from langchain.llms import HuggingFaceTextGenInference
from ragas import evaluate
from ragas.llms import LangchainLLM
from ragas.metrics import (answer_relevancy, context_precision, context_recall,
                           faithfulness)
from ragas.metrics.critique import harmfulness

os.environ["OPENAI_API_KEY"] = "not-needed"

# set the LLM
llm = HuggingFaceTextGenInference(
    inference_server_url="http://localhost:8084/",
    max_new_tokens=512,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.01,
    repetition_penalty=1.03,
    streaming=True,
)

# res = llm("What did foo say about bar?") # this works
ragas_llm = LangchainLLM(llm)

faithfulness.llm = ragas_llm
answer_relevancy.llm = ragas_llm
context_precision.llm = ragas_llm
context_recall.llm = ragas_llm
harmfulness.llm = ragas_llm

data_samples = {
    "question": ["When was the first super bowl?", "Who won the most super bowls?"],
    "answer": [
        "The first superbowl was held on January 15, 1967",
        "The most super bowls have been won by The New England Patriots",
    ],
    "contexts": [
        ["The Super Bowl....season since 1966,", "replacing the NFL...in February."],
        [
            "The Green Bay Packers...Green Bay, Wisconsin.",
            "The Packers compete...Football Conference",
        ],
    ],
    "ground_truths": [
        ["The first superbowl was held on January 15, 1967"],
        ["The New England Patriots have won the Super Bowl a record six times"],
    ],
}

dataset = Dataset.from_dict(data_samples)
results = evaluate(
    dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall,
        harmfulness,
    ],
)
df = results.to_pandas()
print(df.head())

Error trace:

Traceback (most recent call last):
  File "/mnt/md0/share/project-name/evaluation/evaltest.py", line 58, in <module>
    results = evaluate(
  File "/home/user-name/.local/share/pdm/venv/lib/python3.10/site-packages/ragas/evaluation.py", line 97, in evaluate
    [m.init_model() for m in metrics]
  File "/home/user-name/.local/share/pdm/venv/lib/python3.10/site-packages/ragas/evaluation.py", line 97, in <listcomp>
    [m.init_model() for m in metrics]
  File "/home/user-name/.local/share/pdm/venv/lib/python3.10/site-packages/ragas/metrics/answer_relevance.py", line 70, in init_model
    raise OpenAIKeyNotFound
ragas.exceptions.OpenAIKeyNotFound: OpenAI API key not found! Seems like your trying to use Ragas metrics with OpenAI endpoints. Please set 'OPENAI_API_KEY' environment variable

dependencies = [
    "langchain>=0.0.331",
    "text-generation>=0.6.1",
    "ragas>=0.0.19",
    "openai==0.28.1",
]

shahules786 commented 11 months ago

Hi @jenghub , the answer relevancy metric used open-ai embeddings by default. If you wish to change it you can use ragas huggingface embeddings from here and pass it to answer relevancy

Also ensure that you will need to install extra dependencies to use local embeddings.

jenghub commented 11 months ago

Thanks, I ended up modifying the above with something like this to avoid the error:

ragas_llm = LangchainLLM(llm)

faithfulness.llm = ragas_llm
answer_relevancy.llm = ragas_llm
answer_relevancy.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

However, now, when I run the script (having installed it from the latest main branch to avoid the TGI error), I get this keyerror problem:

evaluating with [faithfulness]
  0%|                                                                                                     | 0/1 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "/home/username/.local/share/pdm/venvs/rag-eval-DXYza8Df-rag-eval/lib/python3.10/site-packages/text_generation/client.py", line 502, in generate_stream
    response = StreamResponse(**json_payload)
  File "pydantic/main.py", line 341, in pydantic.main.BaseModel.__init__
pydantic.error_wrappers.ValidationError: 1 validation error for StreamResponse
token -> logprob
  none is not an allowed value (type=type_error.none.not_allowed)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/username/rag-eval/evaluation/testset/evalexample.py", line 58, in <module>
    results = evaluate(
  File "/home/username/rag-eval/ragas/src/ragas/evaluation.py", line 104, in evaluate
    scores.append(metric.score(dataset).select_columns(metric.name))
  File "/home/username/rag-eval/ragas/src/ragas/metrics/base.py", line 76, in score
    score = self._score_batch(dataset.select(batch), callbacks=group)
  File "/home/username/rag-eval/ragas/src/ragas/metrics/_faithfulness.py", line 87, in _score_batch
    result = self.llm.generate(prompts, callbacks=batch_group)
  File "/home/username/rag-eval/ragas/src/ragas/llms/base.py", line 138, in generate
    list_llmresults = run_async_tasks(
  File "/home/username/rag-eval/ragas/src/ragas/async_utils.py", line 41, in run_async_tasks
    outputs: List[Any] = asyncio.run(_gather())
  File "/usr/lib/python3.10/asyncio/runners.py", line 44, in run
    return loop.run_until_complete(main)
  File "/usr/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
    return future.result()
  File "/home/username/rag-eval/ragas/src/ragas/async_utils.py", line 39, in _gather
    return await asyncio.gather(*tasks_to_execute)
  File "/home/username/rag-eval/ragas/src/ragas/llms/base.py", line 114, in generate_completions
    result = await self.llm.agenerate(ps, callbacks=callbacks)
  File "/home/username/.local/share/pdm/venvs/rag-eval-DXYza8Df-rag-eval/lib/python3.10/site-packages/langchain/llms/base.py", line 823, in agenerate
    output = await self._agenerate_helper(
  File "/home/username/.local/share/pdm/venvs/rag-eval-DXYza8Df-rag-eval/lib/python3.10/site-packages/langchain/llms/base.py", line 711, in _agenerate_helper
    raise e
  File "/home/username/.local/share/pdm/venvs/rag-eval-DXYza8Df-rag-eval/lib/python3.10/site-packages/langchain/llms/base.py", line 698, in _agenerate_helper
    await self._agenerate(
  File "/home/username/.local/share/pdm/venvs/rag-eval-DXYza8Df-rag-eval/lib/python3.10/site-packages/langchain/llms/base.py", line 1072, in _agenerate
    await self._acall(prompt, stop=stop, run_manager=run_manager, **kwargs)
  File "/home/username/.local/share/pdm/venvs/rag-eval-DXYza8Df-rag-eval/lib/python3.10/site-packages/langchain/llms/huggingface_text_gen_inference.py", line 217, in _acall
    async for chunk in self._astream(prompt, stop, run_manager, **kwargs):
  File "/home/username/.local/share/pdm/venvs/rag-eval-DXYza8Df-rag-eval/lib/python3.10/site-packages/langchain/llms/huggingface_text_gen_inference.py", line 276, in _astream
    async for res in self.async_client.generate_stream(prompt, **invocation_params):
  File "/home/username/.local/share/pdm/venvs/rag-eval-DXYza8Df-rag-eval/lib/python3.10/site-packages/text_generation/client.py", line 505, in generate_stream
    raise parse_error(resp.status, json_payload)
  File "/home/username/.local/share/pdm/venvs/rag-eval-DXYza8Df-rag-eval/lib/python3.10/site-packages/text_generation/errors.py", line 81, in parse_error
    message = payload["error"]
KeyError: 'error'

jenghub commented 11 months ago

@shahules786 the key error issue occurs on non-answer_relevancy metrics as well when running TGI.

llama-index==0.8.69.post1
ragas==0.0.20

jenghub commented 11 months ago

Was able to get this working with vllm for now. Closing the issue.

explodinggradients / ragas

ragas evaluate asking for OPENAI_API_KEY when using locally hosted Langchain TGI LLM #269