explodinggradients / ragas

Supercharge Your LLM Application Evaluations 🚀
https://docs.ragas.io
Apache License 2.0
7.03k stars 710 forks source link

KeyError: 'data' and answer_correctness nan #1587

Open chenboju opened 2 hours ago

chenboju commented 2 hours ago

from datasets import Dataset

questions = ["恐龙是怎么被命名的?", "恐龙怎么分类的?", "体型最大的是哪种恐龙?", "体型最长的是哪种恐龙?它在哪里被发现?", "恐龙采样什么样的方式繁殖?", "恐龙是冷血动物吗?", "陨石撞击是导致恐龙灭绝的原因吗?", "恐龙是在什么时候灭绝的?", "鳄鱼是恐龙的近亲吗?", "恐龙在英语中叫什么?" ] ground_truths = [["1841年,英国科学家理查德·欧文在研究几块样子像蜥蜴骨头化石时,认为它们是某种史前动物留下来的,并命名为恐龙,意思是“恐怖的蜥蜴”。"], ["恐龙可分为鸟类和非鸟恐龙。"], ["恐龙整体而言的体型很大。以恐龙作为标准来看,蜥脚下目是其中的巨无霸。"], ["最长的恐龙是27米长的梁龙,是在1907年发现于美国怀俄明州。"], ["恐龙采样产卵、孵蛋的方式繁殖。"], ["恐龙是介于冷血和温血之间的动物"], ["科学家最新研究显示,0.65亿年前小行星碰撞地球时间或早或晚都可能不会导致恐龙灭绝,真实灭绝原因是当时恐龙处于较脆弱的生态系统中,环境剧变易导致灭绝。"], ["恐龙灭绝的时间是在距今约6500万年前,地质年代为中生代白垩纪末或新生代第三纪初。"], ["鳄鱼是另一群恐龙的现代近亲,但两者关系较非鸟恐龙与鸟类远。"], ["1842年,英国古生物学家理查德·欧文创建了“dinosaur”这一名词。英文的dinosaur来自希腊文deinos(恐怖的)Saurosc(蜥蜴或爬行动物)。对当时的欧文来说,这“恐怖的蜥蜴”或“恐怖的爬行动物”是指大的灭绝的爬行动物(实则不是)"]] answers = [] contexts = []

Inference

for query in questions: answers.append(rag_chain.invoke(query)) contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

To dict

data = { "question": questions, "answer": answers, "contexts": contexts, "ground_truths": ground_truths, "reference": [" ".join(gt) if isinstance(gt, list) else gt for gt in ground_truths]

}

Convert dict to dataset

dataset = Dataset.from_dict(data)

import os from ragas import evaluate from ragas.metrics import ( faithfulness, answer_relevancy, context_precision, context_recall, context_entity_recall, answer_correctness, answer_similarity, )

result = evaluate( dataset = dataset, metrics=[ faithfulness, answer_relevancy, context_precision, context_recall, context_entity_recall, answer_correctness, answer_similarity,
], )

chenboju commented 2 hours ago

{ "name": "KeyError", "message": "'data'", "stack": "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)\nCell \u001b[1;32mIn[22], line 14\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m evaluate\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mragas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 4\u001b[0m faithfulness,\n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# answer_relevancy,\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[38;5;66;03m# answer_similarity,\u001b[39;00m\n\u001b[0;32m 11\u001b[0m )\n\u001b[1;32m---> 14\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[0;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\n\u001b[0;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[43mfaithfulness\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# answer_relevancy,\u001b[39;49;00m\n\u001b[0;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# context_precision,\u001b[39;49;00m\n\u001b[0;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# context_recall,\u001b[39;49;00m\n\u001b[0;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# context_entity_recall,\u001b[39;49;00m\n\u001b[0;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer_correctness\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 23\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# answer_similarity, \u001b[39;49;00m\n\u001b[0;32m 24\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 25\u001b[0m \u001b[43m)\u001b[49m\n\nFile \u001b[1;32mD:\RAGAS\ragas\src\ragas\_analytics.py:130\u001b[0m, in \u001b[0;36mtrack_was_completed..wrapper\u001b[1;34m(*args, *kwargs)\u001b[0m\n\u001b[0;32m 127\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[0;32m 128\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m\u001b[39margs: P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m\u001b[39m\u001b[38;5;241m\u001b[39mkwargs: P\u001b[38;5;241m.\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m t\u001b[38;5;241m.\u001b[39mAny:\n\u001b[0;32m 129\u001b[0m track(IsCompleteEvent(event_type\u001b[38;5;241m=\u001b[39mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18mname\u001b[39m, is_completed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m))\n\u001b[1;32m--> 130\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 131\u001b[0m track(IsCompleteEvent(event_type\u001b[38;5;241m=\u001b[39mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18mname\u001b[39m, is_completed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m))\n\u001b[0;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n\nFile \u001b[1;32mD:\RAGAS\ragas\src\ragas\evaluation.py:324\u001b[0m, in \u001b[0;36mevaluate\u001b[1;34m(dataset, metrics, llm, embeddings, callbacks, in_ci, run_config, token_usage_parser, raise_exceptions, column_map, show_progress)\u001b[0m\n\u001b[0;32m 320\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 321\u001b[0m \u001b[38;5;66;03m# evalution run was successful\u001b[39;00m\n\u001b[0;32m 322\u001b[0m \u001b[38;5;66;03m# now lets process the results\u001b[39;00m\n\u001b[0;32m 323\u001b[0m cost_cb \u001b[38;5;241m=\u001b[39m ragas_callbacks[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcost_cb\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcost_cb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m ragas_callbacks \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m--> 324\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mEvaluationResult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 325\u001b[0m \u001b[43m \u001b[49m\u001b[43mscores\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mscores\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 326\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 327\u001b[0m \u001b[43m \u001b[49m\u001b[43mbinary_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbinary_metrics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 328\u001b[0m \u001b[43m \u001b[49m\u001b[43mcost_cb\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcast\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 329\u001b[0m \u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mUnion\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mCostCallbackHandler\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 330\u001b[0m \u001b[43m \u001b[49m\u001b[43mcost_cb\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 331\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 332\u001b[0m \u001b[43m \u001b[49m\u001b[43mragas_traces\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtracer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraces\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 333\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 334\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m evaluation_group_cm\u001b[38;5;241m.\u001b[39mended:\n\u001b[0;32m 335\u001b[0m evaluation_rm\u001b[38;5;241m.\u001b[39mon_chain_end(result)\n\nFile \u001b[1;32m:9\u001b[0m, in \u001b[0;36minit\u001b[1;34m(self, scores, dataset, binary_columns, cost_cb, traces, ragas_traces)\u001b[0m\n\nFile \u001b[1;32mD:\RAGAS\ragas\src\ragas\dataset_schema.py:381\u001b[0m, in \u001b[0;36mEvaluationResult.__post_init__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 378\u001b[0m values\u001b[38;5;241m.\u001b[39mappend(value \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1e-10\u001b[39m)\n\u001b[0;32m 380\u001b[0m \u001b[38;5;66;03m# parse the traces\u001b[39;00m\n\u001b[1;32m--> 381\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraces \u001b[38;5;241m=\u001b[39m \u001b[43mparse_run_traces\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mragas_traces\u001b[49m\u001b[43m)\u001b[49m\n\nFile \u001b[1;32mD:\RAGAS\ragas\src\ragas\callbacks.py:160\u001b[0m, in \u001b[0;36mparse_run_traces\u001b[1;34m(traces)\u001b[0m\n\u001b[0;32m 157\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, prompt_uuid \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(metric_trace\u001b[38;5;241m.\u001b[39mchildren):\n\u001b[0;32m 158\u001b[0m prompt_trace \u001b[38;5;241m=\u001b[39m traces[prompt_uuid]\n\u001b[0;32m 159\u001b[0m prompttraces[\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mprompt_trace\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m--> 160\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[43mprompt_trace\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdata\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m,\n\u001b[0;32m 161\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput\u001b[39m\u001b[38;5;124m\"\u001b[39m: prompt_trace\u001b[38;5;241m.\u001b[39moutputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m 162\u001b[0m }\n\u001b[0;32m 163\u001b[0m metric_traces[\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmetric_trace\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m prompt_traces\n\u001b[0;32m 164\u001b[0m parased_traces\u001b[38;5;241m.\u001b[39mappend(metric_traces)\n\n\u001b[1;31mKeyError\u001b[0m: 'data'" }