zeno-ml / zeno-client

Python client for creating new Zeno projects and uploading data
https://zenoml.com/docs/intro/#creating-a-project
MIT License
8 stars 0 forks source link

upload_system with rag view #53

Closed mikelonestone closed 5 months ago

mikelonestone commented 5 months ago

Is it possible to have an example of upload_system with rag view data example ? https://hub.zenoml.com/playground?params=eyJzYW1wbGUiOiJyYWcifQ==

I'm not sure how to pass output column within Dataframe

Sparkier commented 5 months ago

Hi @mikelonestone, the attached example shows how you can use the rag view and add data. Let me know if you have additional questions. document-qa-results.json

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from zeno_client import ZenoClient, ZenoMetric\n",
    "import pandas as pd\n",
    "import json\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv(override=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"document-qa-results.json\", \"r\") as f:\n",
    "    data = json.load(f)\n",
    "data_df = pd.DataFrame({\"question\": [d[\"data\"] for d in data]})\n",
    "data_df[\"id\"] = data_df.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "client = ZenoClient(os.environ[\"ZENO_API_KEY\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Successfully updated project.\n",
      "Access your project at  https://hub.zenoml.com/project/cabreraalex/Document%20QA\n"
     ]
    }
   ],
   "source": [
    "project = client.create_project(\n",
    "    name=\"Document QA\",\n",
    "    view={\n",
    "        \"data\": {\"type\": \"text\"},\n",
    "        \"label\": {\"type\": \"text\"},\n",
    "        \"output\": {\n",
    "            \"type\": \"vstack\",\n",
    "            \"keys\": {\n",
    "                \"answer\": {\"type\": \"text\"},\n",
    "                \"retrieved\": {\n",
    "                    \"type\": \"list\",\n",
    "                    \"elements\": {\n",
    "                        \"type\": \"vstack\",\n",
    "                        \"keys\": {\n",
    "                            \"score\": {\"type\": \"text\", \"label\": \"score: \"},\n",
    "                            \"reference\": {\"type\": \"markdown\"},\n",
    "                            \"text\": {\"type\": \"text\", \"label\": \"text: \"},\n",
    "                        },\n",
    "                    },\n",
    "                    \"border\": True,\n",
    "                },\n",
    "            },\n",
    "        },\n",
    "    },\n",
    "    description=\"Document-grounded question answering with Wikipedia\",\n",
    "    metrics=[\n",
    "        ZenoMetric(name=\"accuracy\", type=\"mean\", columns=[\"accuracy\"]),\n",
    "        ZenoMetric(name=\"exact_match\", type=\"mean\", columns=[\"exact_match\"]),\n",
    "        ZenoMetric(name=\"substring_match\", type=\"mean\", columns=[\"substring_match\"]),\n",
    "        ZenoMetric(name=\"f1\", type=\"mean\", columns=[\"f1\"]),\n",
    "        ZenoMetric(name=\"rougel\", type=\"mean\", columns=[\"rougel\"]),\n",
    "    ],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "project.upload_dataset(data_df, id_column=\"id\", data_column=\"question\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_df = pd.DataFrame(\n",
    "    {\n",
    "        \"output\": [\n",
    "            json.dumps(\n",
    "                {\n",
    "                    \"answer\": d[\"output\"][0][\"answer\"],\n",
    "                    \"retrieved\": [\n",
    "                        {\n",
    "                            \"reference\": \"[{idx}]({url})\".format(\n",
    "                                idx=d[\"output\"][0][\"retrieved\"][0][\"reference\"],\n",
    "                                url=\"https://en.wikipedia.org/?curid=\"\n",
    "                                + d[\"output\"][0][\"retrieved\"][0][\"reference\"],\n",
    "                            ),\n",
    "                            \"text\": d[\"output\"][0][\"retrieved\"][0][\"text\"],\n",
    "                            \"score\": d[\"output\"][0][\"retrieved\"][0][\"score\"],\n",
    "                        }\n",
    "                    ],\n",
    "                }\n",
    "            )\n",
    "            for d in data\n",
    "        ],\n",
    "        \"accuracy\": [d[\"output\"][0][\"answer_evaluation\"][\"accuracy\"] for d in data],\n",
    "        \"exact_match\": [\n",
    "            d[\"output\"][0][\"answer_evaluation\"][\"exact_match\"] for d in data\n",
    "        ],\n",
    "        \"substring_match\": [\n",
    "            d[\"output\"][0][\"answer_evaluation\"][\"substring_match\"] for d in data\n",
    "        ],\n",
    "        \"f1\": [d[\"output\"][0][\"answer_evaluation\"][\"f1\"] for d in data],\n",
    "        \"rougel\": [d[\"output\"][0][\"answer_evaluation\"][\"rougel\"] for d in data],\n",
    "    }\n",
    ")\n",
    "output_df[\"id\"] = output_df.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "project.upload_system(\n",
    "    output_df, name=\"Llama-2 BM25\", id_column=\"id\", output_column=\"output\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "compare",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
mikelonestone commented 5 months ago

sorry i forgot to close the issue, I just had to delete/recreate project (before that the data view specification wasn't used and data was displayed as text instead to be formated)