Arize-ai / phoenix

AI Observability & Evaluation
https://docs.arize.com/phoenix
Other
3.53k stars 263 forks source link

[evals] use function callbacks as a mechanism for the rails snapping if available #1584

Closed mikeldking closed 11 months ago

axiomofjoy commented 11 months ago

Code snippet for getting started:

import logging
from typing import Any, Dict, Optional, List, Tuple
import openai
import json
from phoenix.experimental.evals import PromptTemplate, download_benchmark_dataset

logger = logging.getLogger(__name__)

def openai_functions_classify(
    record: Dict[str, Any],
    prompt_template: PromptTemplate,
    classes: List[str],
    model_name: str,
    function_name: str,
    function_description: str,
    argument_name: str,
    argument_description: str,
    *,
    system_message: Optional[str] = None,
    require_explanation: bool = False,
) -> Tuple[Optional[str], Optional[str]]:
    if not all(variable_name in record for variable_name in prompt_template.variables):
        raise ValueError(
            "All prompt template variables must be present as keys in record."
        )

    user_message_content = prompt_template.format(
        {
            variable_name: record[variable_name]
            for variable_name in prompt_template.variables
        }
    )
    messages = [{"role": "user", "content": user_message_content}]
    if system_message:
        messages.insert(0, {"role": "system", "content": system_message})
    argument_data = {
        argument_name: {
            "type": "string",
            "description": argument_description,
            "enum": classes,
        },
    }
    if require_explanation:
        argument_data["explanation"] = {
            "type": "string",
            "description": "A brief explanation of your reasoning for your answer.",
        }
    functions = [
        {
            "name": function_name,
            "description": function_description,
            "parameters": {
                "type": "object",
                "properties": argument_data,
                "required": [argument_name],
            },
        }
    ]
    response = openai.ChatCompletion.create(
        model=model_name,
        messages=messages,
        functions=functions,
        function_call={"name": function_name},
    )
    try:
        response_message = response["choices"][0]["message"]
        assert response_message["function_call"]["name"] == function_name
        function_arguments = json.loads(response_message["function_call"]["arguments"])
        return function_arguments[argument_name], function_arguments.get("explanation")
    except Exception as err:
        logger.debug(err)

    return None, None

if __name__ == "__main__":
    df = download_benchmark_dataset(
        task="binary-relevance-classification", dataset_name="wiki_qa-train"
    ).rename(columns={"query_text": "query", "document_text": "reference"})
    prompt_template_string = """You are comparing a reference text to a question and trying to determine if the reference text contains information relevant to answering the question. Here is the data:
    [BEGIN DATA]
    ************
    [Question]: {query}
    ************
    [Reference text]: {reference}
    [END DATA]

Compare the question above to the reference text. You must determine whether the reference text contains information that can answer the question. Please focus on whether the very specific question can be answered by the information in the reference text."""
    prompt_template = PromptTemplate(prompt_template_string)

    for record in df[:10].to_dict(orient="records"):
        predicted_class, explanation = openai_functions_classify(
            record=record,
            prompt_template=prompt_template,
            classes=["relevant", "irrelevant"],
            model_name="gpt-4-0613",
            function_name="relevance",
            function_description="A function to record whether a reference text is relevant to a question.",
            argument_name="relevant",
            argument_description="A string indicating whether the reference text is relevant to the question.",
            require_explanation=True,
        )

        print("Query")
        print("=====")
        print()
        print(record["query"])
        print()

        print("Reference")
        print("=======")
        print()
        print(record["reference"])
        print()

        print("Predicted Class")
        print("===============")
        print(predicted_class)
        print()

        print("Ground Truth")
        print("============")
        print({True: "relevant", False: "irrelevant"}.get(record["relevant"]))
        print()

        print("Explanation")
        print("===========")
        print(explanation)
mikeldking commented 11 months ago

resolved by #1651

mikeldking commented 11 months ago

@RogerHYang @axiomofjoy - can we add a follow-up ticket for docs on this one? Also does the notebooks require additional steps to highlight the explanations?