Answer correctness metric

Implemented answer correctness for en and ja.

Here are some example test cases:

# Create an eval_client as usual

gen_output = [
    "Tokyo is the capital of Japan.",
    "Osaka is the capital of Japan.",
    "Washington, D.C. is the capital of the US.",
    "Many people consider New York City is the capital of the US. But it is technically Washington, D.C.",
    "The capital of BlahBlah land is Blahtropolis.",
    "I am not sure what the capital of BlahBlah land is.",
]
prompt = [
    "what is the capital of Japan?",
    "what is the capital of Japan?",
    "what is the capital of the US?",
    "what is the capital of the US?",
    "what is the capital of BlahBlah land?",
    "what is the capital of BlahBlah land?",
]
ref_outputs = [
    "Tokyo",
    "Tokyo",
    "Washington, D.C.",
    "Washington, D.C.",
    "Blahtropolis.",
    "Blahtropolis.",
]

langcheck.metrics.answer_correctness(
    gen_output, ref_outputs, prompt, eval_model=eval_client
)

# Create an eval_client as usual

gen_output = [
    "東京は日本の首都です。",
    "大阪は日本の首都です。",
    "ワシントンD.C.はアメリカの首都です。",
    "多くの人々はニューヨーク市がアメリカの首都だと考えています。しかし、実際はワシントンD.C.です。",
    "ブラブラランドの首都はブラトロポリスです。",
    "ブラブラランドの首都が何かわかりません。",
]
prompt = [
    "日本の首都は何ですか？",
    "日本の首都は何ですか？",
    "アメリカの首都は何ですか？",
    "アメリカの首都は何ですか？",
    "ブラブラランドの首都は何ですか？",
    "ブラブラランドの首都は何ですか？",
]
ref_outputs = [
    "東京",
    "東京",
    "ワシントンD.C.",
    "ワシントンD.C.",
    "ブラトロポリス。",
    "ブラトロポリス。",
]

langcheck.metrics.ja.answer_correctness(
    gen_output, ref_outputs, prompt, eval_model=eval_client
)

citadel-ai / langcheck

Answer correctness metric #130