NASA-IMPACT / evalem

An evaluation framework for your large model pipelines
0 stars 0 forks source link

[alpha] Implementation of Semantic metrics and more basic metrics #6

Closed NISH1001 closed 1 year ago

NISH1001 commented 1 year ago

Major Changes

Minor Changes


Usages


from evalem.structures import (
    PredictionDTO,
    ReferenceDTO,
    EvaluationDTO,
    PredictionInstance,
    ReferenceInstance
)

from evalem.metrics import (
    Metric,
    AccuracyMetric,
    PrecisionMetric,
    RecallMetric,
    F1Metric,
    ConfusionMatrix,
    ExactMatchMetric,
    BertScore,
    BartScore,
)

from typing import Iterable, Type, Mapping, Union, List

from evalem.models import (
    DefaultQAModelWrapper,
    HFPipelineWrapper,
    ModelWrapper
)

from evalem.misc.datasets import get_squad_v2

# wrapped_model = HFPipelineWrapper(
#     pipeline("question-answering"),
# )

wrapped_model = DefaultQAModelWrapper(device="cpu")

def run_pipeline(
    model: Type[ModelWrapper],
    evaluators: Iterable[Type[Evaluator]],
    inputs,
    references
) -> Iterable[Mapping[str, dict]]:
    predictions = model(inputs)
    evaluators = [evaluators] if not isinstance(evaluators, Iterable) else evaluators
    return list(map(lambda e: e(predictions=predictions, references=references), evaluators))

data = get_squad_v2("validation", nsamples=100)

evaluators = [
    Evaluator(metrics=[
        AccuracyMetric(),
        ConfusionMatrix(),
        ExactMatchMetric(),
    ]),
    Evaluator(metrics=[
        BertScore(device="mps", model_type="distilbert-base-uncased"),
        BartScore(device="mps")
    ])
]

results = run_pipeline(
    wrapped_model,
    evaluators,
    data["inputs"],
    data["references"]
)
NISH1001 commented 1 year ago

I have merged with some initial review. Will do another PR sometime this week which has a bit changed implementation in model initialization.