Open NISH1001 opened 1 year ago
We use askathon cleaned response dataset to evaluate the nasa-v6 models against vanilla distilbert model. We're using onnx version of nasa-v6 model.
askathon cleaned response
Dataset loader is tentatively:
def load_askathon_clean(path: str) -> pd.DataFrame: data = pd.read_csv(path) data = data.drop(columns=["Email Address"]).reset_index(drop=True) data.rename(columns={ data.columns[0] : "context", data.columns[1]: "id", data.columns[2]: "source", data.columns[3]: "topics", data.columns[4]: "q1", data.columns[5]: "a1", data.columns[6]: "q2", data.columns[7]: "a2", data.columns[8]: "q3", data.columns[9]: "a3", data.columns[10]: "q4", data.columns[11]: "a4", data.columns[12]: "q5", data.columns[13]: "a5" }, inplace=True) data.drop(columns=["source", "topics"], inplace=True) return data def create_qa_dataset(data: pd.DataFrame) -> pd.DataFrame: res = [] q_keys = [f"q{i}" for i in range(1, 6)] a_keys = [f"a{i}" for i in range(1, 6)] def _index_fn(context: str, answer: str) -> int: try: return context.lower().index(answer.rstrip(" ,.!?").lower()) except ValueError: return -1 for _df in data.itertuples(): tmp = [] for qk, ak in zip(q_keys, a_keys): q, a = getattr(_df, qk), getattr(_df, ak) if not isinstance(a, str): continue idx = _index_fn(_df.context, a) if idx > -1: tmp.append(dict( id=str(_df.id), context=_df.context, question=q, answer_text=a, answer_start=idx, )) res.extend(tmp) return pd.DataFrame(res) data = create_qa_dataset(load_askathon_clean("data/askathon.csv"))
Evaluation is done through evalem with following tentative pipeline code:
evalem
from evalem.nlp.evaluators import QAEvaluator from evalem.nlp.models import QuestionAnsweringHFPipelineWrapper from evalem.nlp.metrics import BartScore, BertScore, BleuMetric, MeteorMetric, ExactMatchMetric, RougeMetric from evalem import NamedSimpleEvaluationPipeline from evalem.misc.utils import build_comparison_table # define models wrapped_model = QuestionAnsweringHFPipelineWrapper(device="mps") wrapped_model_2 = QuestionAnsweringHFPipelineWrapper.from_onnx( model="tmp/onnx/nasa-v6/", tokenizer="tmp/onnx/nasa-v6/", device="mps" ) # define evaluators/metrics evaluators_common = [ QAEvaluator(), BertScore(device="mps"), BartScore(device="mps"), RougeMetric(), MeteorMetric(), BleuMetric(), ] # build pipelines eval_pipe = NamedSimpleEvaluationPipeline( model=wrapped_model, evaluators=evaluators_common, name="distilbert" ) eval_pipe_2 = NamedSimpleEvaluationPipeline( model=wrapped_model_2, evaluators=evaluators_common, name="nasa-v6-onnx" ) # evaluate and get comparison table results = build_comparison_table( eval_pipe, eval_pipe_2, inputs=list(data[["context", "question"]].T.to_dict().values()), references=data["answer_text"].to_list(), )
cc: @muthukumaranR @xhagrg
We use
askathon cleaned response
dataset to evaluate the nasa-v6 models against vanilla distilbert model. We're using onnx version of nasa-v6 model.Dataset loader is tentatively:
Evaluation is done through
evalem
with following tentative pipeline code:cc: @muthukumaranR @xhagrg