import re from rouge import Rouge from fuzzywuzzy import fuzz from datasets import load_metric from nltk.translate.bleu_score import sentence_bleu import json from tqdm import tqdm from statistics import mean from pprint import pprint import os

########################

BLEU

######################## def tokenize(text): tokens = re.split(r'\s|.', text) tokens = [t for t in tokens if len(t) > 0] return tokens

def bleu_score(reference, hypothesis, gram): reference_tokens = tokenize(reference) hypothesis_tokens = tokenize(hypothesis)

if gram == 1:
    bleu = sentence_bleu([reference_tokens], hypothesis_tokens, (1., ))  # BELU-1
elif gram == 2:
    bleu = sentence_bleu([reference_tokens], hypothesis_tokens, (1. / 2., 1. / 2.))  # BELU-2
elif gram == 3:
    bleu = sentence_bleu([reference_tokens], hypothesis_tokens, (1. / 3., 1. / 3., 1. / 3.))  # BELU-3
elif gram == 4:
    bleu = sentence_bleu([reference_tokens], hypothesis_tokens, (1. / 4., 1. / 4., 1. / 4., 1. / 4.))  # BELU-4

return bleu

def caculate_bleu(results, data, gram): bleus = [] for output_id in range(len(results)): prediction = results[output_id] target = data[output_id] if prediction == "" or target == "": continue bleu = bleu_score(target, prediction, gram) bleus.append(bleu) if len(results) == 0: return 0 avg_bleu = sum(bleus) / len(results) return avg_bleu

########################

Rouge-L

######################## def score_rouge(str1, str2): rouge = Rouge(metrics=["rouge-l"]) scores = rouge.get_scores(str1, str2, avg=True) rouge_l = scores['rouge-l']['f'] return rouge_l

def caculate_rouge(results, data): rouges = [] for output_id in range(len(results)): prediction = results[output_id] target = data[output_id] if prediction == "" or target == "": continue rouge = score_rouge(target, prediction) rouges.append(rouge) if len(results) == 0: return 0 avg_rouge = sum(rouges) / len(results) return avg_rouge

########################

Accuracy (EM)

######################## def caculate_accuracy(results, data): scores = 0 for output_id in range(len(results)): prediction = results[output_id] target = data[output_id] if prediction == "" or target == "": continue if prediction == target: scores += 1 if len(results) == 0: return 0 avg_score = scores / len(results) return avg_score

########################

F1-micro

######################## def f1_score(list1, list2):

TP: item in list1 and list2

# FP: item in list1 but not in list2
# TN: item not in list1 and list2
# FN: item in list2 but not in list1
if len(list1) or len(list2) == 0:
    return 0
num_TP = 0
for item1 in list1:
    for item2 in list2:
        if item1 == item2:
            num_TP += 1
            break
precision = num_TP / len(list1)
recall = num_TP / len(list2)
if precision == 0 or recall == 0:
    return 0
return 2 * (precision * recall / (precision + recall))

def caculate_f1(results, data): scores = [] for output_id in range(len(results)): prediction = results[output_id] target = data[output_id] if len(prediction) == 0 or len(target) == 0: continue score = f1_score(target, prediction) scores.append(score) if len(results) == 0: return 0 avg_score = sum(scores) / len(results) return avg_score

########################

fuzzywuzzy

######################## def caculate_fuzz(results, data): scores = 0 for output_id in range(len(results)): prediction = results[output_id] target = data[output_id] if prediction == "" or target == "": continue scores += fuzz.ratio(prediction, target) if len(results) == 0: return 0 avg_score = scores / len(results) return avg_score

########################

SARI

######################## def caculate_sari(inputs, results, data): sari = load_metric("/workspace/home/hoangpv4/ts/code/metrics/sari.py") translation_result = sari.compute(sources=inputs, predictions=results, references=[[label] for label in data]), return translation_result

def load_pred_and_gold_data(pred_path, gold_path): with open(pred_path, "r") as f: dataset_pred = json.load(f) with open(gold_path, "r") as f: dataset_gold = json.load(f) inputs, preds, golds = [], [], [] for i in range(len(dataset_pred)): inputs.append(dataset_gold[i]['prompt']) preds.append(dataset_pred[i]['result'].strip()) golds.append(dataset_gold[i]['answer']) return inputs, preds, golds

def get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids): inputs, preds, golds = [], [], [] for i in range(len(predicted_sequences)): if i not in correct_ids: continue inputs.append(input_sequences[i]) preds.append(predicted_sequences[i]) golds.append(ground_truths[i]) return inputs, preds, golds

def rescale_result(result, num_correct, num_all): if type(result) is float: temp = result num_correct / num_all if temp > 1: temp = temp / 100 return temp if type(result) == dict: for k, v in result.items(): result[k] = float(v) num_correct / num_all if result[k] > 1: result[k] = result[k] / 100 return result

def eval_20minuten(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)

bleu_1 = caculate_bleu(predicted_sequences, ground_truths, 1)
bleu_4 = caculate_bleu(predicted_sequences, ground_truths, 4)
rouge = caculate_rouge(predicted_sequences, ground_truths)
sari = caculate_sari(input_sequences, predicted_sequences, ground_truths)
evaluation_result = {"bleu-1": bleu_1, "bleu-4": bleu_4, "rouge-L": rouge, "sari": sari[0]['sari']}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['sari']

def eval_cstance(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)

accuracy = caculate_accuracy(predicted_sequences, ground_truths)
evaluation_result = {"accuracy": accuracy}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['accuracy']

def eval_fomc(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)

accuracy = caculate_accuracy(predicted_sequences, ground_truths)
evaluation_result = {"accuracy": accuracy}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['accuracy']

def eval_num_glue_cm(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)

accuracy = caculate_accuracy(predicted_sequences, ground_truths)
evaluation_result = {"accuracy": accuracy}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['accuracy']

def eval_num_glue_ds(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)

accuracy = caculate_accuracy(predicted_sequences, ground_truths)
evaluation_result = {"accuracy": accuracy}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['accuracy']

def eval_py150(pred_path, gold_path, correct_ids): import re def postprocess(code): code = code.replace("", "0").replace("", "").replace("", "") pattern = re.compile(r"<(STR|NUM|CHAR)_LIT:(.*?)>", re.S) lits = re.findall(pattern, code) for lit in lits: code = code.replace(f"<{lit[0]}_LIT:{lit[1]}>", lit[1]) return code

input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path)
num_of_all_test = len(ground_truths)
input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)

outputs = []
for output in predicted_sequences:
    outputs.append(postprocess(output))
gts = []
for gt in ground_truths:
    gts.append(postprocess(gt))

fuzz = caculate_fuzz(outputs, gts)
evaluation_result = {"similarity": fuzz}

return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['similarity']

def eval_scienceqa(pred_path, gold_path, correct_ids): def resolve(dataset: list): answers = [] reasonings = [] for datium in dataset: answers.append(datium[0]) # the first char is the answer. e.g. A, B,... reasonings.append(datium[2:]) # A/nBecause... outputs = {"answers": answers, "reasonings": reasonings} return outputs

input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path)
num_of_all_test = len(ground_truths)
input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)

outputs = resolve(predicted_sequences)
gts = resolve(ground_truths)

bleu_1 = caculate_bleu(outputs["reasonings"], gts["reasonings"], 1)
bleu_4 = caculate_bleu(outputs["reasonings"], gts["reasonings"], 4)
rouge = caculate_rouge(outputs["reasonings"], gts["reasonings"])
accuracy = caculate_accuracy(outputs["answers"], gts["answers"])

evaluation_result = {"bleu-1": bleu_1, "bleu-4": bleu_4, "rouge-L": rouge, "accuracy": accuracy}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['accuracy']

def eval_meetingbank(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)

predicted_sequences = [pred if len(pred) > 5 else 'wrong' for pred in predicted_sequences]
bleu_1 = caculate_bleu(predicted_sequences, ground_truths, 1)
bleu_4 = caculate_bleu(predicted_sequences, ground_truths, 4)
rouge = caculate_rouge(predicted_sequences, ground_truths)
evaluation_result = {"bleu-1": bleu_1, "bleu-4": bleu_4, "rouge-L": rouge}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['rouge-L']

EVAL_CONFIG = { 'task_preds': { "C-STANCE": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/C-STANCE_checkpoint-520.json", "FOMC": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/FOMC_checkpoint-416.json", "MeetingBank": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/MeetingBank_checkpoint-553.json", "Py150": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/Py150_Py150_4096.json", "ScienceQA": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/ScienceQA_checkpoint-228.json", "NumGLUE-cm": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/NumGLUE-cm_checkpoint-520.json", "NumGLUE-ds": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/NumGLUE-ds_checkpoint-468.json", "20Minuten": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/20Minuten.json", }, 'task_golds': { "C-STANCE": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/C-STANCE/test.json", "FOMC": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/FOMC/test.json", "MeetingBank": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/MeetingBank/test.json", "Py150": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/Py150/test.json", "ScienceQA": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/ScienceQA/test.json", "NumGLUE-cm": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/NumGLUE-cm/test.json", "NumGLUE-ds": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/NumGLUE-ds/test.json", "20Minuten": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/20Minuten/test.json", }, 'task_eval_funcs': { "C-STANCE": eval_cstance, "FOMC": eval_fomc, "MeetingBank": eval_meetingbank, "Py150": eval_py150, "ScienceQA": eval_scienceqa, "NumGLUE-cm": eval_num_glue_cm, "NumGLUE-ds": eval_num_glue_ds, "20Minuten": eval_20minuten, }, 'retrieval_results_path': "/workspace/home/hoangpv4/ts/model_checkpoints/model_checkpoints_me5_small_200_40/results_to_task_8.json" }

def full_eval(config=EVAL_CONFIG): with open(config['retrieval_results_path'],'r') as f: retrieval_results = json.load(f) save_path = os.path.join("/", config['retrieval_results_path'].split("/")[:-1],"overall_perf.json") for retrieval_result in tqdm(retrieval_results[:]): for task in retrieval_result['acc_of_tasks']: task_name = task['task_name'] correct_retrieval_ids = task['correct_ids'] task['task_performance'] = config['task_eval_funcs'][task_name](config['task_preds'][task_name], config['task_golds'][task_name], correct_retrieval_ids) del task['correct_ids'] retrieval_result['task_performance_mean'] = mean([r['task_performance'] for r in retrieval_result['acc_of_tasks']]) pprint(retrieval_result) print("="*40) with open(save_path, 'w') as f: json.dump(retrieval_results, f)

from glob import glob paths = glob("/workspace/home/hoangpv4/ts/model_checkpoints/*/results_to_task_8.json") print(paths) paths = [p for p in paths if 'origin' in p] for path in paths: EVAL_CONFIG['retrieval_results_path'] = path full_eval(EVAL_CONFIG)

EVAL_CONFIG = { 'task_preds': { "C-STANCE": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/C-STANCE_checkpoint-471.json", "FOMC": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/FOMC_checkpoint-314.json", "MeetingBank": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/MeetingBank_checkpoint-1200.json", "Py150": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/Py150_checkpoint-1495.json", "ScienceQA": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/ScienceQA_checkpoint-310.json", "NumGLUE-cm": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/NumGLUE-cm_checkpoint-315.json", "NumGLUE-ds": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/NumGLUE-ds_checkpoint-315.json", "20Minuten": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/20Minuten_checkpoint-1550.json", }, 'task_golds': { "C-STANCE": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/C-STANCE/test.json", "FOMC": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/FOMC/test.json", "MeetingBank": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/MeetingBank/test.json", "Py150": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/Py150/test.json", "ScienceQA": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/ScienceQA/test.json", "NumGLUE-cm": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/NumGLUE-cm/test.json", "NumGLUE-ds": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/NumGLUE-ds/test.json", "20Minuten": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/20Minuten/test.json", }, 'task_eval_funcs': { "C-STANCE": eval_cstance, "FOMC": eval_fomc, "MeetingBank": eval_meetingbank, "Py150": eval_py150, "ScienceQA": eval_scienceqa, "NumGLUE-cm": eval_num_glue_cm, "NumGLUE-ds": eval_num_glue_ds, "20Minuten": eval_20minuten, }, 'retrieval_results_path': "/workspace/home/hoangpv4/ts/model_checkpoints/model_checkpoints_me5_large_origin/results_to_task_8.json" }

def full_eval(config=EVAL_CONFIG): with open(config['retrieval_results_path'],'r') as f: retrieval_results = json.load(f) save_path = os.path.join("/", *config['retrieval_results_path'].split("/")[:-1],"overall_perf_qlora_mistral.json")

if os.path.exists(save_path):

#     return
results = []
for retrieval_result in tqdm(retrieval_results[:]):
    for task in retrieval_result['acc_of_tasks']:
        task_name = task['task_name']
        correct_retrieval_ids = task['correct_ids']
        task['task_performance'] = config['task_eval_funcs'][task_name](config['task_preds'][task_name], config['task_golds'][task_name], correct_retrieval_ids)
        del task['correct_ids']
    retrieval_result['task_performance_mean'] = mean([r['task_performance'] for r in retrieval_result['acc_of_tasks']])
    pprint(retrieval_result)
    print("=*"*40)
with open(save_path, 'w') as f:
    json.dump(retrieval_results, f)

from glob import glob

paths = glob("/workspace/home/hoangpv4/ts/code/notebooks/custom_classification_head_checkpoints/1/*/results_to_task_8.json")

paths = glob("/workspace/home/hoangpv4/ts/model_checkpoints/*/results_to_task_8.json") print(paths) paths = [p for p in paths] for path in paths: EVAL_CONFIG['retrieval_results_path'] = path full_eval(EVAL_CONFIG)

HoangHoang1408 / temp

final_task_metrics #18

BLEU

Rouge-L

Accuracy (EM)

F1-micro

TP: item in list1 and list2

fuzzywuzzy

SARI

if os.path.exists(save_path):

paths = glob("/workspace/home/hoangpv4/ts/code/notebooks/custom_classification_head_checkpoints/1/*/results_to_task_8.json")