Open HoangHoang1408 opened 3 months ago
def load_pred_and_gold_data(pred_path, gold_path): with open(pred_path, "r") as f: dataset_pred = json.load(f) with open(gold_path, "r") as f: dataset_gold = json.load(f) inputs, preds, golds = [], [], [] for i in range(len(dataset_pred)): inputs.append(dataset_gold[i]['prompt']) preds.append(dataset_pred[i]['result'].strip()) golds.append(dataset_gold[i]['answer']) return inputs, preds, golds
def get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids): inputs, preds, golds = [], [], [] for i in range(len(predicted_sequences)): if i not in correct_ids: continue inputs.append(input_sequences[i]) preds.append(predicted_sequences[i]) golds.append(ground_truths[i]) return inputs, preds, golds
def rescale_result(result, num_correct, num_all): if type(result) is float: temp = result num_correct / num_all if temp > 1: temp = temp / 100 return temp if type(result) == dict: for k, v in result.items(): result[k] = float(v) num_correct / num_all if result[k] > 1: result[k] = result[k] / 100 return result
def eval_20minuten(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)
bleu_1 = caculate_bleu(predicted_sequences, ground_truths, 1)
bleu_4 = caculate_bleu(predicted_sequences, ground_truths, 4)
rouge = caculate_rouge(predicted_sequences, ground_truths)
sari = caculate_sari(input_sequences, predicted_sequences, ground_truths)
evaluation_result = {"bleu-1": bleu_1, "bleu-4": bleu_4, "rouge-L": rouge, "sari": sari[0]['sari']}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['sari']
def eval_cstance(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)
accuracy = caculate_accuracy(predicted_sequences, ground_truths)
evaluation_result = {"accuracy": accuracy}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['accuracy']
def eval_fomc(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)
accuracy = caculate_accuracy(predicted_sequences, ground_truths)
evaluation_result = {"accuracy": accuracy}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['accuracy']
def eval_num_glue_cm(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)
accuracy = caculate_accuracy(predicted_sequences, ground_truths)
evaluation_result = {"accuracy": accuracy}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['accuracy']
def eval_num_glue_ds(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)
accuracy = caculate_accuracy(predicted_sequences, ground_truths)
evaluation_result = {"accuracy": accuracy}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['accuracy']
def eval_py150(pred_path, gold_path, correct_ids):
import re
def postprocess(code):
code = code.replace("
input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path)
num_of_all_test = len(ground_truths)
input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)
outputs = []
for output in predicted_sequences:
outputs.append(postprocess(output))
gts = []
for gt in ground_truths:
gts.append(postprocess(gt))
fuzz = caculate_fuzz(outputs, gts)
evaluation_result = {"similarity": fuzz}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['similarity']
def eval_scienceqa(pred_path, gold_path, correct_ids): def resolve(dataset: list): answers = [] reasonings = [] for datium in dataset: answers.append(datium[0]) # the first char is the answer. e.g. A, B,... reasonings.append(datium[2:]) # A/nBecause... outputs = {"answers": answers, "reasonings": reasonings} return outputs
input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path)
num_of_all_test = len(ground_truths)
input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)
outputs = resolve(predicted_sequences)
gts = resolve(ground_truths)
bleu_1 = caculate_bleu(outputs["reasonings"], gts["reasonings"], 1)
bleu_4 = caculate_bleu(outputs["reasonings"], gts["reasonings"], 4)
rouge = caculate_rouge(outputs["reasonings"], gts["reasonings"])
accuracy = caculate_accuracy(outputs["answers"], gts["answers"])
evaluation_result = {"bleu-1": bleu_1, "bleu-4": bleu_4, "rouge-L": rouge, "accuracy": accuracy}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['accuracy']
def eval_meetingbank(pred_path, gold_path, correct_ids): input_sequences, predicted_sequences, ground_truths = load_pred_and_gold_data(pred_path, gold_path) num_of_all_test = len(ground_truths) input_sequences, predicted_sequences, ground_truths = get_correct_retrieval_samples(input_sequences, predicted_sequences, ground_truths, correct_ids)
predicted_sequences = [pred if len(pred) > 5 else 'wrong' for pred in predicted_sequences]
bleu_1 = caculate_bleu(predicted_sequences, ground_truths, 1)
bleu_4 = caculate_bleu(predicted_sequences, ground_truths, 4)
rouge = caculate_rouge(predicted_sequences, ground_truths)
evaluation_result = {"bleu-1": bleu_1, "bleu-4": bleu_4, "rouge-L": rouge}
return rescale_result(evaluation_result,len(correct_ids), num_of_all_test)['rouge-L']
EVAL_CONFIG = { 'task_preds': { "C-STANCE": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/C-STANCE_checkpoint-520.json", "FOMC": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/FOMC_checkpoint-416.json", "MeetingBank": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/MeetingBank_checkpoint-553.json", "Py150": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/Py150_Py150_4096.json", "ScienceQA": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/ScienceQA_checkpoint-228.json", "NumGLUE-cm": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/NumGLUE-cm_checkpoint-520.json", "NumGLUE-ds": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/NumGLUE-ds_checkpoint-468.json", "20Minuten": "/workspace/home/hoangpv4/ts/code/py_scripts/results_new/20Minuten.json", }, 'task_golds': { "C-STANCE": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/C-STANCE/test.json", "FOMC": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/FOMC/test.json", "MeetingBank": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/MeetingBank/test.json", "Py150": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/Py150/test.json", "ScienceQA": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/ScienceQA/test.json", "NumGLUE-cm": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/NumGLUE-cm/test.json", "NumGLUE-ds": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/NumGLUE-ds/test.json", "20Minuten": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/20Minuten/test.json", }, 'task_eval_funcs': { "C-STANCE": eval_cstance, "FOMC": eval_fomc, "MeetingBank": eval_meetingbank, "Py150": eval_py150, "ScienceQA": eval_scienceqa, "NumGLUE-cm": eval_num_glue_cm, "NumGLUE-ds": eval_num_glue_ds, "20Minuten": eval_20minuten, }, 'retrieval_results_path': "/workspace/home/hoangpv4/ts/model_checkpoints/model_checkpoints_me5_small_200_40/results_to_task_8.json" }
def full_eval(config=EVAL_CONFIG): with open(config['retrieval_results_path'],'r') as f: retrieval_results = json.load(f) save_path = os.path.join("/", config['retrieval_results_path'].split("/")[:-1],"overall_perf.json") for retrieval_result in tqdm(retrieval_results[:]): for task in retrieval_result['acc_of_tasks']: task_name = task['task_name'] correct_retrieval_ids = task['correct_ids'] task['task_performance'] = config['task_eval_funcs'][task_name](config['task_preds'][task_name], config['task_golds'][task_name], correct_retrieval_ids) del task['correct_ids'] retrieval_result['task_performance_mean'] = mean([r['task_performance'] for r in retrieval_result['acc_of_tasks']]) pprint(retrieval_result) print("="*40) with open(save_path, 'w') as f: json.dump(retrieval_results, f)
from glob import glob paths = glob("/workspace/home/hoangpv4/ts/model_checkpoints/*/results_to_task_8.json") print(paths) paths = [p for p in paths if 'origin' in p] for path in paths: EVAL_CONFIG['retrieval_results_path'] = path full_eval(EVAL_CONFIG)
EVAL_CONFIG = { 'task_preds': { "C-STANCE": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/C-STANCE_checkpoint-471.json", "FOMC": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/FOMC_checkpoint-314.json", "MeetingBank": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/MeetingBank_checkpoint-1200.json", "Py150": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/Py150_checkpoint-1495.json", "ScienceQA": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/ScienceQA_checkpoint-310.json", "NumGLUE-cm": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/NumGLUE-cm_checkpoint-315.json", "NumGLUE-ds": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/NumGLUE-ds_checkpoint-315.json", "20Minuten": "/workspace/home/hoangpv4/ts/code/py_scripts/results_qlora_mistral2/20Minuten_checkpoint-1550.json", }, 'task_golds': { "C-STANCE": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/C-STANCE/test.json", "FOMC": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/FOMC/test.json", "MeetingBank": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/MeetingBank/test.json", "Py150": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/Py150/test.json", "ScienceQA": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/ScienceQA/test.json", "NumGLUE-cm": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/NumGLUE-cm/test.json", "NumGLUE-ds": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/NumGLUE-ds/test.json", "20Minuten": "/workspace/home/hoangpv4/ts/dataset/TRACE/LLM-CL-Benchmark_5000/20Minuten/test.json", }, 'task_eval_funcs': { "C-STANCE": eval_cstance, "FOMC": eval_fomc, "MeetingBank": eval_meetingbank, "Py150": eval_py150, "ScienceQA": eval_scienceqa, "NumGLUE-cm": eval_num_glue_cm, "NumGLUE-ds": eval_num_glue_ds, "20Minuten": eval_20minuten, }, 'retrieval_results_path': "/workspace/home/hoangpv4/ts/model_checkpoints/model_checkpoints_me5_large_origin/results_to_task_8.json" }
def full_eval(config=EVAL_CONFIG): with open(config['retrieval_results_path'],'r') as f: retrieval_results = json.load(f) save_path = os.path.join("/", *config['retrieval_results_path'].split("/")[:-1],"overall_perf_qlora_mistral.json")
# return
results = []
for retrieval_result in tqdm(retrieval_results[:]):
for task in retrieval_result['acc_of_tasks']:
task_name = task['task_name']
correct_retrieval_ids = task['correct_ids']
task['task_performance'] = config['task_eval_funcs'][task_name](config['task_preds'][task_name], config['task_golds'][task_name], correct_retrieval_ids)
del task['correct_ids']
retrieval_result['task_performance_mean'] = mean([r['task_performance'] for r in retrieval_result['acc_of_tasks']])
pprint(retrieval_result)
print("=*"*40)
with open(save_path, 'w') as f:
json.dump(retrieval_results, f)
from glob import glob
paths = glob("/workspace/home/hoangpv4/ts/model_checkpoints/*/results_to_task_8.json") print(paths) paths = [p for p in paths] for path in paths: EVAL_CONFIG['retrieval_results_path'] = path full_eval(EVAL_CONFIG)
import re from rouge import Rouge from fuzzywuzzy import fuzz from datasets import load_metric from nltk.translate.bleu_score import sentence_bleu import json from tqdm import tqdm from statistics import mean from pprint import pprint import os
########################
BLEU
######################## def tokenize(text): tokens = re.split(r'\s|.', text) tokens = [t for t in tokens if len(t) > 0] return tokens
def bleu_score(reference, hypothesis, gram): reference_tokens = tokenize(reference) hypothesis_tokens = tokenize(hypothesis)
def caculate_bleu(results, data, gram): bleus = [] for output_id in range(len(results)): prediction = results[output_id] target = data[output_id] if prediction == "" or target == "": continue bleu = bleu_score(target, prediction, gram) bleus.append(bleu) if len(results) == 0: return 0 avg_bleu = sum(bleus) / len(results) return avg_bleu
########################
Rouge-L
######################## def score_rouge(str1, str2): rouge = Rouge(metrics=["rouge-l"]) scores = rouge.get_scores(str1, str2, avg=True) rouge_l = scores['rouge-l']['f'] return rouge_l
def caculate_rouge(results, data): rouges = [] for output_id in range(len(results)): prediction = results[output_id] target = data[output_id] if prediction == "" or target == "": continue rouge = score_rouge(target, prediction) rouges.append(rouge) if len(results) == 0: return 0 avg_rouge = sum(rouges) / len(results) return avg_rouge
########################
Accuracy (EM)
######################## def caculate_accuracy(results, data): scores = 0 for output_id in range(len(results)): prediction = results[output_id] target = data[output_id] if prediction == "" or target == "": continue if prediction == target: scores += 1 if len(results) == 0: return 0 avg_score = scores / len(results) return avg_score
########################
F1-micro
######################## def f1_score(list1, list2):
TP: item in list1 and list2
def caculate_f1(results, data): scores = [] for output_id in range(len(results)): prediction = results[output_id] target = data[output_id] if len(prediction) == 0 or len(target) == 0: continue score = f1_score(target, prediction) scores.append(score) if len(results) == 0: return 0 avg_score = sum(scores) / len(results) return avg_score
########################
fuzzywuzzy
######################## def caculate_fuzz(results, data): scores = 0 for output_id in range(len(results)): prediction = results[output_id] target = data[output_id] if prediction == "" or target == "": continue scores += fuzz.ratio(prediction, target) if len(results) == 0: return 0 avg_score = scores / len(results) return avg_score
########################
SARI
######################## def caculate_sari(inputs, results, data): sari = load_metric("/workspace/home/hoangpv4/ts/code/metrics/sari.py") translation_result = sari.compute(sources=inputs, predictions=results, references=[[label] for label in data]), return translation_result