Open KiUngSong opened 2 months ago
Thank you for pointing this out. We were also curious about the unusually high performance 😂, and after reviewing it today, I found the issue: we were using an incorrect evaluation metric. We'll update the results ASAP, but I'd like to explain the situation here first.
The official MMVP evaluation code requires both questions in a pair to be answered correctly for the sample to be marked as correct. However, in our code, we only calculated the probability of each question being answered correctly, which led to the inconsistency.
Below is the code we used for evaluation:
import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid
# for debug
import sys
sys.path.append(os.getcwd())
from eagle.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from eagle.conversation import conv_templates, SeparatorStyle
from eagle.model.builder import load_pretrained_model
from eagle.utils import disable_torch_init
from eagle.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from torch.utils.data import Dataset, DataLoader
import math
import pandas as pd
from PIL import Image
import os
from tqdm import tqdm
from copy import deepcopy
def calculate_score(predictions):
correct, total = 0, 0
for idx, prediction in enumerate(predictions):
if gt.lower() in answer.lower() or answer.lower() in gt.lower():
correct += 1
total += 1
print(f"Accuracy: {correct / total}")
def eval_model(args):
# Model
disable_torch_init()
model_path = os.path.expanduser(args.model_path)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
benchmark_dir = os.path.join(args.directory, 'Questions.csv')
# Load and read the CSV
df = pd.read_csv(benchmark_dir) # Assuming the fields are separated by tabs
answers_file = os.path.expanduser(args.answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
ans_file = open(answers_file, "w")
all_answers = []
for index, row in tqdm(df.iterrows()):
cur_prompt = row['Question'] + " " + row['Options']
qs = cur_prompt
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + "\nAnswer with the option's letter from the given choices directly."
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
# Load the corresponding image
photo_id = index+1
image_path = os.path.join(args.directory, 'MMVP Images', f"{photo_id}.jpg")
image = Image.open(image_path)
image_sizes = [image.size]
image_tensor = process_images([image], image_processor, model.config)
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
input_ids = input_ids.to(device='cuda', non_blocking=True).unsqueeze(0)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
image_sizes=image_sizes,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
top_p=args.top_p,
num_beams=args.num_beams,
max_new_tokens=64,
use_cache=True)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
ans_id = shortuuid.uuid()
answer_dict = {"question_id": photo_id,
"prompt": cur_prompt,
"answer": row["Correct Answer"],
"response": outputs,
"answer_id": ans_id,
"model_id": model_name,
}
all_answers.append(answer_dict)
ans_file.write(json.dumps(answer_dict) + "\n")
ans_file.flush()
ans_file.close()
calculate_score(all_answers)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="PATH_TO_MLLM")
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--directory", type=str, default="PATH_TO_MMVP_DATASET")
parser.add_argument("--answers-file", type=str, default="playground/data/eval_local_files/mmvp/debug/answers.jsonl")
parser.add_argument("--conv-mode", type=str, default="llava_v1")
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--top_p", type=float, default=None)
parser.add_argument("--num_beams", type=int, default=1)
args = parser.parse_args()
eval_model(args)
Aside from the metric, we also used direct string matching to compute performance. We are now using the GPT API to judge whether responses are correct, and I will update you with the latest scores soon.
Thank you again for raising this issue—this was our mistake, and we will be updating the Arxiv tech report accordingly.
I tested it and the accuracy rate is 53.3%, using GPT-4-Turbo to judge the answer.
Thank you for pointing this out. We were also curious about the unusually high performance 😂, and after reviewing it today, I found the issue: we were using an incorrect evaluation metric. We'll update the results ASAP, but I'd like to explain the situation here first.
The official MMVP evaluation code requires both questions in a pair to be answered correctly for the sample to be marked as correct. However, in our code, we only calculated the probability of each question being answered correctly, which led to the inconsistency.
Below is the code we used for evaluation:
import argparse import torch import os import json from tqdm import tqdm import shortuuid # for debug import sys sys.path.append(os.getcwd()) from eagle.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN from eagle.conversation import conv_templates, SeparatorStyle from eagle.model.builder import load_pretrained_model from eagle.utils import disable_torch_init from eagle.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path from torch.utils.data import Dataset, DataLoader import math import pandas as pd from PIL import Image import os from tqdm import tqdm from copy import deepcopy def calculate_score(predictions): correct, total = 0, 0 for idx, prediction in enumerate(predictions): if gt.lower() in answer.lower() or answer.lower() in gt.lower(): correct += 1 total += 1 print(f"Accuracy: {correct / total}") def eval_model(args): # Model disable_torch_init() model_path = os.path.expanduser(args.model_path) model_name = get_model_name_from_path(model_path) tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) benchmark_dir = os.path.join(args.directory, 'Questions.csv') # Load and read the CSV df = pd.read_csv(benchmark_dir) # Assuming the fields are separated by tabs answers_file = os.path.expanduser(args.answers_file) os.makedirs(os.path.dirname(answers_file), exist_ok=True) ans_file = open(answers_file, "w") all_answers = [] for index, row in tqdm(df.iterrows()): cur_prompt = row['Question'] + " " + row['Options'] qs = cur_prompt qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + "\nAnswer with the option's letter from the given choices directly." conv = conv_templates[args.conv_mode].copy() conv.append_message(conv.roles[0], qs) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() # Load the corresponding image photo_id = index+1 image_path = os.path.join(args.directory, 'MMVP Images', f"{photo_id}.jpg") image = Image.open(image_path) image_sizes = [image.size] image_tensor = process_images([image], image_processor, model.config) input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') input_ids = input_ids.to(device='cuda', non_blocking=True).unsqueeze(0) with torch.inference_mode(): output_ids = model.generate( input_ids, images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), image_sizes=image_sizes, do_sample=True if args.temperature > 0 else False, temperature=args.temperature, top_p=args.top_p, num_beams=args.num_beams, max_new_tokens=64, use_cache=True) outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() ans_id = shortuuid.uuid() answer_dict = {"question_id": photo_id, "prompt": cur_prompt, "answer": row["Correct Answer"], "response": outputs, "answer_id": ans_id, "model_id": model_name, } all_answers.append(answer_dict) ans_file.write(json.dumps(answer_dict) + "\n") ans_file.flush() ans_file.close() calculate_score(all_answers) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-path", type=str, default="PATH_TO_MLLM") parser.add_argument("--model-base", type=str, default=None) parser.add_argument("--directory", type=str, default="PATH_TO_MMVP_DATASET") parser.add_argument("--answers-file", type=str, default="playground/data/eval_local_files/mmvp/debug/answers.jsonl") parser.add_argument("--conv-mode", type=str, default="llava_v1") parser.add_argument("--num-chunks", type=int, default=1) parser.add_argument("--chunk-idx", type=int, default=0) parser.add_argument("--temperature", type=float, default=0.2) parser.add_argument("--top_p", type=float, default=None) parser.add_argument("--num_beams", type=int, default=1) args = parser.parse_args() eval_model(args)
Aside from the metric, we also used direct string matching to compute performance. We are now using the GPT API to judge whether responses are correct, and I will update you with the latest scores soon.
Thank you again for raising this issue—this was our mistake, and we will be updating the Arxiv tech report accordingly.
Should the conv-mode for Eagle-X4-8B-Plus be 'llava_v1' or 'llama3'?
I am trying to reproduce the MMVP benchmark performance of Eagle-X4-8B-Plus. Could you provide the official code or scripts for this, as I am unable to match the published benchmarks?