NVlabs / EAGLE

EAGLE: Exploring The Design Space for Multimodal LLMs with Mixture of Encoders
https://arxiv.org/pdf/2408.15998
Apache License 2.0
541 stars 45 forks source link

Request for Official Code to Reproduce MMVP Performance on Eagle-X4-8B-Plus #14

Open KiUngSong opened 2 months ago

KiUngSong commented 2 months ago

I am trying to reproduce the MMVP benchmark performance of Eagle-X4-8B-Plus. Could you provide the official code or scripts for this, as I am unable to match the published benchmarks?

flyinglynx commented 2 months ago

Thank you for pointing this out. We were also curious about the unusually high performance 😂, and after reviewing it today, I found the issue: we were using an incorrect evaluation metric. We'll update the results ASAP, but I'd like to explain the situation here first.

The official MMVP evaluation code requires both questions in a pair to be answered correctly for the sample to be marked as correct. However, in our code, we only calculated the probability of each question being answered correctly, which led to the inconsistency.

Below is the code we used for evaluation:

import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid

# for debug
import sys
sys.path.append(os.getcwd())

from eagle.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from eagle.conversation import conv_templates, SeparatorStyle
from eagle.model.builder import load_pretrained_model
from eagle.utils import disable_torch_init
from eagle.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from torch.utils.data import Dataset, DataLoader

import math
import pandas as pd
from PIL import Image
import os
from tqdm import tqdm
from copy import deepcopy

def calculate_score(predictions):

    correct, total = 0, 0

    for idx, prediction in enumerate(predictions):
        if gt.lower() in answer.lower() or answer.lower() in gt.lower():
            correct += 1
        total += 1

    print(f"Accuracy: {correct / total}")

def eval_model(args):
    # Model
    disable_torch_init()
    model_path = os.path.expanduser(args.model_path)
    model_name = get_model_name_from_path(model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)

    benchmark_dir = os.path.join(args.directory, 'Questions.csv')
    # Load and read the CSV
    df = pd.read_csv(benchmark_dir)  # Assuming the fields are separated by tabs

    answers_file = os.path.expanduser(args.answers_file)
    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
    ans_file = open(answers_file, "w")
    all_answers = []

    for index, row in tqdm(df.iterrows()):
        cur_prompt = row['Question'] + " " + row['Options']
        qs = cur_prompt       
        qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + "\nAnswer with the option's letter from the given choices directly."

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], qs)
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()

        # Load the corresponding image
        photo_id = index+1
        image_path = os.path.join(args.directory, 'MMVP Images', f"{photo_id}.jpg")
        image = Image.open(image_path)
        image_sizes = [image.size]

        image_tensor = process_images([image], image_processor, model.config)
        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
        input_ids = input_ids.to(device='cuda', non_blocking=True).unsqueeze(0)

        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
                image_sizes=image_sizes,
                do_sample=True if args.temperature > 0 else False,
                temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                max_new_tokens=64,
                use_cache=True)

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

        ans_id = shortuuid.uuid()

        answer_dict = {"question_id": photo_id,
                                   "prompt": cur_prompt,
                                   "answer": row["Correct Answer"], 
                                   "response": outputs,
                                   "answer_id": ans_id,
                                   "model_id": model_name,
                      }
        all_answers.append(answer_dict)
        ans_file.write(json.dumps(answer_dict) + "\n")
        ans_file.flush()

    ans_file.close()
    calculate_score(all_answers)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="PATH_TO_MLLM")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--directory", type=str, default="PATH_TO_MMVP_DATASET")
    parser.add_argument("--answers-file", type=str, default="playground/data/eval_local_files/mmvp/debug/answers.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v1")
    parser.add_argument("--num-chunks", type=int, default=1)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    args = parser.parse_args()

    eval_model(args)

Aside from the metric, we also used direct string matching to compute performance. We are now using the GPT API to judge whether responses are correct, and I will update you with the latest scores soon.

Thank you again for raising this issue—this was our mistake, and we will be updating the Arxiv tech report accordingly.

dingangui commented 2 months ago

I tested it and the accuracy rate is 53.3%, using GPT-4-Turbo to judge the answer.

dingangui commented 1 month ago

Thank you for pointing this out. We were also curious about the unusually high performance 😂, and after reviewing it today, I found the issue: we were using an incorrect evaluation metric. We'll update the results ASAP, but I'd like to explain the situation here first.

The official MMVP evaluation code requires both questions in a pair to be answered correctly for the sample to be marked as correct. However, in our code, we only calculated the probability of each question being answered correctly, which led to the inconsistency.

Below is the code we used for evaluation:

import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid

# for debug
import sys
sys.path.append(os.getcwd())

from eagle.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from eagle.conversation import conv_templates, SeparatorStyle
from eagle.model.builder import load_pretrained_model
from eagle.utils import disable_torch_init
from eagle.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from torch.utils.data import Dataset, DataLoader

import math
import pandas as pd
from PIL import Image
import os
from tqdm import tqdm
from copy import deepcopy

def calculate_score(predictions):

    correct, total = 0, 0

    for idx, prediction in enumerate(predictions):
        if gt.lower() in answer.lower() or answer.lower() in gt.lower():
            correct += 1
        total += 1

    print(f"Accuracy: {correct / total}")

def eval_model(args):
    # Model
    disable_torch_init()
    model_path = os.path.expanduser(args.model_path)
    model_name = get_model_name_from_path(model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)

    benchmark_dir = os.path.join(args.directory, 'Questions.csv')
    # Load and read the CSV
    df = pd.read_csv(benchmark_dir)  # Assuming the fields are separated by tabs

    answers_file = os.path.expanduser(args.answers_file)
    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
    ans_file = open(answers_file, "w")
    all_answers = []

    for index, row in tqdm(df.iterrows()):
        cur_prompt = row['Question'] + " " + row['Options']
        qs = cur_prompt       
        qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + "\nAnswer with the option's letter from the given choices directly."

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], qs)
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()

        # Load the corresponding image
        photo_id = index+1
        image_path = os.path.join(args.directory, 'MMVP Images', f"{photo_id}.jpg")
        image = Image.open(image_path)
        image_sizes = [image.size]

        image_tensor = process_images([image], image_processor, model.config)
        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
        input_ids = input_ids.to(device='cuda', non_blocking=True).unsqueeze(0)

        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
                image_sizes=image_sizes,
                do_sample=True if args.temperature > 0 else False,
                temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                max_new_tokens=64,
                use_cache=True)

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

        ans_id = shortuuid.uuid()

        answer_dict = {"question_id": photo_id,
                                   "prompt": cur_prompt,
                                   "answer": row["Correct Answer"], 
                                   "response": outputs,
                                   "answer_id": ans_id,
                                   "model_id": model_name,
                      }
        all_answers.append(answer_dict)
        ans_file.write(json.dumps(answer_dict) + "\n")
        ans_file.flush()

    ans_file.close()
    calculate_score(all_answers)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="PATH_TO_MLLM")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--directory", type=str, default="PATH_TO_MMVP_DATASET")
    parser.add_argument("--answers-file", type=str, default="playground/data/eval_local_files/mmvp/debug/answers.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v1")
    parser.add_argument("--num-chunks", type=int, default=1)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    args = parser.parse_args()

    eval_model(args)

Aside from the metric, we also used direct string matching to compute performance. We are now using the GPT API to judge whether responses are correct, and I will update you with the latest scores soon.

Thank you again for raising this issue—this was our mistake, and we will be updating the Arxiv tech report accordingly.

Should the conv-mode for Eagle-X4-8B-Plus be 'llava_v1' or 'llama3'?