NVlabs / EAGLE

EAGLE: Exploring The Design Space for Multimodal LLMs with Mixture of Encoders
https://arxiv.org/pdf/2408.15998
Apache License 2.0
541 stars 45 forks source link

`eagle_llama` but Transformers does not recognize this architecture #8

Open flehn opened 2 months ago

flehn commented 2 months ago

Hello, I want to run:

from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("NVEagle/Eagle-X5-13B-Chat")

But I get: ValueError: The checkpoint you are trying to load has model type eagle_llama but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

Transformer version: 4.44.2

Nikitala0014 commented 2 months ago

Hello! You probably won't be able to achieve this because it's not included in the Transformers library, and I didn't find it in the model card on Hugging Face either.

What you might want to do instead is the following:

from eagle.model import EagleLlamaForCausalLM
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = EagleLlamaForCausalLM.from_pretrained(
    model_path,
    low_cpu_mem_usage=True
)

However, this alone isn't sufficient to get started, as Eagle is a family of Multimodal Large Language Models and its weights alone are just fine-tuned versions of LLaVa's. Additionally, the repository downloads other models like Vision Experts and Clip Encoder. Here's your simplified version of load_pretrained_model:

from transformers import AutoTokenizer, BitsAndBytesConfig
import torch
from eagle.model import *
from eagle.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN

def load_pretrained_model(model_path, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
    kwargs = {"device_map": device_map, **kwargs}

    if device != "cuda":
        kwargs['device_map'] = {"": device}

    if load_8bit:
        kwargs['load_in_8bit'] = True
    elif load_4bit:
        kwargs['load_in_4bit'] = True
        kwargs['quantization_config'] = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4'
        )
    else:
        kwargs['torch_dtype'] = torch.float16

    if use_flash_attn:
        kwargs['attn_implementation'] = 'flash_attention_2'

    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
    model = EagleLlamaForCausalLM.from_pretrained(
        model_path,
        low_cpu_mem_usage=True,
        **kwargs
    )

    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
    if mm_use_im_patch_token:
        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
    if mm_use_im_start_end:
        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
    model.resize_token_embeddings(len(tokenizer))

    vision_tower = model.get_vision_tower()
    if not vision_tower.is_loaded:
        vision_tower.load_model(device_map=device_map)
    if device_map != 'auto':
        vision_tower.to(device=device_map, dtype=torch.float16)
    image_processor = vision_tower.image_processor

    if hasattr(model.config, "max_sequence_length"):
        context_len = model.config.max_sequence_length
    else:
        context_len = 2048

    return tokenizer, model, image_processor, context_len

Given that, if you intend to use it without Gradio for terminal testing or to create an endpoint, you'll need to determine how you'll receive your images. Once you have the images and their corresponding prompts, you can proceed with the following steps:

import argparse
import torch
from PIL import Image

from eagle.model.custom_builder import load_pretrained_model
from eagle.constants import DEFAULT_IMAGE_TOKEN
from eagle.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from eagle.mm_utils import tokenizer_image_token, process_images
from eagle.conversation import conv_templates

argparser = argparse.ArgumentParser()
argparser.add_argument("--model-path", default="NVEagle/Eagle-X5-7B", type=str)
argparser.add_argument("--conv-mode", type=str, default="default")
argparser.add_argument("--temperature", type=float, default=0.2)
argparser.add_argument("--max-new-tokens", type=int, default=512)
argparser.add_argument("--num_frames", type=int, default=16)
argparser.add_argument("--load-8bit", action="store_true")
argparser.add_argument("--load-4bit", action="store_true")

args = argparser.parse_args()
model_path = args.model_path
conv_mode = args.conv_mode

tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.load_8bit, args.load_4bit)

def generate(images, prompt, temperature=0.2, top_p=0.7):
    num_image_tokens = 0

    if images is not None and len(images) > 0:
        if len(images) > 0:
            if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
                raise ValueError("Number of images does not match number of <image> tokens in prompt")

            #images = [load_image_from_base64(image) for image in images]
            image_sizes = [image.size for image in images]
            images = process_images(images, image_processor, model.config)

            if type(images) is list:
                images = [image.to(model.device, dtype=torch.float16) for image in images]
            else:
                images = images.to(model.device, dtype=torch.float16)
        else:
            images = None
            image_sizes = None
        image_args = {"images": images, "image_sizes": image_sizes}
    else:
        images = None
        image_args = {}

    max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
    max_new_tokens = 512
    do_sample = True if temperature > 0.001 else False

    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)

    max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)

    if max_new_tokens < 1:
        # yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
        return

    output_tensor = model.generate(
        inputs=input_ids,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        max_new_tokens=max_new_tokens,
        use_cache=True,
        pad_token_id=tokenizer.eos_token_id,
        **image_args
    )

    decoded_text = tokenizer.decode(output_tensor[0], skip_special_tokens=True)
    print("decoded_text", decoded_text)

    torch.cuda.empty_cache()

if __name__ == '__main__':
    message = "Explain this photo"
    image = Image.open("image_path.jpg").convert('RGB')
    prompt = DEFAULT_IMAGE_TOKEN + '\n' + message
    state = conv_templates[conv_mode].copy()
    image_process_mode = "Default"
    box = (prompt, image, image_process_mode)

    state.append_message(state.roles[0], box)
    state.append_message(state.roles[1], None)

    prompt = state.get_prompt()
    images = state.get_images(return_pil=True)

    generate(images, prompt)

Use an older version of Transformers, such as: transformers==4.37.2

With more recent versions, you'll likely encounter the following exception: TypeError: LlavaLlamaForCausalLM.forward() got an unexpected keyword argument 'cache_position'