Open flehn opened 2 months ago
Hello! You probably won't be able to achieve this because it's not included in the Transformers library, and I didn't find it in the model card on Hugging Face either.
What you might want to do instead is the following:
from eagle.model import EagleLlamaForCausalLM
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = EagleLlamaForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True
)
However, this alone isn't sufficient to get started, as Eagle is a family of Multimodal Large Language Models and its weights alone are just fine-tuned versions of LLaVa's. Additionally, the repository downloads other models like Vision Experts and Clip Encoder. Here's your simplified version of load_pretrained_model:
from transformers import AutoTokenizer, BitsAndBytesConfig
import torch
from eagle.model import *
from eagle.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
def load_pretrained_model(model_path, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
kwargs = {"device_map": device_map, **kwargs}
if device != "cuda":
kwargs['device_map'] = {"": device}
if load_8bit:
kwargs['load_in_8bit'] = True
elif load_4bit:
kwargs['load_in_4bit'] = True
kwargs['quantization_config'] = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4'
)
else:
kwargs['torch_dtype'] = torch.float16
if use_flash_attn:
kwargs['attn_implementation'] = 'flash_attention_2'
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = EagleLlamaForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
**kwargs
)
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
if mm_use_im_patch_token:
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
model.resize_token_embeddings(len(tokenizer))
vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
vision_tower.load_model(device_map=device_map)
if device_map != 'auto':
vision_tower.to(device=device_map, dtype=torch.float16)
image_processor = vision_tower.image_processor
if hasattr(model.config, "max_sequence_length"):
context_len = model.config.max_sequence_length
else:
context_len = 2048
return tokenizer, model, image_processor, context_len
Given that, if you intend to use it without Gradio for terminal testing or to create an endpoint, you'll need to determine how you'll receive your images. Once you have the images and their corresponding prompts, you can proceed with the following steps:
import argparse
import torch
from PIL import Image
from eagle.model.custom_builder import load_pretrained_model
from eagle.constants import DEFAULT_IMAGE_TOKEN
from eagle.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from eagle.mm_utils import tokenizer_image_token, process_images
from eagle.conversation import conv_templates
argparser = argparse.ArgumentParser()
argparser.add_argument("--model-path", default="NVEagle/Eagle-X5-7B", type=str)
argparser.add_argument("--conv-mode", type=str, default="default")
argparser.add_argument("--temperature", type=float, default=0.2)
argparser.add_argument("--max-new-tokens", type=int, default=512)
argparser.add_argument("--num_frames", type=int, default=16)
argparser.add_argument("--load-8bit", action="store_true")
argparser.add_argument("--load-4bit", action="store_true")
args = argparser.parse_args()
model_path = args.model_path
conv_mode = args.conv_mode
tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.load_8bit, args.load_4bit)
def generate(images, prompt, temperature=0.2, top_p=0.7):
num_image_tokens = 0
if images is not None and len(images) > 0:
if len(images) > 0:
if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
raise ValueError("Number of images does not match number of <image> tokens in prompt")
#images = [load_image_from_base64(image) for image in images]
image_sizes = [image.size for image in images]
images = process_images(images, image_processor, model.config)
if type(images) is list:
images = [image.to(model.device, dtype=torch.float16) for image in images]
else:
images = images.to(model.device, dtype=torch.float16)
else:
images = None
image_sizes = None
image_args = {"images": images, "image_sizes": image_sizes}
else:
images = None
image_args = {}
max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
max_new_tokens = 512
do_sample = True if temperature > 0.001 else False
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
if max_new_tokens < 1:
# yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
return
output_tensor = model.generate(
inputs=input_ids,
do_sample=do_sample,
temperature=temperature,
top_p=top_p,
max_new_tokens=max_new_tokens,
use_cache=True,
pad_token_id=tokenizer.eos_token_id,
**image_args
)
decoded_text = tokenizer.decode(output_tensor[0], skip_special_tokens=True)
print("decoded_text", decoded_text)
torch.cuda.empty_cache()
if __name__ == '__main__':
message = "Explain this photo"
image = Image.open("image_path.jpg").convert('RGB')
prompt = DEFAULT_IMAGE_TOKEN + '\n' + message
state = conv_templates[conv_mode].copy()
image_process_mode = "Default"
box = (prompt, image, image_process_mode)
state.append_message(state.roles[0], box)
state.append_message(state.roles[1], None)
prompt = state.get_prompt()
images = state.get_images(return_pil=True)
generate(images, prompt)
Use an older version of Transformers, such as:
transformers==4.37.2
With more recent versions, you'll likely encounter the following exception:
TypeError: LlavaLlamaForCausalLM.forward() got an unexpected keyword argument 'cache_position'
Hello, I want to run:
from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("NVEagle/Eagle-X5-13B-Chat")
But I get: ValueError: The checkpoint you are trying to load has model type
eagle_llama
but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.Transformer version: 4.44.2