Open violet17 opened 6 months ago
Now ipex-llm has some issues for such unmerged lora module during inference. Maybe you can try to merge the lora module first and then use low-bit quantization. For example, update https://github.com/Vision-CAIR/MiniGPT4-video/blob/main/minigpt4/models/mini_gpt4_llama_v2.py#L138-L149 and https://github.com/Vision-CAIR/MiniGPT4-video/blob/main/minigpt4/models/mini_gpt4_llama_v2.py#L839-L868 to:
if self.low_resource:
self.llama_model = llm_model.from_pretrained(
llama_model,
torch_dtype=torch.float16,
device_map={"": "cpu"},
)
@classmethod
def from_config(cls, cfg):
vit_model = cfg.get("vit_model", "eva_clip_g")
q_former_model = cfg.get("q_former_model", "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth")
img_size = cfg.get("image_size")
num_query_token = cfg.get("num_query_token")
llama_model = cfg.get("llama_model")
drop_path_rate = cfg.get("drop_path_rate", 0)
use_grad_checkpoint = cfg.get("use_grad_checkpoint", False)
vit_precision = cfg.get("vit_precision", "fp16")
freeze_vit = cfg.get("freeze_vit", True)
freeze_qformer = cfg.get("freeze_qformer", True)
low_resource = cfg.get("low_resource", False)
prompt_path = cfg.get("prompt_path", "")
prompt_template = cfg.get("prompt_template", "")
max_txt_len = cfg.get("max_txt_len", 300)
end_sym = cfg.get("end_sym", '\n')
lora_r = cfg.get("lora_r",64)
lora_alpha = cfg.get("lora_alpha",16)
chat_template = cfg.get("chat_template",False)
system_prompt = cfg.get("system_prompt", False)
token_pooling = cfg.get("token_pooling",True)
use_grad_checkpoint_llm = cfg.get("use_grad_checkpoint_llm", False)
max_context_len = cfg.get("max_context_len", 3800)
remove_template = cfg.get("remove_template", False)
model = cls(
vit_model=vit_model,
img_size=img_size,
drop_path_rate=drop_path_rate,
use_grad_checkpoint=use_grad_checkpoint,
vit_precision=vit_precision,
freeze_vit=freeze_vit,
llama_model=llama_model,
prompt_path=prompt_path,
prompt_template=prompt_template,
max_txt_len=max_txt_len,
low_resource=low_resource,
end_sym=end_sym,
lora_r = lora_r,
lora_alpha = lora_alpha,
chat_template = chat_template,
system_prompt = system_prompt,
token_pooling = token_pooling,
use_grad_checkpoint_llm=use_grad_checkpoint_llm,
max_context_len=max_context_len,
remove_template = remove_template
)
ckpt_path = cfg.get("ckpt", "") # load weights of MiniGPT-4
if ckpt_path:
print("Load Minigpt-4-LLM Checkpoint: {}".format(ckpt_path))
ckpt = torch.load(ckpt_path, map_location="cpu")
msg = model.load_state_dict(ckpt['model'], strict=False)
if low_resource:
import ipex_llm
model.llama_model = model.llama_model.merge_and_unload()
model.llama_model = ipex_llm.optimize_model(model.llama_model, low_bit="sym_int8",
optimize_llm=True).to("xpu")
return model
merge_and_unload()
may require higher peft version (you may upgrade to peft==0.5.0
)
I run MiniGPT4-Video, and get some errors: Traceback (most recent call last): File "C:\Users\mi\miniconda3\lib\site-packages\gradio\queueing.py", line 501, in call_prediction output = await route_utils.call_process_api( File "C:\Users\mi\miniconda3\lib\site-packages\gradio\route_utils.py", line 253, in call_process_api output = await app.get_blocks().process_api( File "C:\Users\mi\miniconda3\lib\site-packages\gradio\blocks.py", line 1695, in process_api result = await self.call_function( File "C:\Users\mi\miniconda3\lib\site-packages\gradio\blocks.py", line 1235, in call_function prediction = await anyio.to_thread.run_sync( File "C:\Users\mi\miniconda3\lib\site-packages\anyio\to_thread.py", line 56, in run_sync return await get_async_backend().run_sync_in_worker_thread( File "C:\Users\mi\miniconda3\lib\site-packages\anyio_backends_asyncio.py", line 2144, in run_sync_in_worker_thread return await future File "C:\Users\mi\miniconda3\lib\site-packages\anyio_backends_asyncio.py", line 851, in run result = context.run(func, args) File "C:\Users\mi\miniconda3\lib\site-packages\gradio\utils.py", line 692, in wrapper response = f(args, kwargs) File "D:\MiniGPT4-Video\minigpt4_video_demo.py", line 231, in gradio_demo_local pred=run(video_path,instruction,model,vis_processor,gen_subtitles=has_sub) File "D:\MiniGPT4-Video\minigpt4_video_demo.py", line 149, in run answers = model.generate(prepared_images, prompt, max_new_tokens=args.max_new_tokens, do_sample=True, lengths=[length],num_beams=2) File "C:\Users\mi\miniconda3\lib\site-packages\torch\utils_contextlib.py", line 116, in decorate_context return func(args, kwargs) File "D:\MiniGPT4-Video\minigpt4\models\mini_gpt4_llama_v2.py", line 626, in generate outputs = self.llama_model.generate( File "C:\Users\mi\miniconda3\lib\site-packages\peft\peft_model.py", line 580, in generate return self.base_model.generate(kwargs) File "C:\Users\mi\miniconda3\lib\site-packages\torch\utils_contextlib.py", line 116, in decorate_context return func(args, kwargs) File "C:\Users\mi\miniconda3\lib\site-packages\transformers\generation\utils.py", line 1595, in generate return self.beam_sample( File "C:\Users\mi\miniconda3\lib\site-packages\transformers\generation\utils.py", line 3276, in beam_sample outputs = self( File "C:\Users\mi\miniconda3\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, kwargs) File "C:\Users\mi\miniconda3\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(*args, *kwargs) File "C:\Users\mi\miniconda3\lib\site-packages\transformers\models\llama\modeling_llama.py", line 1183, in forward outputs = self.model( File "C:\Users\mi\miniconda3\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(args, kwargs) File "C:\Users\mi\miniconda3\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(*args, kwargs) File "C:\Users\mi\miniconda3\lib\site-packages\bigdl\llm\transformers\models\llama.py", line 114, in llama_model_forward_4_36 return llama_model_forward_4_36_internal( File "C:\Users\mi\miniconda3\lib\site-packages\bigdl\llm\transformers\models\llama.py", line 1722, in llama_model_forward_4_36_internal layer_outputs = decoder_layer( File "C:\Users\mi\miniconda3\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, *kwargs) File "C:\Users\mi\miniconda3\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(args, kwargs) File "C:\Users\mi\miniconda3\lib\site-packages\bigdl\llm\transformers\models\llama.py", line 228, in llama_decoder_forward hidden_states, self_attn_weights, present_key_value = self.self_attn( File "C:\Users\mi\miniconda3\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, *kwargs) File "C:\Users\mi\miniconda3\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(args, **kwargs) File "C:\Users\mi\miniconda3\lib\site-packages\bigdl\llm\transformers\models\llama.py", line 882, in llama_attention_forward_4_36 return forward_function( File "C:\Users\mi\miniconda3\lib\site-packages\bigdl\llm\transformers\models\llama.py", line 1174, in llama_attention_forward_4_36_original if should_use_mm_int4_qkv(self, device): File "C:\Users\mi\miniconda3\lib\site-packages\bigdl\llm\transformers\models\llama.py", line 283, in should_use_mm_int4_qkv return device.type == "xpu" and self.q_proj.qtype == SYM_INT4 and self.q_proj.enable_xetla File "C:\Users\mi\miniconda3\lib\site-packages\torch\nn\modules\module.py", line 1695, in getattr raise AttributeError(f"'{type(self).name}' object has no attribute '{name}'") AttributeError: 'Linear' object has no attribute 'qtype'. Did you mean: 'type'?
Could you please help to support this model?
Thanks.
model project: https://github.com/Vision-CAIR/MiniGPT4-video/tree/main