salesforce / LAVIS

LAVIS - A One-stop Library for Language-Vision Intelligence
BSD 3-Clause "New" or "Revised" License
9.69k stars 949 forks source link

BLIP3 inference error #749

Closed josephzpng closed 48 minutes ago

josephzpng commented 2 hours ago

Great job! I encountered the following problem when running the inference example. Looking forward to your reply!

Environment:

torch==2.0.1 transformers==4.41.1

Model Response:

The second image is a close-up of a cat's face, focusing on its eyes and nose. The cat appears to be looking directly at the camera, and its eyes are wide open, giving it a curious and alert expression. The fur is predominantly white with black markings around the eyes and nose. The background is blurred, emphasizing the cat's face. The image contains person (in the center), person (to the left of the center), person (to the right of the center), person (to the right of the center), person (to the left of the center), person (in the center), person (in the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the

josephzpng commented 2 hours ago

Code

import os

from omegaconf import OmegaConf from functools import partial from PIL import Image import torch

from open_flamingo import create_model_and_transforms from open_flamingo.train.any_res_data_utils import process_images

model_ckpt="/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/checkpoint/blip3/xgen-mm-phi3-mini-base-r-v1.5.pt" cfg = dict( model_family = 'xgenmm_v1', lm_path = '/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/checkpoint/Phi-3-mini-4k-instruct', vision_encoder_path = '/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/checkpoint/siglip-so400m-patch14-384', vision_encoder_pretrained = 'google', num_vision_tokens = 128, image_aspect_ratio = 'anyres', anyres_patch_sampling = True, anyres_grids = [(1,2),(2,1),(2,2),(3,1),(1,3)], ckpt_pth = model_ckpt, ) cfg = OmegaConf.create(cfg)

additional_kwargs = { "num_vision_tokens": cfg.num_vision_tokens, "image_aspect_ratio": cfg.image_aspect_ratio, "anyres_patch_sampling": cfg.anyres_patch_sampling, }

model, image_processor, tokenizer = create_model_and_transforms( clip_vision_encoder_path=cfg.vision_encoder_path, clip_vision_encoder_pretrained=cfg.vision_encoder_pretrained, lang_model_path=cfg.lm_path, tokenizer_path=cfg.lm_path, model_family=cfg.model_family, **additional_kwargs)

ckpt = torch.load(cfg.ckpt_pth) model.load_state_dict(ckpt, strict=True) torch.cuda.empty_cache() model = model.eval().cuda()

base_img_size = model.base_img_size anyres_grids = [] for (m,n) in cfg.anyres_grids: anyres_grids.append([base_img_sizem, base_img_sizen]) model.anyres_grids = anyres_grids

image_proc = partial(process_images, image_processor=image_processor, model_cfg=cfg)

def apply_prompt_template(prompt, cfg): if 'Phi-3' in cfg.lm_path: s = ( '<|system|>\nA chat between a curious user and an artificial intelligence assistant. ' "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n" f'<|user|>\n{prompt}<|end|>\n<|assistant|>\n' ) else: raise NotImplementedError return s

image_path_1 = 'example_images/image-1.jpeg' image_path_2 = 'example_images/image-2.jpeg'

image_1 = Image.open(image_path_1).convert('RGB') image_2 = Image.open(image_path_2).convert('RGB') images = [image_1, image_2] image_size = [image_1.size, image_2.size] image_size = [image_size] vision_x = [image_proc([img]) for img in images] vision_x = [vision_x]

prompt = "Look at this image and this image . What is in the second image?" prompt = apply_prompt_template(prompt, cfg) lang_x = tokenizer([prompt], return_tensors="pt")

kwargs_default = dict(do_sample=False, temperature=0, max_new_tokens=1024, top_p=None, num_beams=1)

generated_text = model.generate( vision_x=vision_x, lang_x=lang_x['input_ids'].to(torch.device('cuda:0')), image_size=image_size, attention_mask=lang_x['attention_mask'].to(torch.device('cuda:0')), **kwargs_default)

generated_text = tokenizer.decode(generated_text[0], skip_special_tokens=True) if 'Phi-3' in cfg.lm_path: text = generated_text.split('<|end|>')[0] else: text=generated_text

print(text)