Closed josephzpng closed 48 minutes ago
import os
from omegaconf import OmegaConf from functools import partial from PIL import Image import torch
from open_flamingo import create_model_and_transforms from open_flamingo.train.any_res_data_utils import process_images
model_ckpt="/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/checkpoint/blip3/xgen-mm-phi3-mini-base-r-v1.5.pt" cfg = dict( model_family = 'xgenmm_v1', lm_path = '/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/checkpoint/Phi-3-mini-4k-instruct', vision_encoder_path = '/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/checkpoint/siglip-so400m-patch14-384', vision_encoder_pretrained = 'google', num_vision_tokens = 128, image_aspect_ratio = 'anyres', anyres_patch_sampling = True, anyres_grids = [(1,2),(2,1),(2,2),(3,1),(1,3)], ckpt_pth = model_ckpt, ) cfg = OmegaConf.create(cfg)
additional_kwargs = { "num_vision_tokens": cfg.num_vision_tokens, "image_aspect_ratio": cfg.image_aspect_ratio, "anyres_patch_sampling": cfg.anyres_patch_sampling, }
model, image_processor, tokenizer = create_model_and_transforms( clip_vision_encoder_path=cfg.vision_encoder_path, clip_vision_encoder_pretrained=cfg.vision_encoder_pretrained, lang_model_path=cfg.lm_path, tokenizer_path=cfg.lm_path, model_family=cfg.model_family, **additional_kwargs)
ckpt = torch.load(cfg.ckpt_pth) model.load_state_dict(ckpt, strict=True) torch.cuda.empty_cache() model = model.eval().cuda()
base_img_size = model.base_img_size anyres_grids = [] for (m,n) in cfg.anyres_grids: anyres_grids.append([base_img_sizem, base_img_sizen]) model.anyres_grids = anyres_grids
image_proc = partial(process_images, image_processor=image_processor, model_cfg=cfg)
def apply_prompt_template(prompt, cfg): if 'Phi-3' in cfg.lm_path: s = ( '<|system|>\nA chat between a curious user and an artificial intelligence assistant. ' "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n" f'<|user|>\n{prompt}<|end|>\n<|assistant|>\n' ) else: raise NotImplementedError return s
image_path_1 = 'example_images/image-1.jpeg' image_path_2 = 'example_images/image-2.jpeg'
image_1 = Image.open(image_path_1).convert('RGB') image_2 = Image.open(image_path_2).convert('RGB') images = [image_1, image_2] image_size = [image_1.size, image_2.size] image_size = [image_size] vision_x = [image_proc([img]) for img in images] vision_x = [vision_x]
prompt = "Look at this image
kwargs_default = dict(do_sample=False, temperature=0, max_new_tokens=1024, top_p=None, num_beams=1)
generated_text = model.generate( vision_x=vision_x, lang_x=lang_x['input_ids'].to(torch.device('cuda:0')), image_size=image_size, attention_mask=lang_x['attention_mask'].to(torch.device('cuda:0')), **kwargs_default)
generated_text = tokenizer.decode(generated_text[0], skip_special_tokens=True) if 'Phi-3' in cfg.lm_path: text = generated_text.split('<|end|>')[0] else: text=generated_text
print(text)
Great job! I encountered the following problem when running the inference example. Looking forward to your reply!
Environment:
torch==2.0.1 transformers==4.41.1
Model Response:
The second image is a close-up of a cat's face, focusing on its eyes and nose. The cat appears to be looking directly at the camera, and its eyes are wide open, giving it a curious and alert expression. The fur is predominantly white with black markings around the eyes and nose. The background is blurred, emphasizing the cat's face. The image contains person (in the center), person (to the left of the center), person (to the right of the center), person (to the right of the center), person (to the left of the center), person (in the center), person (in the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the