Open vimal00r opened 3 months ago
可以将padding换为left, 然后取outputs.last_hidden_state[:, -1, :]
可以将padding换为left, 然后取outputs.last_hidden_state[:, -1, :]
code = """
!pip install -q transformers bitsandbytes accelerate torch Pillow decord flash-attn --no-build-isolation
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor import torch from PIL import Image from decord import VideoReader, cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = 'openbmb/MiniCPM-V-2_6-int4'
model = AutoModel.from_pretrained(model_path, trust_remote_code = True, device_map = device) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True) processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code = True)
def encode_image(image): if isinstance(image, str): image = Image.open(image).convert("RGB") else: image = image.convert("RGB") max_size = 448 * 16 if max(image.size) > max_size: image = image.resize((max_size, max_size), resample=Image.BICUBIC) return image
def encode_video(video_path, max_frames=32): def uniform_sample(l, n): gap = len(l) / n idxs = [int(i * gap + gap / 2) for i in range(n)] return [l[i] for i in idxs] vr = VideoReader(video_path, ctx=cpu(0)) sample_fps = round(vr.get_avg_fps() / 1) # FPS frame_idx = [i for i in range(0, len(vr), sample_fps)] if len(frame_idx) > max_frames: frame_idx = uniform_sample(frame_idx, max_frames) frames = vr.get_batch(frame_idx).asnumpy() frames = [Image.fromarray(v.astype('uint8')) for v in frames] return frames
def encode_query(query_text): return tokenizer(query_text, return_tensors='pt').to(device)
image = processor(images=encode_image("/content/image.jpg"), return_tensors="pt")
image
with torch.no_grad(): outputs = model(data = image) embeddings = outputs.last_hidden_state[:, 0, :] """
KeyError Traceback (most recent call last)
I think you should make your hands dirty
可以将padding换为left, 然后取outputs.last_hidden_state[:, -1, :]
code = """
!pip install -q transformers bitsandbytes accelerate torch Pillow decord flash-attn --no-build-isolation
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor import torch from PIL import Image from decord import VideoReader, cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = 'openbmb/MiniCPM-V-2_6-int4'
model = AutoModel.from_pretrained(model_path, trust_remote_code = True, device_map = device) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True) processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code = True)
def encode_image(image): if isinstance(image, str): image = Image.open(image).convert("RGB") else: image = image.convert("RGB") max_size = 448 * 16 if max(image.size) > max_size: image = image.resize((max_size, max_size), resample=Image.BICUBIC) return image
def encode_video(video_path, max_frames=32): def uniform_sample(l, n): gap = len(l) / n idxs = [int(i * gap + gap / 2) for i in range(n)] return [l[i] for i in idxs] vr = VideoReader(video_path, ctx=cpu(0)) sample_fps = round(vr.get_avg_fps() / 1) # FPS frame_idx = [i for i in range(0, len(vr), sample_fps)] if len(frame_idx) > max_frames: frame_idx = uniform_sample(frame_idx, max_frames) frames = vr.get_batch(frame_idx).asnumpy() frames = [Image.fromarray(v.astype('uint8')) for v in frames] return frames
def encode_query(query_text): return tokenizer(query_text, return_tensors='pt').to(device)
image = processor(images=encode_image("/content/image.jpg"), return_tensors="pt")
image
with torch.no_grad(): outputs = model(data = image) embeddings = outputs.last_hidden_state[:, 0, :] """
Error = """
KeyError Traceback (most recent call last) in <cell line: 1>() 1 with torch.no_grad(): ----> 2 outputs = model(data = image) 3 embeddings = outputs.last_hidden_state[:, 0, :]
5 frames /usr/local/lib/python3.10/dist-packages/transformers/feature_extraction_utils.py in getitem(self, item) 85 """ 86 if isinstance(item, str): ---> 87 return self.data[item] 88 else: 89 raise KeyError("Indexing with integers is not available when using Python based feature extractors")
KeyError: 'input_ids'"""
This is the code I am using. Can you give a try in this code and help me to fix this issue.
Input should be a dict object
hello, is this solved? If yes, can you please share the code?
可以将padding换为left, 然后取outputs.last_hidden_state[:, -1, :]
code = """
!pip install -q transformers bitsandbytes accelerate torch Pillow decord flash-attn --no-build-isolation
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor import torch from PIL import Image from decord import VideoReader, cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = 'openbmb/MiniCPM-V-2_6-int4'
model = AutoModel.from_pretrained(model_path, trust_remote_code = True, device_map = device) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True) processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code = True)
def encode_image(image): if isinstance(image, str): image = Image.open(image).convert("RGB") else: image = image.convert("RGB") max_size = 448 * 16 if max(image.size) > max_size: image = image.resize((max_size, max_size), resample=Image.BICUBIC) return image
def encode_video(video_path, max_frames=32): def uniform_sample(l, n): gap = len(l) / n idxs = [int(i * gap + gap / 2) for i in range(n)] return [l[i] for i in idxs] vr = VideoReader(video_path, ctx=cpu(0)) sample_fps = round(vr.get_avg_fps() / 1) # FPS frame_idx = [i for i in range(0, len(vr), sample_fps)] if len(frame_idx) > max_frames: frame_idx = uniform_sample(frame_idx, max_frames) frames = vr.get_batch(frame_idx).asnumpy() frames = [Image.fromarray(v.astype('uint8')) for v in frames] return frames
def encode_query(query_text): return tokenizer(query_text, return_tensors='pt').to(device)
image = processor(images=encode_image("/content/image.jpg"), return_tensors="pt")
image
with torch.no_grad(): outputs = model(data = image) embeddings = outputs.last_hidden_state[:, 0, :] """
Error = """
KeyError Traceback (most recent call last) in <cell line: 1>() 1 with torch.no_grad(): ----> 2 outputs = model(data = image) 3 embeddings = outputs.last_hidden_state[:, 0, :]
5 frames /usr/local/lib/python3.10/dist-packages/transformers/feature_extraction_utils.py in getitem(self, item) 85 """ 86 if isinstance(item, str): ---> 87 return self.data[item] 88 else: 89 raise KeyError("Indexing with integers is not available when using Python based feature extractors")
KeyError: 'input_ids'"""
This is the code I am using. Can you give a try in this code and help me to fix this issue.
Hello, also ran into the same trouble. We followed the code provided here but got the same error message:
File "/***/miniconda3/envs/***/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/***/miniconda3/envs/ColorName/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/***/miniconda3/envs/***/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = module._old_forward(*args, **kwargs)
File "/***/.cache/huggingface/modules/transformers_modules/openbmb/MiniCPM-V-2_6-int4/6f1555d8e2359cb18595da9f1864cb41631e0617/modeling_minicpmv.py", line 171, in forward
vllm_embedding, vision_hidden_states = self.get_vllm_embedding(data)
File "/***/.cache/huggingface/modules/transformers_modules/openbmb/MiniCPM-V-2_6-int4/6f1555d8e2359cb18595da9f1864cb41631e0617/modeling_minicpmv.py", line 147, in get_vllm_embedding
vllm_embedding = self.llm.model.embed_tokens(data['input_ids'])
File "/***/miniconda3/envs/***/lib/python3.9/site-packages/transformers/feature_extraction_utils.py", line 87, in __getitem__
return self.data[item]
KeyError: 'input_ids'
Our code is as follows
def get_miniCPM_image_embeddings(imagepath):
"""
Params
imagepth = str, path to the input png image file
Return
embeddings: torch.Tensor
"""
model_name = "openbmb/MiniCPM-V-2_6-int4"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16)
processor = AutoImageProcessor.from_pretrained(model_name, trust_remote_code = True)
# read image
image = Image.open(imagepath)
if isinstance(image, str):
image = Image.open(image).convert("RGB")
else:
image = image.convert("RGB")
# resize
max_size = 448 * 16
if max(image.size) > max_size:
image = image.resize((max_size, max_size), resample=Image.BICUBIC)
# get embedding
image = processor(images=image, return_tensors="pt")
with torch.no_grad():
outputs = model(data=image)
embeddings = outputs.last_hidden_state[:, 0, :]
return embeddings
If anyone has found a solution, would you mind sharing the code? Thank you all in advance!
起始日期 | Start Date
No response
实现PR | Implementation PR
No response
相关Issues | Reference Issues
No response
摘要 | Summary
I want to create embeddings for text, image and videos using MiniCPM model like LlaVa model. How to create the multimodal embedding using this model.
基本示例 | Basic Example
There are few multimodal embedding like CLIP, LlaVa which can be used to create embeddings for text, images as well as videos.
code ="""from transformers import AutoTokenizer, AutoModel, AutoImageProcessor import torch
缺陷 | Drawbacks
I am trying to do the same in this model but I am facing an error while doing this.
未解决问题 | Unresolved questions
No response