💡 [REQUEST] - <title> How to Create the Multimodal embedding for text, image and videos using this model.

起始日期 | Start Date

No response

实现PR | Implementation PR

No response

摘要 | Summary

I want to create embeddings for text, image and videos using MiniCPM model like LlaVa model. How to create the multimodal embedding using this model.

基本示例 | Basic Example

There are few multimodal embedding like CLIP, LlaVa which can be used to create embeddings for text, images as well as videos.

code ="""from transformers import AutoTokenizer, AutoModel, AutoImageProcessor import torch

            # Load the pre-trained  model
            model_path = "MLLM path "
            model = AutoModel.from_pretrained(model_path, trust_remote_code = True, device_map = device)
            tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True)
            processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code = True)

            # Preprocess the text
            text = "This is a sample text"
            inputs = processor(text, return_tensors="pt")

            # Generate text embeddings
            with torch.no_grad():
                outputs = model(**inputs)
                embeddings = outputs.last_hidden_state[:, 0, :]

            # Use the embeddings
            print(embeddings.shape)   """

缺陷 | Drawbacks

I am trying to do the same in this model but I am facing an error while doing this.

未解决问题 | Unresolved questions

No response

可以将padding换为left, 然后取outputs.last_hidden_state[:, -1, :]

可以将padding换为left, 然后取outputs.last_hidden_state[:, -1, :]

code = """

!pip install -q transformers bitsandbytes accelerate torch Pillow decord flash-attn --no-build-isolation

from transformers import AutoTokenizer, AutoModel, AutoImageProcessor import torch from PIL import Image from decord import VideoReader, cpu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = 'openbmb/MiniCPM-V-2_6-int4'

model = AutoModel.from_pretrained(model_path, trust_remote_code = True, device_map = device) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True) processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code = True)

def encode_image(image): if isinstance(image, str): image = Image.open(image).convert("RGB") else: image = image.convert("RGB") max_size = 448 * 16 if max(image.size) > max_size: image = image.resize((max_size, max_size), resample=Image.BICUBIC) return image

def encode_video(video_path, max_frames=32): def uniform_sample(l, n): gap = len(l) / n idxs = [int(i * gap + gap / 2) for i in range(n)] return [l[i] for i in idxs] vr = VideoReader(video_path, ctx=cpu(0)) sample_fps = round(vr.get_avg_fps() / 1) # FPS frame_idx = [i for i in range(0, len(vr), sample_fps)] if len(frame_idx) > max_frames: frame_idx = uniform_sample(frame_idx, max_frames) frames = vr.get_batch(frame_idx).asnumpy() frames = [Image.fromarray(v.astype('uint8')) for v in frames] return frames

def encode_query(query_text): return tokenizer(query_text, return_tensors='pt').to(device)

image = processor(images=encode_image("/content/image.jpg"), return_tensors="pt")

image

with torch.no_grad(): outputs = model(data = image) embeddings = outputs.last_hidden_state[:, 0, :] """

Error = """

KeyError Traceback (most recent call last)

in () 1 with torch.no_grad(): ----> 2 outputs = model(data = image) 3 embeddings = outputs.last_hidden_state[:, 0, :] 5 frames /usr/local/lib/python3.10/dist-packages/transformers/feature_extraction_utils.py in __getitem__(self, item) 85 """ 86 if isinstance(item, str): ---> 87 return self.data[item] 88 else: 89 raise KeyError("Indexing with integers is not available when using Python based feature extractors") KeyError: 'input_ids'""" This is the code I am using. Can you give a try in this code and help me to fix this issue. ![Screenshot 2024-08-23 124302](https://github.com/user-attachments/assets/3d279f48-3692-4463-bfaa-141aef6500d0)

I think you should make your hands dirty

可以将padding换为left, 然后取outputs.last_hidden_state[:, -1, :]

code = """

!pip install -q transformers bitsandbytes accelerate torch Pillow decord flash-attn --no-build-isolation

from transformers import AutoTokenizer, AutoModel, AutoImageProcessor import torch from PIL import Image from decord import VideoReader, cpu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = 'openbmb/MiniCPM-V-2_6-int4'

model = AutoModel.from_pretrained(model_path, trust_remote_code = True, device_map = device) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True) processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code = True)

def encode_image(image): if isinstance(image, str): image = Image.open(image).convert("RGB") else: image = image.convert("RGB") max_size = 448 * 16 if max(image.size) > max_size: image = image.resize((max_size, max_size), resample=Image.BICUBIC) return image

def encode_video(video_path, max_frames=32): def uniform_sample(l, n): gap = len(l) / n idxs = [int(i * gap + gap / 2) for i in range(n)] return [l[i] for i in idxs] vr = VideoReader(video_path, ctx=cpu(0)) sample_fps = round(vr.get_avg_fps() / 1) # FPS frame_idx = [i for i in range(0, len(vr), sample_fps)] if len(frame_idx) > max_frames: frame_idx = uniform_sample(frame_idx, max_frames) frames = vr.get_batch(frame_idx).asnumpy() frames = [Image.fromarray(v.astype('uint8')) for v in frames] return frames

def encode_query(query_text): return tokenizer(query_text, return_tensors='pt').to(device)

image = processor(images=encode_image("/content/image.jpg"), return_tensors="pt")

image

with torch.no_grad(): outputs = model(data = image) embeddings = outputs.last_hidden_state[:, 0, :] """

Error = """

KeyError Traceback (most recent call last) in <cell line: 1>() 1 with torch.no_grad(): ----> 2 outputs = model(data = image) 3 embeddings = outputs.last_hidden_state[:, 0, :]

5 frames /usr/local/lib/python3.10/dist-packages/transformers/feature_extraction_utils.py in getitem(self, item) 85 """ 86 if isinstance(item, str): ---> 87 return self.data[item] 88 else: 89 raise KeyError("Indexing with integers is not available when using Python based feature extractors")

KeyError: 'input_ids'"""

This is the code I am using. Can you give a try in this code and help me to fix this issue.

Input should be a dict object

hello, is this solved? If yes, can you please share the code?

可以将padding换为left, 然后取outputs.last_hidden_state[:, -1, :]

code = """

!pip install -q transformers bitsandbytes accelerate torch Pillow decord flash-attn --no-build-isolation

from transformers import AutoTokenizer, AutoModel, AutoImageProcessor import torch from PIL import Image from decord import VideoReader, cpu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = 'openbmb/MiniCPM-V-2_6-int4'

model = AutoModel.from_pretrained(model_path, trust_remote_code = True, device_map = device) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True) processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code = True)

def encode_image(image): if isinstance(image, str): image = Image.open(image).convert("RGB") else: image = image.convert("RGB") max_size = 448 * 16 if max(image.size) > max_size: image = image.resize((max_size, max_size), resample=Image.BICUBIC) return image

def encode_video(video_path, max_frames=32): def uniform_sample(l, n): gap = len(l) / n idxs = [int(i * gap + gap / 2) for i in range(n)] return [l[i] for i in idxs] vr = VideoReader(video_path, ctx=cpu(0)) sample_fps = round(vr.get_avg_fps() / 1) # FPS frame_idx = [i for i in range(0, len(vr), sample_fps)] if len(frame_idx) > max_frames: frame_idx = uniform_sample(frame_idx, max_frames) frames = vr.get_batch(frame_idx).asnumpy() frames = [Image.fromarray(v.astype('uint8')) for v in frames] return frames

def encode_query(query_text): return tokenizer(query_text, return_tensors='pt').to(device)

image = processor(images=encode_image("/content/image.jpg"), return_tensors="pt")

image

with torch.no_grad(): outputs = model(data = image) embeddings = outputs.last_hidden_state[:, 0, :] """

Error = """

KeyError Traceback (most recent call last) in <cell line: 1>() 1 with torch.no_grad(): ----> 2 outputs = model(data = image) 3 embeddings = outputs.last_hidden_state[:, 0, :]

5 frames /usr/local/lib/python3.10/dist-packages/transformers/feature_extraction_utils.py in getitem(self, item) 85 """ 86 if isinstance(item, str): ---> 87 return self.data[item] 88 else: 89 raise KeyError("Indexing with integers is not available when using Python based feature extractors")

KeyError: 'input_ids'"""

This is the code I am using. Can you give a try in this code and help me to fix this issue.

Hello, also ran into the same trouble. We followed the code provided here but got the same error message:

 File "/***/miniconda3/envs/***/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/***/miniconda3/envs/ColorName/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/***/miniconda3/envs/***/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/***/.cache/huggingface/modules/transformers_modules/openbmb/MiniCPM-V-2_6-int4/6f1555d8e2359cb18595da9f1864cb41631e0617/modeling_minicpmv.py", line 171, in forward
    vllm_embedding, vision_hidden_states = self.get_vllm_embedding(data)
  File "/***/.cache/huggingface/modules/transformers_modules/openbmb/MiniCPM-V-2_6-int4/6f1555d8e2359cb18595da9f1864cb41631e0617/modeling_minicpmv.py", line 147, in get_vllm_embedding
    vllm_embedding = self.llm.model.embed_tokens(data['input_ids'])
  File "/***/miniconda3/envs/***/lib/python3.9/site-packages/transformers/feature_extraction_utils.py", line 87, in __getitem__
    return self.data[item]
KeyError: 'input_ids'

Our code is as follows

def get_miniCPM_image_embeddings(imagepath):
    """
    Params
    imagepth = str, path to the input png image file

    Return
    embeddings: torch.Tensor
    """

    model_name = "openbmb/MiniCPM-V-2_6-int4"
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16)
    processor = AutoImageProcessor.from_pretrained(model_name, trust_remote_code = True)

    # read image
    image = Image.open(imagepath)
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")
    else:
        image = image.convert("RGB")

    # resize
    max_size = 448 * 16
    if max(image.size) > max_size:
        image = image.resize((max_size, max_size), resample=Image.BICUBIC)

    # get embedding
    image = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(data=image)
        embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings

If anyone has found a solution, would you mind sharing the code? Thank you all in advance!

OpenBMB / MiniCPM-V