OpenBMB / MiniCPM-V

MiniCPM-V 2.6: A GPT-4V Level MLLM for Single Image, Multi Image and Video on Your Phone
Apache License 2.0
11.82k stars 829 forks source link

[BUG] Sizes of tensors must match except in dimension 1. Expected size 123 but got size 122 for tensor number 1 in the list. #417

Closed rish-hyun closed 1 month ago

rish-hyun commented 1 month ago

是否已有关于该错误的issue或讨论? | Is there an existing issue / discussion for this?

该问题是否在FAQ中有解答? | Is there an existing answer for this in FAQ?

当前行为 | Current Behavior

Video Question Answering sometimes get failed

期望行为 | Expected Behavior

Should have been inferenced

复现方法 | Steps To Reproduce

No response

运行环境 | Environment

- OS:
- Python:
- Transformers:
- PyTorch:
- CUDA (`python -c 'import torch; print(torch.version.cuda)'`):

备注 | Anything else?

No response

LDLINGLINGLING commented 1 month ago

请提供您的代码,仅一行报错不容易判断

rish-hyun commented 1 month ago

@LDLINGLINGLING

import gc
import logging
import os
from typing import Any, Dict, List, Optional
from uuid import uuid4

import cv2
import torch
from fastapi import Body, FastAPI, File, UploadFile
from PIL import Image
from transformers import AutoModel, AutoTokenizer

logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

class Model:

    def __init__(self):
        model_id = "openbmb/MiniCPM-V-2_6-int4"
        self._tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            trust_remote_code=True,
        )
        self._model = AutoModel.from_pretrained(
            model_id,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2",
        )
        self._model.eval()
        logger.info(f"Model loaded: {model_id}")

    def run(
        self,
        images: List[Image.Image],
        question: str,
        is_video: Optional[bool] = False,
        params: Optional[Dict[str, Any]] = {},
    ) -> str:
        try:
            logger.info(f"Images: {len(images)}")
            logger.info(f"Question: {question}")

            if is_video:
                params.update(
                    {
                        "use_image_id": False,
                        "max_slice_nums": 1,  # use 1 if cuda OOM and video resolution > 448*448
                    }
                )

            messages = [{"role": "user", "content": [*images, question]}]
            answer = self._model.chat(
                image=None,
                msgs=messages,
                tokenizer=self._tokenizer,
                **params,
            )

        except Exception as error:
            answer = f"Error: {error}"
            logger.error(answer)

        else:
            error = None

        finally:
            gc.collect()
            torch.cuda.empty_cache()
            return {"answer": answer, "error": error}

model = Model()

app = FastAPI()

@app.get("/")
def home():
    return "Welcome to MiniCPM-V-2_6 API"

@app.post("/answer/image/")
def answer_from_image(
    file: UploadFile = File(...),
    question: str = Body("Describe the image in detail."),
) -> str:
    image = Image.open(file.file)
    logger.info(f"Image: {image.size}")
    return model.run([image], question)

@app.post("/answer/video/")
def answer_from_video(
    file: UploadFile = File(...),
    question: str = Body("Describe the video in detail."),
) -> str:
    with open(video_path := f"{uuid4().hex[:12]}.mp4", "wb") as f:
        f.write(file.file.read())

    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
    width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)

    logger.info(f"FPS: {fps}, Height: {height}, Width: {width}")

    is_resize_required: bool = False
    if max(height, width) > (max_dim := 1080):
        if width < height:
            width, height = int(max_dim * width / height), max_dim
        elif width > height:
            width, height = max_dim, int(max_dim * height / width)
        else:
            width, height = max_dim, max_dim

        is_resize_required = True
        logger.info(f"New Height: {height}, New Width: {width}")

    images = []
    _index: int = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if _index % (int(fps) // 2) == 0:
            if is_resize_required:
                frame = cv2.resize(frame, (width, height))
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            images.append(Image.fromarray(frame))
        _index += 1
    cap.release()

    logger.info(f"Total Frames: {len(images)}")

    os.remove(video_path)
    return model.run(images, question, is_video=True)
rish-hyun commented 1 month ago

Input Video given Dimension: 720 × 1280 Duration: 01:25

GPU: NVIDIA A100-SXM4-40GB

iceflame89 commented 1 month ago

Looks like too many frames and exceeds the max_inp_length 8192, limit the max frames to a smaller size would be ok.

rish-hyun commented 1 month ago

@iceflame89 I take 2 frame every second, so 01:25 -> 170 frames in which either of the side (w or h) never exceeds 1080 px. Please clarify what does this mean? max_inp_length 8192 Does it take account of text + images ?

Cuiunbo commented 1 month ago

Yes, both text and images are computed as inp length, and you can try resizing the frames to be smaller before inference(448*448 = 64tokens and 1344*1344 = 640tokens), or modifying max_inp_length (but this may make the model more prone to hallucinations)