Closed rish-hyun closed 1 month ago
请提供您的代码,仅一行报错不容易判断
@LDLINGLINGLING
import gc
import logging
import os
from typing import Any, Dict, List, Optional
from uuid import uuid4
import cv2
import torch
from fastapi import Body, FastAPI, File, UploadFile
from PIL import Image
from transformers import AutoModel, AutoTokenizer
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
class Model:
def __init__(self):
model_id = "openbmb/MiniCPM-V-2_6-int4"
self._tokenizer = AutoTokenizer.from_pretrained(
model_id,
trust_remote_code=True,
)
self._model = AutoModel.from_pretrained(
model_id,
trust_remote_code=True,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)
self._model.eval()
logger.info(f"Model loaded: {model_id}")
def run(
self,
images: List[Image.Image],
question: str,
is_video: Optional[bool] = False,
params: Optional[Dict[str, Any]] = {},
) -> str:
try:
logger.info(f"Images: {len(images)}")
logger.info(f"Question: {question}")
if is_video:
params.update(
{
"use_image_id": False,
"max_slice_nums": 1, # use 1 if cuda OOM and video resolution > 448*448
}
)
messages = [{"role": "user", "content": [*images, question]}]
answer = self._model.chat(
image=None,
msgs=messages,
tokenizer=self._tokenizer,
**params,
)
except Exception as error:
answer = f"Error: {error}"
logger.error(answer)
else:
error = None
finally:
gc.collect()
torch.cuda.empty_cache()
return {"answer": answer, "error": error}
model = Model()
app = FastAPI()
@app.get("/")
def home():
return "Welcome to MiniCPM-V-2_6 API"
@app.post("/answer/image/")
def answer_from_image(
file: UploadFile = File(...),
question: str = Body("Describe the image in detail."),
) -> str:
image = Image.open(file.file)
logger.info(f"Image: {image.size}")
return model.run([image], question)
@app.post("/answer/video/")
def answer_from_video(
file: UploadFile = File(...),
question: str = Body("Describe the video in detail."),
) -> str:
with open(video_path := f"{uuid4().hex[:12]}.mp4", "wb") as f:
f.write(file.file.read())
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
logger.info(f"FPS: {fps}, Height: {height}, Width: {width}")
is_resize_required: bool = False
if max(height, width) > (max_dim := 1080):
if width < height:
width, height = int(max_dim * width / height), max_dim
elif width > height:
width, height = max_dim, int(max_dim * height / width)
else:
width, height = max_dim, max_dim
is_resize_required = True
logger.info(f"New Height: {height}, New Width: {width}")
images = []
_index: int = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if _index % (int(fps) // 2) == 0:
if is_resize_required:
frame = cv2.resize(frame, (width, height))
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
images.append(Image.fromarray(frame))
_index += 1
cap.release()
logger.info(f"Total Frames: {len(images)}")
os.remove(video_path)
return model.run(images, question, is_video=True)
Input Video given Dimension: 720 × 1280 Duration: 01:25
GPU: NVIDIA A100-SXM4-40GB
Looks like too many frames and exceeds the max_inp_length 8192, limit the max frames to a smaller size would be ok.
@iceflame89 I take 2 frame every second, so 01:25 -> 170 frames in which either of the side (w or h) never exceeds 1080 px. Please clarify what does this mean? max_inp_length 8192
Does it take account of text + images ?
Yes, both text and images are computed as inp length, and you can try resizing the frames to be smaller before inference(448*448 = 64tokens and 1344*1344 = 640tokens), or modifying max_inp_length (but this may make the model more prone to hallucinations)
是否已有关于该错误的issue或讨论? | Is there an existing issue / discussion for this?
该问题是否在FAQ中有解答? | Is there an existing answer for this in FAQ?
当前行为 | Current Behavior
Video Question Answering sometimes get failed
期望行为 | Expected Behavior
Should have been inferenced
复现方法 | Steps To Reproduce
No response
运行环境 | Environment
备注 | Anything else?
No response