import torch
from mplug_owl_video.modeling_mplug_owl import MplugOwlForConditionalGeneration
from transformers import AutoTokenizer
from mplug_owl_video.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor
pretrained_ckpt = 'mplug-owl-llama-7b-video'
model = MplugOwlForConditionalGeneration.from_pretrained(
pretrained_ckpt,
torch_dtype=torch.bfloat16,
devices_map="auto",
)
print("model load success")
if hasattr(torch.cuda, 'empty_cache'):
torch.cuda.empty_cache()
image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)
tokenizer = AutoTokenizer.from_pretrained(pretrained_ckpt)
processor = MplugOwlProcessor(image_processor, tokenizer)
print("tokenizer load success")
# We use a human/AI template to organize the context as a multi-turn conversation.
# <|video|> denotes an video placehold.
prompts = [
'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
Human: <|video|>
Human: short caption the highlight monent in the video
AI: ''']
video_list = ['test.mp4']
# generate kwargs (the same in transformers) can be passed in the do_generate()
generate_kwargs = {
'do_sample': True,
'top_k': 1,
'max_length': 256
}
inputs = processor(text=prompts, videos=video_list, num_frames=4, return_tensors='pt')
inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()}
inputs = {k: v.to(model.device) for k, v in inputs.items()}
with torch.no_grad():
res = model.generate(**inputs, **generate_kwargs)
sentence = tokenizer.decode(res.tolist()[0], skip_special_tokens=True)
print(sentence)
Traceback (most recent call last):
File "infer_video.py", line 8, in
model = MplugOwlForConditionalGeneration.from_pretrained(
File "/home/qspace/conda/lib/python3.8/site-packages/transformers/modeling_utils.py", line 2732, in from_pretrained
device_map = infer_auto_device_map(model, dtype=torch_dtype if not load_in_8bit else torch.int8, **kwargs)
File "/home/qspace/conda/lib/python3.8/site-packages/accelerate/utils/modeling.py", line 387, in infer_auto_device_map
max_layer_size, max_layer_names = get_max_layer_size(modules_to_treat, module_sizes, no_split_module_classes)
File "/home/qspace/conda/lib/python3.8/site-packages/accelerate/utils/modeling.py", line 258, in get_max_layer_size
modules_children = list(module.named_children())
AttributeError: 'Parameter' object has no attribute 'named_children'
Traceback (most recent call last): File "infer_video.py", line 8, in
model = MplugOwlForConditionalGeneration.from_pretrained(
File "/home/qspace/conda/lib/python3.8/site-packages/transformers/modeling_utils.py", line 2732, in from_pretrained
device_map = infer_auto_device_map(model, dtype=torch_dtype if not load_in_8bit else torch.int8, **kwargs)
File "/home/qspace/conda/lib/python3.8/site-packages/accelerate/utils/modeling.py", line 387, in infer_auto_device_map
max_layer_size, max_layer_names = get_max_layer_size(modules_to_treat, module_sizes, no_split_module_classes)
File "/home/qspace/conda/lib/python3.8/site-packages/accelerate/utils/modeling.py", line 258, in get_max_layer_size
modules_children = list(module.named_children())
AttributeError: 'Parameter' object has no attribute 'named_children'
相关资料说是因为显存不够,显卡是A100,40G