Open lucasxu777 opened 4 months ago
Could you please share your full scripts. I need to check model_name
. This is usually caused by loading LLM weights via a mismatched Videollama2xxxforCausalLM
class.
Here is the code. I only changed model_base from "None" to "vicuna"
import os import torch import transformers import pandas as pd
import sys sys.path.append('/content/VideoLLaMA2') from videollama2.conversation import conv_templates from videollama2.constants import DEFAULT_MMODAL_TOKEN, MMODAL_TOKEN_INDEX from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, process_video, process_image from videollama2.model.builder import load_pretrained_model
def inference(): video_dir = '/content/drive/My Drive/ColabNotebooks/sample_data/data/beta_vids/'
video_paths = [os.path.join(video_dir, f) for f in os.listdir(video_dir) if f.endswith('.mp4') or f.endswith('.MOV')]
questions = ['On the scale of 1-5 how anxious is the person in the video?'] * len(video_paths)
modal_list = ['video'] * len(video_paths)
model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
model_name = get_model_name_from_path(model_path)
model_base = "lmsys/vicuna-7b-v1.5"
tokenizer, model, processor, context_len = load_pretrained_model(model_path, model_base, model_name)
model = model.to('cuda:0')
conv_mode = 'llama_2'
results = []
for path, question in zip(video_paths, questions):
tensor = process_video(path, processor, model.config.image_aspect_ratio).to(dtype=torch.float16, device='cuda', non_blocking=True)
default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
tensor = [tensor]
question = default_mm_token + "\n" + question
conv = conv_templates[conv_mode].copy()
conv.append_message(conv.roles[0], question)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to('cuda:0')
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images_or_videos=tensor,
modal_list=['video'],
do_sample=True,
temperature=0.2,
max_new_tokens=1024,
use_cache=True,
)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
results.append({'video': path, 'output': outputs[0]})
df = pd.DataFrame(results)
output_file = '/content/drive/My Drive/ColabNotebooks/gemma_anxious_result.csv'
df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
if name == "main": inference()
Setting model_base=lmsys/vicuna-7b-v1.5
will return llama-based VideoLLaMA2 model, i.e., Videollama2LlamaForCausalLM
. However, DAMO-NLP-SG/VideoLLaMA2-7B
is trained based on mistralai/Mistral-7B-Instruct-v0.2
, which mismatches the architecture of Videollama2LlamaForCausalLM
.
If you want to use DAMO-NLP-SG/VideoLLaMA2-7B
for inference, please set model_base=None
.
If you want to use VideoLLaMA2 with lmsys/vicuna-7b-v1.5
, you need to retrain a new VideoLLaMA2, which is based on lmsys/vicuna-7b-v1.5
.
How can we use vicuna as the LLM in inference code. Below is the error message. Thanks.