I have problems in adapting VQAscore in Longva, when I add "{answer}<|im_end|>\n" or "answer" in the end of the prompt, the logits and the lables are wrong, could help me VQAscore in Longva?
from longva.longva.model.builder import load_pretrained_model
from longva.longva.mm_utils import tokenizer_image_token, process_images
from longva.longva.constants import IMAGE_TOKEN_INDEX, IGNORE_INDEX
from PIL import Image
from decord import VideoReader, cpu
import torch
import numpy as np
import copy
#from longva.longva.conversation import Conversation
max_frame_num = 16
prompt = "<|im_start|>system\nYou're an expert in data annotation.<|im_end|>\n<|im_start|>user\n<image>\n{}\n <|im_end|>\n<|im_start|>assistant\n"
question_template = "Here is the caption:[{}], help me determine if the caption matches the content of the video, please answer yes or no"
answer_template = 'yes'
prompt_question_template = "<|im_start|>system\nYou're an expert in data annotation.<|im_end|>\n<|im_start|>user\n<image>\n{}\n <|im_end|>\n<|im_start|>assistant\n"
prompt_answer_template = '{}<|im_end|>\n'
#prompt_answer_template = '{}'
def get_longva_QAscore_single_data(model, image_processor, tokenizer, videos_frames, captions, answers, device):
gen_kwargs = {"do_sample": True, "temperature": 0.5, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024}
vs = []
for video_path in videos_frames:
vr = VideoReader(video_path, ctx=cpu(0))
total_frame_num = len(vr)
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frame_num, dtype=int)
frame_idx = uniform_sampled_frames.tolist()
frames = vr.get_batch(frame_idx).asnumpy()
vs.append(image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(model.device, dtype=torch.float16))
modalities= ['video' for i in vs]
# print(modalities)
q = question_template.format(captions)
a = answer_template.format(answers)
questions = prompt_question_template.format(q)
answers = prompt_answer_template.format(a)
prompts = questions + answers
#prompts = questions
input_ids = tokenizer_image_token(prompts, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
labels = copy.deepcopy(input_ids)
# for label, qs in zip(labels, questions):
tokenized_len = len(tokenizer_image_token(questions, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0))
if questions[-1] == ' ':
tokenized_len -= 1
labels[:tokenized_len] = IGNORE_INDEX
input_ids = input_ids[:tokenizer.model_max_length]
labels = labels[:tokenizer.model_max_length]
attention_mask = input_ids.ne(tokenizer.pad_token_id)
input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
position_ids = None
past_key_values = None
images = vs
modalities = ['video']
image_sizes= None
(input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = model.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes)
with torch.no_grad():
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids = position_ids,
past_key_values = past_key_values,
inputs_embeds = inputs_embeds,
labels = labels,
images = images,
image_sizes = image_sizes,
modalities = modalities,
dpo_forward = False,
return_dict=True
)
# print(outputs.keys())
logits = outputs.logits
loss = outputs.loss
return loss
if __name__ == '__main__':
model_path = '/home/css/t2v_metrics/LongVA-7B-DPO'
image_path = "local_demo/assets/lmms-eval.png"
video_path = "/home/css/LongVA/local_demo/assets/dc_demo.mp4"
device = 'cuda'
max_frames_num = 16 # you can change this to several thousands so long you GPU memory can handle it :)
# gen_kwargs = {"do_sample": True, "return_dict":True, "temperature": 0.5, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024}
tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, "llava_qwen", device_map="cuda:0")
videos = [video_path]
descriptions = 'a man is dancing in a mountain'
answers = 'yes'
scores = get_longva_QAscore_single_data(model, image_processor, tokenizer, videos, descriptions, answers,device)
I have problems in adapting VQAscore in Longva, when I add "{answer}<|im_end|>\n" or "answer" in the end of the prompt, the logits and the lables are wrong, could help me VQAscore in Longva?