EvolvingLMMs-Lab / LongVA

Long Context Transfer from Language to Vision
Apache License 2.0
182 stars 11 forks source link

Adapting VQAScore to Longva #9

Open qingfengcss opened 6 days ago

qingfengcss commented 6 days ago

I have problems in adapting VQAscore in Longva, when I add "{answer}<|im_end|>\n" or "answer" in the end of the prompt, the logits and the lables are wrong, could help me VQAscore in Longva?

from longva.longva.model.builder import load_pretrained_model
from longva.longva.mm_utils import tokenizer_image_token, process_images
from longva.longva.constants import IMAGE_TOKEN_INDEX, IGNORE_INDEX
from PIL import Image
from decord import VideoReader, cpu
import torch
import numpy as np
import copy
#from longva.longva.conversation import Conversation

max_frame_num = 16
prompt = "<|im_start|>system\nYou're an expert in data annotation.<|im_end|>\n<|im_start|>user\n<image>\n{}\n <|im_end|>\n<|im_start|>assistant\n"
question_template = "Here is the caption:[{}], help me determine if the caption matches the content of the video, please answer yes or no"
answer_template = 'yes'

prompt_question_template = "<|im_start|>system\nYou're an expert in data annotation.<|im_end|>\n<|im_start|>user\n<image>\n{}\n <|im_end|>\n<|im_start|>assistant\n"
prompt_answer_template = '{}<|im_end|>\n'
#prompt_answer_template = '{}'
def get_longva_QAscore_single_data(model, image_processor, tokenizer, videos_frames, captions, answers, device):
    gen_kwargs = {"do_sample": True, "temperature": 0.5, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024}
    vs = []
    for video_path in videos_frames:
        vr = VideoReader(video_path, ctx=cpu(0))
        total_frame_num = len(vr)
        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frame_num, dtype=int)
        frame_idx = uniform_sampled_frames.tolist()
        frames = vr.get_batch(frame_idx).asnumpy()
        vs.append(image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(model.device, dtype=torch.float16))
    modalities= ['video' for i in vs]
    # print(modalities)
    q = question_template.format(captions)
    a = answer_template.format(answers) 
    questions = prompt_question_template.format(q)
    answers = prompt_answer_template.format(a)
    prompts = questions + answers
    #prompts = questions
    input_ids = tokenizer_image_token(prompts, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0) 
    labels = copy.deepcopy(input_ids)
    # for label, qs in zip(labels, questions):
    tokenized_len = len(tokenizer_image_token(questions, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0))
    if questions[-1] == ' ':
        tokenized_len -= 1
    labels[:tokenized_len] = IGNORE_INDEX
    input_ids = input_ids[:tokenizer.model_max_length]
    labels = labels[:tokenizer.model_max_length]
    attention_mask = input_ids.ne(tokenizer.pad_token_id)

    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    position_ids = None
    past_key_values = None
    images = vs
    modalities = ['video']
    image_sizes= None
    (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = model.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes)
    with torch.no_grad():
        outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids = position_ids,
        past_key_values = past_key_values,
        inputs_embeds = inputs_embeds,
        labels = labels,
        images = images,
        image_sizes = image_sizes,
        modalities = modalities,
        dpo_forward = False,
        return_dict=True
        )
    # print(outputs.keys())
    logits = outputs.logits
    loss = outputs.loss
    return loss
if __name__ == '__main__':
    model_path = '/home/css/t2v_metrics/LongVA-7B-DPO'
    image_path = "local_demo/assets/lmms-eval.png"
    video_path = "/home/css/LongVA/local_demo/assets/dc_demo.mp4"
    device = 'cuda'
    max_frames_num = 16 # you can change this to several thousands so long you GPU memory can handle it :)
    # gen_kwargs = {"do_sample": True, "return_dict":True, "temperature": 0.5, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024}
    tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, "llava_qwen", device_map="cuda:0")
    videos = [video_path]
    descriptions = 'a man is dancing in a mountain'
    answers = 'yes'
    scores = get_longva_QAscore_single_data(model, image_processor, tokenizer, videos, descriptions, answers,device)