Open yogkul2000 opened 1 month ago
I am getting "" as output, can't seem to figure out the issue.
import torch from videollava.conversation import conv_templates, SeparatorStyle from videollava.model.builder import load_pretrained_model from videollava.constants import DEFAULT_IM_START_TOKEN, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_END_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_VID_START_TOKEN, DEFAULT_VID_END_TOKEN from videollava.utils import disable_torch_init from videollava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria import argparse, json, os from tqdm import tqdm def inference_single_video(video_path, inp, model, processor): disable_torch_init() if model.config.mm_use_x_start_end: qs = DEFAULT_VID_START_TOKEN + ''.join([DEFAULT_IMAGE_TOKEN]*8) + DEFAULT_VID_END_TOKEN + '\n' + inp else: qs = ''.join([DEFAULT_IMAGE_TOKEN]*8) + '\n' + inp conv_mode = "llava_v0" args.conv_mode = conv_mode conv = conv_templates[args.conv_mode].copy() conv.append_message(conv.roles[0], qs) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() video_tensor = processor['video'].preprocess(video_path, return_tensors='pt')['pixel_values'][0].half().to("cuda") # print(video_tensor.shape) input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to("cuda") stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 keywords = [stop_str] stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) with torch.inference_mode(): output_ids = model.generate( input_ids, images=[video_tensor], do_sample=True, temperature=0.2, max_new_tokens=1024, use_cache=True, stopping_criteria=[stopping_criteria]) input_token_len = input_ids.shape[1] n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() if n_diff_input_output > 0: print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] outputs = outputs.strip() if outputs.endswith(stop_str): outputs = outputs[:-len(stop_str)] outputs = outputs.strip() #print(outputs) return outputs answer_prompt = { # "multi-choice": "\nBest Option:", # The old version "multi-choice": "\nPlease directly give the best option:", "yes_no": "\nPlease answer yes or no:", # "caption_matching": "\nBest Option:", #The old version "caption_matching": "\nPlease directly give the best option:", "captioning": "" # The answer "Generated Caption:" is already contained in the question } if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--video_path', default='/scratch/ykulka10/TempCompass/videos') parser.add_argument('--output_path', default='predictions/ppo') parser.add_argument('--task_type', default='multi-choice', choices=['multi-choice', 'captioning', 'caption_matching', 'yes_no']) args = parser.parse_args() # Loading questions question_path = f"../questions/{args.task_type}.json" with open(question_path, 'r') as f: input_datas = json.load(f) if not os.path.exists(args.output_path): os.makedirs(args.output_path) pred_file = f"{args.output_path}/{args.task_type}.json" # Loading existing predictions if os.path.isfile(pred_file): with open(f"{args.output_path}/{args.task_type}.json", 'r') as f: predictions = json.load(f) else: predictions = {} # Loading Video-LLaVA model_path = '/scratch/ykulka10/vlm_rlaif_video_llava_7b' device = 'cuda' model_name = get_model_name_from_path(model_path) tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, device=device) for vid, data in tqdm(input_datas.items()): if vid not in predictions: predictions[vid] = {} video_path = os.path.join(args.video_path, f'{vid}.mp4') for dim, questions in data.items(): predictions[vid][dim] = [] for question in questions: inp = question['question'] + answer_prompt[args.task_type] video_llm_pred = inference_single_video(video_path, inp, model, processor) predictions[vid][dim].append({'question': question['question'], 'answer': question['answer'], 'prediction': video_llm_pred}) with open(pred_file, 'w') as f: json.dump(predictions, f, indent=4)```
Above is the input prompt to the model.
I am getting "" as output, can't seem to figure out the issue.