I have written a code for single inference. But I am not getting the expected output.

User Query: Please explain the video in detail. Koala Response: What query are you answering?

Here is the code:

`""" Adapted from: https://github.com/Vision-CAIR/MiniGPT-4/blob/main/demo.py

Koala Single Video Inference """ import argparse import os import random import numpy as np import torch import torch.backends.cudnn as cudnn from koala.common.config import Config from koala.common.dist_utils import get_rank from koala.common.registry import registry from koala.conversation.conversation_video import Chat, Conversation, default_conversation, SeparatorStyle, conv_llava_llama_2 import decord decord.bridge.set_bridge('torch') import copy

Imports modules for registration

from koala.datasets.builders import from koala.models import from koala.processors import from koala.runners import from koala.tasks import *

def parse_args(): parser = argparse.ArgumentParser(description="Koala Inference") parser.add_argument("--cfg-path", default="./eval_configs/conversation_demo.yaml", help="Path to configuration file.") parser.add_argument("--video-path", default="/share/softwares/uzair/datasets/LoViQA/test_video3_converted/excercise_trick.mp4", help="Path to the input video file.") parser.add_argument("--gpu-id", type=int, default=0, help="Specify the GPU to load the model.") parser.add_argument("--model_type", type=str, default='vicuna', help="Specify LLM.")

parser.add_argument('--pretrained_weight_path', type=str, default="./koala_model.pth", help='Path to pretrained weight path')

parser.add_argument('--num_frames_per_clip', type=int, default=16, help='Specify how many frames to use per clip')
parser.add_argument('--num_segments', type=int, default=4, help='Specify number of video segments')
parser.add_argument('--hierarchical_agg_function', type=str, default="without-top-final-global-prompts-region-segment-full-dis-spatiotemporal-prompts-attn-early-attn-linear-learned", help='Specify function to merge global and clip visual representations')
parser.add_argument("--num-beams", type=int, default=1, help="Beam search numbers.")
parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature.")

parser.add_argument("--prompt", type=str, default="Please explain the video in detail.", help="User query for video inference.")

parser.add_argument(
    "--options",
    nargs="+",
    help="override some settings in the used config, the key-value pair "
    "in xxx=yyy format will be merged into config file (deprecate), "
    "change to --cfg-options instead.",
)
args = parser.parse_args()
return args

========================================

Model Initialization

========================================

print('Initializing Chat') args = parse_args() cfg = Config(args)

model_config = cfg.model_cfg model_config.device_8bit = args.gpu_id model_cls = registry.get_model_class(model_config.arch) model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))

model.num_frames_per_clip = args.num_frames_per_clip # model.num_segments = args.num_segments # model.hierarchical_agg_function = args.hierarchical_agg_function # model.global_region_embed_weight = None #

model.initialize_visual_agg_function() #*

best_checkpoint = torch.load(args.pretrained_weight_path, map_location='cpu')['model_state_dict'] # pretrained_dict = {} # for k, v in best_checkpoint.items(): # pretrained_dict[k.replace('module.', '')] = v #

model_dict = model.state_dict() # model_dict.update(pretrained_dict) # model.load_state_dict(model_dict) # model.cuda().eval() #

vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train

vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg) chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id)) print('Initialization Finished')

========================================

Prepare conversation

if args.model_type == 'vicuna': chat_state = default_conversation.copy() else: chat_state = conv_llava_llama_2.copy()

Upload the single video

chat_state.system = "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail." img_list = [] llm_message = chat.upload_video_without_audio(video_path, chat_state, img_list) #VIDEO_PATH

chat_state_seperate = copy.deepcopy(chat_state) user_question = args.prompt #TEXT_INPUT chat.ask(user_question, chat_state_seperate)

llm_message = chat.answer(conv=chat_state_seperate, img_list=img_list, num_beams=args.num_beams, temperature=args.temperature, max_new_tokens=300, max_length=2000)[0]

Display the response

print("User Query:", args.prompt) print("Koala Response:", llm_message) `

Can you please help me to correct this.

rxtan2 / Koala-video-llm

Single Inference #4

Imports modules for registration

========================================

Model Initialization

========================================

vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train

========================================

Prepare conversation

Upload the single video

Display the response