Official implementation for paper:
Direct Preference Optimization of Video Large Multimodal Models from Language Model Reward
In Huggingface Repo, we release
Datasets:
Models:
# setup requirements
source setup/setup_env.sh
set_path.sh
# Inference Example for DPO/SFT Model
```bash
cd llava_hound_dpo
sudo apt-get install ffmpeg
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
from inference.inference_utils import ModelInference, decode2frame
video_path = "examples/sample_msrvtt.mp4"
# options ["ShareGPTVideo/LLaVA-Hound-DPO", "ShareGPTVideo/LLaVA-Hound-SFT", "ShareGPTVideo/LLaVA-Hound-SFT-Image_only"]
model_path = "ShareGPTVideo/LLaVA-Hound-DPO"
model_name = get_model_name_from_path(model_path)
tokenizer, model, processor, context_len = load_pretrained_model(model_path, model_base = None, model_name=model_name, cache_dir=os.environ['CACHE_DIR'])
inference_model = ModelInference(model=model, tokenizer=tokenizer, processor=processor, context_len=context_len)
# our pipeline
frame_dir, _ = os.path.splitext(video_path)
decode2frame(video_path, frame_dir, verbose=True)
question="What is the evident theme in the video?"
response = inference_model.generate(
question=question,
modal_path=frame_dir,
temperature=0,
)
print(response)
# using decord
response = inference_model.generate(
question=question,
modal_path=video_path,
temperature=0,
video_decode_backend="decord",
)
print(response)
To generate detailed video captions with our pretrained ckpt use
import numpy as np
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
from inference.inference_utils import ModelInference, decode2frame, detail_templates
video_path = "examples/sample_msrvtt.mp4"
model_path = "ShareGPTVideo/LLaVA-Hound-Pretrain"
model_name = get_model_name_from_path(model_path)
tokenizer, model, processor, context_len = load_pretrained_model(model_path, model_base = None, model_name=model_name, cache_dir=os.environ['CACHE_DIR'])
inference_model = ModelInference(model=model, tokenizer=tokenizer, processor=processor, context_len=context_len)
question = np.random.choice(detail_templates) # use pretrained template questions
# our pipeline
frame_dir, _ = os.path.splitext(video_path)
decode2frame(video_path, frame_dir, verbose=True)
response = inference_model.generate(
question=question,
modal_path=frame_dir,
temperature=0,
)
print(response)
# using decord
response = inference_model.generate(
question=question,
modal_path=video_path,
temperature=0,
video_decode_backend="decord",
)
print(response)
# setup data
source setup/setup_test_data.sh
# Eval for official (a subset of 5k qa)
bash test/pipeline/outdomain_official_test_pipeline.sh \
$model_output_name \
$model_name
# Eval for our in-domain
bash test/pipeline/indomain_test_pipeline.sh \
$model_output_name \
$model_name
# Eval for our out-of-domain
bash test/pipeline/outdomain_test_pipeline.sh \
$model_output_name \
$model_name
Exampe of official testing with dpo model
bash test/pipeline/outdomain_official_test_pipeline.sh \
videollava_dpo \
ShareGPTVideo/LLaVA-Hound-DPO
More details including discussion, other SOTA model testing, customized model testing, refer to test readme
DPO training refer to DPO data setup and training
Pretrain + SFT refer to Pretrain + SFT
@misc{zhang2024direct,
title={Direct Preference Optimization of Video Large Multimodal Models from Language Model Reward},
author={Ruohong Zhang and Liangke Gui and Zhiqing Sun and Yihao Feng and Keyang Xu and Yuanhan Zhang and Di Fu and Chunyuan Li and Alexander Hauptmann and Yonatan Bisk and Yiming Yang},
year={2024},
eprint={2404.01258},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
Code is build updo the following projects:
Thanks for their great work!