pip install -r requirements.txt
git lfs install
git clone https://huggingface.co/CompVis/stable-diffusion-v1-4 checkpoints/stable-diffusion-v1-4
To fine-tune the text-to-image diffusion models for text-to-video generation, run this command:
accelerate launch train_camel.py --config="configs/loveu-tgve-2023/DAVIS_480p/gold-fish.yaml"
We employ UMTScore for textual alignment evaluation and CLIPScore for frame consistency evaluation.
@article{liu2023fetv,
title = {FETV: A Benchmark for Fine-Grained Evaluation of Open-Domain Text-to-Video Generation},
author = {Yuanxin Liu and Lei Li and Shuhuai Ren and Rundong Gao and Shicheng Li and Sishuo Chen and Xu Sun and Lu Hou},
year = {2023},
journal = {arXiv preprint arXiv: 2311.01813}
}
@inproceedings{wu2023tune,
title={Tune-a-video: One-shot tuning of image diffusion models for text-to-video generation},
author={Wu, Jay Zhangjie and Ge, Yixiao and Wang, Xintao and Lei, Stan Weixian and Gu, Yuchao and Shi, Yufei and Hsu, Wynne and Shan, Ying and Qie, Xiaohu and Shou, Mike Zheng},
booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
pages={7623--7633},
year={2023}
}