In this paper, we introduced VisualNews, the largest and most diverse news image captioning dataset. We also proposed VisualNews-Captioner, increasing CIDEr by 10+ points with fewer parameters than competing methods.
@misc{liu2020visualnews,
title={VisualNews : Benchmark and Challenges in Entity-aware Image Captioning},
author={Fuxiao Liu and Yinghan Wang and Tianlu Wang and Vicente Ordonez},
year={2020},
eprint={2010.03743},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
We introduce COVID-VTS, a fact-checking dataset for short video platforms. We also propose TwtrDetective, a new explainable fact-checking framework for the short videos. It's very related to your work.
@article{liu2023covid,
title={COVID-VTS: Fact Extraction and Verification on Short Video Platforms},
author={Liu, Fuxiao and Yacoob, Yaser and Shrivastava, Abhinav},
journal={arXiv preprint arXiv:2302.07919},
year={2023}
}
In this paper, we introduced LRV-Instruction, a more robust visual instruction tuning dataset. After finetuning current LMMs on our data, we can mitigate the hallucination.
@article{liu2023aligning,
title={Aligning Large Multi-Modal Model with Robust Instruction Tuning},
author={Liu, Fuxiao and Lin, Kevin and Li, Linjie and Wang, Jianfeng and Yacoob, Yaser and Wang, Lijuan},
journal={arXiv preprint arXiv:2306.14565},
year={2023}
}
(4) DocumentCLIP: Linking Figures and Main Body Text in Reflowed Documents
In this paper, we propose a large multimodal Wikipedia article dataset to help align image with text.
@article{liu2023documentclip,
title={DocumentCLIP: Linking Figures and Main Body Text in Reflowed Documents},
author={Liu, Fuxiao and Tan, Hao and Tensmeyer, Chris},
journal={arXiv preprint arXiv:2306.06306},
year={2023}
}
(1) Visual News: Benchmark and Challenges in News Image Captioning. (EMNLP2021)
In this paper, we introduced VisualNews, the largest and most diverse news image captioning dataset. We also proposed VisualNews-Captioner, increasing CIDEr by 10+ points with fewer parameters than competing methods.
@misc{liu2020visualnews, title={VisualNews : Benchmark and Challenges in Entity-aware Image Captioning}, author={Fuxiao Liu and Yinghan Wang and Tianlu Wang and Vicente Ordonez}, year={2020}, eprint={2010.03743}, archivePrefix={arXiv}, primaryClass={cs.CV} }
(2) "COVID-VTS: Fact Extraction and Verification on Short Video Platforms" ( EACL2023).
We introduce COVID-VTS, a fact-checking dataset for short video platforms. We also propose TwtrDetective, a new explainable fact-checking framework for the short videos. It's very related to your work.
@article{liu2023covid, title={COVID-VTS: Fact Extraction and Verification on Short Video Platforms}, author={Liu, Fuxiao and Yacoob, Yaser and Shrivastava, Abhinav}, journal={arXiv preprint arXiv:2302.07919}, year={2023} }
(3) Aligning Large Multi-Modal Model with Robust Instruction Tuning.
In this paper, we introduced LRV-Instruction, a more robust visual instruction tuning dataset. After finetuning current LMMs on our data, we can mitigate the hallucination.
@article{liu2023aligning, title={Aligning Large Multi-Modal Model with Robust Instruction Tuning}, author={Liu, Fuxiao and Lin, Kevin and Li, Linjie and Wang, Jianfeng and Yacoob, Yaser and Wang, Lijuan}, journal={arXiv preprint arXiv:2306.14565}, year={2023} }
(4) DocumentCLIP: Linking Figures and Main Body Text in Reflowed Documents
In this paper, we propose a large multimodal Wikipedia article dataset to help align image with text.
@article{liu2023documentclip, title={DocumentCLIP: Linking Figures and Main Body Text in Reflowed Documents}, author={Liu, Fuxiao and Tan, Hao and Tensmeyer, Chris}, journal={arXiv preprint arXiv:2306.06306}, year={2023} }