@inproceedings{a94dc6a5d6e54e1eb1f499cbc08a8833,
title = "InfiMM: Advancing Multimodal Understanding with an Open-Sourced Visual Language Model",
abstract = "In this work, we present InfiMM, an advanced Multimodal Large Language Model that adapts to intricate vision-language tasks. InfiMM, inspired by the Flamingo architecture, distinguishes itself through the utilization of large-scale training data, three-stage training strategies, and diverse large language models. This approach ensures the preservation of Flamingo's foundational strengths while simultaneously introducing augmented capabilities. Empirical evaluations across a variety of benchmarks underscore InfiMM's remarkable capability in multimodal understanding. The code and model can be found at: https://huggingface.co/Infi-MM.",
author = "Haogeng Liu and Quanzeng You and Yiqi Wang and Xiaotian Han and Bohan Zhai and Yongfei Liu and Wentao Chen and Yiren Jian and Yunzhe Tao and Jianbo Yuan and Ran He and Hongxia Yang",
note = "Publisher Copyright: {\textcopyright} 2024 Association for Computational Linguistics.; Findings of the 62nd Annual Meeting of the Association for Computational Linguistics, ACL 2024 ; Conference date: 11-08-2024 Through 16-08-2024",
year = "2024",
month = aug,
doi = "10.18653/v1/2024.findings-acl.27",
language = "English",
series = "Proceedings of the Annual Meeting of the Association for Computational Linguistics",
publisher = "Association for Computational Linguistics (ACL)",
pages = "485--492",
editor = "Lun-Wei Ku and Andre Martins and Vivek Srikumar",
booktitle = "The 62nd Annual Meeting of the Association for Computational Linguistics",
address = "United States",
}