@inproceedings{689169771f3b408388878f7247b53e4d,
title = "A Retriever-Reader Framework with Visual Entity Linking for Knowledge-Based Visual Question Answering",
abstract = "In this paper, we propose a Retriever-Reader framework with Visual Entity Linking (RR-VEL) for knowledge-based visual question answering. Given images and original questions, the visual entity linking (VEL) module extracts key entities in images to replace the question referents for semantic disambiguation, achieving entity-oriented queries with explicit entities. Furthermore, the Retriever encodes the queries and knowledge items by Bert with a feed-forward layer, and obtains a set of knowledge candidates. The Reader encodes the questions with image captions and knowledge candidates in two branches, which avoids their interference during self-attentive encoding. Finally, the decoder of Reader fuses the encoded features to generate answers. Extensive experiments conducted on the two public datasets show that our method significantly outperforms the existing baselines.",
keywords = "Entity linking, Knowledge graph, VQA",
author = "Jiuxiang You and Zhenguo Yang and Qing Li and Wenyin Liu",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 2023 IEEE International Conference on Multimedia and Expo, ICME 2023 ; Conference date: 10-07-2023 Through 14-07-2023",
year = "2023",
month = aug,
doi = "10.1109/ICME55011.2023.00011",
language = "English",
series = "Proceedings - IEEE International Conference on Multimedia and Expo",
publisher = "IEEE Computer Society",
pages = "13--18",
booktitle = "Proceedings - 2023 IEEE International Conference on Multimedia and Expo, ICME 2023",
address = "United States",
}