@inproceedings{8a431973463e4492b173953e948bee82,
title = "Cross-Modal Audio-Visual Co-Learning for Text-Independent Speaker Verification",
abstract = "Visual speech (i.e., lip motion) is highly related to auditory speech due to the co-occurrence and synchronization in speech production. This paper investigates this correlation and proposes a cross-modal speech co-learning paradigm. The primary motivation of our cross-modal co-learning method is modeling one modality aided by exploiting knowledge from another modality. Specifically, two cross-modal boosters are introduced based on an audio-visual pseudo-siamese structure to learn the modality-transformed correlation. Inside each booster, a max-feature-map embedded Transformer variant is proposed for modality alignment and enhanced feature generation. The network is co-learned both from scratch and with pretrained models. Experimental results on the test scenarios demonstrate that our proposed method achieves around 60% and 20% average relative performance improvement over baseline unimodal and fusion systems, respectively.",
keywords = "co-learning, cross-modal, lip biometrics, speaker verification, visual speech",
author = "Meng Liu and Lee, {Kong Aik} and Longbiao Wang and Hanyi Zhang and Chang Zeng and Jianwu Dang",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 48th IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2023 ; Conference date: 04-06-2023 Through 10-06-2023",
year = "2023",
month = may,
doi = "10.1109/ICASSP49357.2023.10095883",
language = "English",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing, Proceedings",
}