@inproceedings{e742aa2a74fa49c79467467e536662df,
title = "CompLex-ZH: A New Dataset for Lexical Complexity Prediction in Mandarin and Cantonese",
abstract = "The prediction of lexical complexity in context is assuming an increasing relevance in Natural Language Processing research, since identifying complex words is often the first step of text simplification pipelines. To the best of our knowledge, though, datasets annotated with complex words are available only for English and for a limited number of Western languages. In our paper, we introduce CompLex-ZH, a dataset including words annotated with complexity scores in sentential contexts for Chinese. Our data include sentences in Mandarin and Cantonese, which were selected from a variety of sources and textual genres. We provide a first evaluation with baselines combining hand-crafted and language models-based features.",
author = "Le Qiu and Shanyue Guo and Wong, \{Tak Sum\} and Emmanuele Chersoni and Lee, \{John S.Y.\} and Huang, \{Chu Ren\}",
note = "Publisher Copyright: {\textcopyright} 2024 Association for Computational Linguistics.; 3rd Workshop on Text Simplification, Accessibility and Readability, TSAR 2024 ; Conference date: 15-11-2024",
year = "2024",
month = nov,
doi = "10.18653/v1/2024.tsar-1.3",
language = "English",
series = "TSAR 2024 - 3rd Workshop on Text Simplification, Accessibility and Readability, Proceedings of the Workshop",
publisher = "Association for Computational Linguistics (ACL)",
pages = "20--26",
editor = "Matthew Shardlow and Horacio Saggion and Fernando Alva-Manchego and Marcos Zampieri and Kai North and Sanja Stajner and Regina Stodden",
booktitle = "TSAR 2024 - 3rd Workshop on Text Simplification, Accessibility and Readability, Proceedings of the Workshop",
address = "United States",
}