@inproceedings{8f37cc729bc84c919b2d40713eb8a659,
title = "Rethinking Chinese word segmentation: Tokenization, character classification, or wordbreak identification",
abstract = "This paper addresses two remaining challenges in Chinese word segmentation. The challenge in HLT is to find a robust segmentation method that requires no prior lexical knowledge and no extensive training to adapt to new types of data. The challenge in modelling human cognition and acquisition it to segment words efficiently without using knowledge of wordhood. We propose a radical method of word segmentation to meet both challenges. The most critical concept that we introduce is that Chinese word segmentation is the classification of a string of character-boundaries (CB{\textquoteright}s) into either word-boundaries (WB{\textquoteright}s) and non-word-boundaries. In Chinese, CB{\textquoteright}s are delimited and distributed in between two characters. Hence we can use the distributional properties of CB among the background character strings to predict which CB{\textquoteright}s are WB{\textquoteright}s.",
author = "Huang, {Chu Ren} and Petr {\v S}imon and Hsieh, {Shu Kai} and Laurent Pr{\'e}vot",
note = "Publisher Copyright: {\textcopyright} 2007 Association for Computational Linguistics; 45th Annual Meeting of the Association for Computational Linguistics, ACL 2007 ; Conference date: 25-06-2007 Through 27-06-2007",
year = "2007",
month = jun,
language = "English",
series = "Proceedings of the Annual Meeting of the Association for Computational Linguistics",
publisher = "Association for Computational Linguistics (ACL)",
pages = "69--72",
editor = "Sophia Ananiadou",
booktitle = "Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics Companion Volume Proceedings of the Demo and Poster Sessions",
address = "United States",
}