@inproceedings{a26912cfa0674ebea0544539e24d2bbd,
title = "NusaX: Multilingual Parallel Sentiment Dataset for 10 Indonesian Local Languages",
abstract = "Natural language processing (NLP) has significant impact on society via technologies such as machine translation and search engines. Despite its success, NLP technology is only widely available for high-resource languages such as English and Mandarin Chinese, and remains inaccessible to many languages due to the unavailability of data resources and benchmarks. In this work, we focus on developing resources for languages of Indonesia. Despite being the second most linguistically-diverse country, most languages in Indonesia are categorized as endangered and some are even extinct. We develop the first-ever parallel resource for 10 low-resource languages in Indonesia. Our resource includes sentiment and machine translation datasets, and bilingual lexicons. We provide extensive analysis, and describe challenges for creating such resources. Our hope is that this work will spark more NLP research on Indonesian and other underrepresented languages.",
author = "Winata, {Genta Indra} and Aji, {Alham Fikri} and Samuel Cahyawijaya and Rahmad Mahendra and Fajri Koto and Ade Romadhony and Kemal Kurniawan and David Moeljadi and Prasojo, {Radityo Eko} and Pascale Fung and Timothy Baldwin and Lau, {Jey Han} and Rico Sennrich and Sebastian Ruder",
note = "Funding Information: We thank Dea Adhista and all annotators who helped us in building the corpus. We are grateful to Alexander Gutkin and Xinyu Hua for feedback on a draft of this manuscript. This work has been partially funded by Kata.ai (001/SD/YGI-NLP/1/2022) and PF20-43679 Hong Kong PhD Fellowship Scheme, Research Grant Council, Hong Kong. Publisher Copyright: {\textcopyright} 2023 Association for Computational Linguistics.; 17th Conference of the European Chapter of the Association for Computational Linguistics, EACL 2023 ; Conference date: 02-05-2023 Through 06-05-2023",
year = "2023",
language = "English",
series = "EACL 2023 - 17th Conference of the European Chapter of the Association for Computational Linguistics, Proceedings of the Conference",
publisher = "Association for Computational Linguistics (ACL)",
pages = "815--834",
booktitle = "EACL 2023 - 17th Conference of the European Chapter of the Association for Computational Linguistics, Proceedings of the Conference",
address = "United States",
}