@inproceedings{bf60d55bcc524deb9ad23a170c11870f,
title = "Curras + Baladi: Towards a Levantine Corpus",
abstract = "This paper presents two-fold contributions: a full revision of the Palestinian morphologically annotated corpus (Curras), and a newly annotated Lebanese corpus (Baladi). Both corpora can be used as a more general Levantine corpus. Baladi consists of around 9.6K morphologically annotated tokens. Each token was manually annotated with several morphological features and using LDC's SAMA lemmas and tags. The inter-annotator evaluation on most features illustrates 78.5\% Kappa and 90.1\% F1-Score. Curras was revised by refining all annotations for accuracy, normalization and unification of POS tags, and linking with SAMA lemmas. This revision was also important to ensure that both corpora are compatible and can help to bridge the nuanced linguistic gaps that exist between the two highly mutually intelligible dialects. Both corpora are publicly available through a web portal.",
keywords = "Annotated Corpus, Arabic Dialect, Arabic morphology, Lebanese Arabic, Levantine, Palestinian Arabic",
author = "\{El Haff\}, Karim and Mustafa Jarrar and Tymaa Hammouda and Fadi Zaraket",
note = "Publisher Copyright: {\textcopyright} European Language Resources Association (ELRA), licensed under CC-BY-NC-4.0.; 13th International Conference on Language Resources and Evaluation Conference, LREC 2022 ; Conference date: 20-06-2022 Through 25-06-2022",
year = "2022",
language = "English",
series = "2022 Language Resources and Evaluation Conference, LREC 2022",
publisher = "European Language Resources Association (ELRA)",
pages = "769--778",
editor = "Nicoletta Calzolari and Frederic Bechet and Philippe Blache and Khalid Choukri and Christopher Cieri and Thierry Declerck and Sara Goggi and Hitoshi Isahara and Bente Maegaard and Joseph Mariani and Helene Mazo and Jan Odijk and Stelios Piperidis",
booktitle = "2022 Language Resources and Evaluation Conference, LREC 2022",
}