common/transforms/sat_Olck-sat_FONIPA.xml

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
<!--
Copyright © 1991-2015 Unicode, Inc.
CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
For terms of use, see http://www.unicode.org/copyright.html
-->
<supplementalData>
	<version number="$Revision: 12347 $"/>
	<transforms>
		<transform source="sat_Olck" target="sat_FONIPA" direction="forward" alias="sat-fonipa-t-sat-olck">
			<tRule><![CDATA[
# Santali (Ol Chiki) → Santali (International Phonetic Alphabet)


# Output
# ------
# m mː n nː ɳ ɳː ɲ ɲː ŋ ŋː
# p pʰ pʼ b bʰ t tʰ tʼ d dʰ ʈ ʈʰ ɖ ɖʰ c cʰ cʼ k kʰ kʼ ɡ ʔ
# s sː h
# d͡ʒ
# ɽ r
# l lː
# w wː w̃ w̃ː
#
# i iː ĩ ĩː u uː ũ ũː
# e eː ẽ ẽː ə əː ə̃ ə̃ː o oː õ õː
# ɛ ɛː ɛ̃ ɛ̃ː ɔ ɔː ɔ̃ ɔ̃ː
# a aː ã ãː


# References
# ----------
# [1] Michael Everson: Final proposal to encode the Ol Chiki script
#     in the UCS.  ISO/IEC JTC1/SC2/WG2 Working Group Document N2984R,
#     September 21, 2005.  http://std.dkuug.dk/jtc1/sc2/wg2/docs/n2984.pdf
#
# [2] George L. Campbell: Compendium of the World's Languages.
#     Volume 2: Ladakhi to Zuni. ISBN 0-415-20297-3.  Taylor & Francis, 2000.
#     Pages 1454 to 1458.


# Notes
# -----
# According to [1] (page 3), ᱽ can only follow the four ejective
# consonants ᱵ /pʼ/, ᱡ /cʼ/, ᱫ /tʼ/, and ᱜ /kʼ/; these become
# ᱵᱽ /b/, ᱫᱽ /d/, ᱡᱽ /d͡ʒ/, and ᱜᱽ /ɡ/.  In online texts, however,
# we have occasionally encountered ᱽ following non-ejective plosives,
# for example after ᱯ /p/. These might possibly be typos.  Our rules
# try to be resilient and handle ᱯᱽ as /b/.
#
# According to [1] (page 2), U+1C7C PHAARKAA follows the four “glottal”
# consonants ᱵ /pʼ/, ᱡ /cʼ/, ᱫ /tʼ/, and ᱜ /kʼ/ (these are actually
# ejective, not glottal).  In online texts, however, we have frequently
# encountered ᱼ following non-ejective consonants.

$inword = [[:L:][:M:]];

# Some online texts use a decomposed form of U+1C7A MU-GAAHLAA TTUDDAG.
ᱹᱸ → ᱺ ;
ᱸᱹ → ᱺ ;
::null();

# To simplify the rules below, enforce a uniform ordering of marks.
ᱻᱹ → ᱹᱻ ;
ᱻᱸ → ᱸᱻ ;
ᱻᱺ → ᱺᱻ ;
ᱼᱹ → ᱹᱼ ;
ᱼᱸ → ᱸᱼ ;
ᱼᱺ → ᱺᱼ ;
::null();

# Some online texts use U+1C7C PHAARKAA instead of U+1C7B RELAA for indicating
# long phonemes, presumably because the graphemes look similar in some fonts.
# Since phaarkaa is used for voicing ejectives and plosives (which cannot
# be lenghtened), we rewrite phaarkaa to relaa.
[ᱚᱟᱤᱩᱮᱳᱶᱢᱝᱞᱱ] [ᱹᱸᱺ]* {ᱼ} → ᱻ ;
::null();

ᱚᱹᱻ → ɔː ;
ᱚᱹ → ɔ ;
ᱚᱸᱻ → ɔ̃ː ;
ᱚᱸ → ɔ̃ ;
ᱚᱺᱻ → ɔ̃ː ;
ᱚᱺ → ɔ̃ ;
ᱚᱻ → ɔː ;
ᱚ → ɔ ;

ᱛᱼ → t ;
ᱛᱷ → tʰ ;
ᱛᱽ → d ;
$inword {ᱛ} → d ;
ᱛ → t ;

ᱜᱼ → kʼ ;
ᱜᱷ → kʰ ;
ᱜᱽ → ɡ ;
$inword {ᱜ} → ɡ ;
ᱜ → kʼ ;

ᱝᱻ → ŋː ;
ᱝ → ŋ ;

ᱞᱻ → lː ;
ᱞ → l ;

ᱟᱹᱻ → əː ;
ᱟᱹ → ə ;
ᱟᱸᱻ → ãː ;
ᱟᱸ → ã ;
ᱟᱺᱻ → ə̃ː ;
ᱟᱺ → ə̃ ;
ᱟᱻ → aː ;
ᱟ → a ;

ᱠᱼ → k ;
ᱠᱷ → kʰ ;
ᱠᱽ → ɡ ;
ᱠ → k ;

ᱡᱼ → cʼ ;
ᱡᱷ → cʰ ;
ᱡᱽ →  d͡ʒ ;
$inword {ᱡ} →  d͡ʒ ;
ᱡ → cʼ ;

ᱢᱻ → mː ;
ᱢ → m ;

# According to [1], ᱣ is sometimes /v/ and sometimes /w/.
# TODO: Find out if there is a rule for this.
ᱣᱸ → w̃ ;
ᱣ → w ;

ᱤᱹᱻ → iː ;
ᱤᱹ → i ;
ᱤᱸᱻ → ĩː ;
ᱤᱸ → ĩ ;
ᱤᱺᱻ → ĩː ;
ᱤᱺ → ĩ ;
ᱤᱻ → iː ;
ᱤ → i ;

ᱥᱻ → sː ;
ᱥ → s ;

# According to [1], ᱦ is sometimes /h/ and sometimes /ʔ/.
# TODO: Find out if there is a rule for this.
ᱦ → h ;

ᱧᱻ → ɲː ;
ᱧ → ɲ ;

ᱨᱻ → r ;
ᱨ → r ;

ᱩᱹᱻ → uː ;
ᱩᱹ → u ;
ᱩᱸᱻ → ũː ;
ᱩᱸ → ũ ;
ᱩᱺᱻ → ũː ;
ᱩᱺ → ũ ;
ᱩᱻ → uː ;
ᱩ → u ;

ᱪᱼ → c ;
ᱪᱷ → cʰ ;
ᱪᱽ →  d͡ʒ ;
ᱪ → c ;

ᱫᱼ → tʼ ;
ᱫᱷ → tʰ ;
ᱫᱽ → d ;
$inword {ᱫ} → d ;
ᱫ → tʼ ;

ᱬᱻ → ɳː ;
ᱬ → ɳ ;

# TODO: ᱵᱷᱭᱨᱚᱵ → bʰhrɔb seems unlikely; would be good to verify.
ᱭ → h ;

ᱮᱹᱻ → ɛː ;
ᱮᱹ → ɛ ;
ᱮᱺᱻ → ɛ̃ː ;
ᱮᱺ → ɛ̃ ;
ᱮᱸᱻ → ẽː ;
ᱮᱸ → ẽ ;
ᱮᱻ → eː ;
ᱮ → e ;

ᱯᱼ → p ;
ᱯᱷ → pʰ ;
ᱯᱽ → b ;
ᱯ → p ;

ᱰᱷ → ɖʰ ;
ᱰ → ɖ ;

ᱱᱻ → nː ;
ᱱ → n ;

ᱲᱻ → ɽ ;
ᱲ → ɽ ;

ᱳᱸᱻ → õː ;
ᱳᱸ → õ ;
ᱳᱻ → oː ;
ᱳ → o ;

ᱴᱼ → ʈ ;
ᱴᱷ → ʈʰ ;
ᱴᱽ → ɖ ;
ᱴ → ʈ ;

ᱵᱼ → pʼ ;
ᱵᱷ → bʰ ;
ᱵᱽ → b ;
$inword {ᱵ} → b ;
ᱵ → pʼ ;

ᱶᱻ → w̃ː ;
ᱶ → w̃ ;

			]]></tRule>
		</transform>
	</transforms>
</supplementalData>