1# © 2016 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html#License 3# 4# File: Zawgyi_my.txt 5# Generated from CLDR 6# 7 8# This transform converts Zawgyi "encoded" Burmese into proper 9# unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses 10# the Myanmar unicode range but assigns different characters or 11# glyphs to some codepoints. In addition to the character mapping, 12# there is reordering of codepoints needed to match the expected 13# unicode order. This reordering is context-based. 14# 15# This transform is done in two main stages: 16# (1) Map all Zawgyi codepoints to their Unicode counterpart. 17# (2) Perform reordering. 18# Modern Burmese digits & Unicode code points. 19$nondigits = [^\u1040-\u1049]; 20$consonant = [\u1000-\u1021]; 21$vowelsign = [\u102B-\u1030\u1032]; # Unicode vowel signs except E (1031) 22$umedial = [\u103B-\u103E]; # Medial codepoints in Unicode 23$vowelmedial = [\u102B-\u1030\u1032\u103B-\u103F]; # Union of vowel signs and medials 24$ukinzi = \u1004\u103A\u1039; # Codepoints representing kinzi in Unicode 25# ZAWGYI MYANMAR CONSONANT SIGN MEDIAL RA 26# This character has multiple representations in the Zawgyi font. 27$zmedialra = [\u103B\u107E-\u1084]; 28#### 29#### STAGE (1): CODEPOINT MAPPING FROM ZAWGYI TO UNICODE 30#### 31# Kinzi (predefined ligatures) 32# Move base character to the right 33($consonant) \u103A \u1064 → $ukinzi $1 \u103B; 34($consonant) \u1064 → $ukinzi $1; 35\u1064 → $ukinzi; 36# Special cases moving base character to right before 37($consonant) \u108b → $ukinzi $1 \u102D; 38($consonant) \u108C → $ukinzi $1 \u102E; 39($consonant) \u108D → $ukinzi $1 \u1036; 40# Special cases moving Kinzi block to left 41($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F; 42($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ; 43($consonant) \u103A \u108C \u1033 → $ukinzi $1 \u103B \u102E \u102F; 44($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ; 45($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ; 46($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ; 47\u108B → $ukinzi \u102D ; 48\u108C → $ukinzi \u102E ; 49\u108D → $ukinzi \u1036 ; 50# Consonants (only the ones that have to change) 51\u106A ($vowelsign) \u1038 → \u1025 $1 \u1038 ; # U sound 52\u106A → \u1009 ; # NYA 53\u106B → \u100A ; 54\u108F → \u1014 ; 55\u1090 → \u101B ; 56\u1086 → \u103F ; 57# yapin 58\u103A → \u103B ; 59\u107D → \u103B ; 60# wasway 61\u103C \u108A → \u103D \u103E; # To avoid duplicate medials 62\u103C → \u103D ; 63\u108A → \u103D \u103E ; 64# hatoh 65\u103D → \u103E ; 66\u1087 → \u103E ; 67\u1088 → \u103E \u102F ; 68\u1089 → \u103E \u1030 ; 69# Single diacritics with space - use non-breaking 70# TODO(ccornelius): determine if this breaks transliteration 71# asat 72\u1039 → \u103A ; 73# Vowels 74\u1033 → \u102F ; 75\u1034 → \u1030 ; 76\u105A → \u102B \u103A ; 77\u108E → \u102D \u1036 ; 78# lDot 79# Special cases to move dot to right of base consonant 80\u1031 \u1094 ($consonant) \u103D → $1 \u103E \u1031 \u1037 ; 81\u1094 → \u1037 ; 82\u1095 → \u1037 ; 83# Special cases for 1025 vs 1009 84\u1025 \u1061 → \u1009 \u1039 \u1001; 85\u1025 \u1062 → \u1009 \u1039 \u1002; 86\u1025 \u1065 → \u1009 \u1039 \u1005; 87\u1025 \u1068 → \u1009 \u1039 \u1007; 88\u1025 \u1076 → \u1009 \u1039 \u1013; 89\u1025 \u1078 → \u1009 \u1039 \u1015; 90\u1025 \u107A → \u1009 \u1039 \u1017; 91\u1025 \u1079 → \u1009 \u1039 \u1016; 92($consonant) \u103A \u1039 → $1 \u103A \u103B; 93# Stacked Consonants 94\u1060 → \u1039 \u1000 ; 95\u1061 → \u1039 \u1001 ; 96\u1062 → \u1039 \u1002 ; 97\u1063 → \u1039 \u1003 ; 98\u1065 → \u1039 \u1005 ; 99\u1066 → \u1039 \u1006 ; 100\u1067 → \u1039 \u1006 ; 101\u1068 → \u1039 \u1007 ; 102\u1069 → \u1039 \u1008 ; 103\u106C → \u1039 \u100B ; 104\u106D → \u1039 \u100C ; 105\u1070 → \u1039 \u100F ; 106\u1071 → \u1039 \u1010 ; 107\u1072 → \u1039 \u1010 ; 108\u1096 → \u1039 \u1010 \u103D; 109\u1073 → \u1039 \u1011 ; 110\u1074 → \u1039 \u1011 ; 111\u1075 → \u1039 \u1012 ; 112\u1076 → \u1039 \u1013 ; 113\u1077 → \u1039 \u1014 ; 114\u1078 → \u1039 \u1015 ; 115\u1079 → \u1039 \u1016 ; 116\u107A → \u1039 \u1017 ; 117\u107B → \u1039 \u1018 ; 118\u1093 → \u1039 \u1018 ; 119\u107C → \u1039 \u1019 ; 120\u1085 → \u1039 \u101C ; 121# Pre-defined ligatures 122\u106E → \u100D\u1039\u100D ; 123\u106F → \u100D\u1039\u100E ; 124\u1091 → \u100F\u1039\u100D ; 125\u1092 → \u100B\u1039\u100C ; 126\u1097 → \u100B\u1039\u100B ; 127\u104E → \u104E\u1004\u103A\u1038 ; 128# yayit 129$zmedialra → \u103C ; 130#### 131#### STAGE (2): POST REORDERING RULES FOR UNICODE RENDERING 132#### Now every codepoint is Unicode. This starts conversion 133#### from semi-visual order to logical order. 134#### 135::Null; 136# Case of MYANMAR digit being used instead of a letter 137\u1044 \u103a → | \u104E \u103A ; 138# Lone zero with diacritic mark 139\u1031 \u1040 ($nondigits) → \u1031 \u101D $1; 140($nondigits) \u1040 ([\u102B-\u103F]) → $1 \u101D $2; 141# cwc: Simpler replacements for Zawgyi 1025 142\u1025 \u103A → \u1009 \u103A; 143\u1025 \u102E → \u1026; 144# Asat and dot below reordering. 145\u1037\u103A → \u103A\u1037; 146# Reorder some vowel signs 147\u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ; 148([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1; 149# Move ra medial, but not others. 150\u103C ($consonant) → $1 \u103C; 151#### 152#### Stage 3 153#### Move \u1031, \u1036, and \u103C after consonants. 154::Null; 155# 1031 moved after consonant, with and without kinzi or medials 156([\u1031]+) $ukinzi ($consonant) → $ukinzi $2 $1; 157([\u1031]+) ($consonant) ($umedial+) → $2 $3 $1; 158([\u1031]+) ($consonant) } [^\u103B\u103C\u103D\u103E] → $2 $1; 159\u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C; 160\u1036 ($umedial+) → $1 \u1036; 161#### 162#### Stage 4 163#### Reordering medials, dot below, contractions, E sign, and asat. 164::Null; 165# Reorder the medials 166([\u103C\u103D\u103E]+) \u103B → \u103B $1; 167([\u103D\u103E]+) \u103C → \u103C $1; 168\u103E\u103D → \u103D\u103E ; 169# Contractions with vowel signs 170([\u1031]+) ($vowelsign*) \u1039 ($consonant) → \u1039 $3 $1 $2; 171($vowelsign+) \u1039 ($consonant) → \u1039 $2 $1; 172# Move vowel sign E \u1031 after medials, but not across consonants 173($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2; 174# Reorder dot below after medials and vowel diacritics 175\u1037 ([\u102D-\u1030\u1032\u1036]) → $1 \u1037; 176\u1037 ($umedial+) → $1 \u1037; 177# Move vowel signs after medials 178($vowelsign+) ($umedial+) → $2 $1; 179# Reorder modifiers and asat 180($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) → $1 \u103A $2 $3; 181#### 182#### Stage 5. More reorderings 183#### Vowel signs after medials, sort medials, 184#### 185::Null; 186([\u1031]+) ($umedial+) → $2 $1; 187# More moving vowel signs after medials 188($vowelsign) ($umedial) → $2 $1; 189# Sort the medials 190([\u103C\u103D\u103E]) \u103B → \u103B $1; 191([\u103D\u103E]) \u103C → \u103C $1; 192\u103E\u103D → \u103D\u103E ; 193# Move visarga (\u1038) after other signs 194\u1038 ([$vowelmedial]) → $1 \u1038; 195\u1038 ([\u1036\u1037\u103A]) → $1 \u1038; 196### 197### Stage 6 198### Finish medial sorting, fix conflicting and extra diacritics 199### 200::Null; 201# Fix 103B/103A order for asat. 202($consonant) \u103B \u103A → $1 \u103A \u103B; 203 204