1# © 2016 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html#License 3# 4# File: Latn_Kana.txt 5# Generated from CLDR 6# 7 8# note: a global filter is more efficient, but MUST include all source chars 9#:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ; 10# MINIMAL FILTER GENERATED FOR: Latin-Katakana 11### WARNING -- must add width filter, both here and below!!! ### 12:: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ; 13:: [:Latin:] fullwidth-halfwidth (); 14:: NFD (NFC); 15:: Lower (); # whenever transliterating from cased to uncased script, include this 16# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese 17# Uses modified Hepburn. Small changes to make unambiguous. 18# | Kunrei-shiki: Hepburn/MHepburn 19# | ------------------------------ 20# | si: shi 21# | si ~ya: sha 22# | si ~yu: shu 23# | si ~yo: sho 24# | zi: ji 25# | zi ~ya: ja 26# | zi ~yu: ju 27# | zi ~yo: jo 28# | ti: chi 29# | ti ~ya: cha 30# | ti ~yu: chu 31# | ti ~yu: cho 32# | tu: tsu 33# | di: ji/dji 34# | du: zu/dzu 35# | hu: fu 36# | For foreign words: 37# | ----------------- 38# | se ~i si 39# | si ~e she 40# | 41# | ze ~i zi 42# | zi ~e je 43# | 44# | te ~i ti 45# | ti ~e che 46# | te ~u tu 47# | 48# | de ~i di 49# | de ~u du 50# | de ~i di 51# | 52# | he ~u: hu 53# | hu ~a fa 54# | hu ~i fi 55# | hu ~e he 56# | hu ~o ho 57# Most small forms are generated, but if necessary 58# explicit small forms are given with ~a, ~ya, etc. 59#------------------------------------------------------ 60# Variables 61$vowel = [aeiou] ; 62$consonant = [bcdfghjklmnpqrstvwxyz] ; 63$macron = \u0304 ; 64# Variables used for doubled-consonants with tsu 65$kana = [ぁ-ゔ] ; 66$voice = [\u3099゛]; 67$semivoice = [\u309A゜]; 68$k_start = [カキクケコかきくけこ] ; 69$s_start = [サシスセソさしすせそ] ; 70$j_start = [シし] $voice ; 71$t_start = [タチツテトたちつてと] ; 72$n_start = [ナニヌネノンなにぬねの] ; 73$h_start = [ハヒヘホはひへほ] ; 74$f_start = [フふ] ; 75$m_start = [マミムメモまみむめも] ; 76$y_start = [ヤユヨやゆよ] ; 77$r_start = [ラリルレロらりるれろ] ; 78$w_start = [ワヰヱヲわゐゑを] ; 79$v_start = [ワヰヱヲ]\u3099 ; 80$voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ; 81# if ン is followed by $n_quoter, then it needs an 82# apostrophe after its romaji form to disambiguate it. 83# e.g., ン ア ! = ナ, so represent as "n'a", not "na". 84$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ; 85$small_y = [ャィュェョ] ; 86$iteration = ゝ ; 87#------------------------------------------------------ 88# katakana rules 89# Punctuation 90'.' ↔ 。; 91',' ↔ 、; 92# ' ' } [a-z] → ; # delete spaces before latin 93# ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana 94# Iteration Mark 95# Copy previous letter § marks 96# TODO 97# | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration 98# Specials for katakana -- not shared with hiragana 99va ↔ ワ\u3099 ; 100vi ↔ ヰ\u3099 ; 101ve ↔ ヱ\u3099 ; 102vo ↔ ヲ\u3099 ; 103'~ka' ↔ ヵ ; 104'~ke' ↔ ヶ ; 105# ~~~ begin shared rules ~~~ 106#special 107ya ← '~'ャ; 108yi ← '~'ィ ; 109yu ← '~'ュ; 110ye ← '~'ェ; 111yo ← '~'ョ; 112#normal 113a ↔ ア ; 114b | '~' ← ヒ \u3099} $small_y ; 115by } $vowel → ヒ\u3099 | '~y' ; 116ba ↔ ハ\u3099 ; 117bi ↔ ヒ\u3099 ; 118bu ↔ フ\u3099 ; 119be ↔ ヘ\u3099 ; 120bo ↔ ホ\u3099 ; 121c } i → | s ; 122c } e → | s ; 123da ↔ タ\u3099 ; 124di ↔ テ\u3099ィ ; 125du ↔ テ\u3099ゥ ; 126de ↔ テ\u3099 ; 127do ↔ ト\u3099 ; 128dzu ↔ ツ\u3099 ; 129dja ← チ\u3099ャ ; 130dji'~i' ← チ\u3099ィ ; # liu 131dju ← チ\u3099ュ ; 132dje ← チ\u3099ェ ; 133djo ← チ\u3099ョ ; 134dji ↔ チ\u3099 ; 135dj } $vowel → チ\u3099 | '~y' ; 136# TODO: QUESTION: use ĵĴżŻ instead of dj, dz 137cha ← チャ ; 138chi'~i' ← チィ ; # liu 139chu ← チュ ; 140che ← チェ ; 141cho ← チョ ; 142chi ↔ チ ; 143ch } $vowel → チ | '~y' ; 144e ↔ エ ; 145g | '~' ← キ\u3099} $small_y ; 146gy } $vowel → キ\u3099 | '~y' ; 147ga ↔ カ\u3099 ; 148gi ↔ キ\u3099 ; 149gu ↔ ク\u3099 ; 150ge ↔ ケ\u3099 ; 151go ↔ コ\u3099 ; 152i ↔ イ ; 153# j } $vowel → シ\u3099 | '~y' ; 154ja ↔ シ\u3099ャ ; 155ji'~i' ← シ\u3099ィ ; # liu 156ju ↔ シ\u3099ュ ; 157je ↔ シ\u3099ェ ; 158jo ↔ シ\u3099ョ ; 159ji ↔ シ\u3099 ; 160k | '~' ← キ} $small_y ; 161ky } $vowel → キ | '~y' ; 162ka ↔ カ ; 163ki ↔ キ ; 164ku ↔ ク ; 165ke ↔ ケ ; 166ko ↔ コ ; 167m | '~' ← ミ} $small_y ; 168my } $vowel → ミ | '~y' ; 169ma ↔ マ ; 170mi ↔ ミ ; 171mu ↔ ム ; 172me ↔ メ ; 173mo ↔ モ ; 174m } [pbfv] → ン ; 175n | '~' ← ニ } $small_y ; 176ny } $vowel → ニ | '~y' ; 177na ↔ ナ ; 178ni ↔ ニ ; 179nu ↔ ヌ ; 180ne ↔ ネ ; 181no ↔ ノ ; 182o ↔ オ ; 183p | '~' ← ヒ\u309A } $small_y ; 184py } $vowel → ヒ\u309A | '~y' ; 185pa ↔ ハ\u309A ; 186pi ↔ ヒ\u309A ; 187pu ↔ フ\u309A ; 188pe ↔ ヘ\u309A ; 189po ↔ ホ\u309A ; 190h | '~' ← ヒ } $small_y ; 191hy } $vowel → ヒ | '~y' ; 192ha ↔ ハ ; 193hi ↔ ヒ ; 194hu ↔ ヘゥ ; 195he ↔ ヘ ; 196ho ↔ ホ ; 197# f | '~' ← フ } $small_y ; 198# f } $vowel → フ | '~' ; 199fa ↔ ファ ; 200fi ↔ フィ ; 201fe ↔ フェ ; 202fo ↔ フォ ; 203fu ↔ フ ; 204r | '~' ← リ } $small_y ; 205ry } $vowel → リ | '~y' ; 206ra ↔ ラ ; 207ri ↔ リ ; 208ru ↔ ル ; 209re ↔ レ ; 210ro ↔ ロ ; 211za ↔ サ\u3099 ; 212zi ↔ セ\u3099ィ ; 213zu ↔ ス\u3099 ; 214ze ↔ セ\u3099 ; 215zo ↔ ソ\u3099 ; 216sa ↔ サ ; 217si ↔ セィ ; 218su ↔ ス ; 219se ↔ セ ; 220so ↔ ソ ; 221sha ← シャ ; 222shi'~i' ← シィ ; # liu 223shu ← シュ ; 224she ← シェ ; 225sho ← ショ ; 226shi ↔ シ ; 227sh } $vowel → シ | '~y' ; 228ta ↔ タ ; 229ti ↔ ティ ; 230tu ↔ テゥ ; 231te ↔ テ ; 232to ↔ ト ; 233tsu ↔ ツ ; 234# v } $vowel → ウ\u3099 | '~' ; 235#'v~a' ← ウ\u3099ァ ; # liu 236#'v~i' ← ウ\u3099ィ ; # liu 237#'v~e' ← ウ\u3099ェ ; # liu 238#'v~o' ← ウ\u3099ォ ; # liu 239vu ↔ ウ\u3099 ; 240u ↔ ウ ; 241# w } $vowel → ウ | '~' ; 242wa ↔ ワ ; 243wi ↔ ヰ ; 244wu → ウ ; 245we ↔ ヱ ; 246wo ↔ ヲ ; 247ya ↔ ヤ ; 248yi → イ ; 249yu ↔ ユ ; 250ye → エ ; 251yo ↔ ヨ ; 252# double consonants 253#specials 254s } sh → ッ ; 255t } ch → ッ ; 256#voiced 257j } j ↔ ッ } $j_start ; 258b } b ↔ ッ } [$h_start$f_start] $voice; 259d } d ↔ ッ } $t_start $voice; 260g } g ↔ ッ } $k_start $voice; 261p } p ↔ ッ } [$h_start$f_start] $semivoice; 262# v } v ↔ ッ } [ワヰウヱヲう] $voice ; 263z } z ↔ ッ } $s_start $voice; 264v } v ↔ ッ } $v_start; 265# normal 266k } k ↔ ッ } $k_start ; 267m } m ↔ ッ } $m_start ; 268n } n ↔ ッ } $n_start ; 269h } h ↔ ッ } $h_start ; 270f } f ↔ ッ } $f_start ; 271r } r ↔ ッ } $r_start ; 272t } t ↔ ッ } $t_start ; 273s } s ↔ ッ } $s_start ; 274w } w ↔ ッ } $w_start; 275y } y ↔ ッ } $y_start; 276# completeness 277x } x → ッ ; 278c } k → ッ ; 279c } c → ッ ; 280c } q → ッ ; 281l } l → ッ ; 282q } q → ッ ; 283# y } y → ッ ; 284# w } w → ッ ; 285# prolonged vowel mark. this indicates a doubling of 286# the preceding vowel sound 287#a ← a { ー ; # liu 288#e ← e { ー ; # liu 289#i ← i { ー ; # liu 290#o ← o { ー ; # liu 291#u ← u { ー ; # liu 292$macron ↔ ー ; 293# small forms 294'~a' ↔ ァ ; 295'~i' ↔ ィ ; 296'~u' ↔ ゥ ; 297'~e' ↔ ェ ; 298'~o' ↔ ォ ; 299'~tsu' ↔ ッ ; 300'~wa' ↔ ヮ ; 301'~ya' ↔ ャ ; 302'~yi' → ィ ; 303'~yu' ↔ ュ ; 304'~ye' → ェ ; 305'~yo' ↔ ョ ; 306# iteration marks 307# TODO: make more accurate 308j $1 ← sh (y* $vowel) {ヽ$voice ; 309dj $1 ← ch (y* $vowel) {ヽ$voice ; 310dz $1 ← ts (y* $vowel) {ヽ$voice ; 311g $1 ← k (y* $vowel) {ヽ$voice ; 312z $1 ← s (y* $vowel) {ヽ$voice ; 313d $1 ← t (y* $vowel) {ヽ$voice ; 314h $1 ← b (y* $vowel) {ヽ$voice ; 315v $1 ← w (y* $vowel) {ヽ$voice ; 316sh $1 ← sh (y* $vowel) {ヽ$voice ; 317j $1 ← j (y* $vowel) {ヽ$voice ; 318ch $1 ← ch (y* $vowel) {ヽ$voice ; 319dj $1 ← dj(y* $vowel) {ヽ$voice ; 320ts $1 ← ts (y* $vowel) {ヽ$voice ; 321dz $1 ← dz (y* $vowel) {ヽ$voice ; 322$1 ← ($consonant y* $vowel) {ヽ$voice? ; 323$1 ← (.) {ヽ $voice? ; # otherwise repeat last character 324← ヽ $voice? ; # delete if no characters found 325# h- rule: lengthens vowel if not followed by a vowel. 326# At the point this is applied, latin [cons]?vowel sequences 327# have been converted to katakana in NFD form. 328$voweled_basekana [\u3099 \u309A]? { h → ー ; 329# one-way latin- → kana rules. these do not occur in 330# well-formed romaji representing actual japanese text. 331# their purpose is to make all romaji map to kana of 332# some sort. 333# the following are not really necessary, but produce 334# slightly more natural results. 335cy → セィ ; 336dy → テ\u3099ィ ; 337hy → ヒ ; 338sy → セィ ; 339ty → ティ ; 340zy → セ\u3099ィ ; 341h → ヘ ; 342# isolated consonants listed here so as not to mask 343# longer rules above. 344ch → チ; 345sh → シ ; 346dz → ツ\u3099 ; 347dj → チ\u3099; 348b → フ\u3099 ; 349d → テ\u3099 ; 350g → ク\u3099 ; 351k → ク ; 352m → ム ; 353n'' ← ン } $n_quoter ; 354n ↔ ン ; 355p → フ\u309A ; 356r → ル ; 357s → ス ; 358t → テ ; 359y → イ ; 360z → ス\u3099 ; 361v → ウ\u3099 ; 362f → フ; 363j → シ\u3099; 364w → ウ; 365ß → | ss ; 366æ → | e ; 367ð → | d ; 368ø → | u ; 369þ → | th ; 370# simple substitutions using backup 371c → | k ; 372l → | r ; 373q → | k ; 374x → | ks ; 375# ~~~ END shared rules ~~~ 376#------------------------------------------------------ 377# Final cleanup 378'~' → ; # delete stray tildes between letters 379[:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters 380# [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use 381:: NFC (NFD) ; 382:: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth); 383# note: a global filter is more efficient, but MUST include all source chars!! 384#:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]); 385# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD 386:: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ; 387# eof 388 389