1 /** 2 ******************************************************************************* 3 * Copyright (C) 2002-2010, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8 9 package com.ibm.icu.dev.tool.layout; 10 11 import com.ibm.icu.lang.UCharacter; 12 import com.ibm.icu.lang.UScript; 13 import com.ibm.icu.text.UTF16; 14 import com.ibm.icu.text.UnicodeSet; 15 16 /** 17 * @author Eric Mader 18 * 19 * Notes: 20 * 21 * The property \p{Decomposition_Type=Canonical} will match all characters with a canonical 22 * decomposition. 23 * 24 * So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]" 25 * will match all Latin, Greek and Cyrillic characters with a canonical decomposition. 26 * 27 * Are these three scripts enough? Do we want to collect them all at once and distribute by script, 28 * or process them one script at a time. It's probably a good idea to build a single table for 29 * however many scripts there are. 30 * 31 * It might be better to collect all the characters that have a canonical decomposition and just 32 * sort them into however many scripts there are... unless we'll get characters in COMMON??? 33 */ 34 public class CanonGSUBBuilder 35 { convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)36 static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable) 37 { 38 int leftType = ArabicShaping.VALUE_NONE; 39 int rightType = ArabicShaping.VALUE_NONE; 40 41 switch (type) { 42 case UCharacter.DecompositionType.ISOLATED: 43 break; 44 45 case UCharacter.DecompositionType.FINAL: 46 rightType = ArabicShaping.VALUE_LEFT; 47 break; 48 49 case UCharacter.DecompositionType.INITIAL: 50 leftType = ArabicShaping.VALUE_RIGHT; 51 break; 52 53 case UCharacter.DecompositionType.MEDIAL: 54 rightType = ArabicShaping.VALUE_LEFT; 55 leftType = ArabicShaping.VALUE_RIGHT; 56 break; 57 58 default: 59 return decomp + UCharacter.toString(ligature); 60 } 61 62 char[] chars = decomp.toCharArray(); 63 64 ArabicShaping.shape(chars, leftType, rightType, isolClassTable); 65 66 return new String(chars) + UCharacter.toString(ligature); 67 } 68 buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable, ClassTable finaClassTable, ClassTable isolClassTable)69 static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable, 70 ClassTable finaClassTable, ClassTable isolClassTable) 71 { 72 System.out.print("Finding Arabic contextual forms... "); 73 74 for (int i = 0; i < data.countRecords(); i += 1) { 75 ArabicCharacterData.Record record = data.getRecord(i); 76 String decomposition = record.getDecomposition(); 77 78 if (decomposition != null && decomposition.length() == 1) { 79 int contextual = record.getCodePoint(); 80 int isolated = UTF16.charAt(record.getDecomposition(), 0); 81 82 switch (record.getDecompositionType()) { 83 case UCharacter.DecompositionType.INITIAL: 84 initClassTable.addMapping(isolated, contextual); 85 break; 86 87 case UCharacter.DecompositionType.MEDIAL: 88 mediClassTable.addMapping(isolated, contextual); 89 break; 90 91 case UCharacter.DecompositionType.FINAL: 92 finaClassTable.addMapping(isolated, contextual); 93 break; 94 95 case UCharacter.DecompositionType.ISOLATED: 96 isolClassTable.addMapping(isolated, contextual); 97 break; 98 99 default: 100 // issue some error message? 101 break; 102 } 103 } 104 } 105 106 System.out.println("Done."); 107 } 108 buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)109 static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable) 110 { 111 LigatureTree contextualTree = new LigatureTree(); 112 int ligatureCount = 0; 113 114 System.out.print("Building Arabic ligature tree... "); 115 116 for (int i = 0; i < data.countRecords(); i += 1) { 117 ArabicCharacterData.Record record = data.getRecord(i); 118 String decomposition = record.getDecomposition(); 119 120 if (decomposition != null && decomposition.length() > 1) { 121 int ligature = record.getCodePoint(); 122 int decompType = record.getDecompositionType(); 123 124 switch (decompType) { 125 case UCharacter.DecompositionType.FINAL: 126 case UCharacter.DecompositionType.INITIAL: 127 case UCharacter.DecompositionType.MEDIAL: 128 case UCharacter.DecompositionType.ISOLATED: 129 contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable)); 130 ligatureCount += 1; 131 break; 132 133 case UCharacter.DecompositionType.CANONICAL: 134 //cannonicalTree.insert(decomposition + UCharacter.toString(ligature)); 135 break; 136 } 137 } 138 } 139 140 System.out.println(ligatureCount + " ligatures."); 141 142 return contextualTree; 143 } 144 145 static final int SIMPLE_GLYPH = 1; 146 static final int LIGATURE_GLYPH = 2; 147 static final int MARK_GLYPH = 3; 148 static final int COMPONENT_GLYPH = 4; 149 150 static final int categoryClassMap[] = { 151 0, // UNASSIGNED 152 SIMPLE_GLYPH, // UPPERCASE_LETTER 153 SIMPLE_GLYPH, // LOWERCASE_LETTER 154 SIMPLE_GLYPH, // TITLECASE_LETTER 155 SIMPLE_GLYPH, // MODIFIER_LETTER 156 SIMPLE_GLYPH, // OTHER_LETTER 157 MARK_GLYPH, // NON_SPACING_MARK 158 MARK_GLYPH, // ENCLOSING_MARK ?? 159 MARK_GLYPH, // COMBINING_SPACING_MARK ?? 160 SIMPLE_GLYPH, // DECIMAL_NUMBER 161 SIMPLE_GLYPH, // LETTER_NUMBER 162 SIMPLE_GLYPH, // OTHER_NUMBER; 163 0, // SPACE_SEPARATOR 164 0, // LINE_SEPARATOR 165 0, // PARAGRAPH_SEPARATOR 166 0, // CONTROL 167 0, // FORMAT 168 0, // PRIVATE_USE 169 0, // SURROGATE 170 SIMPLE_GLYPH, // DASH_PUNCTUATION 171 SIMPLE_GLYPH, // START_PUNCTUATION 172 SIMPLE_GLYPH, // END_PUNCTUATION 173 SIMPLE_GLYPH, // CONNECTOR_PUNCTUATION 174 SIMPLE_GLYPH, // OTHER_PUNCTUATION 175 SIMPLE_GLYPH, // MATH_SYMBOL; 176 SIMPLE_GLYPH, // CURRENCY_SYMBOL 177 SIMPLE_GLYPH, // MODIFIER_SYMBOL 178 SIMPLE_GLYPH, // OTHER_SYMBOL 179 SIMPLE_GLYPH, // INITIAL_PUNCTUATION 180 SIMPLE_GLYPH // FINAL_PUNCTUATION 181 }; 182 getGlyphClass(ArabicCharacterData.Record record)183 static int getGlyphClass(ArabicCharacterData.Record record) 184 { 185 String decomp = record.getDecomposition(); 186 187 if (decomp != null && decomp.length() > 1) { 188 return LIGATURE_GLYPH; 189 } 190 191 return categoryClassMap[record.getGeneralCategory()]; 192 } 193 addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)194 static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable) 195 { 196 System.out.print("Adding Arabic glyph classes... "); 197 198 for (int i = 0; i < data.countRecords(); i += 1) { 199 ArabicCharacterData.Record record = data.getRecord(i); 200 classTable.addMapping(record.getCodePoint(), getGlyphClass(record)); 201 } 202 203 System.out.println("Done."); 204 } 205 buildArabicTables(ScriptList scriptList, FeatureList featureList, LookupList lookupList, ClassTable classTable)206 private static void buildArabicTables(ScriptList scriptList, FeatureList featureList, 207 LookupList lookupList, ClassTable classTable) { 208 // TODO: Might want to have the ligature table builder explicitly check for ligatures 209 // which start with space and tatweel rather than pulling them out here... 210 UnicodeSet arabicBlock = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]"); 211 UnicodeSet oddLigatures = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]"); 212 UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]"); 213 ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures)); 214 215 addArabicGlyphClasses(arabicData, classTable); 216 217 ClassTable initClassTable = new ClassTable(); 218 ClassTable mediClassTable = new ClassTable(); 219 ClassTable finaClassTable = new ClassTable(); 220 ClassTable isolClassTable = new ClassTable(); 221 222 buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable); 223 isolClassTable.snapshot(); 224 LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable); 225 226 LigatureTreeWalker ligaWalker = new LigatureTreeWalker(); 227 228 ligaTree.walk(ligaWalker); 229 230 Lookup initLookup, mediLookup, finaLookup, ligaLookup; 231 232 initLookup = new Lookup(Lookup.GSST_Single, 0); 233 initLookup.addSubtable(initClassTable); 234 235 mediLookup = new Lookup(Lookup.GSST_Single, 0); 236 mediLookup.addSubtable(mediClassTable); 237 238 finaLookup = new Lookup(Lookup.GSST_Single, 0); 239 finaLookup.addSubtable(finaClassTable); 240 241 ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks); 242 ligaLookup.addSubtable(ligaWalker); 243 244 Feature init = new Feature("init"); 245 Feature medi = new Feature("medi"); 246 Feature fina = new Feature("fina"); 247 Feature liga = new Feature("liga"); 248 249 init.addLookup(lookupList.addLookup(initLookup)); 250 medi.addLookup(lookupList.addLookup(mediLookup)); 251 fina.addLookup(lookupList.addLookup(finaLookup)); 252 liga.addLookup(lookupList.addLookup(ligaLookup)); 253 254 featureList.addFeature(init); 255 featureList.addFeature(medi); 256 featureList.addFeature(fina); 257 featureList.addFeature(liga); 258 259 scriptList.addFeature("arab", "(default)", init); 260 scriptList.addFeature("arab", "(default)", medi); 261 scriptList.addFeature("arab", "(default)", fina); 262 scriptList.addFeature("arab", "(default)", liga); 263 264 System.out.println(); 265 } 266 buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)267 public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree) 268 { 269 int ligatureCount = 0; 270 271 System.out.print("building composition ligature tree for " + UScript.getName(script) + "... "); 272 273 for (int i = 0; i < data.countRecords(script); i += 1) { 274 CanonicalCharacterData.Record record = data.getRecord(script, i); 275 String composed = UCharacter.toString(record.getComposedCharacter()); 276 277 for (int e = 0; e < record.countEquivalents(); e += 1) { 278 String equivalent = record.getEquivalent(e); 279 280 ligatureTree.insert(equivalent + composed); 281 ligatureCount += 1; 282 } 283 } 284 285 System.out.println(ligatureCount + " ligatures."); 286 } 287 buildDecompTables(CanonicalCharacterData data, int script)288 public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script) 289 { 290 int maxDecompCount = data.getMaxEquivalents(script); 291 DecompTable[] decompTables = new DecompTable[maxDecompCount]; 292 293 System.out.print("Building decompositon tables for " + UScript.getName(script) + 294 "... total decompositions: " + data.countRecords(script) + 295 ", max: " + maxDecompCount + "..."); 296 297 for (int i = 0; i < maxDecompCount; i += 1) { 298 DecompTable table = new DecompTable(); 299 300 for (int r = 0; r < data.countRecords(script); r += 1) { 301 CanonicalCharacterData.Record record = data.getRecord(script, r); 302 303 if (record.countEquivalents() > i) { 304 table.add(record.getComposedCharacter(), record.getEquivalent(i)); 305 } 306 } 307 308 decompTables[i] = table; 309 } 310 311 System.out.println(" Done."); 312 313 return decompTables; 314 } 315 buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)316 public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script) 317 { 318 int[] lookups = new int[2]; 319 320 DecompTable[] decompTables = buildDecompTables(data, script); 321 322 LigatureTree compTree = new LigatureTree(); 323 324 buildLigatureTree(data, script, compTree); 325 326 System.out.println(); 327 328 LigatureTreeWalker compWalker = new LigatureTreeWalker(); 329 330 compTree.walk(compWalker); 331 332 Lookup compLookup, dcmpLookup; 333 //int compLookupIndex, dcmpLookupIndex; 334 335 compLookup = new Lookup(Lookup.GSST_Ligature, 0); 336 compLookup.addSubtable(compWalker); 337 338 dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0); 339 for (int i = 0; i < decompTables.length; i += 1) { 340 dcmpLookup.addSubtable(decompTables[i]); 341 } 342 343 lookups[0] = lookupList.addLookup(compLookup); 344 lookups[1] = lookupList.addLookup(dcmpLookup); 345 346 return lookups; 347 } 348 addLookups(Feature feature, int[] lookups)349 public static void addLookups(Feature feature, int[] lookups) 350 { 351 for (int i = 0; i < lookups.length; i += 1) { 352 feature.addLookup(lookups[i]); 353 } 354 } 355 356 /* 357 * Hebrew mark order taken from the SBL Hebrew Font manual 358 * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks 359 */ buildCombiningClassTable()360 public static ClassTable buildCombiningClassTable() 361 { 362 UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]"); 363 ClassTable exceptions = new ClassTable(); 364 ClassTable combiningClasses = new ClassTable(); 365 int markCount = markSet.size(); 366 367 exceptions.addMapping(0x05C1, 10); // Point Shin Dot 368 exceptions.addMapping(0x05C2, 11); // Point Sin Dot 369 exceptions.addMapping(0x05BC, 21); // Point Dagesh or Mapiq 370 exceptions.addMapping(0x05BF, 23); // Point Rafe 371 exceptions.addMapping(0x05B9, 27); // Point Holam 372 exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum) 373 exceptions.addMapping(0x0591, 220); // Accent Etnahta 374 exceptions.addMapping(0x0596, 220); // Accent Tipeha 375 exceptions.addMapping(0x059B, 220); // Accent Tevir 376 exceptions.addMapping(0x05A3, 220); // Accent Munah 377 exceptions.addMapping(0x05A4, 220); // Accent Mahapakh 378 exceptions.addMapping(0x05A5, 220); // Accent Merkha 379 exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula 380 exceptions.addMapping(0x05A7, 220); // Accent Darga 381 exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo 382 exceptions.addMapping(0x05B0, 220); // Point Sheva 383 exceptions.addMapping(0x05B1, 220); // Point Hataf Segol 384 exceptions.addMapping(0x05B2, 220); // Point Hataf Patah 385 exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats 386 exceptions.addMapping(0x05B4, 220); // Point Hiriq 387 exceptions.addMapping(0x05B5, 220); // Point Tsere 388 exceptions.addMapping(0x05B6, 220); // Point Segol 389 exceptions.addMapping(0x05B7, 220); // Point Patah 390 exceptions.addMapping(0x05B8, 220); // Point Qamats 391 exceptions.addMapping(0x05BB, 220); // Point Qubuts 392 exceptions.addMapping(0x05BD, 220); // Point Meteg 393 exceptions.addMapping(0x059A, 222); // Accent Yetiv 394 exceptions.addMapping(0x05AD, 222); // Accent Dehi 395 exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum) 396 exceptions.addMapping(0x0593, 230); // Accent Shalshelet 397 exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan 398 exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol 399 exceptions.addMapping(0x0597, 230); // Accent Revia 400 exceptions.addMapping(0x0598, 230); // Accent Zarqa 401 exceptions.addMapping(0x059F, 230); // Accent Qarney Para 402 exceptions.addMapping(0x059E, 230); // Accent Gershayim 403 exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam 404 exceptions.addMapping(0x059C, 230); // Accent Geresh 405 exceptions.addMapping(0x0592, 230); // Accent Segolta 406 exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola 407 exceptions.addMapping(0x05AC, 230); // Accent Iluy 408 exceptions.addMapping(0x05A8, 230); // Accent Qadma 409 exceptions.addMapping(0x05AB, 230); // Accent Ole 410 exceptions.addMapping(0x05AF, 230); // Mark Masora Circle 411 exceptions.addMapping(0x05A1, 230); // Accent Pazer 412 //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot 413 exceptions.addMapping(0x05AE, 232); // Accent Zinor 414 exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana 415 exceptions.addMapping(0x0599, 232); // Accent Pashta 416 417 exceptions.addMapping(0x0655, 27); // ARABIC HAMZA BELOW 418 exceptions.addMapping(0x0654, 27); // ARABIC HAMZA ABOVE 419 420 exceptions.addMapping(0x0651, 28); // ARABIC SHADDA 421 422 exceptions.addMapping(0x0656, 29); // ARABIC SUBSCRIPT ALEF 423 exceptions.addMapping(0x0670, 29); // ARABIC LETTER SUPERSCRIPT ALEF 424 425 exceptions.addMapping(0x064D, 30); // ARABIC KASRATAN 426 exceptions.addMapping(0x0650, 30); // ARABIC KASRA 427 428 exceptions.addMapping(0x0652, 31); // ARABIC SUKUN 429 exceptions.addMapping(0x06E1, 31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH 430 431 exceptions.addMapping(0x064B, 31); // ARABIC FATHATAN 432 exceptions.addMapping(0x064C, 31); // ARABIC DAMMATAN 433 exceptions.addMapping(0x064E, 31); // ARABIC FATHA 434 exceptions.addMapping(0x064F, 31); // ARABIC DAMMA 435 exceptions.addMapping(0x0657, 31); // ARABIC INVERTED DAMMA 436 exceptions.addMapping(0x0658, 31); // ARABIC MARK NOON GHUNNA 437 438 exceptions.addMapping(0x0653, 32); // ARABIC MADDAH ABOVE 439 440 exceptions.snapshot(); 441 442 for (int i = 0; i < markCount; i += 1) { 443 int mark = markSet.charAt(i); 444 int markClass = exceptions.getGlyphClassID(mark); 445 446 if (markClass == 0) { 447 markClass = UCharacter.getCombiningClass(mark); 448 } 449 450 combiningClasses.addMapping(mark, markClass); 451 } 452 453 combiningClasses.snapshot(); 454 return combiningClasses; 455 } 456 buildDecompTables(String fileName)457 public static void buildDecompTables(String fileName) 458 { 459 // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored. 460 //UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]"); 461 UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]"); 462 CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet); 463 ClassTable classTable = new ClassTable(); 464 465 LookupList lookupList = new LookupList(); 466 FeatureList featureList = new FeatureList(); 467 ScriptList scriptList = new ScriptList(); 468 469 // build common, inherited lookups... 470 // int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON); 471 // int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED); 472 473 for (int script = 0; script < UScript.CODE_LIMIT; script += 1) { 474 475 // This is a bit lame, but it's the only way I can think of 476 // to make this work w/o knowing the values of COMMON and INHERITED... 477 if (script == UScript.COMMON || script == UScript.INHERITED || 478 data.getMaxEquivalents(script) == 0) { 479 continue; 480 } 481 482 int[] lookups = buildLookups(data, lookupList, script); 483 484 Feature ccmp = new Feature("ccmp"); 485 486 addLookups(ccmp, lookups); 487 // addLookups(ccmp, commonLookups); 488 // addLookups(ccmp, inheritedLookups); 489 490 featureList.addFeature(ccmp); 491 492 String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script)); 493 494 scriptList.addFeature(scriptTag, "(default)", ccmp); 495 496 if (script == UScript.ARABIC) { 497 buildArabicTables(scriptList, featureList, lookupList, classTable); 498 } 499 } 500 501 featureList.finalizeFeatureList(); 502 503 ClassTable markClassTable = buildCombiningClassTable(); 504 505 GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList); 506 GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable); 507 String[] includeFiles = {"LETypes.h", "CanonShaping.h"}; 508 509 LigatureModuleWriter writer = new LigatureModuleWriter(); 510 511 writer.openFile(fileName); 512 writer.writeHeader(null, includeFiles); 513 writer.writeTable(gsubWriter); 514 writer.writeTable(gdefWriter); 515 writer.writeTrailer(); 516 writer.closeFile(); 517 } 518 main(String[] args)519 public static void main(String[] args) 520 { 521 buildDecompTables(args[0]); 522 } 523 } 524