1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2002-2010, International Business Machines Corporation and    *
6  * others. All Rights Reserved.                                                *
7  *******************************************************************************
8  */
9 
10 
11 package com.ibm.icu.dev.tool.layout;
12 
13 import com.ibm.icu.lang.UCharacter;
14 import com.ibm.icu.lang.UScript;
15 import com.ibm.icu.text.UTF16;
16 import com.ibm.icu.text.UnicodeSet;
17 
18 /**
19  * @author Eric Mader
20  *
21  * Notes:
22  *
23  * The property \p{Decomposition_Type=Canonical} will match all characters with a canonical
24  * decomposition.
25  *
26  * So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]"
27  * will match all Latin, Greek and Cyrillic characters with a canonical decomposition.
28  *
29  * Are these three scripts enough? Do we want to collect them all at once and distribute by script,
30  * or process them one script at a time. It's probably a good idea to build a single table for
31  * however many scripts there are.
32  *
33  * It might be better to collect all the characters that have a canonical decomposition and just
34  * sort them into however many scripts there are... unless we'll get characters in COMMON???
35  */
36 public class CanonGSUBBuilder
37 {
convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)38     static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)
39     {
40         int leftType  = ArabicShaping.VALUE_NONE;
41         int rightType = ArabicShaping.VALUE_NONE;
42 
43         switch (type) {
44             case UCharacter.DecompositionType.ISOLATED:
45                 break;
46 
47             case UCharacter.DecompositionType.FINAL:
48                 rightType = ArabicShaping.VALUE_LEFT;
49                 break;
50 
51             case UCharacter.DecompositionType.INITIAL:
52                 leftType = ArabicShaping.VALUE_RIGHT;
53                 break;
54 
55             case UCharacter.DecompositionType.MEDIAL:
56                rightType = ArabicShaping.VALUE_LEFT;
57                leftType  = ArabicShaping.VALUE_RIGHT;
58                break;
59 
60            default:
61                return decomp + UCharacter.toString(ligature);
62         }
63 
64         char[] chars = decomp.toCharArray();
65 
66         ArabicShaping.shape(chars, leftType, rightType, isolClassTable);
67 
68         return new String(chars) + UCharacter.toString(ligature);
69     }
70 
buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable, ClassTable finaClassTable, ClassTable isolClassTable)71     static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable,
72                                      ClassTable finaClassTable, ClassTable isolClassTable)
73     {
74         System.out.print("Finding Arabic contextual forms... ");
75 
76         for (int i = 0; i < data.countRecords(); i += 1) {
77             ArabicCharacterData.Record record = data.getRecord(i);
78             String decomposition = record.getDecomposition();
79 
80             if (decomposition != null && decomposition.length() == 1) {
81                 int contextual = record.getCodePoint();
82                 int isolated   = UTF16.charAt(record.getDecomposition(), 0);
83 
84                 switch (record.getDecompositionType()) {
85                 case UCharacter.DecompositionType.INITIAL:
86                     initClassTable.addMapping(isolated, contextual);
87                     break;
88 
89                 case UCharacter.DecompositionType.MEDIAL:
90                     mediClassTable.addMapping(isolated, contextual);
91                     break;
92 
93                case UCharacter.DecompositionType.FINAL:
94                    finaClassTable.addMapping(isolated, contextual);
95                    break;
96 
97                case UCharacter.DecompositionType.ISOLATED:
98                    isolClassTable.addMapping(isolated, contextual);
99                    break;
100 
101                default:
102                    // issue some error message?
103                    break;
104                 }
105             }
106         }
107 
108         System.out.println("Done.");
109     }
110 
buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)111     static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)
112     {
113         LigatureTree contextualTree = new LigatureTree();
114         int ligatureCount = 0;
115 
116         System.out.print("Building Arabic ligature tree... ");
117 
118         for (int i = 0; i < data.countRecords(); i += 1) {
119             ArabicCharacterData.Record record = data.getRecord(i);
120             String decomposition = record.getDecomposition();
121 
122             if (decomposition != null && decomposition.length() > 1) {
123                 int ligature   = record.getCodePoint();
124                 int decompType = record.getDecompositionType();
125 
126                 switch (decompType) {
127                 case UCharacter.DecompositionType.FINAL:
128                 case UCharacter.DecompositionType.INITIAL:
129                 case UCharacter.DecompositionType.MEDIAL:
130                 case UCharacter.DecompositionType.ISOLATED:
131                     contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable));
132                     ligatureCount += 1;
133                     break;
134 
135                 case UCharacter.DecompositionType.CANONICAL:
136                     //cannonicalTree.insert(decomposition + UCharacter.toString(ligature));
137                     break;
138                 }
139             }
140         }
141 
142         System.out.println(ligatureCount + " ligatures.");
143 
144         return contextualTree;
145     }
146 
147     static final int SIMPLE_GLYPH = 1;
148     static final int LIGATURE_GLYPH = 2;
149     static final int MARK_GLYPH = 3;
150     static final int COMPONENT_GLYPH = 4;
151 
152     static final int categoryClassMap[] = {
153     0,              // UNASSIGNED
154     SIMPLE_GLYPH,   // UPPERCASE_LETTER
155     SIMPLE_GLYPH,   // LOWERCASE_LETTER
156     SIMPLE_GLYPH,   // TITLECASE_LETTER
157     SIMPLE_GLYPH,   // MODIFIER_LETTER
158     SIMPLE_GLYPH,   // OTHER_LETTER
159     MARK_GLYPH,     // NON_SPACING_MARK
160     MARK_GLYPH,     // ENCLOSING_MARK ??
161     MARK_GLYPH,     // COMBINING_SPACING_MARK ??
162     SIMPLE_GLYPH,   // DECIMAL_NUMBER
163     SIMPLE_GLYPH,   // LETTER_NUMBER
164     SIMPLE_GLYPH,   // OTHER_NUMBER;
165     0,              // SPACE_SEPARATOR
166     0,              // LINE_SEPARATOR
167     0,              // PARAGRAPH_SEPARATOR
168     0,              // CONTROL
169     0,              // FORMAT
170     0,              // PRIVATE_USE
171     0,              // SURROGATE
172     SIMPLE_GLYPH,   // DASH_PUNCTUATION
173     SIMPLE_GLYPH,   // START_PUNCTUATION
174     SIMPLE_GLYPH,   // END_PUNCTUATION
175     SIMPLE_GLYPH,   // CONNECTOR_PUNCTUATION
176     SIMPLE_GLYPH,   // OTHER_PUNCTUATION
177     SIMPLE_GLYPH,   // MATH_SYMBOL;
178     SIMPLE_GLYPH,   // CURRENCY_SYMBOL
179     SIMPLE_GLYPH,   // MODIFIER_SYMBOL
180     SIMPLE_GLYPH,   // OTHER_SYMBOL
181     SIMPLE_GLYPH,   // INITIAL_PUNCTUATION
182     SIMPLE_GLYPH    // FINAL_PUNCTUATION
183     };
184 
getGlyphClass(ArabicCharacterData.Record record)185     static int getGlyphClass(ArabicCharacterData.Record record)
186     {
187         String decomp = record.getDecomposition();
188 
189         if (decomp != null && decomp.length() > 1) {
190             return LIGATURE_GLYPH;
191         }
192 
193         return categoryClassMap[record.getGeneralCategory()];
194     }
195 
addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)196     static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)
197     {
198         System.out.print("Adding Arabic glyph classes... ");
199 
200         for (int i = 0; i < data.countRecords(); i += 1) {
201             ArabicCharacterData.Record record = data.getRecord(i);
202             classTable.addMapping(record.getCodePoint(), getGlyphClass(record));
203         }
204 
205         System.out.println("Done.");
206     }
207 
buildArabicTables(ScriptList scriptList, FeatureList featureList, LookupList lookupList, ClassTable classTable)208     private static void buildArabicTables(ScriptList scriptList, FeatureList featureList,
209                                                 LookupList lookupList, ClassTable classTable) {
210         // TODO: Might want to have the ligature table builder explicitly check for ligatures
211         // which start with space and tatweel rather than pulling them out here...
212         UnicodeSet arabicBlock   = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]");
213         UnicodeSet oddLigatures  = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]");
214         UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]");
215         ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures));
216 
217         addArabicGlyphClasses(arabicData, classTable);
218 
219         ClassTable initClassTable = new ClassTable();
220         ClassTable mediClassTable = new ClassTable();
221         ClassTable finaClassTable = new ClassTable();
222         ClassTable isolClassTable = new ClassTable();
223 
224         buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable);
225         isolClassTable.snapshot();
226         LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable);
227 
228         LigatureTreeWalker ligaWalker = new LigatureTreeWalker();
229 
230         ligaTree.walk(ligaWalker);
231 
232         Lookup initLookup, mediLookup, finaLookup, ligaLookup;
233 
234         initLookup = new Lookup(Lookup.GSST_Single, 0);
235         initLookup.addSubtable(initClassTable);
236 
237         mediLookup = new Lookup(Lookup.GSST_Single, 0);
238         mediLookup.addSubtable(mediClassTable);
239 
240         finaLookup = new Lookup(Lookup.GSST_Single, 0);
241         finaLookup.addSubtable(finaClassTable);
242 
243         ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks);
244         ligaLookup.addSubtable(ligaWalker);
245 
246         Feature init = new Feature("init");
247         Feature medi = new Feature("medi");
248         Feature fina = new Feature("fina");
249         Feature liga = new Feature("liga");
250 
251         init.addLookup(lookupList.addLookup(initLookup));
252         medi.addLookup(lookupList.addLookup(mediLookup));
253         fina.addLookup(lookupList.addLookup(finaLookup));
254         liga.addLookup(lookupList.addLookup(ligaLookup));
255 
256         featureList.addFeature(init);
257         featureList.addFeature(medi);
258         featureList.addFeature(fina);
259         featureList.addFeature(liga);
260 
261         scriptList.addFeature("arab", "(default)", init);
262         scriptList.addFeature("arab", "(default)", medi);
263         scriptList.addFeature("arab", "(default)", fina);
264         scriptList.addFeature("arab", "(default)", liga);
265 
266         System.out.println();
267     }
268 
buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)269     public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)
270     {
271         int ligatureCount = 0;
272 
273         System.out.print("building composition ligature tree for " + UScript.getName(script) + "... ");
274 
275         for (int i = 0; i < data.countRecords(script); i += 1) {
276             CanonicalCharacterData.Record record = data.getRecord(script, i);
277             String composed = UCharacter.toString(record.getComposedCharacter());
278 
279             for (int e = 0; e < record.countEquivalents(); e += 1) {
280                 String equivalent = record.getEquivalent(e);
281 
282                 ligatureTree.insert(equivalent + composed);
283                 ligatureCount += 1;
284             }
285         }
286 
287         System.out.println(ligatureCount + " ligatures.");
288     }
289 
buildDecompTables(CanonicalCharacterData data, int script)290     public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script)
291     {
292         int maxDecompCount = data.getMaxEquivalents(script);
293         DecompTable[] decompTables = new DecompTable[maxDecompCount];
294 
295         System.out.print("Building decompositon tables for " + UScript.getName(script) +
296                          "... total decompositions: " + data.countRecords(script) +
297                          ", max: " + maxDecompCount + "...");
298 
299         for (int i = 0; i < maxDecompCount; i += 1) {
300             DecompTable table = new DecompTable();
301 
302             for (int r = 0; r < data.countRecords(script); r += 1) {
303                 CanonicalCharacterData.Record record = data.getRecord(script, r);
304 
305                 if (record.countEquivalents() > i) {
306                     table.add(record.getComposedCharacter(), record.getEquivalent(i));
307                 }
308             }
309 
310             decompTables[i] = table;
311         }
312 
313         System.out.println(" Done.");
314 
315         return decompTables;
316     }
317 
buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)318     public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)
319     {
320         int[] lookups = new int[2];
321 
322         DecompTable[] decompTables = buildDecompTables(data, script);
323 
324         LigatureTree compTree = new LigatureTree();
325 
326         buildLigatureTree(data, script, compTree);
327 
328         System.out.println();
329 
330         LigatureTreeWalker compWalker = new LigatureTreeWalker();
331 
332         compTree.walk(compWalker);
333 
334         Lookup compLookup, dcmpLookup;
335         //int compLookupIndex, dcmpLookupIndex;
336 
337         compLookup = new Lookup(Lookup.GSST_Ligature, 0);
338         compLookup.addSubtable(compWalker);
339 
340         dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0);
341         for (int i = 0; i < decompTables.length; i += 1) {
342             dcmpLookup.addSubtable(decompTables[i]);
343         }
344 
345         lookups[0] = lookupList.addLookup(compLookup);
346         lookups[1] = lookupList.addLookup(dcmpLookup);
347 
348         return lookups;
349     }
350 
addLookups(Feature feature, int[] lookups)351     public static void addLookups(Feature feature, int[] lookups)
352     {
353         for (int i = 0; i < lookups.length; i += 1) {
354             feature.addLookup(lookups[i]);
355         }
356     }
357 
358     /*
359      * Hebrew mark order taken from the SBL Hebrew Font manual
360      * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks
361      */
buildCombiningClassTable()362     public static ClassTable buildCombiningClassTable()
363     {
364         UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]");
365         ClassTable exceptions = new ClassTable();
366         ClassTable combiningClasses = new ClassTable();
367         int markCount = markSet.size();
368 
369         exceptions.addMapping(0x05C1,  10); // Point Shin Dot
370         exceptions.addMapping(0x05C2,  11); // Point Sin Dot
371         exceptions.addMapping(0x05BC,  21); // Point Dagesh or Mapiq
372         exceptions.addMapping(0x05BF,  23); // Point Rafe
373         exceptions.addMapping(0x05B9,  27); // Point Holam
374         exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum)
375         exceptions.addMapping(0x0591, 220); // Accent Etnahta
376         exceptions.addMapping(0x0596, 220); // Accent Tipeha
377         exceptions.addMapping(0x059B, 220); // Accent Tevir
378         exceptions.addMapping(0x05A3, 220); // Accent Munah
379         exceptions.addMapping(0x05A4, 220); // Accent Mahapakh
380         exceptions.addMapping(0x05A5, 220); // Accent Merkha
381         exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula
382         exceptions.addMapping(0x05A7, 220); // Accent Darga
383         exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo
384         exceptions.addMapping(0x05B0, 220); // Point Sheva
385         exceptions.addMapping(0x05B1, 220); // Point Hataf Segol
386         exceptions.addMapping(0x05B2, 220); // Point Hataf Patah
387         exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats
388         exceptions.addMapping(0x05B4, 220); // Point Hiriq
389         exceptions.addMapping(0x05B5, 220); // Point Tsere
390         exceptions.addMapping(0x05B6, 220); // Point Segol
391         exceptions.addMapping(0x05B7, 220); // Point Patah
392         exceptions.addMapping(0x05B8, 220); // Point Qamats
393         exceptions.addMapping(0x05BB, 220); // Point Qubuts
394         exceptions.addMapping(0x05BD, 220); // Point Meteg
395         exceptions.addMapping(0x059A, 222); // Accent Yetiv
396         exceptions.addMapping(0x05AD, 222); // Accent Dehi
397         exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum)
398         exceptions.addMapping(0x0593, 230); // Accent Shalshelet
399         exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan
400         exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol
401         exceptions.addMapping(0x0597, 230); // Accent Revia
402         exceptions.addMapping(0x0598, 230); // Accent Zarqa
403         exceptions.addMapping(0x059F, 230); // Accent Qarney Para
404         exceptions.addMapping(0x059E, 230); // Accent Gershayim
405         exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam
406         exceptions.addMapping(0x059C, 230); // Accent Geresh
407         exceptions.addMapping(0x0592, 230); // Accent Segolta
408         exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola
409         exceptions.addMapping(0x05AC, 230); // Accent Iluy
410         exceptions.addMapping(0x05A8, 230); // Accent Qadma
411         exceptions.addMapping(0x05AB, 230); // Accent Ole
412         exceptions.addMapping(0x05AF, 230); // Mark Masora Circle
413         exceptions.addMapping(0x05A1, 230); // Accent Pazer
414       //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot
415         exceptions.addMapping(0x05AE, 232); // Accent Zinor
416         exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana
417         exceptions.addMapping(0x0599, 232); // Accent Pashta
418 
419         exceptions.addMapping(0x0655,  27); // ARABIC HAMZA BELOW
420         exceptions.addMapping(0x0654,  27); // ARABIC HAMZA ABOVE
421 
422         exceptions.addMapping(0x0651,  28); // ARABIC SHADDA
423 
424         exceptions.addMapping(0x0656,  29); // ARABIC SUBSCRIPT ALEF
425         exceptions.addMapping(0x0670,  29); // ARABIC LETTER SUPERSCRIPT ALEF
426 
427         exceptions.addMapping(0x064D,  30); // ARABIC KASRATAN
428         exceptions.addMapping(0x0650,  30); // ARABIC KASRA
429 
430         exceptions.addMapping(0x0652,  31); // ARABIC SUKUN
431         exceptions.addMapping(0x06E1,  31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
432 
433         exceptions.addMapping(0x064B,  31); // ARABIC FATHATAN
434         exceptions.addMapping(0x064C,  31); // ARABIC DAMMATAN
435         exceptions.addMapping(0x064E,  31); // ARABIC FATHA
436         exceptions.addMapping(0x064F,  31); // ARABIC DAMMA
437         exceptions.addMapping(0x0657,  31); // ARABIC INVERTED DAMMA
438         exceptions.addMapping(0x0658,  31); // ARABIC MARK NOON GHUNNA
439 
440         exceptions.addMapping(0x0653,  32); // ARABIC MADDAH ABOVE
441 
442         exceptions.snapshot();
443 
444         for (int i = 0; i < markCount; i += 1) {
445             int mark = markSet.charAt(i);
446             int markClass = exceptions.getGlyphClassID(mark);
447 
448             if (markClass == 0) {
449                 markClass = UCharacter.getCombiningClass(mark);
450             }
451 
452             combiningClasses.addMapping(mark, markClass);
453         }
454 
455         combiningClasses.snapshot();
456         return combiningClasses;
457     }
458 
buildDecompTables(String fileName)459     public static void buildDecompTables(String fileName)
460     {
461         // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored.
462       //UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]");
463         UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]");
464         CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet);
465         ClassTable classTable = new ClassTable();
466 
467         LookupList  lookupList  = new LookupList();
468         FeatureList featureList = new FeatureList();
469         ScriptList  scriptList  = new ScriptList();
470 
471         // build common, inherited lookups...
472 //        int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON);
473 //        int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED);
474 
475         for (int script = 0; script < UScript.CODE_LIMIT; script += 1) {
476 
477             // This is a bit lame, but it's the only way I can think of
478             // to make this work w/o knowing the values of COMMON and INHERITED...
479             if (script == UScript.COMMON || script == UScript.INHERITED ||
480                 data.getMaxEquivalents(script) == 0) {
481                 continue;
482             }
483 
484             int[] lookups = buildLookups(data, lookupList, script);
485 
486             Feature ccmp = new Feature("ccmp");
487 
488             addLookups(ccmp, lookups);
489 //            addLookups(ccmp, commonLookups);
490 //            addLookups(ccmp, inheritedLookups);
491 
492             featureList.addFeature(ccmp);
493 
494             String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script));
495 
496             scriptList.addFeature(scriptTag, "(default)", ccmp);
497 
498             if (script == UScript.ARABIC) {
499                 buildArabicTables(scriptList, featureList, lookupList, classTable);
500             }
501         }
502 
503         featureList.finalizeFeatureList();
504 
505         ClassTable markClassTable = buildCombiningClassTable();
506 
507         GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList);
508         GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable);
509         String[] includeFiles = {"LETypes.h", "CanonShaping.h"};
510 
511         LigatureModuleWriter writer = new LigatureModuleWriter();
512 
513         writer.openFile(fileName);
514         writer.writeHeader(null, includeFiles);
515         writer.writeTable(gsubWriter);
516         writer.writeTable(gdefWriter);
517         writer.writeTrailer();
518         writer.closeFile();
519     }
520 
main(String[] args)521     public static void main(String[] args)
522     {
523         buildDecompTables(args[0]);
524     }
525 }
526