1 /**
2  *******************************************************************************
3  * Copyright (C) 2002-2010, International Business Machines Corporation and    *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 
8 package com.ibm.icu.dev.tool.layout;
9 
10 import java.util.Vector;
11 
12 import com.ibm.icu.impl.Utility;
13 import com.ibm.icu.lang.UCharacter;
14 import com.ibm.icu.lang.UScript;
15 import com.ibm.icu.text.CanonicalIterator;
16 import com.ibm.icu.text.UTF16;
17 import com.ibm.icu.text.UnicodeSet;
18 
19 public class CanonicalCharacterData
20 {
21     private static int THRESHOLD = 4;
22 
23     public class Record
24     {
25         // TODO: might want to save arrays of Char32's rather than UTF16 strings...
Record(int character, int script)26         Record(int character, int script)
27         {
28             String char32 = UCharacter.toString(character);
29             CanonicalIterator iterator = new CanonicalIterator(char32);
30             Vector equivs = new Vector();
31 
32             composed = character;
33 
34             for (String equiv = iterator.next(); equiv != null; equiv = iterator.next()) {
35                 // Skip all equivalents of length 1; it's either the original
36                 // characeter or something like Angstrom for A-Ring, which we don't care about
37                 if (UTF16.countCodePoint(equiv) > 1) {
38                     equivs.add(equiv);
39                 }
40             }
41 
42             int nEquivalents = equivs.size();
43 
44             if (nEquivalents > maxEquivalents[script]) {
45                 maxEquivalents[script] = nEquivalents;
46             }
47 
48             if (nEquivalents > 0) {
49                 equivalents = new String[nEquivalents];
50 
51                 if (nEquivalents > THRESHOLD) {
52                     dumpEquivalents(character, equivs);
53                 }
54 
55                 sortEquivalents(equivalents, equivs);
56             }
57         }
58 
getComposedCharacter()59         public int getComposedCharacter()
60         {
61             return composed;
62         }
63 
countEquivalents()64         public int countEquivalents()
65         {
66             if (equivalents == null) {
67                 return 0;
68             }
69 
70             return equivalents.length;
71         }
72 
getEquivalents()73         public String[] getEquivalents()
74         {
75             return equivalents;
76         }
77 
getEquivalent(int index)78         public String getEquivalent(int index)
79         {
80             if (equivalents == null || index < 0 || index >= equivalents.length) {
81                 return null;
82             }
83 
84             return equivalents[index];
85         }
86 
dumpEquivalents(int character, Vector equivs)87         private void dumpEquivalents(int character, Vector equivs)
88         {
89             int count = equivs.size();
90 
91             System.out.println(Utility.hex(character, 6) + " - " + count + ":");
92 
93             for (int i = 0; i < count; i += 1) {
94                 String equiv = (String) equivs.elementAt(i);
95                 int codePoints = UTF16.countCodePoint(equiv);
96 
97                 for (int c = 0; c < codePoints; c += 1) {
98                     if (c > 0) {
99                         System.out.print(" ");
100                     }
101 
102                     System.out.print(Utility.hex(UTF16.charAt(equiv, c), 6));
103                 }
104 
105                 System.out.println();
106             }
107 
108             System.out.println();
109         }
110 
111         private int composed;
112         private String[] equivalents = null;
113     }
114 
CanonicalCharacterData()115     public CanonicalCharacterData()
116     {
117         // nothing to do...
118     }
119 
add(int character)120     public void add(int character)
121     {
122         int script = UScript.getScript(character);
123         Vector recordVector = recordVectors[script];
124 
125         if (recordVector == null) {
126             recordVector = recordVectors[script] = new Vector();
127         }
128 
129         recordVector.add(new Record(character, script));
130     }
131 
getMaxEquivalents(int script)132     public int getMaxEquivalents(int script)
133     {
134         if (script < 0 || script >= UScript.CODE_LIMIT) {
135             return 0;
136         }
137 
138         return maxEquivalents[script];
139     }
140 
getRecord(int script, int index)141     public Record getRecord(int script, int index)
142     {
143         if (script < 0 || script >= UScript.CODE_LIMIT) {
144             return null;
145         }
146 
147         Vector recordVector = recordVectors[script];
148 
149         if (recordVector == null || index < 0 || index >= recordVector.size()) {
150             return null;
151         }
152 
153         return (Record) recordVector.elementAt(index);
154     }
155 
countRecords(int script)156     public int countRecords(int script)
157     {
158         if (script < 0 || script >= UScript.CODE_LIMIT ||
159             recordVectors[script] == null) {
160             return 0;
161         }
162 
163         return recordVectors[script].size();
164     }
165 
factory(UnicodeSet characterSet)166     public static CanonicalCharacterData factory(UnicodeSet characterSet)
167     {
168         int charCount = characterSet.size();
169         CanonicalCharacterData data = new CanonicalCharacterData();
170 
171         System.out.println("There are " + charCount + " characters with a canonical decomposition.");
172 
173         for (int i = 0; i < charCount; i += 1) {
174             data.add(characterSet.charAt(i));
175         }
176 
177         return data;
178     }
179 
compareEquivalents(String a, String b)180     private static int compareEquivalents(String a, String b)
181     {
182         int result = UTF16.countCodePoint(a) - UTF16.countCodePoint(b);
183 
184         if (result == 0) {
185             return a.compareTo(b);
186         }
187 
188         return result;
189     }
190 
191     //
192     // Straight insertion sort from Knuth vol. III, pg. 81
193     //
sortEquivalents(String[] equivalents, Vector unsorted)194     private static void sortEquivalents(String[] equivalents, Vector unsorted)
195     {
196         int nEquivalents = equivalents.length;
197 
198         for (int e = 0; e < nEquivalents; e += 1) {
199             String v = (String) unsorted.elementAt(e);
200             int i;
201 
202             for (i = e - 1; i >= 0; i -= 1) {
203                 if (compareEquivalents(v, equivalents[i]) >= 0) {
204                   break;
205                 }
206 
207                 equivalents[i + 1] = equivalents[i];
208             }
209 
210             equivalents[i + 1] = v;
211        }
212     }
213 
214     private Vector recordVectors[] = new Vector[UScript.CODE_LIMIT];
215     private int maxEquivalents[] = new int[UScript.CODE_LIMIT];
216 
217 }
218