1 /** 2 ******************************************************************************* 3 * Copyright (C) 2002-2010, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.dev.tool.layout; 9 10 import java.util.Vector; 11 12 import com.ibm.icu.impl.Utility; 13 import com.ibm.icu.lang.UCharacter; 14 import com.ibm.icu.lang.UScript; 15 import com.ibm.icu.text.CanonicalIterator; 16 import com.ibm.icu.text.UTF16; 17 import com.ibm.icu.text.UnicodeSet; 18 19 public class CanonicalCharacterData 20 { 21 private static int THRESHOLD = 4; 22 23 public class Record 24 { 25 // TODO: might want to save arrays of Char32's rather than UTF16 strings... Record(int character, int script)26 Record(int character, int script) 27 { 28 String char32 = UCharacter.toString(character); 29 CanonicalIterator iterator = new CanonicalIterator(char32); 30 Vector equivs = new Vector(); 31 32 composed = character; 33 34 for (String equiv = iterator.next(); equiv != null; equiv = iterator.next()) { 35 // Skip all equivalents of length 1; it's either the original 36 // characeter or something like Angstrom for A-Ring, which we don't care about 37 if (UTF16.countCodePoint(equiv) > 1) { 38 equivs.add(equiv); 39 } 40 } 41 42 int nEquivalents = equivs.size(); 43 44 if (nEquivalents > maxEquivalents[script]) { 45 maxEquivalents[script] = nEquivalents; 46 } 47 48 if (nEquivalents > 0) { 49 equivalents = new String[nEquivalents]; 50 51 if (nEquivalents > THRESHOLD) { 52 dumpEquivalents(character, equivs); 53 } 54 55 sortEquivalents(equivalents, equivs); 56 } 57 } 58 getComposedCharacter()59 public int getComposedCharacter() 60 { 61 return composed; 62 } 63 countEquivalents()64 public int countEquivalents() 65 { 66 if (equivalents == null) { 67 return 0; 68 } 69 70 return equivalents.length; 71 } 72 getEquivalents()73 public String[] getEquivalents() 74 { 75 return equivalents; 76 } 77 getEquivalent(int index)78 public String getEquivalent(int index) 79 { 80 if (equivalents == null || index < 0 || index >= equivalents.length) { 81 return null; 82 } 83 84 return equivalents[index]; 85 } 86 dumpEquivalents(int character, Vector equivs)87 private void dumpEquivalents(int character, Vector equivs) 88 { 89 int count = equivs.size(); 90 91 System.out.println(Utility.hex(character, 6) + " - " + count + ":"); 92 93 for (int i = 0; i < count; i += 1) { 94 String equiv = (String) equivs.elementAt(i); 95 int codePoints = UTF16.countCodePoint(equiv); 96 97 for (int c = 0; c < codePoints; c += 1) { 98 if (c > 0) { 99 System.out.print(" "); 100 } 101 102 System.out.print(Utility.hex(UTF16.charAt(equiv, c), 6)); 103 } 104 105 System.out.println(); 106 } 107 108 System.out.println(); 109 } 110 111 private int composed; 112 private String[] equivalents = null; 113 } 114 CanonicalCharacterData()115 public CanonicalCharacterData() 116 { 117 // nothing to do... 118 } 119 add(int character)120 public void add(int character) 121 { 122 int script = UScript.getScript(character); 123 Vector recordVector = recordVectors[script]; 124 125 if (recordVector == null) { 126 recordVector = recordVectors[script] = new Vector(); 127 } 128 129 recordVector.add(new Record(character, script)); 130 } 131 getMaxEquivalents(int script)132 public int getMaxEquivalents(int script) 133 { 134 if (script < 0 || script >= UScript.CODE_LIMIT) { 135 return 0; 136 } 137 138 return maxEquivalents[script]; 139 } 140 getRecord(int script, int index)141 public Record getRecord(int script, int index) 142 { 143 if (script < 0 || script >= UScript.CODE_LIMIT) { 144 return null; 145 } 146 147 Vector recordVector = recordVectors[script]; 148 149 if (recordVector == null || index < 0 || index >= recordVector.size()) { 150 return null; 151 } 152 153 return (Record) recordVector.elementAt(index); 154 } 155 countRecords(int script)156 public int countRecords(int script) 157 { 158 if (script < 0 || script >= UScript.CODE_LIMIT || 159 recordVectors[script] == null) { 160 return 0; 161 } 162 163 return recordVectors[script].size(); 164 } 165 factory(UnicodeSet characterSet)166 public static CanonicalCharacterData factory(UnicodeSet characterSet) 167 { 168 int charCount = characterSet.size(); 169 CanonicalCharacterData data = new CanonicalCharacterData(); 170 171 System.out.println("There are " + charCount + " characters with a canonical decomposition."); 172 173 for (int i = 0; i < charCount; i += 1) { 174 data.add(characterSet.charAt(i)); 175 } 176 177 return data; 178 } 179 compareEquivalents(String a, String b)180 private static int compareEquivalents(String a, String b) 181 { 182 int result = UTF16.countCodePoint(a) - UTF16.countCodePoint(b); 183 184 if (result == 0) { 185 return a.compareTo(b); 186 } 187 188 return result; 189 } 190 191 // 192 // Straight insertion sort from Knuth vol. III, pg. 81 193 // sortEquivalents(String[] equivalents, Vector unsorted)194 private static void sortEquivalents(String[] equivalents, Vector unsorted) 195 { 196 int nEquivalents = equivalents.length; 197 198 for (int e = 0; e < nEquivalents; e += 1) { 199 String v = (String) unsorted.elementAt(e); 200 int i; 201 202 for (i = e - 1; i >= 0; i -= 1) { 203 if (compareEquivalents(v, equivalents[i]) >= 0) { 204 break; 205 } 206 207 equivalents[i + 1] = equivalents[i]; 208 } 209 210 equivalents[i + 1] = v; 211 } 212 } 213 214 private Vector recordVectors[] = new Vector[UScript.CODE_LIMIT]; 215 private int maxEquivalents[] = new int[UScript.CODE_LIMIT]; 216 217 } 218