1 /* 2 ********************************************************************** 3 * Copyright (c) 2006-2007, Google and others. All Rights Reserved. 4 ********************************************************************** 5 * Author: Mark Davis 6 ********************************************************************** 7 */ 8 package org.unicode.cldr.util; 9 10 import java.util.ArrayList; 11 import java.util.Collection; 12 import java.util.Comparator; 13 import java.util.HashSet; 14 import java.util.Iterator; 15 import java.util.List; 16 import java.util.Map; 17 import java.util.Map.Entry; 18 import java.util.Set; 19 import java.util.TreeMap; 20 import java.util.TreeSet; 21 22 import org.unicode.cldr.util.VariantFolder.CaseVariantFolder; 23 24 import com.ibm.icu.dev.util.UnicodeMap; 25 import com.ibm.icu.impl.Utility; 26 import com.ibm.icu.lang.UCharacter; 27 import com.ibm.icu.text.CanonicalIterator; 28 import com.ibm.icu.text.CollationElementIterator; 29 import com.ibm.icu.text.Collator; 30 import com.ibm.icu.text.Normalizer; 31 import com.ibm.icu.text.RawCollationKey; 32 import com.ibm.icu.text.RuleBasedCollator; 33 import com.ibm.icu.text.Transliterator; 34 import com.ibm.icu.text.UTF16; 35 import com.ibm.icu.text.UnicodeSet; 36 import com.ibm.icu.text.UnicodeSetIterator; 37 import com.ibm.icu.util.ULocale; 38 39 public class CollationMapMaker { 40 41 public static final boolean SHOW_DEBUG = false; 42 43 /** 44 * Used to pick the "best" sample from a set for using as the canonical form. 45 * 46 * @author markdavis 47 * 48 */ 49 static class ExemplarComparator implements java.util.Comparator { 50 Comparator comp; 51 52 @Override compare(Object o1, Object o2)53 public int compare(Object o1, Object o2) { 54 String s1 = (String) o1; 55 String s2 = (String) o2; 56 int result; 57 58 // sort normalized first 59 int n1 = Normalizer.isNormalized(s1, Normalizer.DECOMP_COMPAT, 0) ? 0 : 1; 60 int n2 = Normalizer.isNormalized(s2, Normalizer.DECOMP_COMPAT, 0) ? 0 : 1; 61 if ((result = n1 - n2) != 0) { 62 return result; 63 } 64 65 // choose the shortest 66 result = UTF16.countCodePoint(s2) - UTF16.countCodePoint(s1); 67 if (result != 0) { // special fix to make zero be first 68 if (s1.length() == 0) 69 return -1; 70 if (s2.length() == 0) 71 return 1; 72 return result; 73 } 74 result = comp.compare(s1, s2); 75 if (result != 0) 76 return result; 77 return s1.compareTo(s2); 78 } 79 ExemplarComparator(Comparator comp)80 public ExemplarComparator(Comparator comp) { 81 this.comp = comp; 82 } 83 84 } 85 86 public static class Folder implements Cloneable { 87 private UnicodeMap m = new UnicodeMap(); 88 89 @Override clone()90 public Object clone() { 91 try { 92 Folder result = (Folder) super.clone(); 93 result.m = m.cloneAsThawed(); 94 return result; 95 } catch (CloneNotSupportedException e) { 96 throw new InternalError("Clone problem"); 97 } 98 } 99 getValue(int i)100 public Object getValue(int i) { 101 return m.getValue(i); 102 } 103 keySet()104 public UnicodeSet keySet() { 105 return m.keySet(); 106 } 107 put(int cp, String result)108 public Folder put(int cp, String result) { 109 m.put(cp, result); 110 return this; 111 } 112 putAll(UnicodeSet set, String result)113 public Folder putAll(UnicodeSet set, String result) { 114 m.putAll(set, result); 115 return this; 116 } 117 getCharactersMapped()118 public UnicodeSet getCharactersMapped() { 119 return m.keySet(); 120 } 121 minimalize()122 public void minimalize() { 123 UnicodeMap newMap = (m.cloneAsThawed()); 124 125 for (UnicodeSetIterator i = new UnicodeSetIterator(m.keySet()); i.next();) { 126 String s = (String) m.getValue(i.codepoint); 127 newMap.put(i.codepoint, null); 128 String t = newMap.transform(newMap.transform(i.getString())); 129 if (!t.equals(s)) { // restore 130 newMap.put(i.codepoint, s); 131 } 132 } 133 } 134 complete()135 public void complete() { 136 while (fixNeeded()) 137 ; 138 } 139 fixNeeded()140 public boolean fixNeeded() { 141 UnicodeMap newMap = new UnicodeMap(); 142 UnicodeSet values = m.keySet(); 143 boolean changed = false; 144 for (UnicodeSetIterator i = new UnicodeSetIterator(values); i.next();) { 145 String result = (String) m.getValue(i.codepoint); 146 String newResult = fold(result); 147 if (!newResult.equals(result)) { 148 changed = true; 149 if (SHOW_DEBUG) { 150 System.out.println(i.getString()); 151 System.out.println("->\t" + result); 152 System.out.println("=>\t" + newResult); 153 } 154 } 155 newMap.put(i.codepoint, newResult); 156 } 157 m = newMap; 158 return changed; 159 } 160 fold(String s)161 public String fold(String s) { 162 return m.transform(s); 163 } 164 165 /** 166 * Re 167 * 168 * @param toRemove 169 * @param addIdentity 170 * TODO 171 * @param old 172 * @return 173 */ removeEquals(Folder toRemove, boolean addIdentity)174 Folder removeEquals(Folder toRemove, boolean addIdentity) { 175 UnicodeMap result = new UnicodeMap(); 176 for (int i = 0; i <= 0x10FFFF; ++i) { 177 Object x = m.getValue(i); 178 Object y = toRemove.m.getValue(i); 179 if (!UnicodeMap.areEqual(x, y)) { 180 if (x != null) { 181 result.put(i, x); 182 } else if (addIdentity) { 183 result.put(i, UTF16.valueOf(i)); // have to add mapping 184 } 185 } 186 } 187 m = result; 188 return this; 189 } 190 191 static Transliterator FullWidthKana = Transliterator 192 .getInstance("fullwidth-halfwidth; [[:script=katakana:][:script=hangul:]] halfwidth-fullwidth; katakana-hiragana"); 193 getSpecialFolded(String a)194 static String getSpecialFolded(String a) { 195 String result = a; 196 result = Normalizer.normalize(result, Normalizer.NFKC); 197 result = FullWidthKana.transliterate(result); 198 result = UCharacter.foldCase(result, true); 199 result = Normalizer.normalize(result, Normalizer.NFKC); 200 return result; 201 } 202 getUnicodeMap()203 public UnicodeMap getUnicodeMap() { 204 return m.cloneAsThawed(); 205 206 } 207 208 } 209 210 static final boolean showDetails = false; 211 static final RuleBasedCollator uca = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 212 static final UnicodeSet filteredChars = new UnicodeSet( 213 "[{ss}[^[:Co:][:Cf:][:Cc:][:Cn:][:Cs:][:script=Han:][:script=Hangul:]-[:nfkcquickcheck=no:]]]").freeze(); // skip 214 // a 215 // bunch 216 // of 217 // stuff, 218 // but 219 // include 220 // the 221 // items 222 // that 223 // are 224 // not 225 // nfkc 226 227 RuleBasedCollator equivalenceClassCollator; 228 Comparator exemplarComparator; 229 Map<String, String> reasonMap = new TreeMap(); 230 XEquivalenceMap equivMap = new XEquivalenceMap(); 231 generateCollatorFolding(RuleBasedCollator equivalenceClassCollator, Map<CharSequence, String> mapping)232 public Map<CharSequence, String> generateCollatorFolding(RuleBasedCollator equivalenceClassCollator, 233 Map<CharSequence, String> mapping) { 234 this.equivalenceClassCollator = equivalenceClassCollator; 235 try { 236 RuleBasedCollator exemplarCollator = (RuleBasedCollator) equivalenceClassCollator.clone(); 237 exemplarCollator.setStrength(Collator.IDENTICAL); 238 exemplarCollator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); 239 exemplarCollator.setUpperCaseFirst(false); 240 exemplarCollator.setAlternateHandlingShifted(false); 241 exemplarCollator.setNumericCollation(false); 242 exemplarCollator.setCaseLevel(false); 243 exemplarCollator.setFrenchCollation(false); 244 exemplarCollator.setHiraganaQuaternary(false); 245 exemplarComparator = new ExemplarComparator(exemplarCollator); 246 } catch (CloneNotSupportedException e) { 247 } // will never happen 248 249 UnicodeSet expansions = new UnicodeSet(); 250 UnicodeSet contractions = new UnicodeSet(); 251 try { 252 equivalenceClassCollator.getContractionsAndExpansions(contractions, expansions, true); 253 } catch (Exception e) { 254 } 255 256 UnicodeSet trialCharacters = new UnicodeSet(filteredChars).addAll(equivalenceClassCollator.getTailoredSet()) 257 .addAll(contractions).addAll(expansions); 258 259 for (UnicodeSetIterator i = new UnicodeSetIterator(trialCharacters); i.next();) { 260 String item = i.getString(); 261 addItems(item); 262 } 263 264 for (UnicodeSetIterator i = new UnicodeSetIterator(getFullExpansions()); i.next();) { 265 String item = i.getString(); 266 addItems(item); 267 } 268 269 closeUnderFolding(); 270 271 if (showDetails) Log.getLog().println("Printing Values: " + equivMap.size()); 272 int count = 0; 273 int countMapped = 0; 274 275 // Now process the results to figure out what we need to keep 276 Set<String> values = new TreeSet(exemplarComparator); 277 278 for (Iterator it = equivMap.iterator(); it.hasNext();) { 279 Set<String> values1 = (Set) it.next(); 280 // if there is only one result, drop it 281 if (values1.size() == 1) { 282 if (false && SHOW_DEBUG) { 283 String item = values1.iterator().next(); 284 System.out.println("Skipping: " + item + "\t" 285 + equivalenceClassCollator.getRawCollationKey(item, null)); 286 } 287 continue; 288 } 289 values.clear(); 290 values.addAll(values1); 291 // if (showDetails) Log.getLog().println(bf.showSetNames(values)); 292 Iterator chars = values.iterator(); 293 // the lowest value is the exemplar value, so use it as the base 294 String target = (String) chars.next(); 295 if (SHOW_DEBUG) { 296 System.out.println("Target: <" + target + ">\t " + Utility.hex(target) + "\t" 297 + equivalenceClassCollator.getRawCollationKey(target, null)); 298 } 299 while (chars.hasNext()) { 300 String source = (String) chars.next(); 301 mapping.put(source, target); 302 if (SHOW_DEBUG) { 303 System.out.println("\tSource: <" + source + ">\t " + Utility.hex(source) + "\t" 304 + equivalenceClassCollator.getRawCollationKey(source, null)); 305 } 306 } 307 // for (Iterator it2 = values.iterator(); it.hasNext();) { 308 // bf. 309 // } 310 count++; 311 countMapped += values.size() - 1; 312 } 313 return mapping; 314 // // convert mapping to UnicodeMap 315 // Set problems = new TreeSet(); 316 // Folder folder = new Folder(); 317 // //folder.put('\u2215', "/"); 318 // 319 // for (Iterator it = mapping.keySet().iterator(); it.hasNext();) { 320 // String source = (String) it.next(); 321 // folder.put(UTF16.charAt(source, 0), (String) mapping.get(source)); 322 // } 323 // // redo folder until it stabilizes 324 // // folder.complete(); 325 // 326 // for (Iterator it = problems.iterator(); it.hasNext();) { 327 // String source = (String) it.next(); 328 // String target = folder.fold((String) mapping.get(source)); 329 // String other = folder.fold(source); 330 // if (target.equals(other)) { 331 // it.remove(); 332 // } else { 333 // if (showDetails) Log.getLog().println("Couldn't map source " + Utility.escape(source) 334 // + "\t" + Utility.escape(target) + "\t" + Utility.escape(other)); 335 // } 336 // } 337 // if (showDetails && problems.size() != 0) { 338 // Log.getLog().println("Problems"); 339 // for (Iterator it = problems.iterator(); it.hasNext();) { 340 // String item = (String) it.next(); 341 // //Log.getLog().println(bf.showSetNames(item)); 342 // //Log.getLog().println("\t-" + bf.showSetNames(reasonMap.get(item))); 343 // } 344 // } 345 // 346 // if (showDetails) Log.getLog().println( "\tEquivalence Classes:\t" + count + "\tMapped characters:\t" + 347 // countMapped + "\tProblems:\t" + problems.size()); 348 // return folder; 349 } 350 351 VariantFolder caseFolder = new VariantFolder(new CaseVariantFolder()); 352 353 VariantFolder.AlternateFetcher COLLATION_FETCHER = new VariantFolder.AlternateFetcher() { 354 @Override 355 public Set<String> getAlternates(String item, Set<String> output) { 356 output.add(item); 357 Set set = equivMap.getEquivalences(item); 358 if (set != null) { 359 output.addAll(set); 360 } 361 return output; 362 } 363 }; 364 closeUnderFolding()365 private void closeUnderFolding() { 366 if (false) return; 367 // TODO Generalize 368 Set<String> others = new HashSet<>(); 369 List<Collection<String>> input = new ArrayList<>(); 370 VariantFolder recursiveFolder = new VariantFolder(COLLATION_FETCHER); 371 TreeSet<CharSequence> hack = new TreeSet(); 372 hack.add("aa"); 373 374 while (true) { 375 others.clear(); 376 for (CharSequence item : hack) { // seenSoFar 377 if (item.length() == 1) { 378 continue; 379 } 380 String str = item.toString(); 381 if (UTF16.countCodePoint(str) <= 1) { 382 continue; 383 } 384 Set<String> results = recursiveFolder.getClosure(item.toString()); 385 results.removeAll(seenSoFar); 386 Log.logln(item + "\t" + results); 387 others.addAll(results); 388 } 389 if (others.size() == 0) { 390 break; 391 } 392 for (String item : others) { 393 addToEquiv(item, item); 394 } 395 } 396 } 397 398 private static UnicodeSet fullExpansions = null; 399 getFullExpansions()400 static UnicodeSet getFullExpansions() { 401 if (fullExpansions == null) addExpansionResults(fullExpansions = new UnicodeSet()); 402 return fullExpansions; 403 } 404 addExpansionResults(UnicodeSet fullExpansions)405 private static UnicodeSet addExpansionResults(UnicodeSet fullExpansions) { 406 StringBuffer trialString = new StringBuffer(); 407 Map stringToKey = new TreeMap(); 408 Map keyToString = new TreeMap(); 409 Set nfkc = new HashSet(); 410 for (int i = 0; i < 0x10FFFF; ++i) { 411 int cat = UCharacter.getType(i); 412 if (cat == UCharacter.UNASSIGNED || cat == UCharacter.PRIVATE_USE || cat == UCharacter.SURROGATE) continue; 413 String source = UTF16.valueOf(i); 414 nfkc.add(Normalizer.compose(source, true)); 415 416 CollationElementIterator x = uca.getCollationElementIterator(source); 417 trialString.setLength(0); 418 while (true) { 419 int element = x.next(); 420 if (element == CollationElementIterator.NULLORDER) break; 421 char primaryOrder = (char) CollationElementIterator.primaryOrder(element); 422 if (primaryOrder == 0) continue; 423 trialString.append(primaryOrder); 424 } 425 if (trialString.length() == 0) continue; 426 String key = trialString.toString(); 427 stringToKey.put(source, key); 428 String newSource = (String) keyToString.get(key); 429 if (newSource == null) { 430 keyToString.put(key, source); 431 } 432 } 433 UnicodeSet expansions = new UnicodeSet(); 434 UnicodeSet contractions = new UnicodeSet(); 435 try { 436 uca.getContractionsAndExpansions(contractions, expansions, true); 437 } catch (Exception e1) { 438 throw new IllegalArgumentException(e1); 439 } 440 441 fullExpansions = new UnicodeSet(); 442 global: for (UnicodeSetIterator usi = new UnicodeSetIterator(expansions); usi.next();) { 443 trialString.setLength(0); 444 String source = usi.getString(); 445 String key = (String) stringToKey.get(source); 446 if (key == null || key.length() == 1) continue; 447 main: while (key.length() > 0) { 448 for (Iterator it = keyToString.entrySet().iterator(); it.hasNext();) { 449 Entry e = (Entry) it.next(); 450 String otherKey = (String) e.getKey(); 451 if (key.startsWith(otherKey)) { 452 trialString.append((String) e.getValue()); 453 key = key.substring(otherKey.length()); 454 continue main; 455 } 456 } 457 System.out.println("Failed with: " + source); 458 continue global; 459 } 460 String result = trialString.toString(); 461 if (contractions.contains(result) || nfkc.contains(result)) { 462 continue global; 463 } 464 if (SHOW_DEBUG & false) { 465 System.out.println("Adding: " + usi.getString() + "\t=>\t" + trialString); 466 } 467 fullExpansions.add(result); 468 } 469 fullExpansions.freeze(); 470 return fullExpansions; 471 } 472 473 CanonicalIterator canonicalIterator = new CanonicalIterator(""); 474 475 /** 476 * Adds items, looking for all canonically equivalent strings as well. 477 * 478 * @param item 479 */ addItems(String item)480 private void addItems(String item) { 481 addItems2(item, item); 482 String minNFKD = getMinimalNKFD(item, equivalenceClassCollator); 483 if (!minNFKD.equals(item)) { 484 addItems2(minNFKD, item); 485 } 486 canonicalIterator.setSource(item); 487 for (String nextItem = canonicalIterator.next(); nextItem != null; nextItem = canonicalIterator.next()) { 488 addItems2(nextItem, item); 489 } 490 } 491 492 /** 493 * Adds items, looking for all case-equivalent strings as well. 494 * 495 * @param item 496 * @param original 497 */ addItems2(String item, String original)498 private void addItems2(String item, String original) { 499 addItems3(item, original); 500 for (String nextItem : caseFolder.getClosure(item)) { 501 addItems3(nextItem, original); 502 } 503 } 504 addItems3(String item, String original)505 private void addItems3(String item, String original) { 506 addToEquiv(item, original); 507 canonicalIterator.setSource(item); 508 for (String newItem = canonicalIterator.next(); newItem != null; newItem = canonicalIterator.next()) { 509 addToEquiv(newItem, original); 510 } 511 } 512 513 Set<CharSequence> seenSoFar = new TreeSet<>(); 514 addToEquiv(String item, String original)515 private void addToEquiv(String item, String original) { 516 if (item.equals("aA")) { 517 System.out.println("ouch"); 518 } 519 if (seenSoFar.contains(item)) { 520 return; 521 } 522 seenSoFar.add(item); 523 // String norm = Normalizer.compose(item, true); 524 // if (UTF16.countCodePoint(norm) < UTF16.countCodePoint(item)) { 525 // item = norm; 526 // } 527 RawCollationKey k = equivalenceClassCollator.getRawCollationKey(item, null); 528 equivMap.add(item, k); 529 reasonMap.put(item, original); 530 } 531 532 static UnicodeSet spaceTatweelAndNSM = new UnicodeSet("[\\u0020\\u0640[:Mn:][:Me:]]"); 533 static UnicodeSet NSM = new UnicodeSet("[[:Mn:][:Me:]]"); 534 535 /** 536 * Return the minimal NFKD string that has the same uca key 537 * 538 * @param item 539 * @param k 540 * @param ucaWeak 541 * @return 542 */ getMinimalNKFD(String item, Collator ucaWeak)543 private String getMinimalNKFD(String item, Collator ucaWeak) { 544 String nfkd = com.ibm.icu.text.Normalizer.decompose(item, true); 545 if (nfkd.startsWith(" ")) { 546 if (spaceTatweelAndNSM.containsAll(nfkd)) { 547 return item; // fails 548 } 549 } 550 String start = ""; 551 String end = nfkd; 552 while (end.length() != 0) { 553 int cp = UTF16.charAt(end, 0); 554 String tryEnd = end.substring(UTF16.getCharCount(cp)); 555 String trial = start + tryEnd; 556 if (!ucaWeak.equals(trial, item)) { // retain character 557 start += UTF16.valueOf(cp); 558 } 559 end = tryEnd; 560 } 561 return start; 562 } 563 564 } 565