1 /*
2  **********************************************************************
3  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
4  **********************************************************************
5  * Author: Mark Davis
6  **********************************************************************
7  */
8 package org.unicode.cldr.util;
9 
10 import java.util.ArrayList;
11 import java.util.Collection;
12 import java.util.Comparator;
13 import java.util.HashSet;
14 import java.util.Iterator;
15 import java.util.List;
16 import java.util.Map;
17 import java.util.Map.Entry;
18 import java.util.Set;
19 import java.util.TreeMap;
20 import java.util.TreeSet;
21 
22 import org.unicode.cldr.util.VariantFolder.CaseVariantFolder;
23 
24 import com.ibm.icu.dev.util.UnicodeMap;
25 import com.ibm.icu.impl.Utility;
26 import com.ibm.icu.lang.UCharacter;
27 import com.ibm.icu.text.CanonicalIterator;
28 import com.ibm.icu.text.CollationElementIterator;
29 import com.ibm.icu.text.Collator;
30 import com.ibm.icu.text.Normalizer;
31 import com.ibm.icu.text.RawCollationKey;
32 import com.ibm.icu.text.RuleBasedCollator;
33 import com.ibm.icu.text.Transliterator;
34 import com.ibm.icu.text.UTF16;
35 import com.ibm.icu.text.UnicodeSet;
36 import com.ibm.icu.text.UnicodeSetIterator;
37 import com.ibm.icu.util.ULocale;
38 
39 public class CollationMapMaker {
40 
41     public static final boolean SHOW_DEBUG = false;
42 
43     /**
44      * Used to pick the "best" sample from a set for using as the canonical form.
45      *
46      * @author markdavis
47      *
48      */
49     static class ExemplarComparator implements java.util.Comparator {
50         Comparator comp;
51 
52         @Override
compare(Object o1, Object o2)53         public int compare(Object o1, Object o2) {
54             String s1 = (String) o1;
55             String s2 = (String) o2;
56             int result;
57 
58             // sort normalized first
59             int n1 = Normalizer.isNormalized(s1, Normalizer.DECOMP_COMPAT, 0) ? 0 : 1;
60             int n2 = Normalizer.isNormalized(s2, Normalizer.DECOMP_COMPAT, 0) ? 0 : 1;
61             if ((result = n1 - n2) != 0) {
62                 return result;
63             }
64 
65             // choose the shortest
66             result = UTF16.countCodePoint(s2) - UTF16.countCodePoint(s1);
67             if (result != 0) { // special fix to make zero be first
68                 if (s1.length() == 0)
69                     return -1;
70                 if (s2.length() == 0)
71                     return 1;
72                 return result;
73             }
74             result = comp.compare(s1, s2);
75             if (result != 0)
76                 return result;
77             return s1.compareTo(s2);
78         }
79 
ExemplarComparator(Comparator comp)80         public ExemplarComparator(Comparator comp) {
81             this.comp = comp;
82         }
83 
84     }
85 
86     public static class Folder implements Cloneable {
87         private UnicodeMap m = new UnicodeMap();
88 
89         @Override
clone()90         public Object clone() {
91             try {
92                 Folder result = (Folder) super.clone();
93                 result.m = m.cloneAsThawed();
94                 return result;
95             } catch (CloneNotSupportedException e) {
96                 throw new InternalError("Clone problem");
97             }
98         }
99 
getValue(int i)100         public Object getValue(int i) {
101             return m.getValue(i);
102         }
103 
keySet()104         public UnicodeSet keySet() {
105             return m.keySet();
106         }
107 
put(int cp, String result)108         public Folder put(int cp, String result) {
109             m.put(cp, result);
110             return this;
111         }
112 
putAll(UnicodeSet set, String result)113         public Folder putAll(UnicodeSet set, String result) {
114             m.putAll(set, result);
115             return this;
116         }
117 
getCharactersMapped()118         public UnicodeSet getCharactersMapped() {
119             return m.keySet();
120         }
121 
minimalize()122         public void minimalize() {
123             UnicodeMap newMap = (m.cloneAsThawed());
124 
125             for (UnicodeSetIterator i = new UnicodeSetIterator(m.keySet()); i.next();) {
126                 String s = (String) m.getValue(i.codepoint);
127                 newMap.put(i.codepoint, null);
128                 String t = newMap.transform(newMap.transform(i.getString()));
129                 if (!t.equals(s)) { // restore
130                     newMap.put(i.codepoint, s);
131                 }
132             }
133         }
134 
complete()135         public void complete() {
136             while (fixNeeded())
137                 ;
138         }
139 
fixNeeded()140         public boolean fixNeeded() {
141             UnicodeMap newMap = new UnicodeMap();
142             UnicodeSet values = m.keySet();
143             boolean changed = false;
144             for (UnicodeSetIterator i = new UnicodeSetIterator(values); i.next();) {
145                 String result = (String) m.getValue(i.codepoint);
146                 String newResult = fold(result);
147                 if (!newResult.equals(result)) {
148                     changed = true;
149                     if (SHOW_DEBUG) {
150                         System.out.println(i.getString());
151                         System.out.println("->\t" + result);
152                         System.out.println("=>\t" + newResult);
153                     }
154                 }
155                 newMap.put(i.codepoint, newResult);
156             }
157             m = newMap;
158             return changed;
159         }
160 
fold(String s)161         public String fold(String s) {
162             return m.transform(s);
163         }
164 
165         /**
166          * Re
167          *
168          * @param toRemove
169          * @param addIdentity
170          *            TODO
171          * @param old
172          * @return
173          */
removeEquals(Folder toRemove, boolean addIdentity)174         Folder removeEquals(Folder toRemove, boolean addIdentity) {
175             UnicodeMap result = new UnicodeMap();
176             for (int i = 0; i <= 0x10FFFF; ++i) {
177                 Object x = m.getValue(i);
178                 Object y = toRemove.m.getValue(i);
179                 if (!UnicodeMap.areEqual(x, y)) {
180                     if (x != null) {
181                         result.put(i, x);
182                     } else if (addIdentity) {
183                         result.put(i, UTF16.valueOf(i)); // have to add mapping
184                     }
185                 }
186             }
187             m = result;
188             return this;
189         }
190 
191         static Transliterator FullWidthKana = Transliterator
192             .getInstance("fullwidth-halfwidth; [[:script=katakana:][:script=hangul:]] halfwidth-fullwidth; katakana-hiragana");
193 
getSpecialFolded(String a)194         static String getSpecialFolded(String a) {
195             String result = a;
196             result = Normalizer.normalize(result, Normalizer.NFKC);
197             result = FullWidthKana.transliterate(result);
198             result = UCharacter.foldCase(result, true);
199             result = Normalizer.normalize(result, Normalizer.NFKC);
200             return result;
201         }
202 
getUnicodeMap()203         public UnicodeMap getUnicodeMap() {
204             return m.cloneAsThawed();
205 
206         }
207 
208     }
209 
210     static final boolean showDetails = false;
211     static final RuleBasedCollator uca = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
212     static final UnicodeSet filteredChars = new UnicodeSet(
213         "[{ss}[^[:Co:][:Cf:][:Cc:][:Cn:][:Cs:][:script=Han:][:script=Hangul:]-[:nfkcquickcheck=no:]]]").freeze(); // skip
214     // a
215     // bunch
216     // of
217     // stuff,
218     // but
219     // include
220     // the
221     // items
222     // that
223     // are
224     // not
225     // nfkc
226 
227     RuleBasedCollator equivalenceClassCollator;
228     Comparator exemplarComparator;
229     Map<String, String> reasonMap = new TreeMap();
230     XEquivalenceMap equivMap = new XEquivalenceMap();
231 
generateCollatorFolding(RuleBasedCollator equivalenceClassCollator, Map<CharSequence, String> mapping)232     public Map<CharSequence, String> generateCollatorFolding(RuleBasedCollator equivalenceClassCollator,
233         Map<CharSequence, String> mapping) {
234         this.equivalenceClassCollator = equivalenceClassCollator;
235         try {
236             RuleBasedCollator exemplarCollator = (RuleBasedCollator) equivalenceClassCollator.clone();
237             exemplarCollator.setStrength(Collator.IDENTICAL);
238             exemplarCollator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
239             exemplarCollator.setUpperCaseFirst(false);
240             exemplarCollator.setAlternateHandlingShifted(false);
241             exemplarCollator.setNumericCollation(false);
242             exemplarCollator.setCaseLevel(false);
243             exemplarCollator.setFrenchCollation(false);
244             exemplarCollator.setHiraganaQuaternary(false);
245             exemplarComparator = new ExemplarComparator(exemplarCollator);
246         } catch (CloneNotSupportedException e) {
247         } // will never happen
248 
249         UnicodeSet expansions = new UnicodeSet();
250         UnicodeSet contractions = new UnicodeSet();
251         try {
252             equivalenceClassCollator.getContractionsAndExpansions(contractions, expansions, true);
253         } catch (Exception e) {
254         }
255 
256         UnicodeSet trialCharacters = new UnicodeSet(filteredChars).addAll(equivalenceClassCollator.getTailoredSet())
257             .addAll(contractions).addAll(expansions);
258 
259         for (UnicodeSetIterator i = new UnicodeSetIterator(trialCharacters); i.next();) {
260             String item = i.getString();
261             addItems(item);
262         }
263 
264         for (UnicodeSetIterator i = new UnicodeSetIterator(getFullExpansions()); i.next();) {
265             String item = i.getString();
266             addItems(item);
267         }
268 
269         closeUnderFolding();
270 
271         if (showDetails) Log.getLog().println("Printing Values: " + equivMap.size());
272         int count = 0;
273         int countMapped = 0;
274 
275         // Now process the results to figure out what we need to keep
276         Set<String> values = new TreeSet(exemplarComparator);
277 
278         for (Iterator it = equivMap.iterator(); it.hasNext();) {
279             Set<String> values1 = (Set) it.next();
280             // if there is only one result, drop it
281             if (values1.size() == 1) {
282                 if (false && SHOW_DEBUG) {
283                     String item = values1.iterator().next();
284                     System.out.println("Skipping: " + item + "\t"
285                         + equivalenceClassCollator.getRawCollationKey(item, null));
286                 }
287                 continue;
288             }
289             values.clear();
290             values.addAll(values1);
291             // if (showDetails) Log.getLog().println(bf.showSetNames(values));
292             Iterator chars = values.iterator();
293             // the lowest value is the exemplar value, so use it as the base
294             String target = (String) chars.next();
295             if (SHOW_DEBUG) {
296                 System.out.println("Target: <" + target + ">\t " + Utility.hex(target) + "\t"
297                     + equivalenceClassCollator.getRawCollationKey(target, null));
298             }
299             while (chars.hasNext()) {
300                 String source = (String) chars.next();
301                 mapping.put(source, target);
302                 if (SHOW_DEBUG) {
303                     System.out.println("\tSource: <" + source + ">\t " + Utility.hex(source) + "\t"
304                         + equivalenceClassCollator.getRawCollationKey(source, null));
305                 }
306             }
307             // for (Iterator it2 = values.iterator(); it.hasNext();) {
308             // bf.
309             // }
310             count++;
311             countMapped += values.size() - 1;
312         }
313         return mapping;
314         // // convert mapping to UnicodeMap
315         // Set problems = new TreeSet();
316         // Folder folder = new Folder();
317         // //folder.put('\u2215', "/");
318         //
319         // for (Iterator it = mapping.keySet().iterator(); it.hasNext();) {
320         // String source = (String) it.next();
321         // folder.put(UTF16.charAt(source, 0), (String) mapping.get(source));
322         // }
323         // // redo folder until it stabilizes
324         // // folder.complete();
325         //
326         // for (Iterator it = problems.iterator(); it.hasNext();) {
327         // String source = (String) it.next();
328         // String target = folder.fold((String) mapping.get(source));
329         // String other = folder.fold(source);
330         // if (target.equals(other)) {
331         // it.remove();
332         // } else {
333         // if (showDetails) Log.getLog().println("Couldn't map source " + Utility.escape(source)
334         // + "\t" + Utility.escape(target) + "\t" + Utility.escape(other));
335         // }
336         // }
337         // if (showDetails && problems.size() != 0) {
338         // Log.getLog().println("Problems");
339         // for (Iterator it = problems.iterator(); it.hasNext();) {
340         // String item = (String) it.next();
341         // //Log.getLog().println(bf.showSetNames(item));
342         // //Log.getLog().println("\t-" + bf.showSetNames(reasonMap.get(item)));
343         // }
344         // }
345         //
346         // if (showDetails) Log.getLog().println( "\tEquivalence Classes:\t" + count + "\tMapped characters:\t" +
347         // countMapped + "\tProblems:\t" + problems.size());
348         // return folder;
349     }
350 
351     VariantFolder caseFolder = new VariantFolder(new CaseVariantFolder());
352 
353     VariantFolder.AlternateFetcher COLLATION_FETCHER = new VariantFolder.AlternateFetcher() {
354         @Override
355         public Set<String> getAlternates(String item, Set<String> output) {
356             output.add(item);
357             Set set = equivMap.getEquivalences(item);
358             if (set != null) {
359                 output.addAll(set);
360             }
361             return output;
362         }
363     };
364 
closeUnderFolding()365     private void closeUnderFolding() {
366         if (false) return;
367         // TODO Generalize
368         Set<String> others = new HashSet<>();
369         List<Collection<String>> input = new ArrayList<>();
370         VariantFolder recursiveFolder = new VariantFolder(COLLATION_FETCHER);
371         TreeSet<CharSequence> hack = new TreeSet();
372         hack.add("aa");
373 
374         while (true) {
375             others.clear();
376             for (CharSequence item : hack) { // seenSoFar
377                 if (item.length() == 1) {
378                     continue;
379                 }
380                 String str = item.toString();
381                 if (UTF16.countCodePoint(str) <= 1) {
382                     continue;
383                 }
384                 Set<String> results = recursiveFolder.getClosure(item.toString());
385                 results.removeAll(seenSoFar);
386                 Log.logln(item + "\t" + results);
387                 others.addAll(results);
388             }
389             if (others.size() == 0) {
390                 break;
391             }
392             for (String item : others) {
393                 addToEquiv(item, item);
394             }
395         }
396     }
397 
398     private static UnicodeSet fullExpansions = null;
399 
getFullExpansions()400     static UnicodeSet getFullExpansions() {
401         if (fullExpansions == null) addExpansionResults(fullExpansions = new UnicodeSet());
402         return fullExpansions;
403     }
404 
addExpansionResults(UnicodeSet fullExpansions)405     private static UnicodeSet addExpansionResults(UnicodeSet fullExpansions) {
406         StringBuffer trialString = new StringBuffer();
407         Map stringToKey = new TreeMap();
408         Map keyToString = new TreeMap();
409         Set nfkc = new HashSet();
410         for (int i = 0; i < 0x10FFFF; ++i) {
411             int cat = UCharacter.getType(i);
412             if (cat == UCharacter.UNASSIGNED || cat == UCharacter.PRIVATE_USE || cat == UCharacter.SURROGATE) continue;
413             String source = UTF16.valueOf(i);
414             nfkc.add(Normalizer.compose(source, true));
415 
416             CollationElementIterator x = uca.getCollationElementIterator(source);
417             trialString.setLength(0);
418             while (true) {
419                 int element = x.next();
420                 if (element == CollationElementIterator.NULLORDER) break;
421                 char primaryOrder = (char) CollationElementIterator.primaryOrder(element);
422                 if (primaryOrder == 0) continue;
423                 trialString.append(primaryOrder);
424             }
425             if (trialString.length() == 0) continue;
426             String key = trialString.toString();
427             stringToKey.put(source, key);
428             String newSource = (String) keyToString.get(key);
429             if (newSource == null) {
430                 keyToString.put(key, source);
431             }
432         }
433         UnicodeSet expansions = new UnicodeSet();
434         UnicodeSet contractions = new UnicodeSet();
435         try {
436             uca.getContractionsAndExpansions(contractions, expansions, true);
437         } catch (Exception e1) {
438             throw new IllegalArgumentException(e1);
439         }
440 
441         fullExpansions = new UnicodeSet();
442         global: for (UnicodeSetIterator usi = new UnicodeSetIterator(expansions); usi.next();) {
443             trialString.setLength(0);
444             String source = usi.getString();
445             String key = (String) stringToKey.get(source);
446             if (key == null || key.length() == 1) continue;
447             main: while (key.length() > 0) {
448                 for (Iterator it = keyToString.entrySet().iterator(); it.hasNext();) {
449                     Entry e = (Entry) it.next();
450                     String otherKey = (String) e.getKey();
451                     if (key.startsWith(otherKey)) {
452                         trialString.append((String) e.getValue());
453                         key = key.substring(otherKey.length());
454                         continue main;
455                     }
456                 }
457                 System.out.println("Failed with: " + source);
458                 continue global;
459             }
460             String result = trialString.toString();
461             if (contractions.contains(result) || nfkc.contains(result)) {
462                 continue global;
463             }
464             if (SHOW_DEBUG & false) {
465                 System.out.println("Adding: " + usi.getString() + "\t=>\t" + trialString);
466             }
467             fullExpansions.add(result);
468         }
469         fullExpansions.freeze();
470         return fullExpansions;
471     }
472 
473     CanonicalIterator canonicalIterator = new CanonicalIterator("");
474 
475     /**
476      * Adds items, looking for all canonically equivalent strings as well.
477      *
478      * @param item
479      */
addItems(String item)480     private void addItems(String item) {
481         addItems2(item, item);
482         String minNFKD = getMinimalNKFD(item, equivalenceClassCollator);
483         if (!minNFKD.equals(item)) {
484             addItems2(minNFKD, item);
485         }
486         canonicalIterator.setSource(item);
487         for (String nextItem = canonicalIterator.next(); nextItem != null; nextItem = canonicalIterator.next()) {
488             addItems2(nextItem, item);
489         }
490     }
491 
492     /**
493      * Adds items, looking for all case-equivalent strings as well.
494      *
495      * @param item
496      * @param original
497      */
addItems2(String item, String original)498     private void addItems2(String item, String original) {
499         addItems3(item, original);
500         for (String nextItem : caseFolder.getClosure(item)) {
501             addItems3(nextItem, original);
502         }
503     }
504 
addItems3(String item, String original)505     private void addItems3(String item, String original) {
506         addToEquiv(item, original);
507         canonicalIterator.setSource(item);
508         for (String newItem = canonicalIterator.next(); newItem != null; newItem = canonicalIterator.next()) {
509             addToEquiv(newItem, original);
510         }
511     }
512 
513     Set<CharSequence> seenSoFar = new TreeSet<>();
514 
addToEquiv(String item, String original)515     private void addToEquiv(String item, String original) {
516         if (item.equals("aA")) {
517             System.out.println("ouch");
518         }
519         if (seenSoFar.contains(item)) {
520             return;
521         }
522         seenSoFar.add(item);
523         // String norm = Normalizer.compose(item, true);
524         // if (UTF16.countCodePoint(norm) < UTF16.countCodePoint(item)) {
525         // item = norm;
526         // }
527         RawCollationKey k = equivalenceClassCollator.getRawCollationKey(item, null);
528         equivMap.add(item, k);
529         reasonMap.put(item, original);
530     }
531 
532     static UnicodeSet spaceTatweelAndNSM = new UnicodeSet("[\\u0020\\u0640[:Mn:][:Me:]]");
533     static UnicodeSet NSM = new UnicodeSet("[[:Mn:][:Me:]]");
534 
535     /**
536      * Return the minimal NFKD string that has the same uca key
537      *
538      * @param item
539      * @param k
540      * @param ucaWeak
541      * @return
542      */
getMinimalNKFD(String item, Collator ucaWeak)543     private String getMinimalNKFD(String item, Collator ucaWeak) {
544         String nfkd = com.ibm.icu.text.Normalizer.decompose(item, true);
545         if (nfkd.startsWith(" ")) {
546             if (spaceTatweelAndNSM.containsAll(nfkd)) {
547                 return item; // fails
548             }
549         }
550         String start = "";
551         String end = nfkd;
552         while (end.length() != 0) {
553             int cp = UTF16.charAt(end, 0);
554             String tryEnd = end.substring(UTF16.getCharCount(cp));
555             String trial = start + tryEnd;
556             if (!ucaWeak.equals(trial, item)) { // retain character
557                 start += UTF16.valueOf(cp);
558             }
559             end = tryEnd;
560         }
561         return start;
562     }
563 
564 }
565