1 package org.unicode.cldr.util;
2 import java.util.ArrayList;
3 import java.util.Collection;
4 import java.util.EnumSet;
5 import java.util.HashSet;
6 import java.util.LinkedHashSet;
7 import java.util.List;
8 import java.util.Map;
9 import java.util.Map.Entry;
10 import java.util.Set;
11 import java.util.TreeMap;
12 import java.util.TreeSet;
13 
14 import org.unicode.cldr.util.StandardCodes.LstrType;
15 
16 import com.google.common.base.Joiner;
17 import com.google.common.base.MoreObjects;
18 import com.google.common.base.Objects;
19 import com.google.common.collect.ComparisonChain;
20 import com.google.common.collect.ImmutableMap;
21 import com.google.common.collect.ImmutableMultimap;
22 import com.google.common.collect.ImmutableSet;
23 import com.google.common.collect.Multimap;
24 import com.google.common.collect.TreeMultimap;
25 import com.ibm.icu.impl.Row.R2;
26 
27 /**
28  * Provides Unicode Language Identifier canonicalization for use in testing.
29  * The implementation is designed to be simple, and is not at all optimized for production use.
30  * It is used to verify the correctness of the specification algorithm,
31  * sanity-check the supplementalMetadata.xml alias data,
32  * and generate test files for use by implementations.
33  */
34 public class LsrvCanonicalizer {
35 
36     public static final Set<LstrType> LSRV = ImmutableSet.of(LstrType.language, LstrType.script, LstrType.region, LstrType.variant);
37     public static final Joiner UNDERBAR_JOINER = Joiner.on('_');
38 
39     /**
40      * A representation of a Unicode Language Identifier in a format that makes it simple to process.
41      * The LSRV fields are represented as multimaps, though the LSR fields restricted to have only have 0 or 1 element.
42      */
43     public static class XLanguageTag {
44         final Multimap<LstrType, String> data;
45 
XLanguageTag(Multimap<LstrType, String> result)46         private XLanguageTag(Multimap<LstrType, String> result) {
47             data = ImmutableMultimap.copyOf(result);
48         }
keys()49         public Set<LstrType> keys() {
50             return data.keySet();
51         }
get(LstrType lstrType)52         public Collection<String> get(LstrType lstrType) {
53             return data.get(lstrType);
54         }
toLocaleString()55         public String toLocaleString() {
56             StringBuilder buffer = new StringBuilder();
57             final Collection<String> region = data.get(LstrType.language);
58             if (!region.isEmpty()) {
59                 buffer.append(UNDERBAR_JOINER.join(region));
60             } else {
61                 buffer.append("und");
62             }
63             addItem(buffer, LstrType.script, "", "_", UNDERBAR_JOINER);
64             addItem(buffer, LstrType.region, "", "_", UNDERBAR_JOINER);
65             addItem(buffer, LstrType.variant, "", "_", UNDERBAR_JOINER);
66 
67             return buffer.toString();
68         }
69         @Override
toString()70         public String toString() {
71             StringBuilder buffer = new StringBuilder();
72             addItem(buffer, LstrType.language, "", "L:", UNDERBAR_JOINER);
73             addItem(buffer, LstrType.script, ";", "S:", UNDERBAR_JOINER);
74             addItem(buffer, LstrType.region, ";", "R:", UNDERBAR_JOINER);
75             addItem(buffer, LstrType.variant, ";", "V:", UNDERBAR_JOINER);
76             return buffer.toString();
77         }
78 
addItem(StringBuilder buffer, LstrType lstrType, String separator, String prefix, final Joiner dashJoiner)79         public void addItem(StringBuilder buffer, LstrType lstrType, String separator, String prefix, final Joiner dashJoiner) {
80             final Collection<String> region = data.get(lstrType);
81             if (!region.isEmpty()) {
82                 if (buffer.length() > 0) {
83                     buffer.append(separator);
84                 }
85                 buffer.append(prefix).append(dashJoiner.join(region));
86             }
87         }
88 
fromTag(LstrType lstrType, String tag)89         public static XLanguageTag fromTag(LstrType lstrType, String tag) {
90             Multimap<LstrType,String> result = TreeMultimap.create();
91             LanguageTagParser source = new LanguageTagParser();
92             final boolean isLanguage = lstrType == LstrType.language;
93             String prefix = isLanguage ? "" : "und_";
94             try {
95                 source.set(prefix + tag);
96             } catch (Exception e) {
97                 return null;  // skip ill-formed for now
98 //                if (lstrType == LstrType.region && tag.length() == 3) {
99 //                    //result.put(LstrType.language, "und");
100 //                    result.put(LstrType.region, tag);
101 //                } else {
102 //                    result.put(LstrType.language, tag);
103 //                }
104 //                //System.out.println("ILLEGAL SOURCE\t" + lstrType + ":\t" + tag + " ⇒ " + result); // for debugging
105 //                return new XLanguageTag(result);
106             }
107             if (!source.getLanguage().isEmpty()
108                 && !source.getLanguage().contains("und")) {
109                 result.put(LstrType.language, source.getLanguage());
110             }
111             if (!source.getScript().isEmpty()) {
112                 result.put(LstrType.script, source.getScript());
113             }
114             if (!source.getRegion().isEmpty()) {
115                 result.put(LstrType.region, source.getRegion());
116             }
117             if (!source.getVariants().isEmpty()) {
118                 result.putAll(LstrType.variant, source.getVariants());
119             }
120             return new XLanguageTag(result);
121         }
122         @Override
equals(Object obj)123         public boolean equals(Object obj) {
124             return data.equals(((XLanguageTag)obj).data);
125         }
126         @Override
hashCode()127         public int hashCode() {
128             return data.hashCode();
129         }
set(LstrType lstrType, String string)130         public XLanguageTag set(LstrType lstrType, String string) {
131             Multimap<LstrType,String> result = TreeMultimap.create(data);
132             if (lstrType != LstrType.variant) {
133                 result.removeAll(lstrType);
134             }
135             result.put(lstrType, string);
136             return new XLanguageTag(result);
137         }
138 
139         /**
140          * containsAll is used in matching a ReplacementRule.<br>
141          * It is here instead of on ReplacementRule so we can use in the denormalization utility used in testing.
142          */
containsAll(XLanguageTag type)143         public boolean containsAll(XLanguageTag type) {
144             for (LstrType lstrType : LSRV) {
145                 final Collection<String> sources = get(lstrType);
146                 final Collection<String> types = type.get(lstrType);
147                 if (!sources.containsAll(types)) {
148                     return false;
149                 }
150             }
151             return true;
152         }
153 
154         /**
155          * Once a rule matches, this actually does the replacement.<br>
156          * It is here instead of on ReplacementRule so we can use it in the denormalization utility used in testing.
157          */
replacePartsFrom(XLanguageTag typeParts, XLanguageTag replacementParts)158         public XLanguageTag replacePartsFrom(XLanguageTag typeParts, XLanguageTag replacementParts) {
159             Multimap<LstrType,String> result = TreeMultimap.create();
160             for (LstrType lstrType : LSRV) {
161                 Collection<String> sources = get(lstrType);
162                 Collection<String> types = typeParts.get(lstrType);
163                 Collection<String> replacements = replacementParts.get(lstrType);
164                 result.putAll(lstrType, sources);
165                 if (!types.isEmpty() && !replacements.isEmpty()) {
166                     removeAll(result, lstrType, types);
167                     result.putAll(lstrType, replacements);
168                 } else if (!types.isEmpty() && replacements.isEmpty()) {
169                     removeAll(result, lstrType, types);
170                 } else if (types.isEmpty() && !replacements.isEmpty()) {
171                     if (sources.isEmpty()) {
172                         result.putAll(lstrType, replacements);
173                     }
174                 } else {
175                     // otherwise both empty, skip
176                 }
177             }
178             return new XLanguageTag(result);
179         }
180     }
181 
182     /**
183      * A representation of the alias data for Unicode Language Identifiers in the supplementalMetadata.txt file.
184      */
185 
186     public static class ReplacementRule implements Comparable<ReplacementRule> {
187         private final XLanguageTag typeParts;
188         final XLanguageTag replacementParts;
189         final List<XLanguageTag> secondaryReplacementSet; // TODO, using this information in special cases to impute the best language according to LDML
190         final String reason;
191         final boolean regular;
192 
ReplacementRule(LstrType lstrType, String type, XLanguageTag typeParts, XLanguageTag replacementParts, List<XLanguageTag> secondaryReplacementSet, String reason)193         private ReplacementRule(LstrType lstrType, String type, XLanguageTag typeParts, XLanguageTag replacementParts,
194             List<XLanguageTag> secondaryReplacementSet, String reason) {
195             this.typeParts = typeParts;
196             this.replacementParts = replacementParts;
197             this.secondaryReplacementSet = secondaryReplacementSet;
198             this.reason = reason;
199             this.regular = typeParts.keys().equals(replacementParts.keys()) &&
200                 typeParts.get(LstrType.variant).size() == replacementParts.get(LstrType.variant).size();
201         }
202 
from(LstrType lstrType, String type, List<String> replacement, String reason)203         static ReplacementRule from(LstrType lstrType, String type, List<String> replacement, String reason) {
204             XLanguageTag typeParts = XLanguageTag.fromTag(lstrType, type);
205             if (typeParts == null) {
206                 return null; // skip ill-formed for now
207             }
208             XLanguageTag replacementParts = XLanguageTag.fromTag(lstrType, replacement.get(0));
209             if (replacementParts == null) {
210                 return null; // skip ill-formed for now
211             }
212             List<XLanguageTag> secondaryReplacementSet = new ArrayList<>();
213             for (int i = 1; i < replacement.size(); ++i) {
214                 secondaryReplacementSet.add(XLanguageTag.fromTag(lstrType, replacement.get(i)));
215             }
216             return new ReplacementRule(lstrType, type, typeParts, replacementParts, secondaryReplacementSet, reason);
217         }
218 
219         @Override
compareTo(ReplacementRule o)220         public int compareTo(ReplacementRule o) {
221             return ComparisonChain.start()
222                 .compare(-getType().keys().size(), -o.getType().keys().size()) // sort most keys first
223                 .compare(getType().toString(), o.getType().toString())
224                 .result();
225         }
226         @Override
equals(Object obj)227         public boolean equals(Object obj) {
228             return compareTo((ReplacementRule) obj) == 0;
229         }
230         @Override
hashCode()231         public int hashCode() {
232             return Objects.hashCode(getType());
233         }
234         @Override
toString()235         public String toString() {
236             return MoreObjects.toStringHelper(getClass())
237                 .add("type", getType())
238                 .add("replacement", replacementParts)
239                 .toString();
240         }
getType()241         public XLanguageTag getType() {
242             return typeParts;
243         }
getReplacement()244         public XLanguageTag getReplacement() {
245             return replacementParts;
246         }
247     }
248 
249     /**
250      * Utility to remove multiple items from Multimap
251      */
removeAll(Multimap<K, V> result, K key, Iterable<V> value)252     public static <K,V> Multimap<K, V> removeAll(Multimap<K, V> result, K key, Iterable<V> value) {
253         for (V type : value) {
254             result.remove(key, type);
255         }
256         return result;
257     }
258 
259     private Set<ReplacementRule> rules = new TreeSet<>();
260     private Multimap<LstrType, String> inType = TreeMultimap.create();
261     private Map<LstrType, String> irrelevant = new TreeMap<>();
262 
add(ReplacementRule replacementRule)263     private void add(ReplacementRule replacementRule) {
264         getRules().add(replacementRule);
265     }
266 
267     /**
268      * Canonicalize a Unicode Language Identifier (LSRV - language, script, region, variants)
269      * @param lstrType This is a special flag used to indicate which supplementalMetadata alias type the languageTag is from.
270      * That determines whether to extend the type and replacement to be full LSRVs if they are partial, by adding "und_", for example.
271      * @param languageTag May be partial, if the lstrType is not LstrType.language.
272      */
canonicalize(LstrType lstrType, String languageTag)273     public String canonicalize(LstrType lstrType, String languageTag) {
274         XLanguageTag newTag = canonicalizeToX(XLanguageTag.fromTag(lstrType, languageTag), null);
275         return newTag.toString();
276     }
277 
278     /**
279      * Canonicalize a Unicode Language Identifier (LSRV - language, script, region, variants) in the XLanguageTag format.
280      * Also returns the rules used in the canonicalization.<br>
281      * NOT OPTIMIZED: just uses a linear search for simplicity; production code would use more efficient mechanisms
282      */
canonicalizeToX(XLanguageTag fromTag, List<ReplacementRule> rulesUsed)283     public XLanguageTag canonicalizeToX(XLanguageTag fromTag, List<ReplacementRule> rulesUsed) {
284         if (rulesUsed != null) {
285             rulesUsed.clear();
286         }
287         XLanguageTag newTag = fromTag;
288         startAtTheTop:
289             while (true) {
290                 for (ReplacementRule rule : getRules()) {
291                     if (newTag.containsAll(rule.getType())) {
292                         XLanguageTag temp = newTag.replacePartsFrom(rule.getType(), rule.getReplacement());
293                         if (!temp.equals(newTag)) {
294                             newTag = temp;
295                             if (rulesUsed != null) {
296                                 rulesUsed.add(rule);
297                             }
298                             continue startAtTheTop;
299                         }
300                     }
301                 }
302                 return newTag;
303             }
304     }
305 
306     /**
307      * Decanonicalize a Unicode Language Identifier (LSRV - language, script, region, variants) in the XLanguageTag format.
308      * Also returns the rules used in the canonicalization. Used in test case generation
309      * NOT OPTIMIZED: just for testing
310      */
decanonicalizeToX(XLanguageTag fromTag)311     public Set<XLanguageTag> decanonicalizeToX(XLanguageTag fromTag) {
312         Set<XLanguageTag> result = new HashSet<>();
313         result.add(fromTag);
314         Set<XLanguageTag> intermediate = new HashSet<>();
315         while (true) {
316             for (ReplacementRule rule : getRules()) {
317                 if (!rule.getType().get(LstrType.variant).isEmpty()) {
318                     continue;
319                 }
320                 for (XLanguageTag newTag : result) {
321                     if (newTag.containsAll(rule.getReplacement())) { // reverse normal order
322                         XLanguageTag changed = newTag.replacePartsFrom(rule.getReplacement(), rule.getType()); // reverse normal order
323                         if (!intermediate.contains(changed)
324                             && !result.contains(changed)) {
325                             intermediate.add(changed);
326                         }
327                     }
328                 }
329             }
330             if (intermediate.isEmpty()) {
331                 result.remove(fromTag);
332                 return result;
333             }
334             result.addAll(intermediate);
335             intermediate.clear();
336         }
337     }
338 
339 
340     /**
341      * Utility for getting a filtered list of rules, mostly useful in debugging.
342      */
filter(LstrType lstrType, String value)343     public List<ReplacementRule> filter(LstrType lstrType, String value) {
344         List<ReplacementRule> result = new ArrayList<>();
345         for (ReplacementRule rule : getRules()) {
346             final Collection<String> items = rule.getType().get(lstrType);
347             if (value == null && !items.isEmpty()
348                 || value != null && items.contains(value)) {
349                 result.add(rule);
350             }
351         }
352         return result;
353     }
354 
getInstance()355     public static final LsrvCanonicalizer getInstance() {
356         return SINGLETON;
357     }
358     private static final LsrvCanonicalizer SINGLETON = load();
359 
load()360     private static LsrvCanonicalizer load() {
361         SupplementalDataInfo SDI = CLDRConfig.getInstance().getSupplementalDataInfo();
362         Map<String, Map<String, R2<List<String>, String>>> aliases = SDI.getLocaleAliasInfo();
363         // type -> tag -> , like "language" -> "sh" -> <{"sr_Latn"}, reason>
364 
365         LsrvCanonicalizer rrs = new LsrvCanonicalizer();
366         for (Entry<String, Map<String, R2<List<String>, String>>> typeTagReplacement : aliases.entrySet()) {
367             String type = typeTagReplacement.getKey();
368             if (type.contains("-")) {
369                 throw new IllegalArgumentException("Bad format for alias: should have _ instead of -.");
370             }
371             LstrType lstrType = LstrType.fromString(type);
372             if (!LSRV.contains(lstrType)) {
373                 continue;
374             }
375             for (Entry<String, R2<List<String>, String>> tagReplacementReason : typeTagReplacement.getValue().entrySet()) {
376                 String tag = tagReplacementReason.getKey();
377                 if (tag.contains("-")) {
378                     throw new IllegalArgumentException("Bad format for alias: should have _ instead of -.");
379                 }
380                 List<String> replacement = tagReplacementReason.getValue().get0();
381                 if (replacement == null) {
382                     System.out.println("No replacement: " + tagReplacementReason);
383                     continue;
384                 }
385                 String reason = tagReplacementReason.getValue().get1();
386                 final ReplacementRule replacementRule = ReplacementRule.from(lstrType, tag, replacement, reason);
387                 if (replacementRule == null) {
388                     // System.out.println("No rule: " + tagReplacementReason);
389                     continue;
390                 }
391                 rrs.add(replacementRule);
392             }
393         }
394         rrs.rules = ImmutableSet.copyOf(rrs.rules);
395         for (ReplacementRule rule :  rrs.rules) {
396             XLanguageTag type = rule.getType();
397             XLanguageTag replacement = rule.getReplacement();
398             for (LstrType lstrType : LsrvCanonicalizer.LSRV) {
399                 rrs.inType.putAll(lstrType, type.get(lstrType));
400                 rrs.inType.putAll(lstrType, replacement.get(lstrType));
401             }
402         }
403         rrs.inType = ImmutableMultimap.copyOf(rrs.inType);
404 
405         for (LstrType lstrType : LsrvCanonicalizer.LSRV) {
406             Set<String> all = new LinkedHashSet<>(Validity.getInstance().getStatusToCodes(lstrType).get(Validity.Status.regular));
407             all.removeAll(rrs.inType.get(lstrType));
408             if (lstrType == LstrType.variant && all.contains("fonipa")) {
409                 rrs.irrelevant.put(lstrType, "fonipa");
410             } else {
411                 rrs.irrelevant.put(lstrType, all.iterator().next());
412             }
413         }
414         rrs.irrelevant = ImmutableMap.copyOf(rrs.irrelevant);
415         return rrs;
416     }
417 
418     /**
419      * Returns the set of all the Replacement rules in the canonicalizer.
420      */
getRules()421     public Set<ReplacementRule> getRules() {
422         return rules;
423     }
424 
425     /**
426      * Types of test data
427      */
428     public enum TestDataTypes {explicit, fromAliases, decanonicalized, withIrrelevants}
429 
430     /**
431      * Returns test data for the rules, used to generate test data files.
432      * @param testDataTypes if null, returns all the data; otherwise the specified set.
433      * @return
434      */
getTestData(Set<TestDataTypes> testDataTypes)435     public Map<TestDataTypes,Map<String, String>> getTestData(Set<TestDataTypes> testDataTypes) {
436         Map<TestDataTypes,Map<String, String>> result = new TreeMap<>();
437 
438         if (testDataTypes == null) {
439             testDataTypes = EnumSet.allOf(TestDataTypes.class);
440         }
441         Set<String> allToTest = new TreeSet<>();
442         if (testDataTypes.contains(TestDataTypes.explicit)) {
443             Map<String, String> testData2 = new TreeMap<>();
444             String[][] tests = {
445                 {"hye_arevmda", "hyw"},
446                 {"art_lojban", "jbo"},
447                 {"en_arevela", "en"},
448                 {"hy_arevela", "hy"},
449                 {"en_arevmda_arevela", "en"},
450                 {"hy_arevmda", "hyw"},
451                 {"hy_arevmda_arevela", "hyw"},
452                 {"en_lojban", "en"},
453                 {"en_US_polytoni", "en_US_polyton"},
454                 {"en_US_heploc", "en_US_alalc97"},
455                 {"en_US_aaland", "en_US"},
456                 {"en_aaland", "en_AX"},
457                 {"no_nynorsk_bokmal", "nb"},
458                 {"no_bokmal_nynorsk", "nb"},
459                 {"zh_guoyu_hakka_xiang", "hak"},
460                 {"zh_hakka_xiang", "hak"},
461             };
462             for (String row[] : tests) {
463                 String toTest = row[0];
464                 String expected = row[1];
465                 testData2.put(toTest, expected);
466             }
467             allToTest.addAll(testData2.keySet());
468             result.put(TestDataTypes.explicit, ImmutableMap.copyOf(testData2));
469         }
470 
471         if (testDataTypes.contains(TestDataTypes.fromAliases)) {
472             Map<String, String> testData2 = new TreeMap<>();
473             for (ReplacementRule rule : getRules()) {
474                 String toTest = rule.getType().toLocaleString();
475                 String expected = rule.getReplacement().toLocaleString();
476                 if (!allToTest.contains(toTest)) {
477                     testData2.put(toTest,expected);
478                 }
479             }
480             allToTest.addAll(testData2.keySet());
481             result.put(TestDataTypes.fromAliases, ImmutableMap.copyOf(testData2));
482         }
483 
484         if (testDataTypes.contains(TestDataTypes.decanonicalized)) {
485             Map<String, String> testData2 = new TreeMap<>();
486             for (String testItem: allToTest) {
487                 for (XLanguageTag decon : decanonicalizeToX(XLanguageTag.fromTag(LstrType.language, testItem))) {
488                     XLanguageTag newTag = canonicalizeToX(decon, null);
489                     final String toTest = decon.toLocaleString();
490                     if (!allToTest.contains(toTest)) {
491                         testData2.put(toTest, newTag.toLocaleString());
492                     }
493                 }
494             }
495             allToTest.addAll(testData2.keySet());
496             result.put(TestDataTypes.decanonicalized, ImmutableMap.copyOf(testData2));
497         }
498 
499         if (testDataTypes.contains(TestDataTypes.withIrrelevants)) {
500             Map<String, String> testData2 = new TreeMap<>();
501             for (String testItem: allToTest) {
502                 XLanguageTag fluffedUp = fluff(XLanguageTag.fromTag(LstrType.language, testItem), irrelevant);
503                 XLanguageTag newTag = canonicalizeToX(fluffedUp, null);
504                 final String toTest = fluffedUp.toLocaleString();
505                 if (!allToTest.contains(toTest)) {
506                     testData2.put(toTest, newTag.toLocaleString());
507                 }
508            }
509             allToTest.addAll(testData2.keySet());
510             result.put(TestDataTypes.withIrrelevants, ImmutableMap.copyOf(testData2));
511         }
512 
513         result = ImmutableMap.copyOf(result);
514         return result;
515     }
516 
fluff(XLanguageTag type, Map<LstrType, String> toAddIfMissing)517     private static XLanguageTag fluff(XLanguageTag type, Map<LstrType, String> toAddIfMissing) {
518         XLanguageTag newTag = type;
519         for (LstrType lstrType : LsrvCanonicalizer.LSRV) {
520             if (type.get(lstrType).isEmpty() || lstrType == LstrType.variant) {
521                 newTag = newTag.set(lstrType, toAddIfMissing.get(lstrType));
522             }
523         }
524         return newTag;
525     }
526 
527     /**
528      * Returns all the fields used in the type attribute of the alias rule.
529      */
getInType(LstrType language)530     public Collection<String> getInType(LstrType language) {
531         return inType.get(language);
532     }
533 
534     /**
535      * Returns some sample fields that do not appear in the type attribute of the alias rule, used for testing.
536      */
getIrrelevantField(LstrType language)537     public String getIrrelevantField(LstrType language) {
538         return irrelevant.get(language);
539     }
540 
541 }
542