1 package org.unicode.cldr.util; 2 import java.util.ArrayList; 3 import java.util.Collection; 4 import java.util.EnumSet; 5 import java.util.HashSet; 6 import java.util.LinkedHashSet; 7 import java.util.List; 8 import java.util.Map; 9 import java.util.Map.Entry; 10 import java.util.Set; 11 import java.util.TreeMap; 12 import java.util.TreeSet; 13 14 import org.unicode.cldr.util.StandardCodes.LstrType; 15 16 import com.google.common.base.Joiner; 17 import com.google.common.base.MoreObjects; 18 import com.google.common.base.Objects; 19 import com.google.common.collect.ComparisonChain; 20 import com.google.common.collect.ImmutableMap; 21 import com.google.common.collect.ImmutableMultimap; 22 import com.google.common.collect.ImmutableSet; 23 import com.google.common.collect.Multimap; 24 import com.google.common.collect.TreeMultimap; 25 import com.ibm.icu.impl.Row.R2; 26 27 /** 28 * Provides Unicode Language Identifier canonicalization for use in testing. 29 * The implementation is designed to be simple, and is not at all optimized for production use. 30 * It is used to verify the correctness of the specification algorithm, 31 * sanity-check the supplementalMetadata.xml alias data, 32 * and generate test files for use by implementations. 33 */ 34 public class LsrvCanonicalizer { 35 36 public static final Set<LstrType> LSRV = ImmutableSet.of(LstrType.language, LstrType.script, LstrType.region, LstrType.variant); 37 public static final Joiner UNDERBAR_JOINER = Joiner.on('_'); 38 39 /** 40 * A representation of a Unicode Language Identifier in a format that makes it simple to process. 41 * The LSRV fields are represented as multimaps, though the LSR fields restricted to have only have 0 or 1 element. 42 */ 43 public static class XLanguageTag { 44 final Multimap<LstrType, String> data; 45 XLanguageTag(Multimap<LstrType, String> result)46 private XLanguageTag(Multimap<LstrType, String> result) { 47 data = ImmutableMultimap.copyOf(result); 48 } keys()49 public Set<LstrType> keys() { 50 return data.keySet(); 51 } get(LstrType lstrType)52 public Collection<String> get(LstrType lstrType) { 53 return data.get(lstrType); 54 } toLocaleString()55 public String toLocaleString() { 56 StringBuilder buffer = new StringBuilder(); 57 final Collection<String> region = data.get(LstrType.language); 58 if (!region.isEmpty()) { 59 buffer.append(UNDERBAR_JOINER.join(region)); 60 } else { 61 buffer.append("und"); 62 } 63 addItem(buffer, LstrType.script, "", "_", UNDERBAR_JOINER); 64 addItem(buffer, LstrType.region, "", "_", UNDERBAR_JOINER); 65 addItem(buffer, LstrType.variant, "", "_", UNDERBAR_JOINER); 66 67 return buffer.toString(); 68 } 69 @Override toString()70 public String toString() { 71 StringBuilder buffer = new StringBuilder(); 72 addItem(buffer, LstrType.language, "", "L:", UNDERBAR_JOINER); 73 addItem(buffer, LstrType.script, ";", "S:", UNDERBAR_JOINER); 74 addItem(buffer, LstrType.region, ";", "R:", UNDERBAR_JOINER); 75 addItem(buffer, LstrType.variant, ";", "V:", UNDERBAR_JOINER); 76 return buffer.toString(); 77 } 78 addItem(StringBuilder buffer, LstrType lstrType, String separator, String prefix, final Joiner dashJoiner)79 public void addItem(StringBuilder buffer, LstrType lstrType, String separator, String prefix, final Joiner dashJoiner) { 80 final Collection<String> region = data.get(lstrType); 81 if (!region.isEmpty()) { 82 if (buffer.length() > 0) { 83 buffer.append(separator); 84 } 85 buffer.append(prefix).append(dashJoiner.join(region)); 86 } 87 } 88 fromTag(LstrType lstrType, String tag)89 public static XLanguageTag fromTag(LstrType lstrType, String tag) { 90 Multimap<LstrType,String> result = TreeMultimap.create(); 91 LanguageTagParser source = new LanguageTagParser(); 92 final boolean isLanguage = lstrType == LstrType.language; 93 String prefix = isLanguage ? "" : "und_"; 94 try { 95 source.set(prefix + tag); 96 } catch (Exception e) { 97 return null; // skip ill-formed for now 98 // if (lstrType == LstrType.region && tag.length() == 3) { 99 // //result.put(LstrType.language, "und"); 100 // result.put(LstrType.region, tag); 101 // } else { 102 // result.put(LstrType.language, tag); 103 // } 104 // //System.out.println("ILLEGAL SOURCE\t" + lstrType + ":\t" + tag + " ⇒ " + result); // for debugging 105 // return new XLanguageTag(result); 106 } 107 if (!source.getLanguage().isEmpty() 108 && !source.getLanguage().contains("und")) { 109 result.put(LstrType.language, source.getLanguage()); 110 } 111 if (!source.getScript().isEmpty()) { 112 result.put(LstrType.script, source.getScript()); 113 } 114 if (!source.getRegion().isEmpty()) { 115 result.put(LstrType.region, source.getRegion()); 116 } 117 if (!source.getVariants().isEmpty()) { 118 result.putAll(LstrType.variant, source.getVariants()); 119 } 120 return new XLanguageTag(result); 121 } 122 @Override equals(Object obj)123 public boolean equals(Object obj) { 124 return data.equals(((XLanguageTag)obj).data); 125 } 126 @Override hashCode()127 public int hashCode() { 128 return data.hashCode(); 129 } set(LstrType lstrType, String string)130 public XLanguageTag set(LstrType lstrType, String string) { 131 Multimap<LstrType,String> result = TreeMultimap.create(data); 132 if (lstrType != LstrType.variant) { 133 result.removeAll(lstrType); 134 } 135 result.put(lstrType, string); 136 return new XLanguageTag(result); 137 } 138 139 /** 140 * containsAll is used in matching a ReplacementRule.<br> 141 * It is here instead of on ReplacementRule so we can use in the denormalization utility used in testing. 142 */ containsAll(XLanguageTag type)143 public boolean containsAll(XLanguageTag type) { 144 for (LstrType lstrType : LSRV) { 145 final Collection<String> sources = get(lstrType); 146 final Collection<String> types = type.get(lstrType); 147 if (!sources.containsAll(types)) { 148 return false; 149 } 150 } 151 return true; 152 } 153 154 /** 155 * Once a rule matches, this actually does the replacement.<br> 156 * It is here instead of on ReplacementRule so we can use it in the denormalization utility used in testing. 157 */ replacePartsFrom(XLanguageTag typeParts, XLanguageTag replacementParts)158 public XLanguageTag replacePartsFrom(XLanguageTag typeParts, XLanguageTag replacementParts) { 159 Multimap<LstrType,String> result = TreeMultimap.create(); 160 for (LstrType lstrType : LSRV) { 161 Collection<String> sources = get(lstrType); 162 Collection<String> types = typeParts.get(lstrType); 163 Collection<String> replacements = replacementParts.get(lstrType); 164 result.putAll(lstrType, sources); 165 if (!types.isEmpty() && !replacements.isEmpty()) { 166 removeAll(result, lstrType, types); 167 result.putAll(lstrType, replacements); 168 } else if (!types.isEmpty() && replacements.isEmpty()) { 169 removeAll(result, lstrType, types); 170 } else if (types.isEmpty() && !replacements.isEmpty()) { 171 if (sources.isEmpty()) { 172 result.putAll(lstrType, replacements); 173 } 174 } else { 175 // otherwise both empty, skip 176 } 177 } 178 return new XLanguageTag(result); 179 } 180 } 181 182 /** 183 * A representation of the alias data for Unicode Language Identifiers in the supplementalMetadata.txt file. 184 */ 185 186 public static class ReplacementRule implements Comparable<ReplacementRule> { 187 private final XLanguageTag typeParts; 188 final XLanguageTag replacementParts; 189 final List<XLanguageTag> secondaryReplacementSet; // TODO, using this information in special cases to impute the best language according to LDML 190 final String reason; 191 final boolean regular; 192 ReplacementRule(LstrType lstrType, String type, XLanguageTag typeParts, XLanguageTag replacementParts, List<XLanguageTag> secondaryReplacementSet, String reason)193 private ReplacementRule(LstrType lstrType, String type, XLanguageTag typeParts, XLanguageTag replacementParts, 194 List<XLanguageTag> secondaryReplacementSet, String reason) { 195 this.typeParts = typeParts; 196 this.replacementParts = replacementParts; 197 this.secondaryReplacementSet = secondaryReplacementSet; 198 this.reason = reason; 199 this.regular = typeParts.keys().equals(replacementParts.keys()) && 200 typeParts.get(LstrType.variant).size() == replacementParts.get(LstrType.variant).size(); 201 } 202 from(LstrType lstrType, String type, List<String> replacement, String reason)203 static ReplacementRule from(LstrType lstrType, String type, List<String> replacement, String reason) { 204 XLanguageTag typeParts = XLanguageTag.fromTag(lstrType, type); 205 if (typeParts == null) { 206 return null; // skip ill-formed for now 207 } 208 XLanguageTag replacementParts = XLanguageTag.fromTag(lstrType, replacement.get(0)); 209 if (replacementParts == null) { 210 return null; // skip ill-formed for now 211 } 212 List<XLanguageTag> secondaryReplacementSet = new ArrayList<>(); 213 for (int i = 1; i < replacement.size(); ++i) { 214 secondaryReplacementSet.add(XLanguageTag.fromTag(lstrType, replacement.get(i))); 215 } 216 return new ReplacementRule(lstrType, type, typeParts, replacementParts, secondaryReplacementSet, reason); 217 } 218 219 @Override compareTo(ReplacementRule o)220 public int compareTo(ReplacementRule o) { 221 return ComparisonChain.start() 222 .compare(-getType().keys().size(), -o.getType().keys().size()) // sort most keys first 223 .compare(getType().toString(), o.getType().toString()) 224 .result(); 225 } 226 @Override equals(Object obj)227 public boolean equals(Object obj) { 228 return compareTo((ReplacementRule) obj) == 0; 229 } 230 @Override hashCode()231 public int hashCode() { 232 return Objects.hashCode(getType()); 233 } 234 @Override toString()235 public String toString() { 236 return MoreObjects.toStringHelper(getClass()) 237 .add("type", getType()) 238 .add("replacement", replacementParts) 239 .toString(); 240 } getType()241 public XLanguageTag getType() { 242 return typeParts; 243 } getReplacement()244 public XLanguageTag getReplacement() { 245 return replacementParts; 246 } 247 } 248 249 /** 250 * Utility to remove multiple items from Multimap 251 */ removeAll(Multimap<K, V> result, K key, Iterable<V> value)252 public static <K,V> Multimap<K, V> removeAll(Multimap<K, V> result, K key, Iterable<V> value) { 253 for (V type : value) { 254 result.remove(key, type); 255 } 256 return result; 257 } 258 259 private Set<ReplacementRule> rules = new TreeSet<>(); 260 private Multimap<LstrType, String> inType = TreeMultimap.create(); 261 private Map<LstrType, String> irrelevant = new TreeMap<>(); 262 add(ReplacementRule replacementRule)263 private void add(ReplacementRule replacementRule) { 264 getRules().add(replacementRule); 265 } 266 267 /** 268 * Canonicalize a Unicode Language Identifier (LSRV - language, script, region, variants) 269 * @param lstrType This is a special flag used to indicate which supplementalMetadata alias type the languageTag is from. 270 * That determines whether to extend the type and replacement to be full LSRVs if they are partial, by adding "und_", for example. 271 * @param languageTag May be partial, if the lstrType is not LstrType.language. 272 */ canonicalize(LstrType lstrType, String languageTag)273 public String canonicalize(LstrType lstrType, String languageTag) { 274 XLanguageTag newTag = canonicalizeToX(XLanguageTag.fromTag(lstrType, languageTag), null); 275 return newTag.toString(); 276 } 277 278 /** 279 * Canonicalize a Unicode Language Identifier (LSRV - language, script, region, variants) in the XLanguageTag format. 280 * Also returns the rules used in the canonicalization.<br> 281 * NOT OPTIMIZED: just uses a linear search for simplicity; production code would use more efficient mechanisms 282 */ canonicalizeToX(XLanguageTag fromTag, List<ReplacementRule> rulesUsed)283 public XLanguageTag canonicalizeToX(XLanguageTag fromTag, List<ReplacementRule> rulesUsed) { 284 if (rulesUsed != null) { 285 rulesUsed.clear(); 286 } 287 XLanguageTag newTag = fromTag; 288 startAtTheTop: 289 while (true) { 290 for (ReplacementRule rule : getRules()) { 291 if (newTag.containsAll(rule.getType())) { 292 XLanguageTag temp = newTag.replacePartsFrom(rule.getType(), rule.getReplacement()); 293 if (!temp.equals(newTag)) { 294 newTag = temp; 295 if (rulesUsed != null) { 296 rulesUsed.add(rule); 297 } 298 continue startAtTheTop; 299 } 300 } 301 } 302 return newTag; 303 } 304 } 305 306 /** 307 * Decanonicalize a Unicode Language Identifier (LSRV - language, script, region, variants) in the XLanguageTag format. 308 * Also returns the rules used in the canonicalization. Used in test case generation 309 * NOT OPTIMIZED: just for testing 310 */ decanonicalizeToX(XLanguageTag fromTag)311 public Set<XLanguageTag> decanonicalizeToX(XLanguageTag fromTag) { 312 Set<XLanguageTag> result = new HashSet<>(); 313 result.add(fromTag); 314 Set<XLanguageTag> intermediate = new HashSet<>(); 315 while (true) { 316 for (ReplacementRule rule : getRules()) { 317 if (!rule.getType().get(LstrType.variant).isEmpty()) { 318 continue; 319 } 320 for (XLanguageTag newTag : result) { 321 if (newTag.containsAll(rule.getReplacement())) { // reverse normal order 322 XLanguageTag changed = newTag.replacePartsFrom(rule.getReplacement(), rule.getType()); // reverse normal order 323 if (!intermediate.contains(changed) 324 && !result.contains(changed)) { 325 intermediate.add(changed); 326 } 327 } 328 } 329 } 330 if (intermediate.isEmpty()) { 331 result.remove(fromTag); 332 return result; 333 } 334 result.addAll(intermediate); 335 intermediate.clear(); 336 } 337 } 338 339 340 /** 341 * Utility for getting a filtered list of rules, mostly useful in debugging. 342 */ filter(LstrType lstrType, String value)343 public List<ReplacementRule> filter(LstrType lstrType, String value) { 344 List<ReplacementRule> result = new ArrayList<>(); 345 for (ReplacementRule rule : getRules()) { 346 final Collection<String> items = rule.getType().get(lstrType); 347 if (value == null && !items.isEmpty() 348 || value != null && items.contains(value)) { 349 result.add(rule); 350 } 351 } 352 return result; 353 } 354 getInstance()355 public static final LsrvCanonicalizer getInstance() { 356 return SINGLETON; 357 } 358 private static final LsrvCanonicalizer SINGLETON = load(); 359 load()360 private static LsrvCanonicalizer load() { 361 SupplementalDataInfo SDI = CLDRConfig.getInstance().getSupplementalDataInfo(); 362 Map<String, Map<String, R2<List<String>, String>>> aliases = SDI.getLocaleAliasInfo(); 363 // type -> tag -> , like "language" -> "sh" -> <{"sr_Latn"}, reason> 364 365 LsrvCanonicalizer rrs = new LsrvCanonicalizer(); 366 for (Entry<String, Map<String, R2<List<String>, String>>> typeTagReplacement : aliases.entrySet()) { 367 String type = typeTagReplacement.getKey(); 368 if (type.contains("-")) { 369 throw new IllegalArgumentException("Bad format for alias: should have _ instead of -."); 370 } 371 LstrType lstrType = LstrType.fromString(type); 372 if (!LSRV.contains(lstrType)) { 373 continue; 374 } 375 for (Entry<String, R2<List<String>, String>> tagReplacementReason : typeTagReplacement.getValue().entrySet()) { 376 String tag = tagReplacementReason.getKey(); 377 if (tag.contains("-")) { 378 throw new IllegalArgumentException("Bad format for alias: should have _ instead of -."); 379 } 380 List<String> replacement = tagReplacementReason.getValue().get0(); 381 if (replacement == null) { 382 System.out.println("No replacement: " + tagReplacementReason); 383 continue; 384 } 385 String reason = tagReplacementReason.getValue().get1(); 386 final ReplacementRule replacementRule = ReplacementRule.from(lstrType, tag, replacement, reason); 387 if (replacementRule == null) { 388 // System.out.println("No rule: " + tagReplacementReason); 389 continue; 390 } 391 rrs.add(replacementRule); 392 } 393 } 394 rrs.rules = ImmutableSet.copyOf(rrs.rules); 395 for (ReplacementRule rule : rrs.rules) { 396 XLanguageTag type = rule.getType(); 397 XLanguageTag replacement = rule.getReplacement(); 398 for (LstrType lstrType : LsrvCanonicalizer.LSRV) { 399 rrs.inType.putAll(lstrType, type.get(lstrType)); 400 rrs.inType.putAll(lstrType, replacement.get(lstrType)); 401 } 402 } 403 rrs.inType = ImmutableMultimap.copyOf(rrs.inType); 404 405 for (LstrType lstrType : LsrvCanonicalizer.LSRV) { 406 Set<String> all = new LinkedHashSet<>(Validity.getInstance().getStatusToCodes(lstrType).get(Validity.Status.regular)); 407 all.removeAll(rrs.inType.get(lstrType)); 408 if (lstrType == LstrType.variant && all.contains("fonipa")) { 409 rrs.irrelevant.put(lstrType, "fonipa"); 410 } else { 411 rrs.irrelevant.put(lstrType, all.iterator().next()); 412 } 413 } 414 rrs.irrelevant = ImmutableMap.copyOf(rrs.irrelevant); 415 return rrs; 416 } 417 418 /** 419 * Returns the set of all the Replacement rules in the canonicalizer. 420 */ getRules()421 public Set<ReplacementRule> getRules() { 422 return rules; 423 } 424 425 /** 426 * Types of test data 427 */ 428 public enum TestDataTypes {explicit, fromAliases, decanonicalized, withIrrelevants} 429 430 /** 431 * Returns test data for the rules, used to generate test data files. 432 * @param testDataTypes if null, returns all the data; otherwise the specified set. 433 * @return 434 */ getTestData(Set<TestDataTypes> testDataTypes)435 public Map<TestDataTypes,Map<String, String>> getTestData(Set<TestDataTypes> testDataTypes) { 436 Map<TestDataTypes,Map<String, String>> result = new TreeMap<>(); 437 438 if (testDataTypes == null) { 439 testDataTypes = EnumSet.allOf(TestDataTypes.class); 440 } 441 Set<String> allToTest = new TreeSet<>(); 442 if (testDataTypes.contains(TestDataTypes.explicit)) { 443 Map<String, String> testData2 = new TreeMap<>(); 444 String[][] tests = { 445 {"hye_arevmda", "hyw"}, 446 {"art_lojban", "jbo"}, 447 {"en_arevela", "en"}, 448 {"hy_arevela", "hy"}, 449 {"en_arevmda_arevela", "en"}, 450 {"hy_arevmda", "hyw"}, 451 {"hy_arevmda_arevela", "hyw"}, 452 {"en_lojban", "en"}, 453 {"en_US_polytoni", "en_US_polyton"}, 454 {"en_US_heploc", "en_US_alalc97"}, 455 {"en_US_aaland", "en_US"}, 456 {"en_aaland", "en_AX"}, 457 {"no_nynorsk_bokmal", "nb"}, 458 {"no_bokmal_nynorsk", "nb"}, 459 {"zh_guoyu_hakka_xiang", "hak"}, 460 {"zh_hakka_xiang", "hak"}, 461 }; 462 for (String row[] : tests) { 463 String toTest = row[0]; 464 String expected = row[1]; 465 testData2.put(toTest, expected); 466 } 467 allToTest.addAll(testData2.keySet()); 468 result.put(TestDataTypes.explicit, ImmutableMap.copyOf(testData2)); 469 } 470 471 if (testDataTypes.contains(TestDataTypes.fromAliases)) { 472 Map<String, String> testData2 = new TreeMap<>(); 473 for (ReplacementRule rule : getRules()) { 474 String toTest = rule.getType().toLocaleString(); 475 String expected = rule.getReplacement().toLocaleString(); 476 if (!allToTest.contains(toTest)) { 477 testData2.put(toTest,expected); 478 } 479 } 480 allToTest.addAll(testData2.keySet()); 481 result.put(TestDataTypes.fromAliases, ImmutableMap.copyOf(testData2)); 482 } 483 484 if (testDataTypes.contains(TestDataTypes.decanonicalized)) { 485 Map<String, String> testData2 = new TreeMap<>(); 486 for (String testItem: allToTest) { 487 for (XLanguageTag decon : decanonicalizeToX(XLanguageTag.fromTag(LstrType.language, testItem))) { 488 XLanguageTag newTag = canonicalizeToX(decon, null); 489 final String toTest = decon.toLocaleString(); 490 if (!allToTest.contains(toTest)) { 491 testData2.put(toTest, newTag.toLocaleString()); 492 } 493 } 494 } 495 allToTest.addAll(testData2.keySet()); 496 result.put(TestDataTypes.decanonicalized, ImmutableMap.copyOf(testData2)); 497 } 498 499 if (testDataTypes.contains(TestDataTypes.withIrrelevants)) { 500 Map<String, String> testData2 = new TreeMap<>(); 501 for (String testItem: allToTest) { 502 XLanguageTag fluffedUp = fluff(XLanguageTag.fromTag(LstrType.language, testItem), irrelevant); 503 XLanguageTag newTag = canonicalizeToX(fluffedUp, null); 504 final String toTest = fluffedUp.toLocaleString(); 505 if (!allToTest.contains(toTest)) { 506 testData2.put(toTest, newTag.toLocaleString()); 507 } 508 } 509 allToTest.addAll(testData2.keySet()); 510 result.put(TestDataTypes.withIrrelevants, ImmutableMap.copyOf(testData2)); 511 } 512 513 result = ImmutableMap.copyOf(result); 514 return result; 515 } 516 fluff(XLanguageTag type, Map<LstrType, String> toAddIfMissing)517 private static XLanguageTag fluff(XLanguageTag type, Map<LstrType, String> toAddIfMissing) { 518 XLanguageTag newTag = type; 519 for (LstrType lstrType : LsrvCanonicalizer.LSRV) { 520 if (type.get(lstrType).isEmpty() || lstrType == LstrType.variant) { 521 newTag = newTag.set(lstrType, toAddIfMissing.get(lstrType)); 522 } 523 } 524 return newTag; 525 } 526 527 /** 528 * Returns all the fields used in the type attribute of the alias rule. 529 */ getInType(LstrType language)530 public Collection<String> getInType(LstrType language) { 531 return inType.get(language); 532 } 533 534 /** 535 * Returns some sample fields that do not appear in the type attribute of the alias rule, used for testing. 536 */ getIrrelevantField(LstrType language)537 public String getIrrelevantField(LstrType language) { 538 return irrelevant.get(language); 539 } 540 541 } 542