1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.io.PrintWriter; 5 import java.lang.invoke.MethodHandles; 6 import java.util.ArrayList; 7 import java.util.Collection; 8 import java.util.Collections; 9 import java.util.Comparator; 10 import java.util.HashMap; 11 import java.util.HashSet; 12 import java.util.LinkedHashSet; 13 import java.util.List; 14 import java.util.Locale; 15 import java.util.Map; 16 import java.util.Map.Entry; 17 import java.util.Set; 18 import java.util.TreeMap; 19 import java.util.TreeSet; 20 import java.util.regex.Pattern; 21 22 import org.unicode.cldr.tool.GenerateSubdivisions.SubdivisionInfo; 23 import org.unicode.cldr.util.CLDRConfig; 24 import org.unicode.cldr.util.CLDRFile; 25 import org.unicode.cldr.util.CLDRPaths; 26 import org.unicode.cldr.util.ChainedMap; 27 import org.unicode.cldr.util.ChainedMap.M3; 28 import org.unicode.cldr.util.DtdType; 29 import org.unicode.cldr.util.Factory; 30 import org.unicode.cldr.util.Pair; 31 import org.unicode.cldr.util.PatternCache; 32 import org.unicode.cldr.util.StandardCodes; 33 import org.unicode.cldr.util.StandardCodes.LstrField; 34 import org.unicode.cldr.util.StandardCodes.LstrType; 35 import org.unicode.cldr.util.SupplementalDataInfo; 36 import org.unicode.cldr.util.Validity; 37 import org.unicode.cldr.util.Validity.Status; 38 import org.unicode.cldr.util.WikiSubdivisionLanguages; 39 import org.unicode.cldr.util.XMLFileReader; 40 import org.unicode.cldr.util.XPathParts; 41 import org.unicode.cldr.util.XPathParts.Comments.CommentType; 42 43 import com.google.common.base.Joiner; 44 import com.ibm.icu.impl.Relation; 45 import com.ibm.icu.impl.Row.R2; 46 import com.ibm.icu.impl.Utility; 47 import com.ibm.icu.lang.UCharacter; 48 import com.ibm.icu.text.CaseMap; 49 import com.ibm.icu.text.Collator; 50 import com.ibm.icu.text.LocaleDisplayNames; 51 import com.ibm.icu.text.Normalizer2; 52 import com.ibm.icu.text.RuleBasedCollator; 53 import com.ibm.icu.util.ULocale; 54 55 public class SubdivisionNode { 56 static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); 57 static final Map<String, R2<List<String>, String>> territoryAliases = SDI.getLocaleAliasInfo().get("territory"); 58 static final Set<String> containment = SDI.getContainers(); 59 static final Map<String, Map<LstrField, String>> codeToData = StandardCodes.getEnumLstreg().get(LstrType.region); 60 61 static LocaleDisplayNames ENGLISH_ICU = LocaleDisplayNames.getInstance(ULocale.ENGLISH); 62 63 static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE = CaseMap.toTitle().wholeString().noLowercase(); 64 static final Comparator<String> ROOT_COL; 65 static { 66 RuleBasedCollator _ROOT_COL = (RuleBasedCollator) Collator.getInstance(ULocale.ENGLISH); 67 _ROOT_COL.setNumericCollation(true); _ROOT_COL.freeze()68 _ROOT_COL.freeze(); 69 ROOT_COL = (Comparator) _ROOT_COL; 70 } 71 static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 72 static final CLDRFile ENGLISH_CLDR = CLDR_CONFIG.getEnglish(); 73 static final Normalizer2 nfc = Normalizer2.getNFCInstance(); 74 convertToCldr(String regionOrSubdivision)75 public static String convertToCldr(String regionOrSubdivision) { 76 return SubdivisionNames.isRegionCode(regionOrSubdivision) ? regionOrSubdivision.toUpperCase(Locale.ROOT) 77 : regionOrSubdivision.replace("-", "").toLowerCase(Locale.ROOT); 78 } 79 80 final SubdivisionSet sset; 81 final String code; 82 final int level; 83 final SubdivisionNode parent; 84 final Map<String, SubdivisionNode> children = new TreeMap<>(ROOT_COL); 85 SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset)86 public SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset) { 87 this.code = code; 88 this.level = parent == null ? -1 : parent.level + 1; 89 this.parent = parent; 90 this.sset = sset; 91 sset.ID_TO_NODE.put(code, this); 92 } 93 addName(String lang, String value)94 public SubdivisionNode addName(String lang, String value) { 95 sset.NAMES.put(code, lang, value); 96 return this; 97 } 98 99 static class SubdivisionSet { 100 101 final M3<String, String, String> NAMES = ChainedMap.of( 102 new TreeMap<String, Object>(), 103 new TreeMap<String, Object>(), 104 String.class); 105 final Map<String, String> TO_COUNTRY_CODE = new TreeMap<>(); 106 final Relation<String, String> ID_SAMPLE = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 107 final Map<String, String> SUB_TO_CAT = new TreeMap<>(); 108 final Relation<String, String> REGION_CONTAINS = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 109 final Map<String, SubdivisionNode> ID_TO_NODE = new HashMap<>(); 110 111 final SubdivisionNode BASE = new SubdivisionNode("001", null, this).addName("en", "World"); 112 addName(String code, String lang, String value)113 public void addName(String code, String lang, String value) { 114 int parenPos = value.indexOf("(see also separate country"); 115 if (parenPos >= 0) { 116 /* 117 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ1: expected "Caribbean Netherlands", got "Bonaire" 118 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ2: expected "Caribbean Netherlands", got "Saba" 119 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ3: expected "Caribbean Netherlands", got "Sint Eustatius" 120 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-21: expected "Svalbard & Jan Mayen", got "Svalbard" 121 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-22: expected "Svalbard & Jan Mayen", got "Jan Mayen" 122 */ 123 // OLD code to guess country from comment 124 // String paren = value.substring(value.length() - 3, value.length() - 1); 125 // if (!paren.equals("BQ") && !paren.equals("SJ")) { 126 // String old = TO_COUNTRY_CODE.get(code); 127 // if (old != null) { 128 // System.err.println("Duplicate: " + code + "\t" + old + "\t" + paren); 129 // } 130 // TO_COUNTRY_CODE.put(code, paren); 131 // } 132 value = value.substring(0, parenPos).trim(); 133 } 134 value = value.replace("*", ""); 135 NAMES.put(code, lang, value); 136 } 137 138 139 140 141 static final String[] CRUFT = { 142 "Emirate", 143 "Parish", 144 "County", 145 "District", 146 "Region", 147 "Province of", 148 "Province", 149 "Republic", 150 ", Barbados", 151 ", Burkina Faso", 152 "Governorate", 153 "Department", 154 "Canton of", 155 "(Région des)", 156 "(Région du)", 157 "(Région de la)", 158 "Autonomous", 159 "Archipelago of", 160 "Canton", 161 "kanton", 162 ", Bahamas", 163 "province", 164 "(Région)", 165 "(Région de l')", 166 ", Cameroon", 167 "State of", 168 "State", 169 "Metropolitan Borough of", 170 "London Borough of", 171 "Royal Borough of", 172 "Borough of", 173 "Borough", 174 "Council of", 175 "Council", 176 "City of", 177 ", The", 178 "prefecture", 179 "Prefecture", 180 "municipality" 181 }; 182 183 static final Pattern CRUFT_PATTERN = PatternCache.get("(?i)\\b" + String.join("|", CRUFT) + "\\b"); 184 static final Pattern BRACKETED = PatternCache.get("\\[.*\\]"); 185 clean(String input)186 static String clean(String input) { 187 if (input == null) { 188 return input; 189 } 190 // Quick & dirty 191 input = BRACKETED.matcher(input).replaceAll(""); 192 input = CRUFT_PATTERN.matcher(input).replaceAll(""); 193 // for (String cruft : CRUFT) { 194 // int pos = input.indexOf(cruft); 195 // if (pos >= 0) { 196 // input = input.substring(0,pos) + input.substring(pos + cruft.length()); 197 // } 198 // } 199 input = input.replace(" ", " "); 200 if (input.endsWith(",")) { 201 input = input.substring(0, input.length() - 1); 202 } 203 return fixName(input); 204 } 205 206 207 appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level)208 private static void appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level) throws IOException { 209 if (name == null) { 210 return; 211 } 212 String cldrCode = convertToCldr(sdCode); 213 String path = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"" + cldrCode + "\"]"; 214 String oldValue = fileSubdivisions.getStringValue(path); 215 if (oldValue != null) { 216 return; // don't override old values 217 } 218 fileSubdivisions.add(path, name); 219 if (level != null) { 220 fileSubdivisions.addComment(path, level, CommentType.LINE); 221 } 222 } 223 isKosher(String regionCode)224 private boolean isKosher(String regionCode) { 225 if (regionCode.equals("001")) { 226 return false; 227 } 228 if (territoryAliases.containsKey(regionCode) 229 || containment.contains(regionCode) 230 || codeToData.get(regionCode).get(LstrField.Description).contains("Private use")) { 231 Set<String> rc = REGION_CONTAINS.get(regionCode); 232 if (rc != null) { 233 throw new IllegalArgumentException("? " + regionCode + ": " + rc); 234 } 235 return false; 236 } 237 return true; 238 } 239 addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2)240 private static void addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2) { 241 TreeMap<String, SubdivisionNode> temp = new TreeMap<>(ROOT_COL); 242 temp.putAll(children2); 243 ordered.addAll(temp.values()); 244 for (SubdivisionNode n : temp.values()) { 245 if (!n.children.isEmpty()) { 246 addChildren(ordered, n.children); 247 } 248 } 249 } 250 251 static Map<String, String> NAME_CORRECTIONS = new HashMap<>(); 252 getBestName(String value, boolean useIso)253 private String getBestName(String value, boolean useIso) { 254 String cldrName = null; 255 cldrName = NAME_CORRECTIONS.get(value); 256 if (cldrName != null) { 257 return fixName(cldrName); 258 } 259 R2<List<String>, String> subdivisionAlias = SubdivisionInfo.SUBDIVISION_ALIASES_FORMER.get(value); 260 if (subdivisionAlias != null) { 261 String country = subdivisionAlias.get0().get(0); 262 cldrName = ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, country); 263 if (cldrName != null) { 264 return fixName(cldrName); 265 } 266 } 267 268 269 cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(value); 270 if (cldrName != null) { 271 return fixName(cldrName); 272 } 273 274 Collection<String> oldAliases = SubdivisionInfo.subdivisionIdToOld.get(value); 275 if (oldAliases != null) { 276 for (String oldAlias : oldAliases) { 277 cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(oldAlias); 278 if (cldrName != null) { 279 return fixName(cldrName); 280 } 281 } 282 } 283 284 if (useIso) { 285 cldrName = getIsoName(value); 286 if (cldrName == null) { 287 cldrName = "UNKNOWN"; 288 //throw new IllegalArgumentException("Failed to find name: " + value); 289 } 290 return fixName(cldrName); 291 } 292 return null; 293 } 294 fixName(String name)295 private static String fixName(String name) { 296 return name == null ? null : nfc.normalize(name.replace('\'', '’').replace(" ", " ").trim()); 297 } 298 SubdivisionSet(String sourceFile)299 public SubdivisionSet(String sourceFile) { 300 301 // <country id="AD" version="16"> 302 // <subdivision-code footnote="*">AD-02</subdivision-code> 303 // <subdivision-locale lang3code="eng" xml:lang="en"> 304 // <subdivision-locale-name>Otago</subdivision-locale-name> 305 306 List<Pair<String, String>> pathValues = XMLFileReader.loadPathValues( 307 sourceFile, 308 new ArrayList<Pair<String, String>>(), false); 309 int maxIndent = 0; 310 SubdivisionNode lastNode = null; 311 String lastCode = null; 312 Set<String> conflictingTargetCountries = new HashSet<>(); 313 314 for (Pair<String, String> pair : pathValues) { 315 String path = pair.getFirst(); 316 boolean code = path.contains("/subdivision-code"); 317 boolean name = path.contains("/subdivision-locale-name"); 318 boolean nameCat = path.contains("/category-name"); 319 boolean relatedCountry = path.contains("/subdivision-related-country"); 320 321 // <country id="AD" version="16"> 322 // <category id="262"> 323 // <category-name lang3code="fra" xml:lang="fr">paroisse</category-name> 324 // <category-name lang3code="eng" xml:lang="en">parish</category-name> 325 // also languages in region... 326 327 // new XML from ISO, so we don't have to guess the country code: 328 // <subdivision-code footnote="*">NL-BQ1</subdivision-code> 329 // <subdivision-related-country country-id="BQ" xml:lang="en">BONAIRE, SINT EUSTATIUS AND SABA</subdivision-related-country> 330 331 if (!code && !name && !nameCat && !relatedCountry) { 332 continue; 333 } 334 XPathParts parts = XPathParts.getFrozenInstance(path); 335 String value = pair.getSecond(); 336 if (relatedCountry) { 337 String target = parts.getAttributeValue(-1, "country-id"); 338 // remove conflicting target countries 339 for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) { 340 if (entry.getValue().equals(target)) { 341 conflictingTargetCountries.add(target); 342 TO_COUNTRY_CODE.remove(entry.getKey(), target); // there can be at most one 343 break; 344 } 345 } 346 if (!conflictingTargetCountries.contains(target)) { 347 TO_COUNTRY_CODE.put(lastCode, target); 348 //System.out.println(lastCode + " => " + target); 349 } 350 } else if (name) { 351 int elementNum = -2; 352 String lang = parts.getAttributeValue(elementNum, "xml:lang"); 353 if (lang == null) { 354 lang = parts.getAttributeValue(elementNum, "lang3code"); 355 } 356 addName(lastCode, lang, value); 357 //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t"); 358 } else if (nameCat) { 359 //country-codes[@generated="2015-05-04T15:40:13.424465+02:00"]/country[@id="AD"][@version="16"]/category[@id="262"]/category-name[@lang3code="fra"][@xml:lang="fr"] 360 int elementNum = -1; 361 String lang = parts.getAttributeValue(elementNum, "xml:lang"); 362 if (lang == null) { 363 lang = parts.getAttributeValue(elementNum, "lang3code"); 364 } 365 String category = parts.getAttributeValue(-2, "id"); 366 addName(category, lang, value); 367 //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t"); 368 } else { 369 int countSubdivision = 0; 370 for (int i = 0; i < parts.size(); ++i) { 371 if (parts.getElement(i).equals("subdivision")) { 372 ++countSubdivision; 373 } 374 } 375 if (maxIndent < countSubdivision) { 376 maxIndent = countSubdivision; 377 } 378 value = convertToCldr(value); 379 if (countSubdivision == 1) { 380 lastNode = addNode(null, value); 381 } else { 382 lastNode = addNode(lastNode, value); 383 } 384 lastCode = value; 385 int subdivisionElement = parts.findElement("subdivision"); 386 String id = parts.getAttributeValue(subdivisionElement, "category-id"); 387 addIdSample(id, value); 388 //<subdivision category-id="262">//<subdivision-code footnote="*">AD-06</subdivision-code> 389 // <subdivision category-id="262"> 390 //output.println(++count + Utility.repeat("\t", indent) + "code=" + value); 391 } 392 } 393 } 394 addIdSample(String id, String value)395 public void addIdSample(String id, String value) { 396 SUB_TO_CAT.put(value, id); 397 ID_SAMPLE.put(getIsoName(id), value); 398 } 399 addNode(SubdivisionNode lastSubdivision, String subdivision)400 final SubdivisionNode addNode(SubdivisionNode lastSubdivision, String subdivision) { 401 // "NZ-S", x 402 String region = SubdivisionNames.getRegionFromSubdivision(subdivision); 403 REGION_CONTAINS.put(region, subdivision); 404 if (lastSubdivision == null) { 405 lastSubdivision = BASE.children.get(region); 406 if (lastSubdivision == null) { 407 lastSubdivision = new SubdivisionNode(region, BASE, this).addName("en", ENGLISH_ICU.regionDisplayName(region)); 408 BASE.children.put(region, lastSubdivision); 409 } 410 return add(lastSubdivision, subdivision); 411 } 412 add(lastSubdivision, subdivision); 413 return lastSubdivision; 414 } 415 add(SubdivisionNode subdivisionNode1, String subdivision2)416 private SubdivisionNode add(SubdivisionNode subdivisionNode1, String subdivision2) { 417 SubdivisionNode subdivisionNode2 = subdivisionNode1.children.get(subdivision2); 418 if (subdivisionNode2 == null) { 419 subdivisionNode2 = new SubdivisionNode(subdivision2, subdivisionNode1, this); 420 } 421 subdivisionNode1.children.put(subdivision2, subdivisionNode2); 422 return subdivisionNode2; 423 } 424 getName(SubdivisionNode base2)425 private String getName(SubdivisionNode base2) { 426 return getIsoName(base2.code); 427 } 428 getIsoName(String code)429 private String getIsoName(String code) { 430 if (code == null) { 431 return null; 432 } 433 Map<String, String> map = NAMES.get(code); 434 if (map == null) { 435 return "???"; 436 } 437 String name = map.get("en"); 438 if (name != null) { 439 return name; 440 } 441 name = map.get("es"); 442 if (name != null) { 443 return name; 444 } 445 name = map.get("fr"); 446 if (name != null) { 447 return name; 448 } 449 if (name == null) { 450 name = map.entrySet().iterator().next().getValue(); 451 } 452 return name; 453 } print(PrintWriter out)454 public void print(PrintWriter out) { 455 print(out, 0, "", BASE); 456 for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) { 457 out.println(entry.getKey() + "\t" + entry.getValue()); 458 } 459 } print(PrintWriter out, int indent, String prefix, SubdivisionNode base2)460 private void print(PrintWriter out, int indent, String prefix, SubdivisionNode base2) { 461 if (!prefix.isEmpty()) { 462 prefix += "\t"; 463 } 464 prefix += base2.code; 465 final String indentString = Utility.repeat("\t", 4-indent); 466 out.println(prefix + indentString + getName(base2)); 467 if (base2.children.isEmpty()) { 468 return; 469 } 470 for (SubdivisionNode child : base2.children.values()) { 471 print(out, indent + 1, prefix, child); 472 } 473 } 474 } 475 476 static class SubDivisionExtractor { 477 final SubdivisionSet sdset; 478 final Validity validityFormer; 479 final Map<String, R2<List<String>, String>> subdivisionAliasesFormer; 480 final Relation<String, String> formerRegionToSubdivisions; 481 SubDivisionExtractor(SubdivisionSet sdset, Validity validityFormer, Map<String, R2<List<String>, String>> subdivisionAliasesFormer, Relation<String, String> formerRegionToSubdivisions)482 public SubDivisionExtractor(SubdivisionSet sdset, 483 Validity validityFormer, 484 Map<String, R2<List<String>, String>> subdivisionAliasesFormer, 485 Relation<String, String> formerRegionToSubdivisions) { 486 this.sdset = sdset; 487 this.validityFormer = validityFormer; 488 this.subdivisionAliasesFormer = subdivisionAliasesFormer; 489 this.formerRegionToSubdivisions = formerRegionToSubdivisions; 490 } 491 printXml(Appendable output)492 void printXml(Appendable output) throws IOException { 493 494 /* 495 <subdivisionContainment> 496 <group type="NZ" category="island" contains="NZ-N NZ-S"/> <!-- New Zealand --> 497 <group type="NZ" category="special island authority" contains="NZ-CIT"/> <!-- New Zealand --> 498 <group type="NZ-N" contains="NZ-AUK NZ-BOP NZ-GIS NZ-HKB NZ-MWT NZ-NTL NZ-AUK NZ-TKI NZ-WGN NZ-WKO"/> <!-- North Island --> 499 <group type="NZ-S" contains="NZ-CAN NZ-MBH NZ-STL NZ-NSN NZ-OTA NZ-TAS NZ-WTC"/> <!-- South Island --> 500 </subdivisionContainment> 501 */ 502 output.append( 503 DtdType.supplementalData.header(MethodHandles.lookup().lookupClass()) 504 + "\t<version number=\"$Revision" + "$\"/>\n" 505 + "\t<subdivisionContainment>\n"); 506 printXml(output, sdset.BASE, 0); 507 output.append("\t</subdivisionContainment>\n</supplementalData>\n"); 508 } 509 510 // private static String header(DtdType type) { 511 // return "<?xml version='1.0' encoding='UTF-8' ?>\n" 512 // + "<!DOCTYPE " + type // supplementalData 513 // + " SYSTEM '../../" + type.dtdPath + "'>\n" // "common/dtd/ldmlSupplemental.dtd" 514 // + "<!--\n" 515 // + "Copyright © 1991-2013 Unicode, Inc.\n" 516 // + "CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)\n" 517 // + "For terms of use, see http://www.unicode.org/copyright.html\n" 518 // + "-->\n"; 519 // } 520 printAliases(Appendable output)521 void printAliases(Appendable output) throws IOException { 522 addAliases(output, sdset.TO_COUNTRY_CODE.keySet()); 523 524 // Get the old validity data 525 Map<Status, Set<String>> oldSubdivisionData = validityFormer.getStatusToCodes(LstrType.subdivision); 526 Set<String> missing = new TreeSet<>(ROOT_COL); 527 missing.addAll(sdset.TO_COUNTRY_CODE.keySet()); 528 Set<String> nowValid = sdset.ID_TO_NODE.keySet(); 529 for (Entry<Status, Set<String>> e : oldSubdivisionData.entrySet()) { 530 Status v = e.getKey(); 531 if (v == Status.unknown) { 532 continue; 533 } 534 Set<String> set = e.getValue(); 535 for (String sdcodeRaw : set) { 536 String sdcode = sdcodeRaw; // .toUpperCase(Locale.ROOT); 537 // sdcode = sdcode.substring(0,2) + "-" + sdcode.substring(2); 538 if (!nowValid.contains(sdcode)) { 539 missing.add(sdcode); 540 } 541 } 542 } 543 missing.removeAll(sdset.TO_COUNTRY_CODE.keySet()); 544 addAliases(output, missing); 545 } 546 addAliases(Appendable output, Set<String> missing)547 private void addAliases(Appendable output, Set<String> missing) throws IOException { 548 for (String toReplace : missing) { 549 List<String> replaceBy = null; 550 String reason = "deprecated"; 551 R2<List<String>, String> aliasInfo = subdivisionAliasesFormer.get(toReplace); 552 if (aliasInfo != null) { 553 replaceBy = aliasInfo.get0(); 554 reason = aliasInfo.get1(); 555 System.out.println("Adding former alias: " + toReplace + " => " + replaceBy); 556 } else { 557 String replacement = sdset.TO_COUNTRY_CODE.get(toReplace); 558 if (replacement != null) { 559 replaceBy = Collections.singletonList(replacement); 560 reason = "overlong"; 561 System.out.println("Adding country code alias: " + toReplace + " => " + replaceBy); 562 } 563 } 564 addAlias(output, toReplace, replaceBy, reason); 565 } 566 } 567 addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason)568 private void addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason) throws IOException { 569 // <languageAlias type="art_lojban" replacement="jbo" reason="deprecated"/> <!-- Lojban --> 570 output.append("\t\t\t"); 571 if (replaceBy == null) { 572 output.append("<!-- "); 573 } 574 output.append("<subdivisionAlias" 575 + " type=\"" + toReplace + "\"" 576 + " replacement=\"" + (replaceBy == null ? toReplace.substring(0, 2) + "?" : 577 Joiner.on(" ").join(replaceBy)) + "\"" 578 + " reason=\"" + reason + "\"/>" 579 + (replaceBy == null ? " <!- - " : " <!-- ") 580 + sdset.getBestName(toReplace, true) + " => " + (replaceBy == null ? "??" : getBestName(replaceBy, true)) + " -->" 581 + "\n"); 582 } 583 getBestName(List<String> replaceBy, boolean useIso)584 private String getBestName(List<String> replaceBy, boolean useIso) { 585 StringBuilder result = new StringBuilder(); 586 for (String s : replaceBy) { 587 if (result.length() != 0) { 588 result.append(", "); 589 } 590 if (SubdivisionNames.isRegionCode(s)) { 591 result.append(ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, s)); 592 } else { 593 result.append(sdset.getBestName(s, useIso)); 594 } 595 } 596 return result.toString(); 597 } 598 printXml(Appendable output, SubdivisionNode base2, int indent)599 private void printXml(Appendable output, SubdivisionNode base2, int indent) throws IOException { 600 if (base2.children.isEmpty()) { 601 return; 602 } 603 String type = base2.code; 604 if (base2 != sdset.BASE) { 605 type = convertToCldr(type); 606 output.append("\t\t" + "<subgroup" 607 + " type=\"" + type + "\"" 608 + " contains=\""); 609 boolean first = true; 610 for (String child : base2.children.keySet()) { 611 if (first) { 612 first = false; 613 } else { 614 output.append(' '); 615 } 616 String subregion = convertToCldr(child); 617 output.append(subregion); 618 } 619 output.append("\"/>\n"); 620 } 621 for (SubdivisionNode child : base2.children.values()) { 622 printXml(output, child, indent); 623 } 624 } 625 printSamples(Appendable pw)626 public void printSamples(Appendable pw) throws IOException { 627 Set<String> seen = new HashSet<>(); 628 for (Entry<String, Set<String>> entry : sdset.ID_SAMPLE.keyValuesSet()) { 629 pw.append(entry.getKey()); 630 //int max = 10; 631 seen.clear(); 632 for (String sample : entry.getValue()) { 633 String region = sample.substring(0, 2); 634 if (seen.contains(region)) { 635 continue; 636 } 637 seen.add(region); 638 pw.append(";\t" + ENGLISH_ICU.regionDisplayName(region) + ": " + sdset.getIsoName(sample) 639 + " (" + sample + ")"); 640 //if (--max < 0) break; 641 } 642 pw.append(System.lineSeparator()); 643 } 644 } 645 printEnglishComp(Appendable output)646 public void printEnglishComp(Appendable output) throws IOException { 647 Set<String> countEqual = new TreeSet<>(); 648 String lastCC = null; 649 output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\tEqual\n"); 650 for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) { 651 final String countryCode = entry.getKey(); 652 if (!countryCode.equals(lastCC)) { 653 if (lastCC != null && countEqual.size() != 0) { 654 output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n"); 655 } 656 countEqual.clear(); 657 658 lastCC = countryCode; 659 } 660 for (String value : entry.getValue()) { 661 String cldrName = sdset.getBestName(value, false); 662 String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value); 663 final String iso = sdset.getIsoName(value); 664 if (iso.equals(wiki)) { 665 countEqual.add(iso); 666 continue; 667 } 668 output.append( 669 ENGLISH_ICU.regionDisplayName(countryCode) 670 // + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value) 671 + "\t" + cldrName 672 + "\t" + value 673 + "\t" + iso 674 + "\t" + wiki 675 + "\n"); 676 } 677 } 678 if (countEqual.size() != 0) { 679 output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n"); 680 } 681 } 682 printEnglishCompFull(Appendable output)683 public void printEnglishCompFull(Appendable output) throws IOException { 684 output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\n"); 685 for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) { 686 final String countryCode = entry.getKey(); 687 for (String value : entry.getValue()) { 688 String cldrName = sdset.getBestName(value, false); 689 //getBestName(value); 690 String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value); 691 final String iso = sdset.getIsoName(value); 692 output.append( 693 ENGLISH_ICU.regionDisplayName(countryCode) 694 // + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value) 695 + "\t" + value 696 + "\t" + cldrName 697 + "\t" + iso 698 + "\t" + wiki 699 + "\n"); 700 } 701 } 702 } 703 printEnglish(PrintWriter output)704 public void printEnglish(PrintWriter output) throws IOException { 705 TreeSet<String> allRegions = new TreeSet<>(); 706 allRegions.addAll(codeToData.keySet()); 707 allRegions.addAll(formerRegionToSubdivisions.keySet()); // override 708 709 Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*"); 710 CLDRFile oldFileSubdivisions = cldrFactorySubdivisions.make("en", false); 711 CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed(); 712 713 Set<String> skipped = new LinkedHashSet<>(); 714 715 for (String regionCode : allRegions) { 716 if (!sdset.isKosher(regionCode)) { 717 if (regionCode.length() != 3) { 718 skipped.add(regionCode); 719 } 720 continue; 721 } 722 Set<String> remainder = formerRegionToSubdivisions.get(regionCode); 723 remainder = remainder == null ? Collections.emptySet() : new LinkedHashSet<>(remainder); 724 725 SubdivisionNode regionNode = sdset.ID_TO_NODE.get(regionCode); 726 if (regionNode == null) { 727 continue; 728 } 729 730 Set<SubdivisionNode> ordered = new LinkedHashSet<>(); 731 SubdivisionSet.addChildren(ordered, regionNode.children); 732 733 for (SubdivisionNode node : ordered) { 734 final String sdCode = node.code; 735 String name = sdset.getBestName(sdCode, true); 736 String upper = UCharacter.toUpperCase(name); 737 String title = SubdivisionNode.TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(Locale.ROOT, null, name); 738 if (name.equals(upper) || !name.equals(title)) { 739 System.out.println("Suspicious name: " + name); 740 } 741 SubdivisionSet.appendName(fileSubdivisions, sdCode, name, null); 742 remainder.remove(sdCode); 743 } 744 for (String sdCode : remainder) { 745 String name = sdset.getBestName(sdCode, true); 746 if (!name.equals("???")) { 747 SubdivisionSet.appendName(fileSubdivisions, sdCode, name, "\t<!-- deprecated -->"); 748 } 749 } 750 } 751 System.out.println("Skipping: " + skipped); 752 fileSubdivisions.write(output); 753 } 754 printMissingMIDs(PrintWriter pw)755 public void printMissingMIDs(PrintWriter pw) { 756 // for (Entry<String, String> entry : WikiSubdivisionLanguages.WIKIDATA_TO_MID.entrySet()) { 757 // String mid = entry.getValue(); 758 // if (!mid.isEmpty()) { 759 // continue; 760 // } 761 // String subCode = entry.getKey(); 762 // String wiki = clean(getWikiName(subCode)); 763 // String iso = clean(getIsoName(subCode)); 764 // String countryCode = subCode.substring(0, 2); 765 // String cat = SUB_TO_CAT.get(subCode); 766 // String catName = getIsoName(cat); 767 // pw.append( 768 // ENGLISH_ICU.regionDisplayName(countryCode) 769 // + "\t" + mid 770 // + "\t" + subCode 771 // + "\t" + catName 772 // + "\t" + wiki 773 // + "\t" + iso 774 // + "\n" 775 // ); 776 // } 777 } 778 } 779 }