1 package org.unicode.cldr.tool;
2 
3 import java.io.IOException;
4 import java.io.PrintWriter;
5 import java.lang.invoke.MethodHandles;
6 import java.util.ArrayList;
7 import java.util.Collection;
8 import java.util.Collections;
9 import java.util.Comparator;
10 import java.util.HashMap;
11 import java.util.HashSet;
12 import java.util.LinkedHashSet;
13 import java.util.List;
14 import java.util.Locale;
15 import java.util.Map;
16 import java.util.Map.Entry;
17 import java.util.Set;
18 import java.util.TreeMap;
19 import java.util.TreeSet;
20 import java.util.regex.Pattern;
21 
22 import org.unicode.cldr.tool.GenerateSubdivisions.SubdivisionInfo;
23 import org.unicode.cldr.util.CLDRConfig;
24 import org.unicode.cldr.util.CLDRFile;
25 import org.unicode.cldr.util.CLDRPaths;
26 import org.unicode.cldr.util.ChainedMap;
27 import org.unicode.cldr.util.ChainedMap.M3;
28 import org.unicode.cldr.util.DtdType;
29 import org.unicode.cldr.util.Factory;
30 import org.unicode.cldr.util.Pair;
31 import org.unicode.cldr.util.PatternCache;
32 import org.unicode.cldr.util.StandardCodes;
33 import org.unicode.cldr.util.StandardCodes.LstrField;
34 import org.unicode.cldr.util.StandardCodes.LstrType;
35 import org.unicode.cldr.util.SupplementalDataInfo;
36 import org.unicode.cldr.util.Validity;
37 import org.unicode.cldr.util.Validity.Status;
38 import org.unicode.cldr.util.WikiSubdivisionLanguages;
39 import org.unicode.cldr.util.XMLFileReader;
40 import org.unicode.cldr.util.XPathParts;
41 import org.unicode.cldr.util.XPathParts.Comments.CommentType;
42 
43 import com.ibm.icu.dev.util.CollectionUtilities;
44 import com.ibm.icu.impl.Relation;
45 import com.ibm.icu.impl.Row.R2;
46 import com.ibm.icu.impl.Utility;
47 import com.ibm.icu.lang.UCharacter;
48 import com.ibm.icu.text.CaseMap;
49 import com.ibm.icu.text.Collator;
50 import com.ibm.icu.text.LocaleDisplayNames;
51 import com.ibm.icu.text.Normalizer2;
52 import com.ibm.icu.text.RuleBasedCollator;
53 import com.ibm.icu.util.ULocale;
54 
55 public class SubdivisionNode {
56     static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
57     static final Map<String, R2<List<String>, String>> territoryAliases = SDI.getLocaleAliasInfo().get("territory");
58     static final Set<String> containment = SDI.getContainers();
59     static final Map<String, Map<LstrField, String>> codeToData = StandardCodes.getEnumLstreg().get(LstrType.region);
60 
61     static LocaleDisplayNames ENGLISH_ICU = LocaleDisplayNames.getInstance(ULocale.ENGLISH);
62 
63     static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE = CaseMap.toTitle().wholeString().noLowercase();
64     static final Comparator<String> ROOT_COL;
65     static {
66         RuleBasedCollator _ROOT_COL = (RuleBasedCollator) Collator.getInstance(ULocale.ENGLISH);
67         _ROOT_COL.setNumericCollation(true);
_ROOT_COL.freeze()68         _ROOT_COL.freeze();
69         ROOT_COL = (Comparator) _ROOT_COL;
70     }
71     static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
72     static final CLDRFile ENGLISH_CLDR = CLDR_CONFIG.getEnglish();
73     static final Normalizer2 nfc = Normalizer2.getNFCInstance();
74 
convertToCldr(String regionOrSubdivision)75     public static String convertToCldr(String regionOrSubdivision) {
76         return SubdivisionNames.isRegionCode(regionOrSubdivision) ? regionOrSubdivision.toUpperCase(Locale.ROOT)
77             : regionOrSubdivision.replace("-", "").toLowerCase(Locale.ROOT);
78     }
79 
80     final SubdivisionSet sset;
81     final String code;
82     final int level;
83     final SubdivisionNode parent;
84     final Map<String, SubdivisionNode> children = new TreeMap<>(ROOT_COL);
85 
SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset)86     public SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset) {
87         this.code = code;
88         this.level = parent == null ? -1 : parent.level + 1;
89         this.parent = parent;
90         this.sset = sset;
91         sset.ID_TO_NODE.put(code, this);
92     }
93 
addName(String lang, String value)94     public SubdivisionNode addName(String lang, String value) {
95         sset.NAMES.put(code, lang, value);
96         return this;
97     }
98 
99     static class SubdivisionSet {
100 
101         final M3<String, String, String> NAMES = ChainedMap.of(
102             new TreeMap<String, Object>(),
103             new TreeMap<String, Object>(),
104             String.class);
105         final Map<String, String> TO_COUNTRY_CODE = new TreeMap<String, String>();
106         final Relation<String, String> ID_SAMPLE = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
107         final Map<String, String> SUB_TO_CAT = new TreeMap<>();
108         final Relation<String, String> REGION_CONTAINS = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
109         final Map<String, SubdivisionNode> ID_TO_NODE = new HashMap<>();
110 
111         final SubdivisionNode BASE = new SubdivisionNode("001", null, this).addName("en", "World");
112 
addName(String code, String lang, String value)113         public void addName(String code, String lang, String value) {
114             int parenPos = value.indexOf("(see also separate country");
115             if (parenPos >= 0) {
116                 /*
117                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ1: expected "Caribbean Netherlands", got "Bonaire"
118                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ2: expected "Caribbean Netherlands", got "Saba"
119                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ3: expected "Caribbean Netherlands", got "Sint Eustatius"
120                 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-21: expected "Svalbard & Jan Mayen", got "Svalbard"
121                 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-22: expected "Svalbard & Jan Mayen", got "Jan Mayen"
122                  */
123                 // OLD code to guess country from comment
124 //              String paren = value.substring(value.length() - 3, value.length() - 1);
125 //                if (!paren.equals("BQ") && !paren.equals("SJ")) {
126 //                    String old = TO_COUNTRY_CODE.get(code);
127 //                    if (old != null) {
128 //                        System.err.println("Duplicate: " + code + "\t" + old + "\t" + paren);
129 //                    }
130 //                    TO_COUNTRY_CODE.put(code, paren);
131 //                }
132                 value = value.substring(0, parenPos).trim();
133             }
134             value = value.replace("*", "");
135             NAMES.put(code, lang, value);
136         }
137 
138 
139 
140 
141         static final String[] CRUFT = {
142             "Emirate",
143             "Parish",
144             "County",
145             "District",
146             "Region",
147             "Province of",
148             "Province",
149             "Republic",
150             ", Barbados",
151             ", Burkina Faso",
152             "Governorate",
153             "Department",
154             "Canton of",
155             "(Région des)",
156             "(Région du)",
157             "(Région de la)",
158             "Autonomous",
159             "Archipelago of",
160             "Canton",
161             "kanton",
162             ", Bahamas",
163             "province",
164             "(Région)",
165             "(Région de l')",
166             ", Cameroon",
167             "State of",
168             "State",
169             "Metropolitan Borough of",
170             "London Borough of",
171             "Royal Borough of",
172             "Borough of",
173             "Borough",
174             "Council of",
175             "Council",
176             "City of",
177             ", The",
178             "prefecture",
179             "Prefecture",
180             "municipality"
181         };
182 
183         static final Pattern CRUFT_PATTERN = PatternCache.get("(?i)\\b" + CollectionUtilities.join(CRUFT, "|") + "\\b");
184         static final Pattern BRACKETED = PatternCache.get("\\[.*\\]");
185 
clean(String input)186         static String clean(String input) {
187             if (input == null) {
188                 return input;
189             }
190             // Quick & dirty
191             input = BRACKETED.matcher(input).replaceAll("");
192             input = CRUFT_PATTERN.matcher(input).replaceAll("");
193 //            for (String cruft : CRUFT) {
194 //                int pos = input.indexOf(cruft);
195 //                if (pos >= 0) {
196 //                    input = input.substring(0,pos) + input.substring(pos + cruft.length());
197 //                }
198 //            }
199             input = input.replace("  ", " ");
200             if (input.endsWith(",")) {
201                 input = input.substring(0, input.length() - 1);
202             }
203             return fixName(input);
204         }
205 
206 
207 
appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level)208         private static void appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level) throws IOException {
209             if (name == null) {
210                 return;
211             }
212             String cldrCode = convertToCldr(sdCode);
213             String path = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"" + cldrCode + "\"]";
214             String oldValue = fileSubdivisions.getStringValue(path);
215             if (oldValue != null) {
216                 return; // don't override old values
217             }
218             fileSubdivisions.add(path, name);
219             if (level != null) {
220                 fileSubdivisions.addComment(path, level, CommentType.LINE);
221             }
222         }
223 
isKosher(String regionCode)224         private boolean isKosher(String regionCode) {
225             if (regionCode.equals("001")) {
226                 return false;
227             }
228             if (territoryAliases.containsKey(regionCode)
229                 || containment.contains(regionCode)
230                 || codeToData.get(regionCode).get(LstrField.Description).contains("Private use")) {
231                 Set<String> rc = REGION_CONTAINS.get(regionCode);
232                 if (rc != null) {
233                     throw new IllegalArgumentException("? " + regionCode + ": " + rc);
234                 }
235                 return false;
236             }
237             return true;
238         }
239 
addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2)240         private static void addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2) {
241             TreeMap<String, SubdivisionNode> temp = new TreeMap<>(ROOT_COL);
242             temp.putAll(children2);
243             ordered.addAll(temp.values());
244             for (SubdivisionNode n : temp.values()) {
245                 if (!n.children.isEmpty()) {
246                     addChildren(ordered, n.children);
247                 }
248             }
249         }
250 
251         static Map<String, String> NAME_CORRECTIONS = new HashMap<>();
252 //      static {
253 //          Splitter semi = Splitter.on(';').trimResults();
254 //          for (String s : FileUtilities.in(ISO_COUNTRY_CODES, "en-subdivisions-corrections.txt")) {
255 //              if (s.startsWith("#")) {
256 //                  continue;
257 //              }
258 //              s = s.trim();
259 //              if (s.isEmpty()) {
260 //                  continue;
261 //              }
262 //              List<String> parts = semi.splitToList(s);
263 //              NAME_CORRECTIONS.put(convertToCldr(parts.get(0)), parts.get(1));
264 //          }
265 //      }
266 
267 
getBestName(String value, boolean useIso)268         private String getBestName(String value, boolean useIso) {
269             if (value.equals("cnah")) {
270                 int debug = 0;
271             }
272             String cldrName = null;
273             cldrName = NAME_CORRECTIONS.get(value);
274             if (cldrName != null) {
275                 return fixName(cldrName);
276             }
277             R2<List<String>, String> subdivisionAlias = SubdivisionInfo.SUBDIVISION_ALIASES_FORMER.get(value);
278             if (subdivisionAlias != null) {
279                 String country = subdivisionAlias.get0().get(0);
280                 cldrName = ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, country);
281                 if (cldrName != null) {
282                     return fixName(cldrName);
283                 }
284             }
285 
286 
287             cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(value);
288             if (cldrName != null) {
289                 return fixName(cldrName);
290             }
291 
292             Collection<String> oldAliases = SubdivisionInfo.subdivisionIdToOld.get(value);
293             if (oldAliases != null) {
294                 for (String oldAlias : oldAliases) {
295                     cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(oldAlias);
296                     if (cldrName != null) {
297                         return fixName(cldrName);
298                     }
299                 }
300             }
301 
302             if (useIso) {
303                 cldrName = getIsoName(value);
304                 if (cldrName == null) {
305                     cldrName = "UNKNOWN";
306                     //throw new IllegalArgumentException("Failed to find name: " + value);
307                 }
308                 return fixName(cldrName);
309             }
310             return null;
311         }
312 
fixName(String name)313         private static String fixName(String name) {
314             return name == null ? null : nfc.normalize(name.replace('\'', '’').replace("  ", " ").trim());
315         }
316 
SubdivisionSet(String sourceFile)317         public SubdivisionSet(String sourceFile) {
318 
319             //    <country id="AD" version="16">
320             //           <subdivision-code footnote="*">AD-02</subdivision-code>
321             //             <subdivision-locale lang3code="eng" xml:lang="en">
322             //                  <subdivision-locale-name>Otago</subdivision-locale-name>
323 
324             List<Pair<String, String>> pathValues = XMLFileReader.loadPathValues(
325                 sourceFile,
326                 new ArrayList<Pair<String, String>>(), false);
327             XPathParts parts = new XPathParts();
328             int maxIndent = 0;
329             SubdivisionNode lastNode = null;
330             String lastCode = null;
331             Set<String> conflictingTargetCountries = new HashSet<>();
332 
333             for (Pair<String, String> pair : pathValues) {
334                 String path = pair.getFirst();
335                 boolean code = path.contains("/subdivision-code");
336                 boolean name = path.contains("/subdivision-locale-name");
337                 boolean nameCat = path.contains("/category-name");
338                 boolean relatedCountry = path.contains("/subdivision-related-country");
339 
340                 //    <country id="AD" version="16">
341                 //       <category id="262">
342                 //  <category-name lang3code="fra" xml:lang="fr">paroisse</category-name>
343                 //  <category-name lang3code="eng" xml:lang="en">parish</category-name>
344                 // also languages in region...
345 
346                 // new XML from ISO, so we don't have to guess the country code:
347                 //            <subdivision-code footnote="*">NL-BQ1</subdivision-code>
348                 //            <subdivision-related-country country-id="BQ" xml:lang="en">BONAIRE, SINT EUSTATIUS AND SABA</subdivision-related-country>
349 
350                 if (!code && !name && !nameCat && !relatedCountry) {
351                     continue;
352                 }
353                 parts.set(path);
354                 String value = pair.getSecond();
355                 if (relatedCountry) {
356                     String target = parts.getAttributeValue(-1, "country-id");
357                     // remove conflicting target countries
358                     for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) {
359                         if (entry.getValue().equals(target)) {
360                             conflictingTargetCountries.add(target);
361                             TO_COUNTRY_CODE.remove(entry.getKey(), target); // there can be at most one
362                             break;
363                         }
364                     }
365                     if (!conflictingTargetCountries.contains(target)) {
366                         TO_COUNTRY_CODE.put(lastCode, target);
367                         //System.out.println(lastCode + " => " + target);
368                     }
369                 } else if (name) {
370                     int elementNum = -2;
371                     String lang = parts.getAttributeValue(elementNum, "xml:lang");
372                     if (lang == null) {
373                         lang = parts.getAttributeValue(elementNum, "lang3code");
374                     }
375                     addName(lastCode, lang, value);
376                     //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t");
377                 } else if (nameCat) {
378                     //country-codes[@generated="2015-05-04T15:40:13.424465+02:00"]/country[@id="AD"][@version="16"]/category[@id="262"]/category-name[@lang3code="fra"][@xml:lang="fr"]
379                     int elementNum = -1;
380                     String lang = parts.getAttributeValue(elementNum, "xml:lang");
381                     if (lang == null) {
382                         lang = parts.getAttributeValue(elementNum, "lang3code");
383                     }
384                     String category = parts.getAttributeValue(-2, "id");
385                     addName(category, lang, value);
386                     //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t");
387                 } else {
388                     int countSubdivision = 0;
389                     for (int i = 0; i < parts.size(); ++i) {
390                         if (parts.getElement(i).equals("subdivision")) {
391                             ++countSubdivision;
392                         }
393                     }
394                     if (maxIndent < countSubdivision) {
395                         maxIndent = countSubdivision;
396                     }
397                     value = convertToCldr(value);
398                     if (countSubdivision == 1) {
399                         lastNode = addNode(null, value);
400                     } else {
401                         lastNode = addNode(lastNode, value);
402                     }
403                     lastCode = value;
404                     int subdivisionElement = parts.findElement("subdivision");
405                     String id = parts.getAttributeValue(subdivisionElement, "category-id");
406                     addIdSample(id, value);
407                     //<subdivision category-id="262">//<subdivision-code footnote="*">AD-06</subdivision-code>
408                     // <subdivision category-id="262">
409                     //output.println(++count + Utility.repeat("\t", indent) + "code=" + value);
410                 }
411             }
412         }
413 
addIdSample(String id, String value)414         public void addIdSample(String id, String value) {
415             SUB_TO_CAT.put(value, id);
416             ID_SAMPLE.put(getIsoName(id), value);
417         }
418 
addNode(SubdivisionNode lastSubdivision, String subdivision)419         final SubdivisionNode addNode(SubdivisionNode lastSubdivision, String subdivision) {
420             // "NZ-S", x
421             String region = SubdivisionNames.getRegionFromSubdivision(subdivision);
422             REGION_CONTAINS.put(region, subdivision);
423             if (lastSubdivision == null) {
424                 lastSubdivision = BASE.children.get(region);
425                 if (lastSubdivision == null) {
426                     lastSubdivision = new SubdivisionNode(region, BASE, this).addName("en", ENGLISH_ICU.regionDisplayName(region));
427                     BASE.children.put(region, lastSubdivision);
428                 }
429                 return add(lastSubdivision, subdivision);
430             }
431             add(lastSubdivision, subdivision);
432             return lastSubdivision;
433         }
434 
add(SubdivisionNode subdivisionNode1, String subdivision2)435         private SubdivisionNode add(SubdivisionNode subdivisionNode1, String subdivision2) {
436             SubdivisionNode subdivisionNode2 = subdivisionNode1.children.get(subdivision2);
437             if (subdivisionNode2 == null) {
438                 subdivisionNode2 = new SubdivisionNode(subdivision2, subdivisionNode1, this);
439             }
440             subdivisionNode1.children.put(subdivision2, subdivisionNode2);
441             return subdivisionNode2;
442         }
443 
getName(SubdivisionNode base2)444         private String getName(SubdivisionNode base2) {
445             return getIsoName(base2.code);
446         }
447 
getIsoName(String code)448         private String getIsoName(String code) {
449             if (code == null) {
450                 return null;
451             }
452             Map<String, String> map = NAMES.get(code);
453             if (map == null) {
454                 return "???";
455             }
456             String name = map.get("en");
457             if (name != null) {
458                 return name;
459             }
460             name = map.get("es");
461             if (name != null) {
462                 return name;
463             }
464             name = map.get("fr");
465             if (name != null) {
466                 return name;
467             }
468             if (name == null) {
469                 name = map.entrySet().iterator().next().getValue();
470             }
471             return name;
472         }
print(PrintWriter out)473         public void print(PrintWriter out) {
474             print(out, 0, "", BASE);
475             for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) {
476                 out.println(entry.getKey() + "\t" + entry.getValue());
477             }
478         }
print(PrintWriter out, int indent, String prefix, SubdivisionNode base2)479         private void print(PrintWriter out, int indent, String prefix, SubdivisionNode base2) {
480             if (!prefix.isEmpty()) {
481                 prefix += "\t";
482             }
483             prefix += base2.code;
484             final String indentString = Utility.repeat("\t", 4-indent);
485             out.println(prefix + indentString + getName(base2));
486             if (base2.children.isEmpty()) {
487                 return;
488             }
489             for (SubdivisionNode child : base2.children.values()) {
490                 print(out, indent + 1, prefix, child);
491             }
492         }
493     }
494 
495     static class SubDivisionExtractor {
496         final SubdivisionSet sdset;
497         final Validity validityFormer;
498         final Map<String, R2<List<String>, String>> subdivisionAliasesFormer;
499         final Relation<String, String> formerRegionToSubdivisions;
500 
SubDivisionExtractor(SubdivisionSet sdset, Validity validityFormer, Map<String, R2<List<String>, String>> subdivisionAliasesFormer, Relation<String, String> formerRegionToSubdivisions)501         public SubDivisionExtractor(SubdivisionSet sdset,
502             Validity validityFormer,
503             Map<String, R2<List<String>, String>> subdivisionAliasesFormer,
504             Relation<String, String> formerRegionToSubdivisions) {
505             this.sdset = sdset;
506             this.validityFormer = validityFormer;
507             this.subdivisionAliasesFormer = subdivisionAliasesFormer;
508             this.formerRegionToSubdivisions = formerRegionToSubdivisions;
509         }
510 
printXml(Appendable output)511         void printXml(Appendable output) throws IOException {
512 
513             /*
514             <subdivisionContainment>
515             <group type="NZ" category="island" contains="NZ-N NZ-S"/> <!-- New Zealand -->
516             <group type="NZ" category="special island authority" contains="NZ-CIT"/> <!-- New Zealand -->
517             <group type="NZ-N" contains="NZ-AUK NZ-BOP NZ-GIS NZ-HKB NZ-MWT NZ-NTL NZ-AUK NZ-TKI NZ-WGN NZ-WKO"/> <!-- North Island -->
518             <group type="NZ-S" contains="NZ-CAN NZ-MBH NZ-STL NZ-NSN NZ-OTA NZ-TAS NZ-WTC"/> <!-- South Island -->
519             </subdivisionContainment>
520              */
521             output.append(
522                 DtdType.supplementalData.header(MethodHandles.lookup().lookupClass())
523                 + "\t<version number=\"$Revision" /*hack to stop SVN changing this*/ + "$\"/>\n"
524                 + "\t<subdivisionContainment>\n");
525             printXml(output, sdset.BASE, 0);
526             output.append("\t</subdivisionContainment>\n</supplementalData>\n");
527         }
528 
529 //        private static String header(DtdType type) {
530 //            return "<?xml version='1.0' encoding='UTF-8' ?>\n"
531 //                + "<!DOCTYPE " + type // supplementalData
532 //                + " SYSTEM '../../" + type.dtdPath + "'>\n" // "common/dtd/ldmlSupplemental.dtd"
533 //                + "<!--\n"
534 //                + "Copyright © 1991-2013 Unicode, Inc.\n"
535 //                + "CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)\n"
536 //                + "For terms of use, see http://www.unicode.org/copyright.html\n"
537 //                + "-->\n";
538 //        }
539 
printAliases(Appendable output)540         void printAliases(Appendable output) throws IOException {
541             addAliases(output, sdset.TO_COUNTRY_CODE.keySet());
542 
543             // Get the old validity data
544             Map<Status, Set<String>> oldSubdivisionData = validityFormer.getStatusToCodes(LstrType.subdivision);
545             Set<String> missing = new TreeSet<>(ROOT_COL);
546             missing.addAll(sdset.TO_COUNTRY_CODE.keySet());
547             Set<String> nowValid = sdset.ID_TO_NODE.keySet();
548             for (Entry<Status, Set<String>> e : oldSubdivisionData.entrySet()) {
549                 Status v = e.getKey();
550                 if (v == Status.unknown) {
551                     continue;
552                 }
553                 Set<String> set = e.getValue();
554                 for (String sdcodeRaw : set) {
555                     String sdcode = sdcodeRaw; // .toUpperCase(Locale.ROOT);
556 //                  sdcode = sdcode.substring(0,2) + "-" + sdcode.substring(2);
557                     if (!nowValid.contains(sdcode)) {
558                         missing.add(sdcode);
559                     }
560                 }
561             }
562             missing.removeAll(sdset.TO_COUNTRY_CODE.keySet());
563             addAliases(output, missing);
564         }
565 
addAliases(Appendable output, Set<String> missing)566         private void addAliases(Appendable output, Set<String> missing) throws IOException {
567             for (String toReplace : missing) {
568                 List<String> replaceBy = null;
569                 String reason = "deprecated";
570                 R2<List<String>, String> aliasInfo = subdivisionAliasesFormer.get(toReplace);
571                 if (aliasInfo != null) {
572                     replaceBy = aliasInfo.get0(); //  == null ? null : CollectionUtilities.join(aliasInfo.get0(), " ");
573                     reason = aliasInfo.get1();
574                     System.out.println("Adding former alias: " + toReplace + " => " + replaceBy);
575                 } else {
576                     String replacement = sdset.TO_COUNTRY_CODE.get(toReplace);
577                     if (replacement != null) {
578                         replaceBy = Collections.singletonList(replacement);
579                         reason = "overlong";
580                         System.out.println("Adding country code alias: " + toReplace + " => " + replaceBy);
581                     }
582                 }
583                 addAlias(output, toReplace, replaceBy, reason);
584             }
585         }
586 
addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason)587         private void addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason) throws IOException {
588             // <languageAlias type="art_lojban" replacement="jbo" reason="deprecated"/> <!-- Lojban -->
589             output.append("\t\t\t");
590             if (replaceBy == null) {
591                 output.append("<!-- ");
592             }
593             output.append("<subdivisionAlias"
594                 + " type=\"" + toReplace + "\""
595                 + " replacement=\"" + (replaceBy == null ? toReplace.substring(0, 2) + "?" : CollectionUtilities.join(replaceBy, " ")) + "\""
596                 + " reason=\"" + reason + "\"/>"
597                 + (replaceBy == null ? " <!- - " : " <!-- ")
598                 + sdset.getBestName(toReplace, true) + " => " + (replaceBy == null ? "??" : getBestName(replaceBy, true)) + " -->"
599                 + "\n");
600         }
601 
getBestName(List<String> replaceBy, boolean useIso)602         private String getBestName(List<String> replaceBy, boolean useIso) {
603             StringBuilder result = new StringBuilder();
604             for (String s : replaceBy) {
605                 if (result.length() != 0) {
606                     result.append(", ");
607                 }
608                 if (SubdivisionNames.isRegionCode(s)) {
609                     result.append(ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, s));
610                 } else {
611                     result.append(sdset.getBestName(s, useIso));
612                 }
613             }
614             return result.toString();
615         }
616 
printXml(Appendable output, SubdivisionNode base2, int indent)617         private void printXml(Appendable output, SubdivisionNode base2, int indent) throws IOException {
618             if (base2.children.isEmpty()) {
619                 return;
620             }
621             String type = base2.code;
622             if (base2 != sdset.BASE) {
623                 type = convertToCldr(type);
624                 output.append("\t\t" + "<subgroup"
625                     + " type=\"" + type + "\""
626                     + " contains=\"");
627                 boolean first = true;
628                 for (String child : base2.children.keySet()) {
629                     if (first) {
630                         first = false;
631                     } else {
632                         output.append(' ');
633                     }
634                     String subregion = convertToCldr(child);
635                     output.append(subregion);
636                 }
637                 output.append("\"/>\n");
638             }
639             for (SubdivisionNode child : base2.children.values()) {
640                 printXml(output, child, indent);
641             }
642         }
643 
printSamples(Appendable pw)644         public void printSamples(Appendable pw) throws IOException {
645             Set<String> seen = new HashSet<>();
646             for (Entry<String, Set<String>> entry : sdset.ID_SAMPLE.keyValuesSet()) {
647                 pw.append(entry.getKey());
648                 //int max = 10;
649                 seen.clear();
650                 for (String sample : entry.getValue()) {
651                     String region = sample.substring(0, 2);
652                     if (seen.contains(region)) {
653                         continue;
654                     }
655                     seen.add(region);
656                     pw.append(";\t" + ENGLISH_ICU.regionDisplayName(region) + ": " + sdset.getIsoName(sample)
657                     + " (" + sample + ")");
658                     //if (--max < 0) break;
659                 }
660                 pw.append(System.lineSeparator());
661             }
662         }
663 
printEnglishComp(Appendable output)664         public void printEnglishComp(Appendable output) throws IOException {
665             Set<String> countEqual = new TreeSet<>();
666             String lastCC = null;
667             output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\tEqual\n");
668             for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) {
669                 final String countryCode = entry.getKey();
670                 if (!countryCode.equals(lastCC)) {
671                     if (lastCC != null && countEqual.size() != 0) {
672                         output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n");
673                     }
674                     countEqual.clear();
675                     ;
676                     lastCC = countryCode;
677                 }
678                 for (String value : entry.getValue()) {
679                     String cldrName = sdset.getBestName(value, false);
680                     String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value);
681                     final String iso = sdset.getIsoName(value);
682                     if (iso.equals(wiki)) {
683                         countEqual.add(iso);
684                         continue;
685                     }
686                     output.append(
687                         ENGLISH_ICU.regionDisplayName(countryCode)
688 //                        + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value)
689                         + "\t" + cldrName
690                         + "\t" + value
691                         + "\t" + iso
692                         + "\t" + wiki
693                         + "\n");
694                 }
695             }
696             if (countEqual.size() != 0) {
697                 output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n");
698             }
699         }
700 
printEnglishCompFull(Appendable output)701         public void printEnglishCompFull(Appendable output) throws IOException {
702             output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\n");
703             for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) {
704                 final String countryCode = entry.getKey();
705                 for (String value : entry.getValue()) {
706                     String cldrName = sdset.getBestName(value, false);
707                     //getBestName(value);
708                     String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value);
709                     final String iso = sdset.getIsoName(value);
710                     output.append(
711                         ENGLISH_ICU.regionDisplayName(countryCode)
712 //                        + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value)
713                         + "\t" + value
714                         + "\t" + cldrName
715                         + "\t" + iso
716                         + "\t" + wiki
717                         + "\n");
718                 }
719             }
720         }
721 
printEnglish(PrintWriter output)722         public void printEnglish(PrintWriter output) throws IOException {
723             TreeSet<String> allRegions = new TreeSet<>();
724             allRegions.addAll(codeToData.keySet());
725             allRegions.addAll(formerRegionToSubdivisions.keySet()); // override
726 
727             Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*");
728             CLDRFile oldFileSubdivisions = cldrFactorySubdivisions.make("en", false);
729             CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed();
730 
731             // <subdivisions>
732             // <subdivisiontype="NZ-AUK">Auckland</territory>
733 //            output.append(
734 //                DtdType.ldml.header(MethodHandles.lookup().lookupClass())
735 //                + "\t<identity>\n"
736 //                + "\t\t<version number=\"$Revision" /*hack to stop SVN changing this*/ + "$\"/>\n"
737 //                + "\t\t<language type=\"en\"/>\n"
738 //                + "\t</identity>\n"
739 //                + "\t<localeDisplayNames>\n"
740 //                + "\t\t<subdivisions>\n");
741             Set<String> skipped = new LinkedHashSet<>();
742 
743             for (String regionCode : allRegions) {
744                 if (regionCode.equals("FR")) {
745                     int debug = 0;
746                 }
747                 if (!sdset.isKosher(regionCode)) {
748                     if (regionCode.length() != 3) {
749                         skipped.add(regionCode);
750                     }
751                     continue;
752                 }
753                 Set<String> remainder = formerRegionToSubdivisions.get(regionCode);
754                 remainder = remainder == null ? Collections.emptySet() : new LinkedHashSet<>(remainder);
755 
756                 SubdivisionNode regionNode = sdset.ID_TO_NODE.get(regionCode);
757 //                output.append("\t\t<!-- ")
758 //                .append(convertToCldr(regionCode)).append(" : ")
759 //                .append(TransliteratorUtilities.toXML.transform(ENGLISH_ICU.regionDisplayName(regionCode)));
760                 if (regionNode == null) {
761 //                    output.append(" : NO SUBDIVISIONS -->\n");
762                     continue;
763                 }
764 //                output.append(" -->\n");
765 
766                 Set<SubdivisionNode> ordered = new LinkedHashSet<>();
767                 SubdivisionSet.addChildren(ordered, regionNode.children);
768 
769                 for (SubdivisionNode node : ordered) {
770                     final String sdCode = node.code;
771                     String name = sdset.getBestName(sdCode, true);
772                     String upper = UCharacter.toUpperCase(name);
773                     String title = SubdivisionNode.TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(Locale.ROOT, null, name);
774                     if (name.equals(upper) || !name.equals(title)) {
775                         System.out.println("Suspicious name: " + name);
776                     }
777 
778                     SubdivisionNode sd = sdset.ID_TO_NODE.get(sdCode);
779 
780 //                    String level = sd.level == 1 ? "" : "\t<!-- in " + sd.parent.code
781 //                        + " : " + TransliteratorUtilities.toXML.transform(sdset.getBestName(sd.parent.code, true)) + " -->";
782                     SubdivisionSet.appendName(fileSubdivisions, sdCode, name, null);
783                     remainder.remove(sdCode);
784                 }
785                 for (String sdCode : remainder) {
786                     String name = sdset.getBestName(sdCode, true);
787                     if (!name.equals("???")) {
788                         SubdivisionSet.appendName(fileSubdivisions, sdCode, name, "\t<!-- deprecated -->");
789                     }
790                 }
791             }
792 //            output.append(
793 //                "\t\t</subdivisions>\n"
794 //                    + "\t</localeDisplayNames>\n"
795 //                    + "</ldml>");
796             System.out.println("Skipping: " + skipped);
797 //            if (!missing.isEmpty()) {
798 //                throw new IllegalArgumentException("No name for: " + missing.size() + ", " + missing);
799 //            }
800             fileSubdivisions.write(output);
801         }
802 
printMissingMIDs(PrintWriter pw)803         public void printMissingMIDs(PrintWriter pw) {
804 //          for (Entry<String, String> entry : WikiSubdivisionLanguages.WIKIDATA_TO_MID.entrySet()) {
805 //              String mid = entry.getValue();
806 //              if (!mid.isEmpty()) {
807 //                  continue;
808 //              }
809 //              String subCode = entry.getKey();
810 //              String wiki = clean(getWikiName(subCode));
811 //              String iso = clean(getIsoName(subCode));
812 //              String countryCode = subCode.substring(0, 2);
813 //              String cat = SUB_TO_CAT.get(subCode);
814 //              String catName = getIsoName(cat);
815 //              pw.append(
816 //                  ENGLISH_ICU.regionDisplayName(countryCode)
817 //                  + "\t" + mid
818 //                  + "\t" + subCode
819 //                  + "\t" + catName
820 //                  + "\t" + wiki
821 //                  + "\t" + iso
822 //                  + "\n"
823 //                  );
824 //          }
825         }
826     }
827 }