1 package org.unicode.cldr.tool;
2 
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.PrintWriter;
6 import java.text.ParseException;
7 import java.util.ArrayList;
8 import java.util.Arrays;
9 import java.util.Collection;
10 import java.util.Collections;
11 import java.util.Comparator;
12 import java.util.EnumMap;
13 import java.util.HashMap;
14 import java.util.HashSet;
15 import java.util.Iterator;
16 import java.util.LinkedHashSet;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.Set;
20 import java.util.TreeMap;
21 import java.util.TreeSet;
22 import java.util.regex.Matcher;
23 
24 import org.unicode.cldr.draft.FileUtilities;
25 import org.unicode.cldr.draft.ScriptMetadata;
26 import org.unicode.cldr.draft.ScriptMetadata.IdUsage;
27 import org.unicode.cldr.draft.ScriptMetadata.Info;
28 import org.unicode.cldr.util.Builder;
29 import org.unicode.cldr.util.CLDRFile;
30 import org.unicode.cldr.util.CLDRPaths;
31 import org.unicode.cldr.util.CldrUtility;
32 import org.unicode.cldr.util.Factory;
33 import org.unicode.cldr.util.Iso639Data;
34 import org.unicode.cldr.util.Iso639Data.Scope;
35 import org.unicode.cldr.util.Iso639Data.Source;
36 import org.unicode.cldr.util.Iso639Data.Type;
37 import org.unicode.cldr.util.LanguageTagCanonicalizer;
38 import org.unicode.cldr.util.LanguageTagParser;
39 import org.unicode.cldr.util.LocaleIDParser;
40 import org.unicode.cldr.util.LocaleIDParser.Level;
41 import org.unicode.cldr.util.Log;
42 import org.unicode.cldr.util.Pair;
43 import org.unicode.cldr.util.PatternCache;
44 import org.unicode.cldr.util.SpreadSheet;
45 import org.unicode.cldr.util.StandardCodes;
46 import org.unicode.cldr.util.StandardCodes.LstrType;
47 import org.unicode.cldr.util.SupplementalDataInfo;
48 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
49 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
50 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
51 import org.unicode.cldr.util.TransliteratorUtilities;
52 import org.unicode.cldr.util.Validity;
53 import org.unicode.cldr.util.Validity.Status;
54 import org.unicode.cldr.util.XPathParts;
55 import org.unicode.cldr.util.XPathParts.Comments;
56 
57 import com.google.common.base.Joiner;
58 import com.google.common.collect.ImmutableSet;
59 import com.google.common.math.DoubleMath;
60 import com.ibm.icu.impl.Relation;
61 import com.ibm.icu.impl.Row;
62 import com.ibm.icu.impl.Row.R2;
63 import com.ibm.icu.text.Collator;
64 import com.ibm.icu.text.NumberFormat;
65 import com.ibm.icu.text.RuleBasedCollator;
66 import com.ibm.icu.text.UTF16;
67 import com.ibm.icu.util.ULocale;
68 
69 /**
70  * @author markdavis
71  *
72  */
73 public class ConvertLanguageData {
74 
75     private static final boolean DEBUG = false;
76     // change this if you need to override what is generated for the default contents.
77     private static final List<String> defaultOverrides = Arrays.asList("es_ES".split("\\s+")); // und_ZZ
78 
79     public static final boolean SHOW_DIFF = false;
80 
81     private static final boolean ALLOW_SMALL_NUMBERS = true;
82 
83     static final Comparator<String> GENERAL_COLLATOR = new GeneralCollator();
84     static final Comparator<String> INVERSE_GENERAL = new InverseComparator<>(GENERAL_COLLATOR);
85 
86     private static StandardCodes sc = StandardCodes.make();
87 
88     static final double populationFactor = 1;
89     static final double gdpFactor = 1;
90     static final int BAD_COUNTRY_NAME = 0, COUNTRY_CODE = 1, COUNTRY_POPULATION = 2, COUNTRY_LITERACY = 3,
91         COUNTRY_GDP = 4, OFFICIAL_STATUS = 5, BAD_LANGUAGE_NAME = 6, LANGUAGE_CODE = 7, LANGUAGE_POPULATION = 8,
92         LANGUAGE_LITERACY = 9, COMMENT = 10, NOTES = 11;
93     static final Map<String, CodeAndPopulation> languageToMaxCountry = new TreeMap<>();
94     static final Map<String, CodeAndPopulation> languageToMaxScript = new TreeMap<>();
95 
96     private static final double NON_OFFICIAL_WEIGHT = 0.40;
97 
98     private static final boolean SHOW_OLD_DEFAULT_CONTENTS = false;
99 
100     private static final ImmutableSet<String> scriptAssumedLocales = ImmutableSet.of(
101         "bm_ML", "ha_GH", "ha_NE", "ha_NG", "kk_KZ", "ks_IN", "ky_KG", "mn_MN", "ms_BN", "ms_MY", "ms_SG", "tk_TM", "tzm_MA", "ug_CN");
102 
103     static Set<String> skipLocales = new HashSet<>(
104         Arrays
105             .asList(
106                 "sh sh_BA sh_CS sh_YU characters supplementalData supplementalData-old supplementalData-old2 supplementalData-old3 supplementalMetadata root"
107                     .split("\\s")));
108 
109     static Map<String, String> defaultContent = new TreeMap<>();
110 
111     static Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
112     static CLDRFile english = cldrFactory.make("en", true);
113 
114     static SupplementalDataInfo supplementalData = SupplementalDataInfo
115         .getInstance(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY);
116 
main(String[] args)117     public static void main(String[] args) throws IOException, ParseException {
118         BufferedReader oldFile = null;
119         try {
120             // load elements we care about
121             Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalData.xml");
122             // Log.println("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
123             // Log.println("<!DOCTYPE supplementalData SYSTEM \"http://www.unicode.org/cldr/data/dtd/ldmlSupplemental.dtd\">");
124             // Log.println("<supplementalData version=\"1.5\">");
125 
126             oldFile = FileUtilities.openUTF8Reader(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY, "supplementalData.xml");
127             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<languageData>\\s*"), Log.getLog(), false);
128 
129             Set<String> available = cldrFactory.getAvailable();
130 
131             Set<String> cldrParents = getCldrParents(available);
132 
133             List<String> failures = new ArrayList<>();
134             Map<String, RowData> localeToRowData = new TreeMap<>();
135 
136             Set<RowData> sortedInput = getExcelData(failures, localeToRowData);
137 
138             // get the locales (including parents)
139             Set<String> localesWithData = new TreeSet<>(localeToRowData.keySet());
140             for (String locale : localeToRowData.keySet()) {
141                 while (true) {
142                     String parent = LocaleIDParser.getParent(locale);
143                     if (parent == null) break;
144                     localesWithData.add(parent);
145                     locale = parent;
146                 }
147             }
148 
149             final LanguageTagParser languageTagParser = new LanguageTagParser();
150 
151             for (String localeRaw : available) {
152                 String locale = languageTagCanonicalizer.transform(localeRaw);
153                 if (!localesWithData.contains(locale)) {
154                     CLDRFile locFile = cldrFactory.make(localeRaw, false);
155                     if (locFile.isAliasedAtTopLevel()) {
156                         continue;
157                     }
158                     if (scriptAssumedLocales.contains(locale)) {
159                         continue;
160                     }
161                     languageTagParser.set(locale);
162                     if (languageTagParser.getVariants().size() != 0) {
163                         continue;
164                     }
165                     String withoutScript = languageTagParser.setScript("").toString();
166                     if (!localesWithData.contains(withoutScript)) {
167                         String region = new LanguageTagParser().set(locale).getRegion();
168                         if (StandardCodes.isCountry(region)) {
169                             BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale));
170                         }
171                     } else {
172                         // These exceptions are OK, because these locales by default use the non-default script
173                         Set<String> OKExceptions = ImmutableSet.of("sr_Cyrl_ME", "zh_Hans_HK", "zh_Hans_MO");
174                         if (OKExceptions.contains(locale)) {
175                             continue;
176                         }
177                         BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale)
178                             + " but have data for " + getLanguageCodeAndName(withoutScript));
179                     }
180                 }
181             }
182 
183             // TODO sort by country code, then functionalPopulation, then language code
184             // and keep the top country for each language code (even if < 1%)
185 
186             addLanguageScriptData();
187 
188             // showAllBasicLanguageData(allLanguageData, "old");
189             getLanguage2Scripts(sortedInput);
190 
191             writeNewBasicData2(sortedInput);
192             // writeNewBasicData(sortedInput);
193 
194             writeTerritoryLanguageData(failures, sortedInput);
195 
196             checkBasicData(localeToRowData);
197 
198             Set<String> defaultLocaleContent = new TreeSet<>();
199 
200             showDefaults(cldrParents, nf, defaultContent, localeToRowData, defaultLocaleContent);
201 
202             // showContent(available);
203 
204             // certain items are overridden
205 
206             List<String> toRemove = new ArrayList<>();
207             for (String override : defaultOverrides) {
208                 String replacement = getReplacement(override, defaultLocaleContent);
209                 if (replacement != null) {
210                     toRemove.add(replacement);
211                 }
212             }
213             defaultLocaleContent.removeAll(toRemove);
214             defaultLocaleContent.addAll(defaultOverrides);
215 
216             showFailures(failures);
217 
218             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</territoryInfo>\\s*"), null, false);
219             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<references>\\s*"), Log.getLog(), false);
220             // generateIso639_2Data();
221             references.printReferences();
222             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</references>\\s*"), null, false);
223             CldrUtility.copyUpTo(oldFile, null, Log.getLog(), false);
224             // Log.println("</supplementalData>");
225             Log.close();
226             oldFile.close();
227 
228             Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "language_script_raw.txt");
229             getLanguageScriptSpreadsheet(Log.getLog());
230             Log.close();
231         } catch (Exception e) {
232             e.printStackTrace();
233         } finally {
234             if (oldFile != null) {
235                 oldFile.close();
236             }
237             System.out.println("DONE");
238         }
239     }
240 
getLanguageCodeAndName(String code)241     public static String getLanguageCodeAndName(String code) {
242         if (code == null) return null;
243         return english.getName(code) + " [" + code + "]";
244     }
245 
getReplacement(String oldDefault, Set<String> defaultLocaleContent)246     private static String getReplacement(String oldDefault, Set<String> defaultLocaleContent) {
247         String parent = LocaleIDParser.getParent(oldDefault);
248         for (String replacement : defaultLocaleContent) {
249             if (replacement.startsWith(parent)) {
250                 if (parent.equals(LocaleIDParser.getParent(replacement))) {
251                     return replacement;
252                 }
253             }
254         }
255         return null;
256     }
257 
getLanguageScriptSpreadsheet(PrintWriter out)258     private static void getLanguageScriptSpreadsheet(PrintWriter out) {
259         out.println("#Lcode\tLanguageName\tStatus\tScode\tScriptName\tReferences");
260         Pair<String, String> languageScript = new Pair<>("", "");
261         for (String language : language_status_scripts.keySet()) {
262             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
263             for (BasicLanguageData.Type status : status_scripts.keySet()) {
264                 for (String script : status_scripts.getAll(status)) {
265                     String reference = language_script_references.get(languageScript.setFirst(language).setSecond(
266                         script));
267                     out.println(language + "\t" + getLanguageName(language) + "\t" + status + "\t" + script + "\t"
268                         + getDisplayScript(script)
269                         + (reference == null ? "" : "\t" + reference));
270                 }
271             }
272         }
273     }
274 
275     /**
276      * Write data in format:
277      * <languageData>
278      * <language type="aa" scripts="Latn" territories="DJ ER ET"/>
279      *
280      * @param sortedInput
281      */
writeNewBasicData2(Set<RowData> sortedInput)282     private static void writeNewBasicData2(Set<RowData> sortedInput) {
283         double cutoff = 0.2; // 20%
284 
285         // Relation<String, BasicLanguageData> newLanguageData = new Relation(new TreeMap(), TreeSet.class);
286         LanguageTagParser ltp = new LanguageTagParser();
287         Map<String, Relation<BasicLanguageData.Type, String>> language_status_territories = new TreeMap<>();
288         //Map<String, Pair<String, String>> languageToBestCountry;
289         for (RowData rowData : sortedInput) {
290             if (rowData.countryCode.equals("ZZ")) continue;
291             ltp.set(rowData.languageCode);
292             String languageCode = ltp.getLanguage();
293             Relation<BasicLanguageData.Type, String> status_territories = language_status_territories.get(languageCode);
294             if (status_territories == null) {
295                 language_status_territories.put(languageCode, status_territories = Relation.of(
296                     new TreeMap<BasicLanguageData.Type, Set<String>>(),
297                     TreeSet.class));
298             }
299             if (rowData.officialStatus.isMajor()) {
300                 status_territories.put(BasicLanguageData.Type.primary, rowData.countryCode);
301             } else if (rowData.officialStatus.isOfficial()
302                 || rowData.getLanguagePopulation() >= cutoff * rowData.countryPopulation
303                 || rowData.getLanguagePopulation() >= 1000000) {
304                 status_territories.put(BasicLanguageData.Type.secondary, rowData.countryCode);
305             }
306         }
307 
308         Set<String> allLanguages = new TreeSet<>(language_status_territories.keySet());
309         allLanguages.addAll(language_status_scripts.keySet());
310         // now add all the remaining language-script info
311         // <language type="sv" scripts="Latn" territories="AX FI SE"/>
312         Set<String> warnings = new LinkedHashSet<>();
313         Log.println("\t<languageData>");
314         for (String languageSubtag : allLanguages) {
315             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(languageSubtag);
316             Relation<BasicLanguageData.Type, String> status_territories = language_status_territories
317                 .get(languageSubtag);
318 
319             // check against old:
320             Map<BasicLanguageData.Type, BasicLanguageData> oldData = supplementalData
321                 .getBasicLanguageDataMap(languageSubtag);
322             if (oldData == null) {
323                 oldData = Collections.emptyMap();
324             }
325 
326             EnumMap<BasicLanguageData.Type, BasicLanguageData> newData = new EnumMap<>(
327                 BasicLanguageData.Type.class);
328             for (BasicLanguageData.Type status : BasicLanguageData.Type.values()) {
329                 Set<String> scripts = status_scripts == null ? null : status_scripts.getAll(status);
330                 Set<String> territories = status_territories == null ? null : status_territories.getAll(status);
331                 if (scripts == null && territories == null) continue;
332                 BasicLanguageData bld = new BasicLanguageData();
333                 bld.setTerritories(territories);
334                 bld.setScripts(scripts);
335                 bld.setType(status);
336                 bld.freeze();
337                 newData.put(status, bld);
338             }
339 
340             // compare
341             if (!CldrUtility.equals(oldData.entrySet(), newData.entrySet())) {
342                 for (String problem : compare(oldData, newData)) {
343                     warnings.add(BadItem.DETAIL.toString("changing <languageData>", languageSubtag
344                         + "\t" + english.getName(languageSubtag), problem));
345                 }
346             }
347 
348             for (BasicLanguageData bld : newData.values()) {
349                 Set<String> scripts = bld.getScripts();
350                 Set<String> territories = bld.getTerritories();
351                 BasicLanguageData.Type status = bld.getType();
352                 Log.println("\t\t<language type=\"" + languageSubtag + "\""
353                     + (scripts.isEmpty() ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"")
354                     + (territories.isEmpty() ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"")
355                     + (status == BasicLanguageData.Type.primary ? "" : " alt=\"secondary\"")
356                     + "/>");
357             }
358         }
359         Log.println("\t</languageData>");
360         for (String s : warnings) {
361             if (s.contains("!")) {
362                 System.out.println(s);
363             }
364         }
365         for (String s : warnings) {
366             if (!s.contains("!")) {
367                 System.out.println(s);
368             }
369         }
370     }
371 
compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData, Map<BasicLanguageData.Type, BasicLanguageData> newData)372     private static List<String> compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData,
373         Map<BasicLanguageData.Type, BasicLanguageData> newData) {
374         Map<String, BasicLanguageData.Type> oldDataToType = getDataToType(oldData.values(), true);
375         Map<String, BasicLanguageData.Type> newDataToType = getDataToType(newData.values(), true);
376         List<String> result = new ArrayList<>();
377         StringBuilder temp = new StringBuilder();
378         for (String s : Builder.with(new LinkedHashSet<String>()).addAll(oldDataToType.keySet())
379             .addAll(newDataToType.keySet()).get()) {
380             BasicLanguageData.Type oldValue = oldDataToType.get(s);
381             BasicLanguageData.Type newValue = newDataToType.get(s);
382             if (!CldrUtility.equals(oldValue, newValue)) {
383                 temp.setLength(0);
384                 temp.append("[").append(s).append(":")
385                     .append(english.getName(s.length() == 4 ? "script" : "region", s)).append("] ");
386                 if (oldValue == null) {
387                     temp.append(" added as ").append(newValue);
388                 } else if (newValue == null) {
389                     temp.append(" REMOVED!");
390                 } else if (oldValue == BasicLanguageData.Type.primary) {
391                     temp.append(" DOWNGRADED TO! ").append(newValue);
392                 } else {
393                     temp.append(" upgraded to ").append(newValue);
394                 }
395                 result.add(temp.toString());
396             }
397         }
398         result.add(newData.toString());
399         return result;
400     }
401 
getDataToType( Collection<BasicLanguageData> collection, boolean script)402     private static Map<String, BasicLanguageData.Type> getDataToType(
403         Collection<BasicLanguageData> collection, boolean script) {
404         Map<String, BasicLanguageData.Type> result = new TreeMap<>();
405         for (BasicLanguageData i : collection) {
406             for (String s : i.getScripts()) {
407                 result.put(s, i.getType());
408             }
409             for (String s : i.getTerritories()) {
410                 result.put(s, i.getType());
411             }
412         }
413         return result;
414     }
415 
checkBasicData(Map<String, RowData> localeToRowData)416     private static void checkBasicData(Map<String, RowData> localeToRowData) {
417         // find languages with multiple scripts
418         Relation<String, String> languageToScripts = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
419         for (String languageSubtag : language2BasicLanguageData.keySet()) {
420             for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
421                 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), item.getScripts());
422             }
423         }
424         // get primary combinations
425         Set<String> primaryCombos = new TreeSet<>();
426         Set<String> basicCombos = new TreeSet<>();
427         for (String languageSubtag : language2BasicLanguageData.keySet()) {
428             for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
429                 Set<String> scripts = new TreeSet<>();
430                 scripts.addAll(item.getScripts());
431                 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), scripts);
432                 if (scripts.size() == 0) {
433                     scripts.add("Zzzz");
434                 }
435                 Set<String> territories = new TreeSet<>();
436                 territories.addAll(item.getTerritories());
437                 if (territories.size() == 0) {
438                     territories.add("ZZ");
439                     continue;
440                 }
441 
442                 for (String script : scripts) {
443                     for (String territory : territories) {
444                         String locale = StandardCodes.fixLanguageTag(languageSubtag)
445                             // + (script.equals("Zzzz") ? "" : languageToScripts.getAll(languageSubtag).size() <= 1 ? ""
446                             // : "_" + script)
447                             + (territories.equals("ZZ") ? "" : "_" + territory);
448                         if (item.getType() != BasicLanguageData.Type.secondary) {
449                             primaryCombos.add(locale);
450                         }
451                         basicCombos.add(locale);
452                     }
453                 }
454             }
455         }
456         Set<String> populationOver20 = new TreeSet<>();
457         Set<String> population = new TreeSet<>();
458         LanguageTagParser ltp = new LanguageTagParser();
459         for (String rawLocale : localeToRowData.keySet()) {
460             ltp.set(rawLocale);
461             String locale = ltp.getLanguage() + (ltp.getRegion().length() == 0 ? "" : "_" + ltp.getRegion());
462             population.add(locale);
463             RowData rowData = localeToRowData.get(rawLocale);
464             if (rowData.getLanguagePopulation() / rowData.countryPopulation >= 0.2
465             //|| rowData.getLanguagePopulation() > 900000
466             ) {
467                 populationOver20.add(locale);
468             } else {
469                 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(
470                     ltp.getLanguageScript(), ltp.getRegion());
471                 if (popData != null && popData.getOfficialStatus().isOfficial()) {
472                     populationOver20.add(locale);
473                 }
474             }
475         }
476         Set<String> inBasicButNotPopulation = new TreeSet<>(primaryCombos);
477 
478         inBasicButNotPopulation.removeAll(population);
479         for (String locale : inBasicButNotPopulation) {
480             ltp.set(locale);
481             String region = ltp.getRegion();
482             String language = ltp.getLanguage();
483             if (!sc.isModernLanguage(language)) continue;
484             PopulationData popData = supplementalData.getPopulationDataForTerritory(region);
485             // Afghanistan AF "29,928,987" 28.10% "21,500,000,000" Hazaragi haz "1,770,000" 28.10%
486             BadItem.WARNING.show("In Basic Data but not Population > 20%",
487                 getDisplayCountry(region)
488                     + "\t" + region
489                     + "\t\"" + formatNumber(popData.getPopulation(), 0, false) + "\""
490                     + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false)
491                     + "\""
492                     + "\t\"" + formatPercent(popData.getGdp(), 0, false) + "\""
493                     + "\t" + ""
494                     + "\t" + getLanguageName(language)
495                     + "\t" + language
496                     + "\t" + -1
497                     + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false)
498                     + "\"");
499         }
500 
501         Set<String> inPopulationButNotBasic = new TreeSet<>(populationOver20);
502         inPopulationButNotBasic.removeAll(basicCombos);
503         for (Iterator<String> it = inPopulationButNotBasic.iterator(); it.hasNext();) {
504             String locale = it.next();
505             if (locale.endsWith("_ZZ")) {
506                 it.remove();
507             }
508         }
509         for (String locale : inPopulationButNotBasic) {
510             BadItem.WARNING.show("In Population>20% but not Basic Data", locale + " " + getLanguageName(locale), localeToRowData.get(locale).toString());
511         }
512     }
513 
514     static class LanguageInfo {
515         static LanguageInfo INSTANCE = new LanguageInfo();
516 
517         Map<String, Set<String>> languageToScripts = new TreeMap<>();
518         Map<String, Set<String>> languageToRegions = new TreeMap<>();
519         Map<String, Comments> languageToComments = new TreeMap<>();
520 
521         Map<String, Set<String>> languageToScriptsAlt = new TreeMap<>();
522         Map<String, Set<String>> languageToRegionsAlt = new TreeMap<>();
523         Map<String, Comments> languageToCommentsAlt = new TreeMap<>();
524 
LanguageInfo()525         private LanguageInfo() {
526             cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
527             //Set<String> available = cldrFactory.getAvailable();
528             CLDRFile supplemental = cldrFactory.make("supplementalData", true);
529             for (Iterator<String> it = supplemental.iterator("//supplementalData/languageData/language"); it.hasNext();) {
530                 String xpath = it.next();
531                 XPathParts parts = XPathParts.getFrozenInstance(xpath);
532                 Map<String, String> x = parts.getAttributes(-1);
533                 boolean alt = x.containsKey("alt");
534                 String lang = x.get("type");
535                 List<String> scripts = getAttributeList(x, "scripts");
536                 if (scripts != null) {
537                     if (alt) {
538                         putAll(languageToScriptsAlt, lang, new LinkedHashSet<>(scripts));
539                     } else {
540                         putAll(languageToScripts, lang, new LinkedHashSet<>(scripts));
541                     }
542                 }
543                 List<String> regions = getAttributeList(x, "territories");
544                 if (regions != null) {
545                     if (alt) {
546                         putAll(languageToRegionsAlt, lang, new LinkedHashSet<>(regions));
547                     } else {
548                         putAll(languageToRegions, lang, new LinkedHashSet<>(regions));
549                     }
550                 }
551             }
552         }
553 
getAttributeList(Map<String, String> x, String attribute)554         private List<String> getAttributeList(Map<String, String> x, String attribute) {
555             List<String> scripts = null;
556             String scriptString = x.get(attribute);
557             if (scriptString != null) {
558                 scripts = Arrays.asList(scriptString.split("\\s+"));
559             }
560             return scripts;
561         }
562     }
563 
putUnique(Map<K, V> map, K key, V value)564     private static <K, V> void putUnique(Map<K, V> map, K key, V value) {
565         V oldValue = map.get(key);
566         if (oldValue != null && !oldValue.equals(value)) {
567             throw new IllegalArgumentException("Duplicate value for <" + key + ">: <" + oldValue + ">, <" + value + ">");
568         }
569         map.put(key, value);
570     }
571 
putAll(Map<K, Set<W>> map, K key, Set<W> values)572     private static <K, W> void putAll(Map<K, Set<W>> map, K key, Set<W> values) {
573         Set<W> oldValue = map.get(key);
574         if (oldValue == null) {
575             map.put(key, values);
576         } else {
577             oldValue.addAll(values);
578         }
579     }
580 
581     // public enum OfficialStatus {unknown, de_facto_official, official, official_regional, official_minority};
582 
583     static class RowData implements Comparable<Object> {
584         private final String countryCode;
585         private final double countryGdp;
586         private final double countryLiteracy;
587         private final double countryPopulation;
588         private final String languageCode;
589         private final OfficialStatus officialStatus;
590         private final double languagePopulation;
591         private final double languageLiteracy;
592         private final String comment;
593         private final String notes;
594         private final String badLanguageName;
595         private final boolean relativeLanguagePopulation;
596         // String badLanguageCode = "";
597         private final static Set<String> doneCountries = new HashSet<>();
598 
599         private final static Set<String> countryCodes = sc.getGoodAvailableCodes("territory");
600 
RowData(String country, String language)601         public RowData(String country, String language) {
602             this.countryCode = country;
603             this.languageCode = language;
604             badLanguageName = country = language = notes = comment = "";
605             officialStatus = OfficialStatus.unknown;
606             countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000);
607             countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d;
608             countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue();
609             languagePopulation = languageLiteracy = Double.NaN;
610             relativeLanguagePopulation = false;
611         }
612 
RowData(List<String> row)613         RowData(List<String> row) throws ParseException {
614             countryCode = fixCountryCode(row.get(COUNTRY_CODE), row);
615 
616             if (!countryCodes.contains(countryCode)) {
617                 System.err.println("WRONG COUNTRY CODE: " + row);
618             }
619 
620             double countryPopulation1 = parseDecimal(row.get(COUNTRY_POPULATION));
621             double countryLiteracy1 = parsePercent(row.get(COUNTRY_LITERACY), countryPopulation1);
622 
623             countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000);
624             countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d;
625             countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue();
626 
627             String officialStatusString = row.get(OFFICIAL_STATUS).trim().replace(' ', '_');
628             if (officialStatusString.equals("national")) {
629                 officialStatusString = "official";
630             } else if (officialStatusString.equals("regional_official")) {
631                 officialStatusString = "official_regional";
632             } else if (officialStatusString.length() == 0 || officialStatusString.equals("uninhabited")) {
633                 officialStatusString = "unknown";
634             }
635             try {
636                 officialStatus = OfficialStatus.valueOf(officialStatusString);
637             } catch (RuntimeException e) {
638                 throw new IllegalArgumentException("Can't interpret offical-status: " + officialStatusString);
639             }
640 
641             String languageCode1 = row.get(LANGUAGE_CODE);
642             if (languageCode1.startsWith("*") || languageCode1.startsWith("\u00A7")) {
643                 languageCode1 = languageCode1.substring(1);
644             }
645             languageCode = fixLanguageCode(languageCode1, row);
646 
647             if (doneCountries.contains(countryCode) == false) {
648                 // showDiff(countryGdp1, countryGdp);
649                 // showDiff(countryLiteracy1, countryLiteracy);
650                 if (SHOW_DIFF) showDiff(countryPopulation1, countryPopulation, 0.1, false);
651                 doneCountries.add(countryCode);
652             }
653 
654             double languagePopulation1 = parsePercent(row.get(LANGUAGE_POPULATION), countryPopulation1)
655                 * countryPopulation1;
656             if ((officialStatus.isMajor())
657                 && languagePopulation1 * 100 < countryPopulation && languagePopulation1 < 1000000) {
658                 BadItem.WARNING.show("official language has population < 1% of country & < 1,000,000", languageCode + ", " + Math.round(languagePopulation1),
659                     row);
660             }
661             if (languagePopulation1 < 0.999) {
662                 BadItem.WARNING.show("suspect language population, < 1", languageCode + ", " + Math.round(languagePopulation1), row);
663             }
664             if (languagePopulation1 > 10000) {
665                 relativeLanguagePopulation = true;
666                 languagePopulation1 = languagePopulation1 * countryPopulation / countryPopulation1; // correct the
667                 // values
668             } else {
669                 relativeLanguagePopulation = false;
670             }
671             if (isApproximatelyGreater(languagePopulation1, countryPopulation, 0.0001)) {
672                 BadItem.ERROR.show("language population > country population", Math.round(languagePopulation1) + " > " + countryPopulation, row);
673             }
674             languagePopulation = languagePopulation1 < countryPopulation ? languagePopulation1 : countryPopulation;
675 
676             if (SHOW_DIFF)
677                 showDiff(languagePopulation1 / countryPopulation1, languagePopulation / countryPopulation, 0.01, true);
678 
679             String stringLanguageLiteracy = row.size() <= LANGUAGE_LITERACY ? "" : row.get(LANGUAGE_LITERACY);
680             double languageLiteracy1 = stringLanguageLiteracy.length() == 0 ? countryLiteracy
681                 : parsePercent(stringLanguageLiteracy, languagePopulation);
682             if (isApproximatelyEqual(languageLiteracy1, countryLiteracy1, 0.001)) {
683                 languageLiteracy1 = countryLiteracy; // correct the values
684             }
685             languageLiteracy = languageLiteracy1;
686 
687             if (row.size() > COMMENT) {
688                 comment = row.get(COMMENT);
689             } else {
690                 comment = "";
691             }
692             if (row.size() > NOTES) {
693                 notes = row.get(NOTES);
694             } else {
695                 notes = "";
696             }
697             badLanguageName = row.get(BAD_LANGUAGE_NAME);
698         }
699 
showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang)700         private void showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang) {
701             final double diff = new_a / a - 1;
702             if (Math.abs(diff) > maxRelativeDiff) {
703                 System.out.println(formatPercent(diff, 0, false)
704                     + "\t" + countryCode + "\t" + getDisplayCountry(countryCode)
705                     + (showLang ? "\t" + languageCode + "\t" + getLanguageName(languageCode) : "")
706                     + "\t" + formatNumber(a, 0, false) + "\t=>\t" + formatNumber(new_a, 0, false));
707             }
708         }
709 
roundToPartsPer(double a, double whole)710         private double roundToPartsPer(double a, double whole) {
711             // break this out just to make it easier to follow.
712             double log10 = Math.log10(a / whole);
713             long digitsFound = (long) (log10);
714             long factor = (long) (Math.pow(10, digitsFound));
715             double rounded = Math.round(a / factor);
716             double result = rounded * factor;
717             // if (Math.abs(result - a) >= 1) {
718             // System.out.println("Rounding " + a + " => " + result);
719             // }
720             return result;
721         }
722 
isApproximatelyEqual(double a, double b, double epsilon)723         private static boolean isApproximatelyEqual(double a, double b, double epsilon) {
724             return a == b || Math.abs(a - b) < epsilon;
725         }
726 
isApproximatelyGreater(double a, double b, double epsilon)727         private static boolean isApproximatelyGreater(double a, double b, double epsilon) {
728             return a > b + epsilon;
729         }
730 
parseDecimal(String numericRepresentation)731         double parseDecimal(String numericRepresentation) throws ParseException {
732             try {
733                 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN;
734                 Number result = nf.parse(numericRepresentation);
735                 // if (result == null) return Double.NaN;
736                 return result.doubleValue();
737             } catch (ParseException e) {
738                 throw e;
739                 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation +
740                 // ">").initCause(e);
741             }
742         }
743 
parsePercent(String numericRepresentation, double baseValue)744         double parsePercent(String numericRepresentation, double baseValue) throws ParseException {
745             try {
746                 double result;
747                 if (numericRepresentation.contains("%")) {
748                     Number result0 = pf.parse(numericRepresentation);
749                     result = result0.doubleValue();
750                 } else {
751                     Number result0 = nf.parse(numericRepresentation);
752                     result = result0.doubleValue() / baseValue;
753                 }
754                 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN;
755                 // if (result == null) return Double.NaN;
756                 return result;
757             } catch (ParseException e) {
758                 throw e;
759                 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation +
760                 // ">").initCause(e);
761             }
762         }
763 
getLanguageLiteratePopulation()764         public double getLanguageLiteratePopulation() {
765             return languageLiteracy * languagePopulation;
766         }
767 
768         /**
769          * Get the weighted population
770          *
771          * @param weightIfNotOfficial
772          * @return
773          */
getLanguageLiteratePopulation(double weightIfNotOfficial)774         public double getLanguageLiteratePopulation(double weightIfNotOfficial) {
775             double result = languageLiteracy * languagePopulation;
776             if (!officialStatus.isMajor()) {
777                 result *= weightIfNotOfficial;
778             }
779             return result;
780         }
781 
782         @Override
compareTo(Object o)783         public int compareTo(Object o) {
784             RowData that = (RowData) o;
785             int result;
786             if (0 != (result = GENERAL_COLLATOR.compare(countryCode, that.countryCode))) return result;
787             if (languagePopulation > that.languagePopulation) return -1; // descending
788             if (languagePopulation < that.languagePopulation) return 1;
789             if (0 != (result = GENERAL_COLLATOR.compare(languageCode, that.languageCode))) return result;
790             return 0;
791         }
792 
toStringHeader()793         public static String toStringHeader() {
794             return "countryCode" + "\t" + "countryPopulation" + "\t" + "countryGdp"
795                 + "\t" + "countryLiteracy"
796                 + "\t" + "languagePopulation" + "\t" + "languageCode"
797                 + "\t" + "writingPopulation";
798         }
799 
800         @Override
toString()801         public String toString() {
802             return countryCode + "\t" + countryPopulation + "\t" + countryGdp
803                 + "\t" + countryLiteracy
804                 + "\t" + languagePopulation + "\t" + languageCode
805                 + "\t" + languageLiteracy;
806         }
807 
toString(boolean b)808         public String toString(boolean b) {
809             return "region:\t" + getCountryCodeAndName(countryCode)
810                 + "\tpop:\t" + countryPopulation
811                 + "\tgdp:\t" + countryGdp
812                 + "\tlit:\t" + countryLiteracy
813                 + "\tlang:\t" + getLanguageCodeAndName(languageCode)
814                 + "\tpop:\t" + languagePopulation
815                 + "\tlit:\t" + languageLiteracy;
816         }
817 
818         static boolean MARK_OUTPUT = false;
819 
getRickLanguageCode()820         public String getRickLanguageCode() {
821             if (languageCode.contains("_")) return languageCode;
822             Source source = Iso639Data.getSource(languageCode);
823             if (source == null) {
824                 return "§" + languageCode;
825             }
826             if (MARK_OUTPUT) {
827                 if (source == Source.ISO_639_3) {
828                     return "*" + languageCode;
829                 }
830             }
831             return languageCode;
832         }
833 
834         static Map<String, String> oldToFixed = new HashMap<>();
835 
getRickLanguageName()836         public String getRickLanguageName() {
837             String cldrResult = getExcelQuote(english.getName(languageCode, true));
838 //            String result = getRickLanguageName2();
839 //            if (!result.equalsIgnoreCase(cldrResult)) {
840 //                if (null == oldToFixed.put(result, cldrResult)) {
841 //                    System.out.println("## " + result + "!=" + cldrResult);
842 //                }
843 //            }
844             return cldrResult;
845         }
846 
getRickLanguageName2()847         public String getRickLanguageName2() {
848             String result = new ULocale(languageCode).getDisplayName();
849             if (!result.equals(languageCode)) return getExcelQuote(result);
850             Set<String> names = Iso639Data.getNames(languageCode);
851             if (names != null && names.size() != 0) {
852                 if (MARK_OUTPUT) {
853                     return getExcelQuote("*" + names.iterator().next());
854                 } else {
855                     return getExcelQuote(names.iterator().next());
856                 }
857             }
858             return getExcelQuote("§" + badLanguageName);
859         }
860 
getCountryName()861         public String getCountryName() {
862             return getExcelQuote(getDisplayCountry(countryCode));
863         }
864 
getCountryGdpString()865         public String getCountryGdpString() {
866             return getExcelQuote(formatNumber(countryGdp, 0, false));
867         }
868 
getCountryLiteracyString()869         public String getCountryLiteracyString() {
870             return formatPercent(countryLiteracy, 2, false);
871         }
872 
getCountryPopulationString()873         public String getCountryPopulationString() {
874             return getExcelQuote(formatNumber(countryPopulation, 0, false));
875         }
876 
getLanguageLiteracyString()877         public String getLanguageLiteracyString() {
878             return formatPercent(languageLiteracy, 2, false);
879         }
880 
getLanguagePopulationString()881         public String getLanguagePopulationString() {
882 
883             try {
884                 final double percent = languagePopulation / countryPopulation;
885                 return getExcelQuote(relativeLanguagePopulation
886                     && percent > 0.03
887                     && languagePopulation > 10000
888                         ? formatPercent(percent, 2, false)
889                         : formatNumber(languagePopulation, 3, false));
890             } catch (IllegalArgumentException e) {
891                 return "NaN";
892             }
893         }
894 
getLanguagePopulation()895         private double getLanguagePopulation() {
896             return languagePopulation;
897         }
898 
899     }
900 
getExcelQuote(String comment)901     public static String getExcelQuote(String comment) {
902         return comment == null || comment.length() == 0 ? ""
903             : comment.contains(",") ? '"' + comment + '"'
904                 : comment.contains("\"") ? '"' + comment.replace("\"", "\"\"") + '"'
905                     : comment;
906     }
907 
getCountryCodeAndName(String code)908     public static String getCountryCodeAndName(String code) {
909         if (code == null) return null;
910         return english.getName(CLDRFile.TERRITORY_NAME, code) + " [" + code + "]";
911     }
912 
913     static class RickComparator implements Comparator<RowData> {
914         @Override
compare(RowData me, RowData that)915         public int compare(RowData me, RowData that) {
916             int result;
917             if (0 != (result = GENERAL_COLLATOR.compare(me.getCountryName(), that.getCountryName()))) return result;
918             if (0 != (result = GENERAL_COLLATOR.compare(me.getRickLanguageName(), that.getRickLanguageName())))
919                 return result;
920             return me.compareTo(that);
921         }
922     }
923 
writeTerritoryLanguageData(List<String> failures, Set<RowData> sortedInput)924     private static void writeTerritoryLanguageData(List<String> failures, Set<RowData> sortedInput) {
925 
926         String lastCountryCode = "";
927         boolean first = true;
928         LanguageTagParser ltp = new LanguageTagParser();
929 
930         Log.println(" <!-- See http://unicode.org/cldr/data/diff/supplemental/territory_language_information.html for more information on territoryInfo. -->");
931         Log.println("\t<territoryInfo>");
932 
933         for (RowData row : sortedInput) {
934             String countryCode = row.countryCode;
935 
936             double countryPopulationRaw = row.countryPopulation;
937             double countryPopulation = countryPopulationRaw; // (long) Utility.roundToDecimals(countryPopulationRaw, 2);
938             double languageLiteracy = row.languageLiteracy;
939             double countryLiteracy = row.countryLiteracy;
940 
941             double countryGDPRaw = row.countryGdp;
942             long countryGDP = Math.round(countryGDPRaw / gdpFactor);
943 
944             String languageCode = row.languageCode;
945 
946             double languagePopulationRaw = row.getLanguagePopulation();
947             double languagePopulation = languagePopulationRaw; // (long) Utility.roundToDecimals(languagePopulationRaw,
948             // 2);
949 
950             double languagePopulationPercent = languagePopulation / countryPopulation;
951             // Utility.roundToDecimals(Math.min(100, Math.max(0,
952             // languagePopulation*100 / (double)countryPopulation)),3);
953 
954             if (!countryCode.equals(lastCountryCode)) {
955                 if (first) {
956                     first = false;
957                 } else {
958                     Log.println("\t\t</territory>");
959                 }
960                 Log.print("\t\t<territory type=\"" + countryCode + "\""
961                     + " gdp=\"" + formatNumber(countryGDP, 4, true) + "\""
962                     + " literacyPercent=\"" + formatPercent(countryLiteracy, 3, true) + "\""
963                     + " population=\"" + formatNumber(countryPopulation, 6, true) + "\">");
964                 lastCountryCode = countryCode;
965                 Log.println("\t<!--" + getDisplayCountry(countryCode) + "-->");
966             }
967 
968             if (languageCode.length() != 0
969                 && languagePopulationPercent > 0.0000
970                 && (ALLOW_SMALL_NUMBERS || languagePopulationPercent >= 1 || languagePopulationRaw > 100000
971                     || languageCode.equals("haw") || row.officialStatus.isOfficial())) {
972                 // add best case
973                 addBestRegion(languageCode, countryCode, languagePopulationRaw);
974                 String baseScriptLanguage = ltp.set(languageCode).getLanguageScript();
975                 if (!baseScriptLanguage.equals(languageCode)) {
976                     addBestRegion(baseScriptLanguage, countryCode, languagePopulationRaw);
977                 }
978                 String baseLanguage = ltp.set(baseScriptLanguage).getLanguage();
979                 if (!baseLanguage.equals(baseScriptLanguage)) {
980                     addBestRegion(baseLanguage, countryCode, languagePopulationRaw);
981                     addBestScript(baseLanguage, ltp.set(languageCode).getScript(), languagePopulationRaw);
982                 }
983 
984                 if (languageLiteracy != countryLiteracy) {
985                     int debug = 0;
986                 }
987                 Log.print("\t\t\t<languagePopulation type=\""
988                     + languageCode
989                     + "\""
990                     + (DoubleMath.fuzzyCompare(languageLiteracy, countryLiteracy, 0.0001) == 0 ? ""
991                         : (DoubleMath.fuzzyCompare(languageLiteracy, 0.05, 0.0001) == 0 ? " writingPercent=\"" : " literacyPercent=\"")
992                             + formatPercent(languageLiteracy, 2, true) + "\"")
993                     + " populationPercent=\"" + formatPercent(languagePopulationPercent, 2, true) + "\""
994                     + (row.officialStatus.isOfficial() ? " officialStatus=\"" + row.officialStatus + "\"" : "")
995                     + references.addReference(row.notes)
996                     + "/>");
997                 Log.println("\t<!--" + getLanguageName(languageCode) + "-->");
998             } else if (!row.countryCode.equals("ZZ")) {
999                 failures.add(BadItem.ERROR.toString("too few speakers: suspect line", languageCode, row.toString(true)));
1000             }
1001             // if (first) {
1002             if (false) System.out.print(
1003                 "countryCode: " + countryCode + "\t"
1004                     + "countryPopulation: " + countryPopulation + "\t"
1005                     + "countryGDP: " + countryGDP + "\t"
1006                     + "languageCode: " + languageCode + "\t"
1007                     + "languagePopulation: " + languagePopulation + CldrUtility.LINE_SEPARATOR);
1008             // }
1009         }
1010 
1011         Log.println("\t\t</territory>");
1012         Log.println("\t</territoryInfo>");
1013     }
1014 
getDisplayCountry(String countryCode)1015     private static String getDisplayCountry(String countryCode) {
1016         String result = getULocaleCountryName(countryCode);
1017         if (!result.equals(countryCode)) {
1018             return result;
1019         }
1020         result = sc.getData("territory", countryCode);
1021         if (result != null) {
1022             return result;
1023         }
1024         return countryCode;
1025         // new ULocale("und-" + countryCode).getDisplayCountry()
1026     }
1027 
getDisplayScript(String scriptCode)1028     private static String getDisplayScript(String scriptCode) {
1029         String result = getULocaleScriptName(scriptCode);
1030         if (!result.equals(scriptCode)) {
1031             return result;
1032         }
1033         result = sc.getData("territory", scriptCode);
1034         if (result != null) {
1035             return result;
1036         }
1037         return scriptCode;
1038         // new ULocale("und-" + countryCode).getDisplayCountry()
1039     }
1040 
getLanguageName(String languageCode)1041     private static String getLanguageName(String languageCode) {
1042         String result = getULocaleLocaleName(languageCode);
1043         if (!result.equals(languageCode)) return result;
1044         Set<String> names = Iso639Data.getNames(languageCode);
1045         if (names != null && names.size() != 0) {
1046             return names.iterator().next();
1047         }
1048         return languageCode;
1049     }
1050 
1051     static class References {
1052         Map<String, Pair<String, String>> Rxxx_to_reference = new TreeMap<>();
1053         Map<Pair<String, String>, String> reference_to_Rxxx = new TreeMap<>();
1054         Map<String, Pair<String, String>> Rxxx_to_oldReferences = supplementalData.getReferences();
1055         Map<Pair<String, String>, String> oldReferences_to_Rxxx = new TreeMap<>();
1056         {
1057             for (String Rxxx : Rxxx_to_oldReferences.keySet()) {
Rxxx_to_oldReferences.get(Rxxx)1058                 oldReferences_to_Rxxx.put(Rxxx_to_oldReferences.get(Rxxx), Rxxx);
1059             }
1060         }
1061         Matcher URI = PatternCache.get("([a-z]+\\://[\\S]+)\\s?(.*)").matcher("");
1062 
1063         static int referenceStart = 1000;
1064 
1065         /**
1066          * Returns " references=\"" + Rxxx + "\"" or "" if there is no reference.
1067          *
1068          * @param rawReferenceText
1069          * @return
1070          */
addReference(String rawReferenceText)1071         private String addReference(String rawReferenceText) {
1072             if (rawReferenceText == null || rawReferenceText.length() == 0) return "";
1073             Pair<String, String> p;
1074             if (URI.reset(rawReferenceText).matches()) {
1075                 p = new Pair<>(URI.group(1), URI.group(2) == null || URI.group(2).length() == 0 ? "[missing]"
1076                     : URI.group(2)).freeze();
1077             } else {
1078                 p = new Pair<String, String>(null, rawReferenceText).freeze();
1079             }
1080 
1081             String Rxxx = reference_to_Rxxx.get(p);
1082             if (Rxxx == null) { // add new
1083                 Rxxx = oldReferences_to_Rxxx.get(p);
1084                 if (Rxxx != null) { // if old, just keep number
1085                     p = Rxxx_to_oldReferences.get(Rxxx);
1086                 } else { // find an empty number
1087                     while (true) {
1088                         Rxxx = "R" + (referenceStart++);
1089                         if (Rxxx_to_reference.get(Rxxx) == null && Rxxx_to_oldReferences.get(Rxxx) == null) {
1090                             break;
1091                         }
1092                     }
1093                 }
1094                 // add to new references
1095                 reference_to_Rxxx.put(p, Rxxx);
1096                 Rxxx_to_reference.put(Rxxx, p);
1097             }
1098             // references="R034"
1099             return " references=\"" + Rxxx + "\"";
1100         }
1101 
getReferenceHTML(String Rxxx)1102         String getReferenceHTML(String Rxxx) {
1103             Pair<String, String> p = Rxxx_to_reference.get(Rxxx); // exception if fails.
1104             String uri = p.getFirst();
1105             String value = p.getSecond();
1106             uri = uri == null ? "" : " uri=\"" + TransliteratorUtilities.toHTML.transliterate(uri) + "\"";
1107             value = value == null ? "[missing]" : TransliteratorUtilities.toHTML.transliterate(value);
1108             return "\t\t<reference type=\"" + Rxxx + "\"" + uri + ">" + value + "</reference>";
1109         }
1110 
printReferences()1111         void printReferences() {
1112             // <reference type="R034" uri="isbn:0-321-18578-1">The Unicode Standard 4.0</reference>
1113             Log.println("\t<references>");
1114             for (String Rxxx : Rxxx_to_reference.keySet()) {
1115                 Log.println(getReferenceHTML(Rxxx));
1116             }
1117             Log.println("\t</references>");
1118         }
1119     }
1120 
1121     static References references = new References();
1122 
getExcelData(List<String> failures, Map<String, RowData> localeToRowData)1123     private static Set<RowData> getExcelData(List<String> failures, Map<String, RowData> localeToRowData)
1124         throws IOException {
1125 
1126         LanguageTagParser ltp = new LanguageTagParser();
1127 
1128         String dir = CLDRPaths.GEN_DIRECTORY + "supplemental/";
1129         final String ricksFile = "country_language_population_raw.txt";
1130         System.out.println("\n# Problems in " + ricksFile + "\n");
1131         List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data(ricksFile));
1132 
1133         Set<String> languages = languagesNeeded; // sc.getGoodAvailableCodes("language");
1134 
1135         Set<String> territories = new TreeSet<>(sc.getGoodAvailableCodes("territory"));
1136         territories.removeAll(supplementalData.getContainers());
1137         territories.remove("EU");
1138         territories.remove("QO");
1139 
1140         Set<String> countriesNotFound = new TreeSet<>(territories);
1141         Set<OfficialStatus> statusFound = new TreeSet<>();
1142         Set<String> countriesWithoutOfficial = new TreeSet<>(territories);
1143         countriesWithoutOfficial.remove("ZZ");
1144 
1145         Map<String, Row.R2<String, Double>> countryToLargestOfficialLanguage = new HashMap<>();
1146 
1147         Set<String> languagesNotFound = new TreeSet<>(languages);
1148         Set<RowData> sortedInput = new TreeSet<>();
1149         int count = 0;
1150         for (List<String> row : input) {
1151             ++count;
1152             if (count == 1 || row.size() <= COUNTRY_GDP) {
1153                 failures.add(join(row, "\t") + "\tShort row");
1154                 continue;
1155             }
1156             try {
1157                 RowData x = new RowData(row);
1158                 if (x.officialStatus.isOfficial()) {
1159                     Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(x.countryCode);
1160                     if (largestOffical == null) {
1161                         countryToLargestOfficialLanguage.put(x.countryCode,
1162                             Row.of(x.languageCode, x.languagePopulation));
1163                     } else if (largestOffical.get1() < x.languagePopulation) {
1164                         largestOffical.set0(x.languageCode);
1165                         largestOffical.set1(x.languagePopulation);
1166                     }
1167                 }
1168                 if (x.officialStatus.isMajor() || x.countryPopulation < 1000) {
1169                     countriesWithoutOfficial.remove(x.countryCode);
1170                 }
1171                 if (!checkCode(LstrType.region, x.countryCode, row)) continue;
1172                 statusFound.add(x.officialStatus);
1173                 countriesNotFound.remove(x.countryCode);
1174                 languagesNotFound.remove(x.languageCode);
1175                 if (x.languageCode.contains("_")) {
1176                     ltp.set(x.languageCode);
1177                     languagesNotFound.remove(ltp.getLanguage());
1178                     if (!checkCode(LstrType.language, ltp.getLanguage(), row)) continue;
1179                     if (!checkCode(LstrType.script, ltp.getScript(), row)) continue;
1180                 }
1181                 String locale = x.languageCode + "_" + x.countryCode;
1182                 if (localeToRowData.get(locale) != null) {
1183                     BadItem.ERROR.show("duplicate data", x.languageCode + " with " + x.countryCode, row);
1184                 }
1185                 localeToRowData.put(locale, x);
1186                 sortedInput.add(x);
1187             } catch (ParseException e) {
1188                 failures.add(join(row, "\t") + "\t" + e.getMessage() + "\t"
1189                     + join(Arrays.asList(e.getStackTrace()), ";\t"));
1190             } catch (RuntimeException e) {
1191                 throw (RuntimeException) new IllegalArgumentException("Failure on line " + count + ")\t" + row)
1192                     .initCause(e);
1193             }
1194         }
1195         // System.out.println("Note: the following Status values were found in the data: " +
1196         // CldrUtility.join(statusFound, " | "));
1197 
1198         // make sure we have something
1199         for (String country : countriesNotFound) {
1200             RowData x = new RowData(country, "und");
1201             sortedInput.add(x);
1202         }
1203         for (String language : languagesNotFound) {
1204             RowData x = new RowData("ZZ", language);
1205             sortedInput.add(x);
1206         }
1207 
1208         for (RowData row : sortedInput) {
1209             // see which countries have languages that are larger than any offical language
1210 
1211             if (!row.officialStatus.isOfficial()) {
1212                 //String country = row.countryCode;
1213                 Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(row.countryCode);
1214                 if (largestOffical != null && largestOffical.get1() < row.languagePopulation) {
1215                     BadItem.WARNING.show("language population > all official languages", getLanguageCodeAndName(largestOffical.get0()), row.toString(true));
1216                 }
1217             }
1218 
1219             // see which countries are missing an official language
1220             if (!countriesWithoutOfficial.contains(row.countryCode)) continue;
1221             BadItem.ERROR.show("missing official language", row.getCountryName() + "\t" + row.countryCode, row.toString(true));
1222             countriesWithoutOfficial.remove(row.countryCode);
1223         }
1224 
1225         // write out file for rick
1226         PrintWriter log = FileUtilities.openUTF8Writer(dir, ricksFile);
1227         log.println(
1228             "*\tCName" +
1229                 "\tCCode" +
1230                 "\tCPopulation" +
1231                 "\tCLiteracy" +
1232                 "\tCGdp" +
1233                 "\tOfficialStatus" +
1234                 "\tLanguage" +
1235                 "\tLCode" +
1236                 "\tLPopulation" +
1237                 "\tWritingPop" +
1238                 "\tReferences" +
1239                 "\tNotes");
1240         RickComparator rickSorting = new RickComparator();
1241         Set<RowData> rickSorted = new TreeSet<>(rickSorting);
1242         rickSorted.addAll(sortedInput);
1243 
1244         for (RowData row : rickSorted) {
1245             final String langLit = row.getLanguageLiteracyString();
1246             final String countryLit = row.getCountryLiteracyString();
1247             log.println(
1248                 row.getCountryName()
1249                     + "\t" + row.countryCode
1250                     + "\t" + row.getCountryPopulationString()
1251                     + "\t" + countryLit
1252                     + "\t" + row.getCountryGdpString()
1253                     + "\t" + (row.officialStatus == OfficialStatus.unknown ? "" : row.officialStatus)
1254                     + "\t" + row.getRickLanguageName()
1255                     + "\t" + row.getRickLanguageCode()
1256                     + "\t" + row.getLanguagePopulationString()
1257                     + "\t" + (langLit.equals(countryLit) ? "" : langLit)
1258                     + "\t" + getExcelQuote(row.comment)
1259                     + "\t" + getExcelQuote(row.notes));
1260         }
1261         log.close();
1262         return sortedInput;
1263     }
1264 
getCldrParents(Set<String> available)1265     private static Set<String> getCldrParents(Set<String> available) {
1266         LanguageTagParser ltp2 = new LanguageTagParser();
1267         Set<String> cldrParents = new TreeSet<>();
1268         for (String locale : available) {
1269             if (skipLocales.contains(locale)) continue;
1270             try {
1271                 ltp2.set(locale);
1272             } catch (RuntimeException e) {
1273                 System.out.println("Skipping CLDR file: " + locale);
1274                 continue;
1275             }
1276             String locale2 = ltp2.getLanguageScript();
1277             if (locale2.equals("sh")) continue;
1278             // int lastPos = locale.lastIndexOf('_');
1279             // if (lastPos < 0) continue;
1280             // String locale2 = locale.substring(0,lastPos);
1281             cldrParents.add(locale2);
1282             languageToMaxCountry.put(locale2, null);
1283         }
1284         //System.out.println("CLDR Parents: " + cldrParents);
1285         return cldrParents;
1286     }
1287 
showFailures(List<String> failures)1288     private static void showFailures(List<String> failures) {
1289         if (failures.size() <= 1) {
1290             return;
1291         }
1292         System.out.println();
1293         System.out.println("Failures in Output");
1294         System.out.println();
1295 
1296         System.out.println(RowData.toStringHeader());
1297         for (String failure : failures) {
1298             System.out.println(failure);
1299         }
1300     }
1301 
getProcessedParent(String localeCode)1302     public static String getProcessedParent(String localeCode) {
1303         if (localeCode == null || localeCode.equals("root")) return null;
1304         int pos = localeCode.lastIndexOf('_');
1305         if (pos < 0) return "root";
1306         LanguageTagParser ltp = new LanguageTagParser();
1307         String script = ltp.set(localeCode).getScript();
1308         if (script.length() == 0) {
1309             return getFullyResolved(localeCode);
1310         }
1311         return localeCode.substring(0, pos);
1312     }
1313 
getFullyResolved(String languageCode)1314     private static String getFullyResolved(String languageCode) {
1315         String result = defaultContent.get(languageCode);
1316         if (result != null) return result;
1317         // we missed. Try taking parent and trying again
1318         int pos = languageCode.length() + 1;
1319         while (true) {
1320             pos = languageCode.lastIndexOf('_', pos - 1);
1321             if (pos < 0) {
1322                 return "***" + languageCode;
1323             }
1324             result = defaultContent.get(languageCode.substring(0, pos));
1325             if (result != null) {
1326                 LanguageTagParser ltp = new LanguageTagParser().set(languageCode);
1327                 LanguageTagParser ltp2 = new LanguageTagParser().set(result);
1328                 String region = ltp.getRegion();
1329                 if (region.length() == 0) {
1330                     ltp.setRegion(ltp2.getRegion());
1331                 }
1332                 String script = ltp.getScript();
1333                 if (script.length() == 0) {
1334                     ltp.setScript(ltp2.getScript());
1335                 }
1336                 return ltp.toString();
1337             }
1338         }
1339     }
1340 
1341     static Comparator<Iterable> firstElementComparator = new Comparator<Iterable>() {
1342         @Override
1343         public int compare(Iterable o1, Iterable o2) {
1344             int result = ((Comparable) o1.iterator().next()).compareTo((o2.iterator().next()));
1345             assert result != 0;
1346             return result;
1347         }
1348     };
1349 
showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent, Map<String, RowData> localeToRowData, Set<String> defaultLocaleContent)1350     private static void showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent,
1351         Map<String, RowData> localeToRowData,
1352         Set<String> defaultLocaleContent) {
1353 
1354         if (SHOW_OLD_DEFAULT_CONTENTS) {
1355             System.out.println();
1356             System.out.println("Computing Defaults Contents");
1357             System.out.println();
1358         }
1359 
1360         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
1361         Set<String> locales = new TreeSet<>(cldrFactory.getAvailable());
1362         LocaleIDParser lidp = new LocaleIDParser();
1363 
1364         // add all the combinations of language, script, and territory.
1365         for (String locale : localeToRowData.keySet()) {
1366             String baseLanguage = lidp.set(locale).getLanguage();
1367             if (locales.contains(baseLanguage) && !locales.contains(locale)) {
1368                 locales.add(locale);
1369                 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding: " + locale);
1370             }
1371         }
1372 
1373         // adding parents
1374         Set<String> toAdd = new TreeSet<>();
1375         while (true) {
1376             for (String locale : locales) {
1377                 String newguy = LocaleIDParser.getParent(locale);
1378                 if (newguy != null && !locales.contains(newguy) && !toAdd.contains(newguy)) {
1379                     toAdd.add(newguy);
1380                     if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding parent: " + newguy);
1381                 }
1382             }
1383             if (toAdd.size() == 0) {
1384                 break;
1385             }
1386             locales.addAll(toAdd);
1387             toAdd.clear();
1388         }
1389 
1390         // get sets of siblings
1391         Set<Set<String>> siblingSets = new TreeSet<>(firstElementComparator);
1392         Set<String> needsADoin = new TreeSet<>(locales);
1393 
1394         Set<String> deprecatedLanguages = new TreeSet<>();
1395         deprecatedLanguages.add("sh");
1396         Set<String> deprecatedRegions = new TreeSet<>();
1397         deprecatedRegions.add("YU");
1398         deprecatedRegions.add("CS");
1399         deprecatedRegions.add("ZZ");
1400 
1401         // first find all the language subtags that have scripts, and those we need to skip. Those are aliased-only
1402         Set<String> skippingItems = new TreeSet<>();
1403         Set<String> hasAScript = new TreeSet<>();
1404         //Set<LocaleIDParser.Level> languageOnly = EnumSet.of(LocaleIDParser.Level.Language);
1405         for (String locale : locales) {
1406             lidp.set(locale);
1407             if (lidp.getScript().length() != 0) {
1408                 hasAScript.add(lidp.getLanguage());
1409             }
1410             Set<LocaleIDParser.Level> levels = lidp.getLevels();
1411             // must have no variants, must have either script or region, no deprecated elements
1412             if (levels.contains(LocaleIDParser.Level.Variants) // no variants
1413                 || !(levels.contains(LocaleIDParser.Level.Script)
1414                     || levels.contains(LocaleIDParser.Level.Region))
1415                 || deprecatedLanguages.contains(lidp.getLanguage())
1416                 || deprecatedRegions.contains(lidp.getRegion())) {
1417                 // skip language-only locales, and ones with variants
1418                 needsADoin.remove(locale);
1419                 skippingItems.add(locale);
1420                 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tremoving: " + locale);
1421                 continue;
1422             }
1423         }
1424         // walk through the locales, getting the ones we care about.
1425         Map<String, Double> scriptLocaleToLanguageLiteratePopulation = new TreeMap<>();
1426 
1427         for (String locale : new TreeSet<>(needsADoin)) {
1428             if (!needsADoin.contains(locale)) continue;
1429             lidp.set(locale);
1430             Set<Level> level = lidp.getLevels();
1431             // skip locales that need scripts and don't have them
1432             if (!level.contains(LocaleIDParser.Level.Script) // no script
1433                 && hasAScript.contains(lidp.getLanguage())) {
1434                 needsADoin.remove(locale);
1435                 skippingItems.add(locale);
1436                 continue;
1437             }
1438             // get siblings
1439             Set<String> siblingSet = lidp.getSiblings(needsADoin);
1440             // if it has a script and region
1441             if (level.contains(LocaleIDParser.Level.Script) && level.contains(LocaleIDParser.Level.Region)) {
1442                 double languageLiteratePopulation = 0;
1443                 for (String localeID2 : siblingSet) {
1444                     RowData rowData = localeToRowData.get(localeID2);
1445                     if (rowData != null) {
1446                         languageLiteratePopulation += rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT);
1447                     }
1448                 }
1449                 String parentID = LocaleIDParser.getParent(locale);
1450                 scriptLocaleToLanguageLiteratePopulation.put(parentID, languageLiteratePopulation);
1451             }
1452 
1453             try {
1454                 siblingSets.add(siblingSet);
1455             } catch (RuntimeException e) {
1456                 e.printStackTrace();
1457             }
1458             needsADoin.removeAll(siblingSet);
1459         }
1460         if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("ConvertLanguageData Skipping: " + skippingItems);
1461         if (needsADoin.size() != 0) {
1462             if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("Missing: " + needsADoin);
1463         }
1464 
1465         // walk through the data
1466         Set<String> skippingSingletons = new TreeSet<>();
1467 
1468         Set<String> missingData = new TreeSet<>();
1469         for (Set<String> siblingSet : siblingSets) {
1470             if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("** From siblings: " + siblingSet);
1471 
1472             if (false & siblingSet.size() == 1) {
1473                 skippingSingletons.add(siblingSet.iterator().next());
1474                 continue;
1475             }
1476             // get best
1477             double best = Double.NEGATIVE_INFINITY;
1478             String bestLocale = "???";
1479             Set<Pair<Double, String>> data = new TreeSet<>();
1480             LanguageTagParser ltp = new LanguageTagParser();
1481             for (String locale : siblingSet) {
1482                 RowData rowData = localeToRowData.get(locale);
1483                 double languageLiteratePopulation = -1;
1484                 if (rowData != null) {
1485                     languageLiteratePopulation = rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT);
1486                 } else {
1487                     Double d = scriptLocaleToLanguageLiteratePopulation.get(locale);
1488                     if (d != null) {
1489                         languageLiteratePopulation = d;
1490                     } else {
1491                         final String region = ltp.set(locale).getRegion();
1492                         if (region.isEmpty() || StandardCodes.isCountry(region)) {
1493                             missingData.add(locale);
1494                         }
1495                     }
1496                 }
1497                 data.add(new Pair<>(languageLiteratePopulation, locale));
1498                 if (best < languageLiteratePopulation) {
1499                     best = languageLiteratePopulation;
1500                     bestLocale = locale;
1501                 }
1502             }
1503             // show it
1504             for (Pair<Double, String> datum : data) {
1505                 if (SHOW_OLD_DEFAULT_CONTENTS)
1506                     System.out.format(
1507                         "\tContenders: %s %f (based on literate population)" + CldrUtility.LINE_SEPARATOR,
1508                         datum.getSecond(), datum.getFirst());
1509             }
1510             // System.out.format("\tPicking default content: %s %f (based on literate population)" +
1511             // Utility.LINE_SEPARATOR, bestLocale, best);
1512             // Hack to fix English
1513             // TODO Generalize in the future for other locales with non-primary scripts
1514             if (bestLocale.startsWith("en_")) {
1515                 defaultLocaleContent.add("en_US");
1516             } else {
1517                 defaultLocaleContent.add(bestLocale);
1518             }
1519         }
1520 
1521         for (String singleton : skippingSingletons) {
1522             BadItem.WARNING.show("skipping Singletons", singleton);
1523         }
1524         for (String missing : missingData) {
1525             BadItem.WARNING.show("Missing Data", missing);
1526         }
1527 
1528         // LanguageTagParser ltp = new LanguageTagParser();
1529         // Set<String> warnings = new LinkedHashSet();
1530         // for (String languageCode : languageToMaxCountry.keySet()) {
1531         // CodeAndPopulation best = languageToMaxCountry.get(languageCode);
1532         // String languageSubtag = ltp.set(languageCode).getLanguage();
1533         // String countryCode = "ZZ";
1534         // double rawLanguagePopulation = -1;
1535         // if (best != null) {
1536         // countryCode = best.code;
1537         // rawLanguagePopulation = best.population;
1538         // Set<String> regions = LanguageInfo.INSTANCE.languageToRegions.get(languageSubtag);
1539         // if (regions == null || !regions.contains(countryCode)) {
1540         // Set<String> regions2 = LanguageInfo.INSTANCE.languageToRegionsAlt.get(languageSubtag);
1541         // if (regions2 == null || !regions2.contains(countryCode)) {
1542         // warnings.add("WARNING: " + languageCode + " => " + countryCode + ", not in " + regions + "/" + regions2);
1543         // }
1544         // }
1545         // }
1546         // String resolvedLanguageCode = languageCode + "_" + countryCode;
1547         // ltp.set(languageCode);
1548         // Set<String> scripts = LanguageInfo.INSTANCE.languageToScripts.get(languageCode);
1549         // String script = ltp.getScript();
1550         // if (script.length() == 0) {
1551         // CodeAndPopulation bestScript = languageToMaxScript.get(languageCode);
1552         // if (bestScript != null) {
1553         // script = bestScript.code;
1554         // if (scripts == null || !scripts.contains(script)) {
1555         // warnings.add("WARNING: " + languageCode + " => " + script + ", not in " + scripts);
1556         // }
1557         // } else {
1558         // script = "Zzzz";
1559         // if (scripts == null) {
1560         // scripts = LanguageInfo.INSTANCE.languageToScriptsAlt.get(languageCode);
1561         // }
1562         // if (scripts != null) {
1563         // script = scripts.iterator().next();
1564         // if (scripts.size() != 1) {
1565         // warnings.add("WARNING: " + languageCode + " => " + scripts);
1566         // }
1567         // }
1568         // }
1569         // if (scripts == null) {
1570         // warnings.add("Missing scripts for: " + languageCode);
1571         // } else if (scripts.size() == 1){
1572         // script = "";
1573         // }
1574         // resolvedLanguageCode = languageCode
1575         // + (script.length() == 0 ? "" : "_" + script)
1576         // + "_" + countryCode;
1577         // }
1578         //
1579         //
1580         // System.out.println(
1581         // resolvedLanguageCode
1582         // + "\t" + languageCode
1583         // + "\t" + ULocale.getDisplayName(languageCode, ULocale.ENGLISH)
1584         // + "\t" + countryCode
1585         // + "\t" + ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH)
1586         // + "\t" + formatNumber(rawLanguagePopulation)
1587         // + (cldrParents.contains(languageCode) ? "\tCLDR" : "")
1588         // );
1589         // if (languageCode.length() == 0) continue;
1590         // defaultContent.put(languageCode, resolvedLanguageCode);
1591         // }
1592         // for (String warning : warnings) {
1593         // System.out.println(warning);
1594         // }
1595     }
1596 
1597     // private static void printDefaultContent(Set<String> defaultLocaleContent) {
1598     // String sep = Utility.LINE_SEPARATOR + "\t\t\t";
1599     // String broken = Utility.breakLines(join(defaultLocaleContent," "), sep, PatternCache.get("(\\S)\\S*").matcher(""),
1600     // 80);
1601     //
1602     // Log.println("\t\t<defaultContent locales=\"" + broken + "\"");
1603     // Log.println("\t\t/>");
1604     // }
1605 
getSuppressScript(String languageCode)1606     private static Object getSuppressScript(String languageCode) {
1607         // TODO Auto-generated method stub
1608         return null;
1609     }
1610 
join(Collection c, String separator)1611     public static String join(Collection c, String separator) {
1612         StringBuffer result = new StringBuffer();
1613         boolean first = true;
1614         for (Object x : c) {
1615             if (first)
1616                 first = false;
1617             else
1618                 result.append(separator);
1619             result.append(x);
1620         }
1621         return result.toString();
1622     }
1623 
addBestRegion(String languageCode, String countryCode, double languagePopulationRaw)1624     private static void addBestRegion(String languageCode, String countryCode, double languagePopulationRaw) {
1625         addBest(languageCode, languagePopulationRaw, countryCode, languageToMaxCountry);
1626     }
1627 
addBestScript(String languageCode, String scriptCode, double languagePopulationRaw)1628     private static void addBestScript(String languageCode, String scriptCode, double languagePopulationRaw) {
1629         addBest(languageCode, languagePopulationRaw, scriptCode, languageToMaxScript);
1630     }
1631 
addBest(String languageCode, double languagePopulationRaw, String code, Map<String, CodeAndPopulation> languageToMaxCode)1632     private static void addBest(String languageCode, double languagePopulationRaw, String code,
1633         Map<String, CodeAndPopulation> languageToMaxCode) {
1634         if (languageCode.length() == 0) {
1635             throw new IllegalArgumentException();
1636         }
1637         CodeAndPopulation best = languageToMaxCode.get(languageCode);
1638         if (best == null) {
1639             languageToMaxCode.put(languageCode, best = new CodeAndPopulation());
1640         } else if (best.population >= languagePopulationRaw) {
1641             return;
1642         }
1643         best.population = languagePopulationRaw;
1644         best.code = code;
1645     }
1646 
1647     static class CodeAndPopulation {
1648         String code = null;
1649         double population = Double.NaN;
1650 
1651         @Override
toString()1652         public String toString() {
1653             return "{" + code + "," + population + "}";
1654         }
1655     }
1656 
1657     static public class GeneralCollator implements Comparator<String> {
1658         static UTF16.StringComparator cpCompare = new UTF16.StringComparator(true, false, 0);
1659         static RuleBasedCollator UCA = (RuleBasedCollator) Collator
1660             .getInstance(ULocale.ROOT);
1661         static {
1662             UCA.setNumericCollation(true);
1663         }
1664 
1665         @Override
compare(String s1, String s2)1666         public int compare(String s1, String s2) {
1667             if (s1 == null) {
1668                 return s2 == null ? 0 : -1;
1669             } else if (s2 == null) {
1670                 return 1;
1671             }
1672             int result = UCA.compare(s1, s2);
1673             if (result != 0) return result;
1674             return cpCompare.compare(s1, s2);
1675         }
1676     }
1677 
1678     public static class InverseComparator<T> implements Comparator<T> {
1679         private Comparator<T> other;
1680 
InverseComparator()1681         public InverseComparator() {
1682             this.other = null;
1683         }
1684 
InverseComparator(Comparator<T> other)1685         public InverseComparator(Comparator<T> other) {
1686             this.other = other;
1687         }
1688 
1689         @Override
compare(T a, T b)1690         public int compare(T a, T b) {
1691             return other == null
1692                 ? ((Comparable) b).compareTo(a)
1693                 : other.compare(b, a);
1694         }
1695     }
1696 
1697     static Set<String> languagesNeeded = new TreeSet<>(
1698         Arrays
1699             .asList("ab ba bh bi bo fj fy gd ha ht ik iu ks ku ky lg mi na nb rm sa sd sg si sm sn su tg tk to tw vo yi za lb dv chr syr kha sco gv"
1700                 .split("\\s")));
1701 
generateIso639_2Data()1702     static void generateIso639_2Data() {
1703         for (String languageSubtag : sc.getAvailableCodes("language")) {
1704             String alpha3 = Iso639Data.toAlpha3(languageSubtag);
1705             Type type = Iso639Data.getType(languageSubtag);
1706             Scope scope = Iso639Data.getScope(languageSubtag);
1707             if (type != null || alpha3 != null || scope != null) {
1708                 Log.println("\t\t<languageCode type=\"" + languageSubtag + "\"" +
1709                     (alpha3 == null ? "" : " iso639Alpha3=\"" + alpha3 + "\"") +
1710                     (type == null ? "" : " iso639Type=\"" + type + "\"") +
1711                     (scope == null ? "" : " iso639Scope=\"" + scope + "\"") +
1712                     "/>");
1713             }
1714 
1715         }
1716     }
1717 
1718     static Relation<String, BasicLanguageData> language2BasicLanguageData = Relation.of(new TreeMap<String, Set<BasicLanguageData>>(), TreeSet.class);
1719 
1720     static Map<String, Relation<BasicLanguageData.Type, String>> language_status_scripts;
1721     static Map<Pair<String, String>, String> language_script_references = new TreeMap<>();
1722 
1723     static final Map<String, Map<String, R2<List<String>, String>>> LOCALE_ALIAS_INFO = SupplementalDataInfo
1724         .getInstance().getLocaleAliasInfo();
1725 
getLanguage2Scripts(Set<RowData> sortedInput)1726     static void getLanguage2Scripts(Set<RowData> sortedInput) throws IOException {
1727         language_status_scripts = new TreeMap<>();
1728 
1729         // // get current scripts
1730         // Relation<String,String> languageToDefaultScript = new Relation(new TreeMap(), TreeSet.class);
1731         // Relation<String,String> secondaryLanguageToDefaultScript = new Relation(new TreeMap(), TreeSet.class);
1732         // for (String languageSubtag : language2BasicLanguageData.keySet()) {
1733         // for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
1734         // for (String script : item.getScripts()) {
1735         // addLanguage2Script(languageSubtag, item.getType(), script);
1736         // }
1737         // }
1738         // }
1739         // System.out.println("Language 2 scripts: " + language_status_scripts);
1740 
1741         // #Lcode LanguageName Status Scode ScriptName References
1742         List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data("language_script_raw.txt"));
1743         System.out.println(CldrUtility.LINE_SEPARATOR + "# Problems in language_script_raw.txt"
1744             + CldrUtility.LINE_SEPARATOR);
1745         //int count = -1;
1746         for (List<String> row : input) {
1747             try {
1748                 if (row.size() == 0) continue;
1749                 //++count;
1750                 String language = row.get(0).trim();
1751                 if (language.length() == 0 || language.startsWith("#")) continue;
1752                 BasicLanguageData.Type status = BasicLanguageData.Type.valueOf(row.get(2));
1753                 String scripts = row.get(3);
1754                 if (!checkCode(LstrType.language, language, row)) continue;
1755                 for (String script : scripts.split("\\s+")) {
1756                     if (!checkCode(LstrType.script, script, row)) continue;
1757                     // if the script is not modern, demote
1758                     Info scriptInfo = ScriptMetadata.getInfo(script);
1759                     if (scriptInfo == null) {
1760                         BadItem.ERROR.toString("illegal script; must be represented in Unicode, remove line or fix", script, row);
1761                         continue;
1762                     }
1763                     IdUsage idUsage = scriptInfo.idUsage;
1764                     if (status == BasicLanguageData.Type.primary && idUsage != IdUsage.RECOMMENDED) {
1765                         if (idUsage == IdUsage.ASPIRATIONAL || idUsage == IdUsage.LIMITED_USE) {
1766                             BadItem.WARNING.toString("Script has unexpected usage; make secondary if a Recommended script is used widely for the langauge",
1767                                 idUsage + ", " + script + "=" + getULocaleScriptName(script), row);
1768                         } else {
1769                             BadItem.ERROR.toString("Script is not modern; make secondary", idUsage + ", " + script + "=" + getULocaleScriptName(script), row);
1770                             status = BasicLanguageData.Type.secondary;
1771                         }
1772                     }
1773 
1774                     // if the language is not modern, demote
1775                     if (LOCALE_ALIAS_INFO.get("language").containsKey(language)) {
1776                         BadItem.ERROR.toString("Remove/Change deprecated language", language + " "
1777                             + getLanguageName(language) + "; " + LOCALE_ALIAS_INFO.get("language").get(language), row);
1778                         continue;
1779                     }
1780                     if (status == BasicLanguageData.Type.primary && !sc.isModernLanguage(language)) {
1781                         BadItem.ERROR.toString("Should be secondary, language is not modern", language + " " + getLanguageName(language), row);
1782                         status = BasicLanguageData.Type.secondary;
1783                     }
1784 
1785                     addLanguage2Script(language, status, script);
1786                     if (row.size() > 5) {
1787                         String reference = row.get(5);
1788                         if (reference != null && reference.length() == 0) {
1789                             language_script_references.put(new Pair<>(language, script), reference);
1790                         }
1791                     }
1792                 }
1793             } catch (RuntimeException e) {
1794                 System.err.println(row);
1795                 throw e;
1796             }
1797         }
1798 
1799         // System.out.println("Language 2 scripts: " + language_status_scripts);
1800 
1801         for (String language : sc.getGoodAvailableCodes("language")) {
1802             if (supplementalData.getDeprecatedInfo("language", language) != null) {
1803                 continue;
1804             }
1805             Map<String, String> registryData = sc.getLangData("language", language);
1806             if (registryData != null) {
1807                 String suppressScript = registryData.get("Suppress-Script");
1808                 if (suppressScript == null) continue;
1809                 if (ScriptMetadata.getInfo(suppressScript) == null) {
1810                     // skip, not represented in Unicode
1811                     continue;
1812                 }
1813                 // if there is something already there, we have a problem.
1814                 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1815                 if (status_scripts == null) {
1816                     System.out
1817                         .println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript);
1818                 } else if (!status_scripts.values().contains(suppressScript)) {
1819                     System.out.println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript
1820                         + "\tall:\t" + status_scripts.values());
1821                 } else {
1822                     // at this point, the suppressScript is in the union of the primary and secondary.
1823                     Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary);
1824                     if (primaryScripts != null && !primaryScripts.contains(suppressScript)) {
1825                         System.out.println("Suppress-Script is not in primary: " + language + "\tSuppress-Script:\t"
1826                             + suppressScript + "\tprimary:\t"
1827                             + primaryScripts);
1828                     }
1829                 }
1830                 addLanguage2Script(language, BasicLanguageData.Type.primary, suppressScript);
1831             }
1832         }
1833 
1834         // remove primaries from secondaries
1835         // check for primaries for scripts
1836         for (String language : language_status_scripts.keySet()) {
1837             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1838             Set<String> secondaryScripts = status_scripts.getAll(BasicLanguageData.Type.secondary);
1839             if (secondaryScripts == null) continue;
1840             Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary);
1841             if (primaryScripts == null) {
1842                 // status_scripts.putAll(BasicLanguageData.Type.primary, secondaryScripts);
1843                 // status_scripts.removeAll(BasicLanguageData.Type.secondary);
1844                 if (sc.isModernLanguage(language)) {
1845                     BadItem.ERROR.show("modern language without primary script, might need to edit moribund_languages.txt", language + " "
1846                         + getLanguageName(language));
1847                 }
1848             } else {
1849                 status_scripts.removeAll(BasicLanguageData.Type.secondary, primaryScripts);
1850             }
1851         }
1852 
1853         // check that every living language in the row data has a script
1854         Set<String> livingLanguagesWithTerritories = new TreeSet<>();
1855         for (RowData rowData : sortedInput) {
1856             String language = rowData.languageCode;
1857             if (sc.isModernLanguage(language) && Iso639Data.getSource(language) != Iso639Data.Source.ISO_639_3) {
1858                 livingLanguagesWithTerritories.add(language);
1859             }
1860         }
1861         for (String language : livingLanguagesWithTerritories) {
1862             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1863             if (status_scripts != null) {
1864                 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary);
1865                 if (primaryScripts != null && primaryScripts.size() > 0) {
1866                     continue;
1867                 }
1868             }
1869             if (language.equals("tw")) continue; // TODO load aliases and check...
1870             BadItem.WARNING.show("ISO 639-1/2 language in language-territory list without primary script", language + "\t" + getLanguageName(language));
1871         }
1872 
1873         // System.out.println("Language 2 scripts: " + language_status_scripts);
1874     }
1875 
checkScript(String script)1876     private static boolean checkScript(String script) {
1877         // TODO Auto-generated method stub
1878         return false;
1879     }
1880 
1881     static Validity VALIDITY = Validity.getInstance();
1882 
checkCode(LstrType type, String code, List<String> sourceLine)1883     private static boolean checkCode(LstrType type, String code, List<String> sourceLine) {
1884         Status validity = VALIDITY.getCodeToStatus(type).get(code);
1885         if (validity == Status.regular) {
1886             if (type == LstrType.language && code.equals("no")) {
1887                 validity = Status.invalid;
1888             } else {
1889                 return true;
1890             }
1891         } else if (validity == Status.unknown && type == LstrType.region) {
1892             return true;
1893         }
1894         BadItem.ERROR.show("Illegitimate Code", type + ": " + code + " = " + validity, sourceLine);
1895         return false;
1896     }
1897 
addLanguage2Script(String language, BasicLanguageData.Type type, String script)1898     private static void addLanguage2Script(String language, BasicLanguageData.Type type, String script) {
1899         Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1900         if (status_scripts == null)
1901             language_status_scripts.put(language, status_scripts = Relation.of(new TreeMap<BasicLanguageData.Type, Set<String>>(), TreeSet.class));
1902         status_scripts.put(type, script);
1903     }
1904 
addLanguageScriptData()1905     static void addLanguageScriptData() throws IOException {
1906         // check to make sure that every language subtag is in 639-3
1907         Set<String> langRegistryCodes = sc.getGoodAvailableCodes("language");
1908         // Set<String> iso639_2_missing = new TreeSet(langRegistryCodes);
1909         // iso639_2_missing.removeAll(Iso639Data.getAvailable());
1910         // iso639_2_missing.remove("root");
1911         // if (iso639_2_missing.size() != 0) {
1912         // for (String missing : iso639_2_missing){
1913         // System.out.println("*ERROR in StandardCodes* Missing Lang/Script data:\t" + missing + ", " +
1914         // sc.getData("language", missing));
1915         // }
1916         // }
1917 
1918         // Map<String, String> nameToTerritoryCode = new TreeMap();
1919         // for (String territoryCode : sc.getGoodAvailableCodes("territory")) {
1920         // nameToTerritoryCode.put(sc.getData("territory", territoryCode).toLowerCase(), territoryCode);
1921         // }
1922         // nameToTerritoryCode.put("iran", nameToTerritoryCode.get("iran, islamic republic of")); //
1923 
1924         //BasicLanguageData languageData = new BasicLanguageData();
1925 
1926         BufferedReader in = CldrUtility.getUTF8Data("extraLanguagesAndScripts.txt");
1927         while (true) {
1928             String line = in.readLine();
1929             if (line == null) break;
1930             String[] parts = line.split("\\t");
1931             String alpha3 = parts[0];
1932             alpha3 = stripBrackets(alpha3);
1933             String languageSubtag = Iso639Data.fromAlpha3(alpha3);
1934             if (languageSubtag == null) {
1935                 if (langRegistryCodes.contains(alpha3)) {
1936                     languageSubtag = alpha3;
1937                 } else {
1938                     BadItem.WARNING.show("Language subtag not found on line", alpha3, line);
1939                     continue;
1940                 }
1941             }
1942             //String name = parts[1];
1943             Set<String> names = Iso639Data.getNames(languageSubtag);
1944             if (names == null) {
1945                 Map<String, String> name2 = sc.getLangData("language", languageSubtag);
1946                 if (name2 != null) {
1947                     String name3 = name2.get("Description");
1948                     if (name3 != null) {
1949                         names = new TreeSet<>();
1950                         names.add(name3);
1951                     }
1952                 }
1953             }
1954             // if (names == null || !names.contains(name)) {
1955             // System.out.println("Name <" + name + "> for <" + languageSubtag + "> not found in " + names);
1956             // }
1957 
1958             // names all straight, now get scripts and territories
1959             // [Cyrl]; [Latn]
1960             Set<String> fullScriptList = sc.getGoodAvailableCodes("script");
1961 
1962             String[] scriptList = parts[2].split("[;,]\\s*");
1963             Set<String> scripts = new TreeSet<>();
1964             Set<String> scriptsAlt = new TreeSet<>();
1965             for (String script : scriptList) {
1966                 if (script.length() == 0) continue;
1967                 boolean alt = false;
1968                 if (script.endsWith("*")) {
1969                     alt = true;
1970                     script = script.substring(0, script.length() - 1);
1971                 }
1972                 script = stripBrackets(script);
1973                 if (!fullScriptList.contains(script)) {
1974                     System.out.println("Script <" + script + "> for <" + languageSubtag + "> not found in "
1975                         + fullScriptList);
1976                 } else if (alt) {
1977                     scriptsAlt.add(script);
1978                 } else {
1979                     scripts.add(script);
1980                 }
1981             }
1982             // now territories
1983             Set<String> territories = new TreeSet<>();
1984             if (parts.length > 4) {
1985                 String[] territoryList = parts[4].split("\\s*[;,-]\\s*");
1986                 for (String territoryName : territoryList) {
1987                     if (territoryName.equals("ISO/DIS 639") || territoryName.equals("3")) continue;
1988                     String territoryCode = CountryCodeConverter.getCodeFromName(territoryName, true);
1989                     if (territoryCode == null) {
1990                         BadItem.ERROR.show("no name found for territory", "<" + territoryName + ">", languageSubtag);
1991                     } else {
1992                         territories.add(territoryCode);
1993                     }
1994                 }
1995             }
1996             // <language type="de" scripts="Latn" territories="IT" alt="secondary"/>
1997             // we're going to go ahead and set these all to secondary.
1998             if (scripts.size() != 0) {
1999                 language2BasicLanguageData.put(languageSubtag,
2000                     new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scripts)
2001                         .setTerritories(territories));
2002             }
2003             if (scriptsAlt.size() != 0) {
2004                 language2BasicLanguageData.put(languageSubtag,
2005                     new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scriptsAlt)
2006                         .setTerritories(territories));
2007             }
2008         }
2009         in.close();
2010 
2011         // add other data
2012         for (String languageSubtag : supplementalData.getBasicLanguageDataLanguages()) {
2013             Set<BasicLanguageData> otherData = supplementalData.getBasicLanguageData(languageSubtag);
2014             language2BasicLanguageData.putAll(languageSubtag, otherData);
2015         }
2016     }
2017 
2018     // private static void showAllBasicLanguageData(Relation<String, BasicLanguageData> language2basicData, String
2019     // comment) {
2020     // // now print
2021     // Relation<String, String> primaryCombos = new Relation(new TreeMap(), TreeSet.class);
2022     // Relation<String, String> secondaryCombos = new Relation(new TreeMap(), TreeSet.class);
2023     //
2024     // Log.println("\t<languageData>" + (comment == null ? "" : " <!-- " + comment + " -->"));
2025     //
2026     // for (String languageSubtag : language2basicData.keySet()) {
2027     // String duplicate = "";
2028     // // script,territory
2029     // primaryCombos.clear();
2030     // secondaryCombos.clear();
2031     //
2032     // for (BasicLanguageData item : language2basicData.getAll(languageSubtag)) {
2033     // Set<String> scripts = item.getScripts();
2034     // if (scripts.size() == 0) scripts = new TreeSet(Arrays.asList(new String[] { "Zzzz" }));
2035     // for (String script : scripts) {
2036     // Set<String> territories = item.getTerritories();
2037     // if (territories.size() == 0) territories = new TreeSet(Arrays.asList(new String[] { "ZZ" }));
2038     // for (String territory : territories) {
2039     // if (item.getType().equals(BasicLanguageData.Type.primary)) {
2040     // primaryCombos.put(script, territory);
2041     // } else {
2042     // secondaryCombos.put(script, territory);
2043     // }
2044     // }
2045     // }
2046     // }
2047     // secondaryCombos.removeAll(primaryCombos);
2048     // showBasicLanguageData(languageSubtag, primaryCombos, null, BasicLanguageData.Type.primary);
2049     // showBasicLanguageData(languageSubtag, secondaryCombos, primaryCombos.keySet(),
2050     // BasicLanguageData.Type.secondary);
2051     // // System.out.println(item.toString(languageSubtag) + duplicate);
2052     // // duplicate = " <!-- " + "**" + " -->";
2053     // }
2054     // Log.println("\t</languageData>");
2055     // }
2056 
showBasicLanguageData(String languageSubtag, Relation<String, String> primaryCombos, Set<String> suppressEmptyScripts, BasicLanguageData.Type type)2057     private static void showBasicLanguageData(String languageSubtag, Relation<String, String> primaryCombos,
2058         Set<String> suppressEmptyScripts, BasicLanguageData.Type type) {
2059         Set<String> scriptsWithSameTerritories = new TreeSet<>();
2060         Set<String> lastTerritories = Collections.emptySet();
2061         for (String script : primaryCombos.keySet()) {
2062             Set<String> territories = primaryCombos.getAll(script);
2063             if (lastTerritories == Collections.EMPTY_SET) {
2064                 // skip first
2065             } else if (lastTerritories.equals(territories)) {
2066                 scriptsWithSameTerritories.add(script);
2067             } else {
2068                 showBasicLanguageData2(languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts,
2069                     lastTerritories, type);
2070                 scriptsWithSameTerritories.clear();
2071             }
2072             lastTerritories = territories;
2073             scriptsWithSameTerritories.add(script);
2074         }
2075         showBasicLanguageData2(languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts, lastTerritories, type);
2076     }
2077 
showBasicLanguageData2(String languageSubtag, Set<String> scripts, Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type)2078     private static void showBasicLanguageData2(String languageSubtag, Set<String> scripts,
2079         Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type) {
2080         scripts.remove("Zzzz");
2081         territories.remove("ZZ");
2082         if (territories.size() == 0 && suppressEmptyScripts != null) {
2083             scripts.removeAll(suppressEmptyScripts);
2084         }
2085         if (scripts.size() == 0 && territories.size() == 0) return;
2086         Log.println("\t\t<language type=\"" + languageSubtag + "\"" +
2087             (scripts.size() == 0 ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"") +
2088             (territories.size() == 0 ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"") +
2089             (type == BasicLanguageData.Type.primary ? "" : " alt=\"" + type + "\"") +
2090             "/>");
2091     }
2092 
2093     /*
2094      * System.out.println(
2095      * "\t\t<language type=\"" + languageSubtag + "\"" +
2096      * " scripts=\"" + Utility.join(scripts," ") + "\"" +
2097      * (territories.size() == 0 ? "" : " territories=\"" + Utility.join(territories," ") + "\"") +
2098      * "/>"
2099      * );
2100      */
2101 
stripBrackets(String alpha3)2102     private static String stripBrackets(String alpha3) {
2103         if (alpha3.startsWith("[") && alpha3.endsWith("]")) {
2104             alpha3 = alpha3.substring(1, alpha3.length() - 1);
2105         }
2106         return alpha3;
2107     }
2108 
2109     static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH);
2110     static NumberFormat nf_no_comma = NumberFormat.getInstance(ULocale.ENGLISH);
2111     static {
2112         nf_no_comma.setGroupingUsed(false);
2113     }
2114     static NumberFormat pf = NumberFormat.getPercentInstance(ULocale.ENGLISH);
2115 
formatNumber(double original, int roundDigits, boolean xml)2116     public static String formatNumber(double original, int roundDigits, boolean xml) {
2117         double d = original;
2118         if (roundDigits != 0) {
2119             d = CldrUtility.roundToDecimals(original, roundDigits);
2120         }
2121         if (Double.isNaN(d)) {
2122             d = CldrUtility.roundToDecimals(original, roundDigits);
2123             throw new IllegalArgumentException("Double is NaN");
2124         }
2125         if (xml) {
2126             return nf_no_comma.format(d);
2127         }
2128         return nf.format(d);
2129     }
2130 
formatPercent(double d, int roundDigits, boolean xml)2131     public static String formatPercent(double d, int roundDigits, boolean xml) {
2132         if (roundDigits != 0) {
2133             d = CldrUtility.roundToDecimals(d, roundDigits);
2134         }
2135         if (xml) {
2136             nf_no_comma.setMaximumFractionDigits(roundDigits + 2);
2137             return nf_no_comma.format(d * 100.0);
2138         }
2139         pf.setMaximumFractionDigits(roundDigits + 2);
2140         return pf.format(d);
2141     }
2142 
2143     static final LanguageTagCanonicalizer languageTagCanonicalizer = new LanguageTagCanonicalizer();
2144 
fixLanguageCode(String languageCodeRaw, List<String> row)2145     private static String fixLanguageCode(String languageCodeRaw, List<String> row) {
2146         String languageCode = languageTagCanonicalizer.transform(languageCodeRaw);
2147         if (DEBUG && !languageCode.equals(languageCodeRaw)) {
2148             System.out.println("## " + languageCodeRaw + " => " + languageCode);
2149         }
2150         int bar = languageCode.indexOf('_');
2151         String script = "";
2152         if (bar >= 0) {
2153             script = languageCode.substring(bar);
2154             languageCode = languageCode.substring(0, bar);
2155         }
2156         R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("language").get(languageCode);
2157         if (replacement != null) {
2158             String replacementCode = replacement.get0().get(0);
2159             BadItem.ERROR.show("deprecated language code", languageCode + " => " + replacementCode, row);
2160             languageCode = replacementCode;
2161         }
2162         if (!sc.getAvailableCodes("language").contains(languageCode)) {
2163             BadItem.ERROR.show("bad language code", languageCode, row);
2164         }
2165         return languageCode + script;
2166     }
2167 
2168     enum BadItem {
2169         ERROR, WARNING, DETAIL;
2170 
show(String problem, String details, String... items)2171         void show(String problem, String details, String... items) {
2172             System.out.println(toString(problem, details, items));
2173         }
2174 
show(String problem, String details, List<String> row)2175         void show(String problem, String details, List<String> row) {
2176             System.out.println(toString(problem, details, row));
2177         }
2178 
toString(String problem, String details, String... items)2179         private String toString(String problem, String details, String... items) {
2180             return toString(problem, details, Arrays.asList(items));
2181         }
2182 
toString(String problem, String details, List<String> row)2183         private String toString(String problem, String details, List<String> row) {
2184             return "* " + this
2185                 + " *\t" + problem + ":"
2186                 + "\t" + details
2187                 + (row != null && row.size() > 0 ? "\t" + Joiner.on("\t").join(row) : "");
2188         }
2189     }
2190 
fixCountryCode(String countryCode, List<String> row)2191     private static String fixCountryCode(String countryCode, List<String> row) {
2192         R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("territory").get(countryCode);
2193         if (replacement != null) {
2194             String replacementCode = replacement.get0().get(0);
2195             BadItem.ERROR.show("deprecated territory code", countryCode + " => " + replacementCode, row);
2196             countryCode = replacementCode;
2197         }
2198         if (!sc.getAvailableCodes("territory").contains(countryCode)) {
2199             BadItem.ERROR.show("bad territory code", countryCode, row);
2200         }
2201         return countryCode;
2202     }
2203 
getULocaleLocaleName(String languageCode)2204     private static String getULocaleLocaleName(String languageCode) {
2205         return english.getName(languageCode, true);
2206         //return new ULocale(languageCode).getDisplayName();
2207     }
2208 
getULocaleScriptName(String scriptCode)2209     private static String getULocaleScriptName(String scriptCode) {
2210         return english.getName(CLDRFile.SCRIPT_NAME, scriptCode);
2211         // return ULocale.getDisplayScript("und_" + scriptCode, ULocale.ENGLISH);
2212     }
2213 
getULocaleCountryName(String countryCode)2214     private static String getULocaleCountryName(String countryCode) {
2215         return english.getName(CLDRFile.TERRITORY_NAME, countryCode);
2216         //return ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH);
2217     }
2218 }
2219