1 package org.unicode.cldr.tool;
2 
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.PrintWriter;
6 import java.text.ParseException;
7 import java.util.ArrayList;
8 import java.util.Arrays;
9 import java.util.Collection;
10 import java.util.Collections;
11 import java.util.Comparator;
12 import java.util.EnumMap;
13 import java.util.HashMap;
14 import java.util.HashSet;
15 import java.util.Iterator;
16 import java.util.LinkedHashSet;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.Set;
20 import java.util.TreeMap;
21 import java.util.TreeSet;
22 import java.util.regex.Matcher;
23 
24 import org.unicode.cldr.draft.FileUtilities;
25 import org.unicode.cldr.draft.ScriptMetadata;
26 import org.unicode.cldr.draft.ScriptMetadata.IdUsage;
27 import org.unicode.cldr.draft.ScriptMetadata.Info;
28 import org.unicode.cldr.util.Builder;
29 import org.unicode.cldr.util.CLDRFile;
30 import org.unicode.cldr.util.CLDRPaths;
31 import org.unicode.cldr.util.CldrUtility;
32 import org.unicode.cldr.util.Factory;
33 import org.unicode.cldr.util.Iso639Data;
34 import org.unicode.cldr.util.Iso639Data.Scope;
35 import org.unicode.cldr.util.Iso639Data.Source;
36 import org.unicode.cldr.util.Iso639Data.Type;
37 import org.unicode.cldr.util.LanguageTagCanonicalizer;
38 import org.unicode.cldr.util.LanguageTagParser;
39 import org.unicode.cldr.util.LocaleIDParser;
40 import org.unicode.cldr.util.LocaleIDParser.Level;
41 import org.unicode.cldr.util.Log;
42 import org.unicode.cldr.util.Pair;
43 import org.unicode.cldr.util.PatternCache;
44 import org.unicode.cldr.util.SpreadSheet;
45 import org.unicode.cldr.util.StandardCodes;
46 import org.unicode.cldr.util.StandardCodes.LstrType;
47 import org.unicode.cldr.util.SupplementalDataInfo;
48 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
49 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus;
50 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
51 import org.unicode.cldr.util.TransliteratorUtilities;
52 import org.unicode.cldr.util.Validity;
53 import org.unicode.cldr.util.Validity.Status;
54 import org.unicode.cldr.util.XPathParts;
55 import org.unicode.cldr.util.XPathParts.Comments;
56 
57 import com.google.common.collect.ImmutableSet;
58 import com.google.common.math.DoubleMath;
59 import com.ibm.icu.dev.util.CollectionUtilities;
60 import com.ibm.icu.impl.Relation;
61 import com.ibm.icu.impl.Row;
62 import com.ibm.icu.impl.Row.R2;
63 import com.ibm.icu.text.Collator;
64 import com.ibm.icu.text.NumberFormat;
65 import com.ibm.icu.text.RuleBasedCollator;
66 import com.ibm.icu.text.UTF16;
67 import com.ibm.icu.util.ULocale;
68 
69 /**
70  * @author markdavis
71  *
72  */
73 public class ConvertLanguageData {
74 
75     private static final boolean DEBUG = false;
76     // change this if you need to override what is generated for the default contents.
77     private static final List<String> defaultOverrides = Arrays.asList("es_ES".split("\\s+")); // und_ZZ
78 
79     public static final boolean SHOW_DIFF = false;
80 
81     private static final boolean ALLOW_SMALL_NUMBERS = true;
82 
83     static final Comparator<String> GENERAL_COLLATOR = new GeneralCollator();
84     static final Comparator<String> INVERSE_GENERAL = new InverseComparator<String>(GENERAL_COLLATOR);
85 
86     private static StandardCodes sc = StandardCodes.make();
87 
88     static final double populationFactor = 1;
89     static final double gdpFactor = 1;
90     static final int BAD_COUNTRY_NAME = 0, COUNTRY_CODE = 1, COUNTRY_POPULATION = 2, COUNTRY_LITERACY = 3,
91         COUNTRY_GDP = 4, OFFICIAL_STATUS = 5, BAD_LANGUAGE_NAME = 6, LANGUAGE_CODE = 7, LANGUAGE_POPULATION = 8,
92         LANGUAGE_LITERACY = 9, COMMENT = 10, NOTES = 11;
93     static final Map<String, CodeAndPopulation> languageToMaxCountry = new TreeMap<String, CodeAndPopulation>();
94     static final Map<String, CodeAndPopulation> languageToMaxScript = new TreeMap<String, CodeAndPopulation>();
95 
96     private static final double NON_OFFICIAL_WEIGHT = 0.40;
97 
98     private static final boolean SHOW_OLD_DEFAULT_CONTENTS = false;
99 
100     private static final ImmutableSet<String> scriptAssumedLocales = ImmutableSet.of(
101         "bm_ML", "ha_GH", "ha_NE", "ha_NG", "kk_KZ", "ks_IN", "ky_KG", "mn_MN", "ms_BN", "ms_MY", "ms_SG", "tk_TM", "tzm_MA", "ug_CN");
102 
103     static Set<String> skipLocales = new HashSet<String>(
104         Arrays
105             .asList(
106                 "sh sh_BA sh_CS sh_YU characters supplementalData supplementalData-old supplementalData-old2 supplementalData-old3 supplementalMetadata root"
107                     .split("\\s")));
108 
109     static Map<String, String> defaultContent = new TreeMap<String, String>();
110 
111     static Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
112     static CLDRFile english = cldrFactory.make("en", true);
113 
114     static SupplementalDataInfo supplementalData = SupplementalDataInfo
115         .getInstance(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY);
116 
main(String[] args)117     public static void main(String[] args) throws IOException, ParseException {
118         BufferedReader oldFile = null;
119         try {
120             // load elements we care about
121             Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalData.xml");
122             // Log.println("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
123             // Log.println("<!DOCTYPE supplementalData SYSTEM \"http://www.unicode.org/cldr/data/dtd/ldmlSupplemental.dtd\">");
124             // Log.println("<supplementalData version=\"1.5\">");
125 
126             oldFile = FileUtilities.openUTF8Reader(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY, "supplementalData.xml");
127             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<languageData>\\s*"), Log.getLog(), false);
128 
129             Set<String> available = cldrFactory.getAvailable();
130 
131             Set<String> cldrParents = getCldrParents(available);
132 
133             List<String> failures = new ArrayList<String>();
134             Map<String, RowData> localeToRowData = new TreeMap<String, RowData>();
135 
136             Set<RowData> sortedInput = getExcelData(failures, localeToRowData);
137 
138             // get the locales (including parents)
139             Set<String> localesWithData = new TreeSet<String>(localeToRowData.keySet());
140             for (String locale : localeToRowData.keySet()) {
141                 while (true) {
142                     String parent = LocaleIDParser.getParent(locale);
143                     if (parent == null) break;
144                     localesWithData.add(parent);
145                     locale = parent;
146                 }
147             }
148 
149             final LanguageTagParser languageTagParser = new LanguageTagParser();
150 
151             for (String localeRaw : available) {
152                 String locale = languageTagCanonicalizer.transform(localeRaw);
153                 if (!localesWithData.contains(locale)) {
154                     CLDRFile locFile = cldrFactory.make(localeRaw, false);
155                     if (locFile.isAliasedAtTopLevel()) {
156                         continue;
157                     }
158                     if (scriptAssumedLocales.contains(locale)) {
159                         continue;
160                     }
161                     languageTagParser.set(locale);
162                     if (languageTagParser.getVariants().size() != 0) {
163                         continue;
164                     }
165                     String withoutScript = languageTagParser.setScript("").toString();
166                     if (!localesWithData.contains(withoutScript)) {
167                         String region = new LanguageTagParser().set(locale).getRegion();
168                         if (StandardCodes.isCountry(region)) {
169                             BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale));
170                         }
171                     } else {
172                         // These exceptions are OK, because these locales by default use the non-default script
173                         Set<String> OKExceptions = ImmutableSet.of("sr_Cyrl_ME", "zh_Hans_HK", "zh_Hans_MO");
174                         if (OKExceptions.contains(locale)) {
175                             continue;
176                         }
177                         BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale)
178                             + " but have data for " + getLanguageCodeAndName(withoutScript));
179                     }
180                 }
181             }
182 
183             // TODO sort by country code, then functionalPopulation, then language code
184             // and keep the top country for each language code (even if < 1%)
185 
186             addLanguageScriptData();
187 
188             // showAllBasicLanguageData(allLanguageData, "old");
189             getLanguage2Scripts(sortedInput);
190 
191             writeNewBasicData2(sortedInput);
192             // writeNewBasicData(sortedInput);
193 
194             writeTerritoryLanguageData(failures, sortedInput);
195 
196             checkBasicData(localeToRowData);
197 
198             Set<String> defaultLocaleContent = new TreeSet<String>();
199 
200             showDefaults(cldrParents, nf, defaultContent, localeToRowData, defaultLocaleContent);
201 
202             // showContent(available);
203 
204             // certain items are overridden
205 
206             List<String> toRemove = new ArrayList<String>();
207             for (String override : defaultOverrides) {
208                 String replacement = getReplacement(override, defaultLocaleContent);
209                 if (replacement != null) {
210                     toRemove.add(replacement);
211                 }
212             }
213             defaultLocaleContent.removeAll(toRemove);
214             defaultLocaleContent.addAll(defaultOverrides);
215 
216             showFailures(failures);
217 
218             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</territoryInfo>\\s*"), null, false);
219             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<references>\\s*"), Log.getLog(), false);
220             // generateIso639_2Data();
221             references.printReferences();
222             CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</references>\\s*"), null, false);
223             CldrUtility.copyUpTo(oldFile, null, Log.getLog(), false);
224             // Log.println("</supplementalData>");
225             Log.close();
226             oldFile.close();
227 
228             Log.setLog(CLDRPaths.GEN_DIRECTORY + "/supplemental", "language_script_raw.txt");
229             getLanguageScriptSpreadsheet(Log.getLog());
230             Log.close();
231         } catch (Exception e) {
232             e.printStackTrace();
233         } finally {
234             if (oldFile != null) {
235                 oldFile.close();
236             }
237             System.out.println("DONE");
238         }
239     }
240 
getLanguageCodeAndName(String code)241     public static String getLanguageCodeAndName(String code) {
242         if (code == null) return null;
243         return english.getName(code) + " [" + code + "]";
244     }
245 
getReplacement(String oldDefault, Set<String> defaultLocaleContent)246     private static String getReplacement(String oldDefault, Set<String> defaultLocaleContent) {
247         String parent = LocaleIDParser.getParent(oldDefault);
248         for (String replacement : defaultLocaleContent) {
249             if (replacement.startsWith(parent)) {
250                 if (parent.equals(LocaleIDParser.getParent(replacement))) {
251                     return replacement;
252                 }
253             }
254         }
255         return null;
256     }
257 
getLanguageScriptSpreadsheet(PrintWriter out)258     private static void getLanguageScriptSpreadsheet(PrintWriter out) {
259         out.println("#Lcode LanguageName  Status  Scode ScriptName  References");
260         Pair<String, String> languageScript = new Pair<String, String>("", "");
261         for (String language : language_status_scripts.keySet()) {
262             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
263             for (BasicLanguageData.Type status : status_scripts.keySet()) {
264                 for (String script : status_scripts.getAll(status)) {
265                     String reference = language_script_references.get(languageScript.setFirst(language).setSecond(
266                         script));
267                     out.println(language + "\t" + getLanguageName(language) + "\t" + status + "\t" + script + "\t"
268                         + getDisplayScript(script)
269                         + (reference == null ? "" : "\t" + reference));
270                 }
271             }
272         }
273     }
274 
275     /**
276      * Write data in format:
277      * <languageData>
278      * <language type="aa" scripts="Latn" territories="DJ ER ET"/>
279      *
280      * @param sortedInput
281      */
writeNewBasicData2(Set<RowData> sortedInput)282     private static void writeNewBasicData2(Set<RowData> sortedInput) {
283         double cutoff = 0.2; // 20%
284 
285         // Relation<String, BasicLanguageData> newLanguageData = new Relation(new TreeMap(), TreeSet.class);
286         LanguageTagParser ltp = new LanguageTagParser();
287         Map<String, Relation<BasicLanguageData.Type, String>> language_status_territories = new TreeMap<String, Relation<BasicLanguageData.Type, String>>();
288         //Map<String, Pair<String, String>> languageToBestCountry;
289         for (RowData rowData : sortedInput) {
290             if (rowData.countryCode.equals("ZZ")) continue;
291             ltp.set(rowData.languageCode);
292             String languageCode = ltp.getLanguage();
293             Relation<BasicLanguageData.Type, String> status_territories = language_status_territories.get(languageCode);
294             if (status_territories == null) {
295                 language_status_territories.put(languageCode, status_territories = Relation.of(
296                     new TreeMap<BasicLanguageData.Type, Set<String>>(),
297                     TreeSet.class));
298             }
299             if (rowData.officialStatus.isMajor()) {
300                 status_territories.put(BasicLanguageData.Type.primary, rowData.countryCode);
301             } else if (rowData.officialStatus.isOfficial()
302                 || rowData.getLanguagePopulation() >= cutoff * rowData.countryPopulation
303                 || rowData.getLanguagePopulation() >= 1000000) {
304                 status_territories.put(BasicLanguageData.Type.secondary, rowData.countryCode);
305             }
306         }
307 
308         Set<String> allLanguages = new TreeSet<String>(language_status_territories.keySet());
309         allLanguages.addAll(language_status_scripts.keySet());
310         // now add all the remaining language-script info
311         // <language type="sv" scripts="Latn" territories="AX FI SE"/>
312         Set<String> warnings = new LinkedHashSet<String>();
313         Log.println("\t<languageData>");
314         for (String languageSubtag : allLanguages) {
315             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(languageSubtag);
316             Relation<BasicLanguageData.Type, String> status_territories = language_status_territories
317                 .get(languageSubtag);
318 
319             // check against old:
320             Map<BasicLanguageData.Type, BasicLanguageData> oldData = supplementalData
321                 .getBasicLanguageDataMap(languageSubtag);
322             if (oldData == null) {
323                 oldData = Collections.emptyMap();
324             }
325 
326             EnumMap<BasicLanguageData.Type, BasicLanguageData> newData = new EnumMap<BasicLanguageData.Type, BasicLanguageData>(
327                 BasicLanguageData.Type.class);
328             for (BasicLanguageData.Type status : BasicLanguageData.Type.values()) {
329                 Set<String> scripts = status_scripts == null ? null : status_scripts.getAll(status);
330                 Set<String> territories = status_territories == null ? null : status_territories.getAll(status);
331                 if (scripts == null && territories == null) continue;
332                 BasicLanguageData bld = new BasicLanguageData();
333                 bld.setTerritories(territories);
334                 bld.setScripts(scripts);
335                 bld.setType(status);
336                 bld.freeze();
337                 newData.put(status, bld);
338             }
339 
340             // compare
341             if (!CldrUtility.equals(oldData.entrySet(), newData.entrySet())) {
342                 for (String problem : compare(oldData, newData)) {
343                     warnings.add(BadItem.DETAIL.toString("changing <languageData>", languageSubtag
344                         + "\t" + english.getName(languageSubtag), problem));
345                 }
346             }
347 
348             for (BasicLanguageData bld : newData.values()) {
349                 Set<String> scripts = bld.getScripts();
350                 Set<String> territories = bld.getTerritories();
351                 BasicLanguageData.Type status = bld.getType();
352                 Log.println("\t\t<language type=\"" + languageSubtag + "\""
353                     + (scripts.isEmpty() ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"")
354                     + (territories.isEmpty() ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"")
355                     + (status == BasicLanguageData.Type.primary ? "" : " alt=\"secondary\"")
356                     + "/>");
357             }
358         }
359         Log.println("\t</languageData>");
360         for (String s : warnings) {
361             if (s.contains("!")) {
362                 System.out.println(s);
363             }
364         }
365         for (String s : warnings) {
366             if (!s.contains("!")) {
367                 System.out.println(s);
368             }
369         }
370     }
371 
compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData, Map<BasicLanguageData.Type, BasicLanguageData> newData)372     private static List<String> compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData,
373         Map<BasicLanguageData.Type, BasicLanguageData> newData) {
374         Map<String, BasicLanguageData.Type> oldDataToType = getDataToType(oldData.values(), true);
375         Map<String, BasicLanguageData.Type> newDataToType = getDataToType(newData.values(), true);
376         List<String> result = new ArrayList<>();
377         StringBuilder temp = new StringBuilder();
378         for (String s : Builder.with(new LinkedHashSet<String>()).addAll(oldDataToType.keySet())
379             .addAll(newDataToType.keySet()).get()) {
380             BasicLanguageData.Type oldValue = oldDataToType.get(s);
381             BasicLanguageData.Type newValue = newDataToType.get(s);
382             if (!CldrUtility.equals(oldValue, newValue)) {
383                 temp.setLength(0);
384                 temp.append("[").append(s).append(":")
385                     .append(english.getName(s.length() == 4 ? "script" : "region", s)).append("] ");
386                 if (oldValue == null) {
387                     temp.append(" added as ").append(newValue);
388                 } else if (newValue == null) {
389                     temp.append(" REMOVED!");
390                 } else if (oldValue == BasicLanguageData.Type.primary) {
391                     temp.append(" DOWNGRADED TO! ").append(newValue);
392                 } else {
393                     temp.append(" upgraded to ").append(newValue);
394                 }
395                 result.add(temp.toString());
396             }
397         }
398         result.add(newData.toString());
399         return result;
400     }
401 
getDataToType( Collection<BasicLanguageData> collection, boolean script)402     private static Map<String, BasicLanguageData.Type> getDataToType(
403         Collection<BasicLanguageData> collection, boolean script) {
404         Map<String, BasicLanguageData.Type> result = new TreeMap<String, BasicLanguageData.Type>();
405         for (BasicLanguageData i : collection) {
406             for (String s : i.getScripts()) {
407                 result.put(s, i.getType());
408             }
409             for (String s : i.getTerritories()) {
410                 result.put(s, i.getType());
411             }
412         }
413         return result;
414     }
415 
checkBasicData(Map<String, RowData> localeToRowData)416     private static void checkBasicData(Map<String, RowData> localeToRowData) {
417         // find languages with multiple scripts
418         Relation<String, String> languageToScripts = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
419         for (String languageSubtag : language2BasicLanguageData.keySet()) {
420             for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
421                 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), item.getScripts());
422             }
423         }
424         // get primary combinations
425         Set<String> primaryCombos = new TreeSet<String>();
426         Set<String> basicCombos = new TreeSet<String>();
427         for (String languageSubtag : language2BasicLanguageData.keySet()) {
428             for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
429                 Set<String> scripts = new TreeSet<String>();
430                 scripts.addAll(item.getScripts());
431                 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), scripts);
432                 if (scripts.size() == 0) {
433                     scripts.add("Zzzz");
434                 }
435                 Set<String> territories = new TreeSet<String>();
436                 territories.addAll(item.getTerritories());
437                 if (territories.size() == 0) {
438                     territories.add("ZZ");
439                     continue;
440                 }
441 
442                 for (String script : scripts) {
443                     for (String territory : territories) {
444                         String locale = StandardCodes.fixLanguageTag(languageSubtag)
445                             // + (script.equals("Zzzz") ? "" : languageToScripts.getAll(languageSubtag).size() <= 1 ? ""
446                             // : "_" + script)
447                             + (territories.equals("ZZ") ? "" : "_" + territory);
448                         if (item.getType() != BasicLanguageData.Type.secondary) {
449                             primaryCombos.add(locale);
450                         }
451                         basicCombos.add(locale);
452                     }
453                 }
454             }
455         }
456         Set<String> populationOver20 = new TreeSet<String>();
457         Set<String> population = new TreeSet<String>();
458         LanguageTagParser ltp = new LanguageTagParser();
459         for (String rawLocale : localeToRowData.keySet()) {
460             ltp.set(rawLocale);
461             String locale = ltp.getLanguage() + (ltp.getRegion().length() == 0 ? "" : "_" + ltp.getRegion());
462             population.add(locale);
463             RowData rowData = localeToRowData.get(rawLocale);
464             if (rowData.getLanguagePopulation() / rowData.countryPopulation >= 0.2
465             //|| rowData.getLanguagePopulation() > 900000
466             ) {
467                 populationOver20.add(locale);
468             } else {
469                 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(
470                     ltp.getLanguageScript(), ltp.getRegion());
471                 if (popData != null && popData.getOfficialStatus().isOfficial()) {
472                     populationOver20.add(locale);
473                 }
474             }
475         }
476         Set<String> inBasicButNotPopulation = new TreeSet<String>(primaryCombos);
477 
478         inBasicButNotPopulation.removeAll(population);
479         for (String locale : inBasicButNotPopulation) {
480             ltp.set(locale);
481             String region = ltp.getRegion();
482             String language = ltp.getLanguage();
483             if (!sc.isModernLanguage(language)) continue;
484             PopulationData popData = supplementalData.getPopulationDataForTerritory(region);
485             // Afghanistan AF "29,928,987" 28.10% "21,500,000,000" Hazaragi haz "1,770,000" 28.10%
486             BadItem.WARNING.show("In Basic Data but not Population > 20%",
487                 getDisplayCountry(region)
488                     + "\t" + region
489                     + "\t\"" + formatNumber(popData.getPopulation(), 0, false) + "\""
490                     + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false)
491                     + "\""
492                     + "\t\"" + formatPercent(popData.getGdp(), 0, false) + "\""
493                     + "\t" + ""
494                     + "\t" + getLanguageName(language)
495                     + "\t" + language
496                     + "\t" + -1
497                     + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false)
498                     + "\"");
499         }
500 
501         Set<String> inPopulationButNotBasic = new TreeSet<String>(populationOver20);
502         inPopulationButNotBasic.removeAll(basicCombos);
503         for (Iterator<String> it = inPopulationButNotBasic.iterator(); it.hasNext();) {
504             String locale = it.next();
505             if (locale.endsWith("_ZZ")) {
506                 it.remove();
507             }
508         }
509         for (String locale : inPopulationButNotBasic) {
510             BadItem.WARNING.show("In Population>20% but not Basic Data", locale + " " + getLanguageName(locale), localeToRowData.get(locale).toString());
511         }
512     }
513 
514     static class LanguageInfo {
515         static LanguageInfo INSTANCE = new LanguageInfo();
516 
517         Map<String, Set<String>> languageToScripts = new TreeMap<String, Set<String>>();
518         Map<String, Set<String>> languageToRegions = new TreeMap<String, Set<String>>();
519         Map<String, Comments> languageToComments = new TreeMap<String, Comments>();
520 
521         Map<String, Set<String>> languageToScriptsAlt = new TreeMap<String, Set<String>>();
522         Map<String, Set<String>> languageToRegionsAlt = new TreeMap<String, Set<String>>();
523         Map<String, Comments> languageToCommentsAlt = new TreeMap<String, Comments>();
524 
LanguageInfo()525         private LanguageInfo() {
526             cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
527             //Set<String> available = cldrFactory.getAvailable();
528             CLDRFile supplemental = cldrFactory.make("supplementalData", true);
529             XPathParts parts = new XPathParts();
530             for (Iterator<String> it = supplemental.iterator("//supplementalData/languageData/language"); it.hasNext();) {
531                 String xpath = it.next();
532                 Map<String, String> x = parts.set(xpath).getAttributes(-1);
533                 boolean alt = x.containsKey("alt");
534                 String lang = x.get("type");
535                 List<String> scripts = getAttributeList(x, "scripts");
536                 if (scripts != null) {
537                     if (alt) {
538                         putAll(languageToScriptsAlt, lang, new LinkedHashSet<String>(scripts));
539                     } else {
540                         putAll(languageToScripts, lang, new LinkedHashSet<String>(scripts));
541                     }
542                 }
543                 List<String> regions = getAttributeList(x, "territories");
544                 if (regions != null) {
545                     if (alt) {
546                         putAll(languageToRegionsAlt, lang, new LinkedHashSet<String>(regions));
547                     } else {
548                         putAll(languageToRegions, lang, new LinkedHashSet<String>(regions));
549                     }
550                 }
551             }
552         }
553 
getAttributeList(Map<String, String> x, String attribute)554         private List<String> getAttributeList(Map<String, String> x, String attribute) {
555             List<String> scripts = null;
556             String scriptString = x.get(attribute);
557             if (scriptString != null) {
558                 scripts = Arrays.asList(scriptString.split("\\s+"));
559             }
560             return scripts;
561         }
562     }
563 
putUnique(Map<K, V> map, K key, V value)564     private static <K, V> void putUnique(Map<K, V> map, K key, V value) {
565         V oldValue = map.get(key);
566         if (oldValue != null && !oldValue.equals(value)) {
567             throw new IllegalArgumentException("Duplicate value for <" + key + ">: <" + oldValue + ">, <" + value + ">");
568         }
569         map.put(key, value);
570     }
571 
putAll(Map<K, Set<W>> map, K key, Set<W> values)572     private static <K, W> void putAll(Map<K, Set<W>> map, K key, Set<W> values) {
573         Set<W> oldValue = map.get(key);
574         if (oldValue == null) {
575             map.put(key, values);
576         } else {
577             oldValue.addAll(values);
578         }
579     }
580 
581     // public enum OfficialStatus {unknown, de_facto_official, official, official_regional, official_minority};
582 
583     static class RowData implements Comparable<Object> {
584         private final String countryCode;
585         private final double countryGdp;
586         private final double countryLiteracy;
587         private final double countryPopulation;
588         private final String languageCode;
589         private final OfficialStatus officialStatus;
590         private final double languagePopulation;
591         private final double languageLiteracy;
592         private final String comment;
593         private final String notes;
594         private final String badLanguageName;
595         private final boolean relativeLanguagePopulation;
596         // String badLanguageCode = "";
597         private final static Set<String> doneCountries = new HashSet<String>();
598 
599         private final static Set<String> countryCodes = sc.getGoodAvailableCodes("territory");
600 
RowData(String country, String language)601         public RowData(String country, String language) {
602             this.countryCode = country;
603             this.languageCode = language;
604             badLanguageName = country = language = notes = comment = "";
605             officialStatus = OfficialStatus.unknown;
606             countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000);
607             countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d;
608             countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue();
609             languagePopulation = languageLiteracy = Double.NaN;
610             relativeLanguagePopulation = false;
611         }
612 
RowData(List<String> row)613         RowData(List<String> row) throws ParseException {
614             countryCode = fixCountryCode(row.get(COUNTRY_CODE), row);
615 
616             if (!countryCodes.contains(countryCode)) {
617                 System.err.println("WRONG COUNTRY CODE: " + row);
618             }
619 
620             double countryPopulation1 = parseDecimal(row.get(COUNTRY_POPULATION));
621             double countryLiteracy1 = parsePercent(row.get(COUNTRY_LITERACY), countryPopulation1);
622 
623             countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000);
624             countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d;
625             countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue();
626 
627             String officialStatusString = row.get(OFFICIAL_STATUS).trim().replace(' ', '_');
628             if (officialStatusString.equals("national")) {
629                 officialStatusString = "official";
630             } else if (officialStatusString.equals("regional_official")) {
631                 officialStatusString = "official_regional";
632             } else if (officialStatusString.length() == 0 || officialStatusString.equals("uninhabited")) {
633                 officialStatusString = "unknown";
634             }
635             try {
636                 officialStatus = OfficialStatus.valueOf(officialStatusString);
637             } catch (RuntimeException e) {
638                 throw new IllegalArgumentException("Can't interpret offical-status: " + officialStatusString);
639             }
640 
641             String languageCode1 = row.get(LANGUAGE_CODE);
642             if (languageCode1.startsWith("*") || languageCode1.startsWith("\u00A7")) {
643                 languageCode1 = languageCode1.substring(1);
644             }
645             languageCode = fixLanguageCode(languageCode1, row);
646 
647             if (doneCountries.contains(countryCode) == false) {
648                 // showDiff(countryGdp1, countryGdp);
649                 // showDiff(countryLiteracy1, countryLiteracy);
650                 if (SHOW_DIFF) showDiff(countryPopulation1, countryPopulation, 0.1, false);
651                 doneCountries.add(countryCode);
652             }
653 
654             double languagePopulation1 = parsePercent(row.get(LANGUAGE_POPULATION), countryPopulation1)
655                 * countryPopulation1;
656             if ((officialStatus.isMajor())
657                 && languagePopulation1 * 100 < countryPopulation && languagePopulation1 < 1000000) {
658                 BadItem.WARNING.show("official language has population < 1% of country & < 1,000,000", languageCode + ", " + Math.round(languagePopulation1),
659                     row);
660             }
661             if (languagePopulation1 < 0.999) {
662                 BadItem.WARNING.show("suspect language population, < 1", languageCode + ", " + Math.round(languagePopulation1), row);
663             }
664             if (languagePopulation1 > 10000) {
665                 relativeLanguagePopulation = true;
666                 languagePopulation1 = languagePopulation1 * countryPopulation / countryPopulation1; // correct the
667                 // values
668             } else {
669                 relativeLanguagePopulation = false;
670             }
671             if (isApproximatelyGreater(languagePopulation1, countryPopulation, 0.0001)) {
672                 BadItem.ERROR.show("language population > country population", Math.round(languagePopulation1) + " > " + countryPopulation, row);
673             }
674             languagePopulation = languagePopulation1 < countryPopulation ? languagePopulation1 : countryPopulation;
675 
676             if (SHOW_DIFF)
677                 showDiff(languagePopulation1 / countryPopulation1, languagePopulation / countryPopulation, 0.01, true);
678 
679             String stringLanguageLiteracy = row.size() <= LANGUAGE_LITERACY ? "" : row.get(LANGUAGE_LITERACY);
680             double languageLiteracy1 = stringLanguageLiteracy.length() == 0 ? countryLiteracy
681                 : parsePercent(stringLanguageLiteracy, languagePopulation);
682             if (isApproximatelyEqual(languageLiteracy1, countryLiteracy1, 0.001)) {
683                 languageLiteracy1 = countryLiteracy; // correct the values
684             }
685             languageLiteracy = languageLiteracy1;
686 
687             if (row.size() > COMMENT) {
688                 comment = row.get(COMMENT);
689             } else {
690                 comment = "";
691             }
692             if (row.size() > NOTES) {
693                 notes = row.get(NOTES);
694             } else {
695                 notes = "";
696             }
697             badLanguageName = row.get(BAD_LANGUAGE_NAME);
698         }
699 
showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang)700         private void showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang) {
701             final double diff = new_a / a - 1;
702             if (Math.abs(diff) > maxRelativeDiff) {
703                 System.out.println(formatPercent(diff, 0, false)
704                     + "\t" + countryCode + "\t" + getDisplayCountry(countryCode)
705                     + (showLang ? "\t" + languageCode + "\t" + getLanguageName(languageCode) : "")
706                     + "\t" + formatNumber(a, 0, false) + "\t=>\t" + formatNumber(new_a, 0, false));
707             }
708         }
709 
roundToPartsPer(double a, double whole)710         private double roundToPartsPer(double a, double whole) {
711             // break this out just to make it easier to follow.
712             double log10 = Math.log10(a / whole);
713             long digitsFound = (long) (log10);
714             long factor = (long) (Math.pow(10, digitsFound));
715             double rounded = Math.round(a / factor);
716             double result = rounded * factor;
717             // if (Math.abs(result - a) >= 1) {
718             // System.out.println("Rounding " + a + " => " + result);
719             // }
720             return result;
721         }
722 
isApproximatelyEqual(double a, double b, double epsilon)723         private static boolean isApproximatelyEqual(double a, double b, double epsilon) {
724             return a == b || Math.abs(a - b) < epsilon;
725         }
726 
isApproximatelyGreater(double a, double b, double epsilon)727         private static boolean isApproximatelyGreater(double a, double b, double epsilon) {
728             return a > b + epsilon;
729         }
730 
parseDecimal(String numericRepresentation)731         double parseDecimal(String numericRepresentation) throws ParseException {
732             try {
733                 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN;
734                 Number result = nf.parse(numericRepresentation);
735                 // if (result == null) return Double.NaN;
736                 return result.doubleValue();
737             } catch (ParseException e) {
738                 throw e;
739                 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation +
740                 // ">").initCause(e);
741             }
742         }
743 
parsePercent(String numericRepresentation, double baseValue)744         double parsePercent(String numericRepresentation, double baseValue) throws ParseException {
745             try {
746                 double result;
747                 if (numericRepresentation.contains("%")) {
748                     Number result0 = pf.parse(numericRepresentation);
749                     result = result0.doubleValue();
750                 } else {
751                     Number result0 = nf.parse(numericRepresentation);
752                     result = result0.doubleValue() / baseValue;
753                 }
754                 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN;
755                 // if (result == null) return Double.NaN;
756                 return result;
757             } catch (ParseException e) {
758                 throw e;
759                 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation +
760                 // ">").initCause(e);
761             }
762         }
763 
getLanguageLiteratePopulation()764         public double getLanguageLiteratePopulation() {
765             return languageLiteracy * languagePopulation;
766         }
767 
768         /**
769          * Get the weighted population
770          *
771          * @param weightIfNotOfficial
772          * @return
773          */
getLanguageLiteratePopulation(double weightIfNotOfficial)774         public double getLanguageLiteratePopulation(double weightIfNotOfficial) {
775             double result = languageLiteracy * languagePopulation;
776             if (!officialStatus.isMajor()) {
777                 result *= weightIfNotOfficial;
778             }
779             return result;
780         }
781 
compareTo(Object o)782         public int compareTo(Object o) {
783             RowData that = (RowData) o;
784             int result;
785             if (0 != (result = GENERAL_COLLATOR.compare(countryCode, that.countryCode))) return result;
786             if (languagePopulation > that.languagePopulation) return -1; // descending
787             if (languagePopulation < that.languagePopulation) return 1;
788             if (0 != (result = GENERAL_COLLATOR.compare(languageCode, that.languageCode))) return result;
789             return 0;
790         }
791 
toStringHeader()792         public static String toStringHeader() {
793             return "countryCode" + "\t" + "countryPopulation" + "\t" + "countryGdp"
794                 + "\t" + "countryLiteracy"
795                 + "\t" + "languagePopulation" + "\t" + "languageCode"
796                 + "\t" + "writingPopulation";
797         }
798 
toString()799         public String toString() {
800             return countryCode + "\t" + countryPopulation + "\t" + countryGdp
801                 + "\t" + countryLiteracy
802                 + "\t" + languagePopulation + "\t" + languageCode
803                 + "\t" + languageLiteracy;
804         }
805 
toString(boolean b)806         public String toString(boolean b) {
807             return "region:\t" + getCountryCodeAndName(countryCode)
808                 + "\tpop:\t" + countryPopulation
809                 + "\tgdp:\t" + countryGdp
810                 + "\tlit:\t" + countryLiteracy
811                 + "\tlang:\t" + getLanguageCodeAndName(languageCode)
812                 + "\tpop:\t" + languagePopulation
813                 + "\tlit:\t" + languageLiteracy;
814         }
815 
816         static boolean MARK_OUTPUT = false;
817 
getRickLanguageCode()818         public String getRickLanguageCode() {
819             if (languageCode.contains("_")) return languageCode;
820             Source source = Iso639Data.getSource(languageCode);
821             if (source == null) {
822                 return "§" + languageCode;
823             }
824             if (MARK_OUTPUT) {
825                 if (source == Source.ISO_639_3) {
826                     return "*" + languageCode;
827                 }
828             }
829             return languageCode;
830         }
831 
832         static Map<String, String> oldToFixed = new HashMap<>();
833 
getRickLanguageName()834         public String getRickLanguageName() {
835             String cldrResult = getExcelQuote(english.getName(languageCode, true));
836 //            String result = getRickLanguageName2();
837 //            if (!result.equalsIgnoreCase(cldrResult)) {
838 //                if (null == oldToFixed.put(result, cldrResult)) {
839 //                    System.out.println("## " + result + "!=" + cldrResult);
840 //                }
841 //            }
842             return cldrResult;
843         }
844 
getRickLanguageName2()845         public String getRickLanguageName2() {
846             String result = new ULocale(languageCode).getDisplayName();
847             if (!result.equals(languageCode)) return getExcelQuote(result);
848             Set<String> names = Iso639Data.getNames(languageCode);
849             if (names != null && names.size() != 0) {
850                 if (MARK_OUTPUT) {
851                     return getExcelQuote("*" + names.iterator().next());
852                 } else {
853                     return getExcelQuote(names.iterator().next());
854                 }
855             }
856             return getExcelQuote("§" + badLanguageName);
857         }
858 
getCountryName()859         public String getCountryName() {
860             return getExcelQuote(getDisplayCountry(countryCode));
861         }
862 
getCountryGdpString()863         public String getCountryGdpString() {
864             return getExcelQuote(formatNumber(countryGdp, 0, false));
865         }
866 
getCountryLiteracyString()867         public String getCountryLiteracyString() {
868             return formatPercent(countryLiteracy, 2, false);
869         }
870 
getCountryPopulationString()871         public String getCountryPopulationString() {
872             return getExcelQuote(formatNumber(countryPopulation, 0, false));
873         }
874 
getLanguageLiteracyString()875         public String getLanguageLiteracyString() {
876             return formatPercent(languageLiteracy, 2, false);
877         }
878 
getLanguagePopulationString()879         public String getLanguagePopulationString() {
880 
881             try {
882                 final double percent = languagePopulation / countryPopulation;
883                 return getExcelQuote(relativeLanguagePopulation
884                     && percent > 0.03
885                     && languagePopulation > 10000
886                         ? formatPercent(percent, 2, false)
887                         : formatNumber(languagePopulation, 3, false));
888             } catch (IllegalArgumentException e) {
889                 return "NaN";
890             }
891         }
892 
getLanguagePopulation()893         private double getLanguagePopulation() {
894             return languagePopulation;
895         }
896 
897     }
898 
getExcelQuote(String comment)899     public static String getExcelQuote(String comment) {
900         return comment == null || comment.length() == 0 ? ""
901             : comment.contains(",") ? '"' + comment + '"'
902                 : comment.contains("\"") ? '"' + comment.replace("\"", "\"\"") + '"'
903                     : comment;
904     }
905 
getCountryCodeAndName(String code)906     public static String getCountryCodeAndName(String code) {
907         if (code == null) return null;
908         return english.getName(CLDRFile.TERRITORY_NAME, code) + " [" + code + "]";
909     }
910 
911     static class RickComparator implements Comparator<RowData> {
compare(RowData me, RowData that)912         public int compare(RowData me, RowData that) {
913             int result;
914             if (0 != (result = GENERAL_COLLATOR.compare(me.getCountryName(), that.getCountryName()))) return result;
915             if (0 != (result = GENERAL_COLLATOR.compare(me.getRickLanguageName(), that.getRickLanguageName())))
916                 return result;
917             return me.compareTo(that);
918         }
919     }
920 
writeTerritoryLanguageData(List<String> failures, Set<RowData> sortedInput)921     private static void writeTerritoryLanguageData(List<String> failures, Set<RowData> sortedInput) {
922 
923         String lastCountryCode = "";
924         boolean first = true;
925         LanguageTagParser ltp = new LanguageTagParser();
926 
927         Log.println(" <!-- See http://unicode.org/cldr/data/diff/supplemental/territory_language_information.html for more information on territoryInfo. -->");
928         Log.println("\t<territoryInfo>");
929 
930         for (RowData row : sortedInput) {
931             String countryCode = row.countryCode;
932 
933             double countryPopulationRaw = row.countryPopulation;
934             double countryPopulation = countryPopulationRaw; // (long) Utility.roundToDecimals(countryPopulationRaw, 2);
935             double languageLiteracy = row.languageLiteracy;
936             double countryLiteracy = row.countryLiteracy;
937 
938             double countryGDPRaw = row.countryGdp;
939             long countryGDP = Math.round(countryGDPRaw / gdpFactor);
940 
941             String languageCode = row.languageCode;
942 
943             double languagePopulationRaw = row.getLanguagePopulation();
944             double languagePopulation = languagePopulationRaw; // (long) Utility.roundToDecimals(languagePopulationRaw,
945             // 2);
946 
947             double languagePopulationPercent = languagePopulation / countryPopulation;
948             // Utility.roundToDecimals(Math.min(100, Math.max(0,
949             // languagePopulation*100 / (double)countryPopulation)),3);
950 
951             if (!countryCode.equals(lastCountryCode)) {
952                 if (first) {
953                     first = false;
954                 } else {
955                     Log.println("\t\t</territory>");
956                 }
957                 Log.print("\t\t<territory type=\"" + countryCode + "\""
958                     + " gdp=\"" + formatNumber(countryGDP, 4, true) + "\""
959                     + " literacyPercent=\"" + formatPercent(countryLiteracy, 3, true) + "\""
960                     + " population=\"" + formatNumber(countryPopulation, 6, true) + "\">");
961                 lastCountryCode = countryCode;
962                 Log.println("\t<!--" + getDisplayCountry(countryCode) + "-->");
963             }
964 
965             if (languageCode.length() != 0
966                 && languagePopulationPercent > 0.0000
967                 && (ALLOW_SMALL_NUMBERS || languagePopulationPercent >= 1 || languagePopulationRaw > 100000
968                     || languageCode.equals("haw") || row.officialStatus.isOfficial())) {
969                 // add best case
970                 addBestRegion(languageCode, countryCode, languagePopulationRaw);
971                 String baseScriptLanguage = ltp.set(languageCode).getLanguageScript();
972                 if (!baseScriptLanguage.equals(languageCode)) {
973                     addBestRegion(baseScriptLanguage, countryCode, languagePopulationRaw);
974                 }
975                 String baseLanguage = ltp.set(baseScriptLanguage).getLanguage();
976                 if (!baseLanguage.equals(baseScriptLanguage)) {
977                     addBestRegion(baseLanguage, countryCode, languagePopulationRaw);
978                     addBestScript(baseLanguage, ltp.set(languageCode).getScript(), languagePopulationRaw);
979                 }
980 
981                 if (languageLiteracy != countryLiteracy) {
982                     int debug = 0;
983                 }
984                 Log.print("\t\t\t<languagePopulation type=\""
985                     + languageCode
986                     + "\""
987                     + (DoubleMath.fuzzyCompare(languageLiteracy, countryLiteracy, 0.0001) == 0 ? ""
988                         : (DoubleMath.fuzzyCompare(languageLiteracy, 0.05, 0.0001) == 0 ? " writingPercent=\"" : " literacyPercent=\"")
989                             + formatPercent(languageLiteracy, 2, true) + "\"")
990                     + " populationPercent=\"" + formatPercent(languagePopulationPercent, 2, true) + "\""
991                     + (row.officialStatus.isOfficial() ? " officialStatus=\"" + row.officialStatus + "\"" : "")
992                     + references.addReference(row.notes)
993                     + "/>");
994                 Log.println("\t<!--" + getLanguageName(languageCode) + "-->");
995             } else if (!row.countryCode.equals("ZZ")) {
996                 failures.add(BadItem.ERROR.toString("too few speakers: suspect line", languageCode, row.toString(true)));
997             }
998             // if (first) {
999             if (false) System.out.print(
1000                 "countryCode: " + countryCode + "\t"
1001                     + "countryPopulation: " + countryPopulation + "\t"
1002                     + "countryGDP: " + countryGDP + "\t"
1003                     + "languageCode: " + languageCode + "\t"
1004                     + "languagePopulation: " + languagePopulation + CldrUtility.LINE_SEPARATOR);
1005             // }
1006         }
1007 
1008         Log.println("\t\t</territory>");
1009         Log.println("\t</territoryInfo>");
1010     }
1011 
getDisplayCountry(String countryCode)1012     private static String getDisplayCountry(String countryCode) {
1013         String result = getULocaleCountryName(countryCode);
1014         if (!result.equals(countryCode)) {
1015             return result;
1016         }
1017         result = sc.getData("territory", countryCode);
1018         if (result != null) {
1019             return result;
1020         }
1021         return countryCode;
1022         // new ULocale("und-" + countryCode).getDisplayCountry()
1023     }
1024 
getDisplayScript(String scriptCode)1025     private static String getDisplayScript(String scriptCode) {
1026         String result = getULocaleScriptName(scriptCode);
1027         if (!result.equals(scriptCode)) {
1028             return result;
1029         }
1030         result = sc.getData("territory", scriptCode);
1031         if (result != null) {
1032             return result;
1033         }
1034         return scriptCode;
1035         // new ULocale("und-" + countryCode).getDisplayCountry()
1036     }
1037 
getLanguageName(String languageCode)1038     private static String getLanguageName(String languageCode) {
1039         String result = getULocaleLocaleName(languageCode);
1040         if (!result.equals(languageCode)) return result;
1041         Set<String> names = Iso639Data.getNames(languageCode);
1042         if (names != null && names.size() != 0) {
1043             return names.iterator().next();
1044         }
1045         return languageCode;
1046     }
1047 
1048     static class References {
1049         Map<String, Pair<String, String>> Rxxx_to_reference = new TreeMap<String, Pair<String, String>>();
1050         Map<Pair<String, String>, String> reference_to_Rxxx = new TreeMap<Pair<String, String>, String>();
1051         Map<String, Pair<String, String>> Rxxx_to_oldReferences = supplementalData.getReferences();
1052         Map<Pair<String, String>, String> oldReferences_to_Rxxx = new TreeMap<Pair<String, String>, String>();
1053         {
1054             for (String Rxxx : Rxxx_to_oldReferences.keySet()) {
Rxxx_to_oldReferences.get(Rxxx)1055                 oldReferences_to_Rxxx.put(Rxxx_to_oldReferences.get(Rxxx), Rxxx);
1056             }
1057         }
1058         Matcher URI = PatternCache.get("([a-z]+\\://[\\S]+)\\s?(.*)").matcher("");
1059 
1060         static int referenceStart = 1000;
1061 
1062         /**
1063          * Returns " references=\"" + Rxxx + "\"" or "" if there is no reference.
1064          *
1065          * @param rawReferenceText
1066          * @return
1067          */
addReference(String rawReferenceText)1068         private String addReference(String rawReferenceText) {
1069             if (rawReferenceText == null || rawReferenceText.length() == 0) return "";
1070             Pair<String, String> p;
1071             if (URI.reset(rawReferenceText).matches()) {
1072                 p = new Pair<String, String>(URI.group(1), URI.group(2) == null || URI.group(2).length() == 0 ? "[missing]"
1073                     : URI.group(2)).freeze();
1074             } else {
1075                 p = new Pair<String, String>(null, rawReferenceText).freeze();
1076             }
1077 
1078             String Rxxx = reference_to_Rxxx.get(p);
1079             if (Rxxx == null) { // add new
1080                 Rxxx = oldReferences_to_Rxxx.get(p);
1081                 if (Rxxx != null) { // if old, just keep number
1082                     p = Rxxx_to_oldReferences.get(Rxxx);
1083                 } else { // find an empty number
1084                     while (true) {
1085                         Rxxx = "R" + (referenceStart++);
1086                         if (Rxxx_to_reference.get(Rxxx) == null && Rxxx_to_oldReferences.get(Rxxx) == null) {
1087                             break;
1088                         }
1089                     }
1090                 }
1091                 // add to new references
1092                 reference_to_Rxxx.put(p, Rxxx);
1093                 Rxxx_to_reference.put(Rxxx, p);
1094             }
1095             // references="R034"
1096             return " references=\"" + Rxxx + "\"";
1097         }
1098 
getReferenceHTML(String Rxxx)1099         String getReferenceHTML(String Rxxx) {
1100             Pair<String, String> p = Rxxx_to_reference.get(Rxxx); // exception if fails.
1101             String uri = p.getFirst();
1102             String value = p.getSecond();
1103             uri = uri == null ? "" : " uri=\"" + TransliteratorUtilities.toHTML.transliterate(uri) + "\"";
1104             value = value == null ? "[missing]" : TransliteratorUtilities.toHTML.transliterate(value);
1105             return "\t\t<reference type=\"" + Rxxx + "\"" + uri + ">" + value + "</reference>";
1106         }
1107 
printReferences()1108         void printReferences() {
1109             // <reference type="R034" uri="isbn:0-321-18578-1">The Unicode Standard 4.0</reference>
1110             Log.println("\t<references>");
1111             for (String Rxxx : Rxxx_to_reference.keySet()) {
1112                 Log.println(getReferenceHTML(Rxxx));
1113             }
1114             Log.println("\t</references>");
1115         }
1116     }
1117 
1118     static References references = new References();
1119 
getExcelData(List<String> failures, Map<String, RowData> localeToRowData)1120     private static Set<RowData> getExcelData(List<String> failures, Map<String, RowData> localeToRowData)
1121         throws IOException {
1122 
1123         LanguageTagParser ltp = new LanguageTagParser();
1124 
1125         String dir = CLDRPaths.GEN_DIRECTORY + "supplemental/";
1126         final String ricksFile = "country_language_population_raw.txt";
1127         System.out.println("\n# Problems in " + ricksFile + "\n");
1128         List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data(ricksFile));
1129 
1130         Set<String> languages = languagesNeeded; // sc.getGoodAvailableCodes("language");
1131 
1132         Set<String> territories = new TreeSet<String>(sc.getGoodAvailableCodes("territory"));
1133         territories.removeAll(supplementalData.getContainers());
1134         territories.remove("EU");
1135         territories.remove("QO");
1136 
1137         Set<String> countriesNotFound = new TreeSet<String>(territories);
1138         Set<OfficialStatus> statusFound = new TreeSet<OfficialStatus>();
1139         Set<String> countriesWithoutOfficial = new TreeSet<String>(territories);
1140         countriesWithoutOfficial.remove("ZZ");
1141 
1142         Map<String, Row.R2<String, Double>> countryToLargestOfficialLanguage = new HashMap<String, Row.R2<String, Double>>();
1143 
1144         Set<String> languagesNotFound = new TreeSet<String>(languages);
1145         Set<RowData> sortedInput = new TreeSet<RowData>();
1146         int count = 0;
1147         for (List<String> row : input) {
1148             ++count;
1149             if (count == 1 || row.size() <= COUNTRY_GDP) {
1150                 failures.add(join(row, "\t") + "\tShort row");
1151                 continue;
1152             }
1153             try {
1154                 RowData x = new RowData(row);
1155                 if (x.officialStatus.isOfficial()) {
1156                     Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(x.countryCode);
1157                     if (largestOffical == null) {
1158                         countryToLargestOfficialLanguage.put(x.countryCode,
1159                             Row.of(x.languageCode, x.languagePopulation));
1160                     } else if (largestOffical.get1() < x.languagePopulation) {
1161                         largestOffical.set0(x.languageCode);
1162                         largestOffical.set1(x.languagePopulation);
1163                     }
1164                 }
1165                 if (x.officialStatus.isMajor() || x.countryPopulation < 1000) {
1166                     countriesWithoutOfficial.remove(x.countryCode);
1167                 }
1168                 if (!checkCode(LstrType.region, x.countryCode, row)) continue;
1169                 statusFound.add(x.officialStatus);
1170                 countriesNotFound.remove(x.countryCode);
1171                 languagesNotFound.remove(x.languageCode);
1172                 if (x.languageCode.contains("_")) {
1173                     ltp.set(x.languageCode);
1174                     languagesNotFound.remove(ltp.getLanguage());
1175                     if (!checkCode(LstrType.language, ltp.getLanguage(), row)) continue;
1176                     if (!checkCode(LstrType.script, ltp.getScript(), row)) continue;
1177                 }
1178                 String locale = x.languageCode + "_" + x.countryCode;
1179                 if (localeToRowData.get(locale) != null) {
1180                     BadItem.ERROR.show("duplicate data", x.languageCode + " with " + x.countryCode, row);
1181                 }
1182                 localeToRowData.put(locale, x);
1183                 sortedInput.add(x);
1184             } catch (ParseException e) {
1185                 failures.add(join(row, "\t") + "\t" + e.getMessage() + "\t"
1186                     + join(Arrays.asList(e.getStackTrace()), ";\t"));
1187             } catch (RuntimeException e) {
1188                 throw (RuntimeException) new IllegalArgumentException("Failure on line " + count + ")\t" + row)
1189                     .initCause(e);
1190             }
1191         }
1192         // System.out.println("Note: the following Status values were found in the data: " +
1193         // CldrUtility.join(statusFound, " | "));
1194 
1195         // make sure we have something
1196         for (String country : countriesNotFound) {
1197             RowData x = new RowData(country, "und");
1198             sortedInput.add(x);
1199         }
1200         for (String language : languagesNotFound) {
1201             RowData x = new RowData("ZZ", language);
1202             sortedInput.add(x);
1203         }
1204 
1205         for (RowData row : sortedInput) {
1206             // see which countries have languages that are larger than any offical language
1207 
1208             if (!row.officialStatus.isOfficial()) {
1209                 //String country = row.countryCode;
1210                 Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(row.countryCode);
1211                 if (largestOffical != null && largestOffical.get1() < row.languagePopulation) {
1212                     BadItem.WARNING.show("language population > all official languages", getLanguageCodeAndName(largestOffical.get0()), row.toString(true));
1213                 }
1214             }
1215 
1216             // see which countries are missing an official language
1217             if (!countriesWithoutOfficial.contains(row.countryCode)) continue;
1218             BadItem.ERROR.show("missing official language", row.getCountryName() + "\t" + row.countryCode, row.toString(true));
1219             countriesWithoutOfficial.remove(row.countryCode);
1220         }
1221 
1222         // write out file for rick
1223         PrintWriter log = FileUtilities.openUTF8Writer(dir, ricksFile);
1224         log.println(
1225             "*\tCName" +
1226                 "\tCCode" +
1227                 "\tCPopulation" +
1228                 "\tCLiteracy" +
1229                 "\tCGdp" +
1230                 "\tOfficialStatus" +
1231                 "\tLanguage" +
1232                 "\tLCode" +
1233                 "\tLPopulation" +
1234                 "\tWritingPop" +
1235                 "\tReferences" +
1236                 "\tNotes");
1237         RickComparator rickSorting = new RickComparator();
1238         Set<RowData> rickSorted = new TreeSet<RowData>(rickSorting);
1239         rickSorted.addAll(sortedInput);
1240 
1241         for (RowData row : rickSorted) {
1242             final String langLit = row.getLanguageLiteracyString();
1243             final String countryLit = row.getCountryLiteracyString();
1244             log.println(
1245                 row.getCountryName()
1246                     + "\t" + row.countryCode
1247                     + "\t" + row.getCountryPopulationString()
1248                     + "\t" + countryLit
1249                     + "\t" + row.getCountryGdpString()
1250                     + "\t" + (row.officialStatus == OfficialStatus.unknown ? "" : row.officialStatus)
1251                     + "\t" + row.getRickLanguageName()
1252                     + "\t" + row.getRickLanguageCode()
1253                     + "\t" + row.getLanguagePopulationString()
1254                     + "\t" + (langLit.equals(countryLit) ? "" : langLit)
1255                     + "\t" + getExcelQuote(row.comment)
1256                     + "\t" + getExcelQuote(row.notes));
1257         }
1258         log.close();
1259         return sortedInput;
1260     }
1261 
getCldrParents(Set<String> available)1262     private static Set<String> getCldrParents(Set<String> available) {
1263         LanguageTagParser ltp2 = new LanguageTagParser();
1264         Set<String> cldrParents = new TreeSet<String>();
1265         for (String locale : available) {
1266             if (skipLocales.contains(locale)) continue;
1267             try {
1268                 ltp2.set(locale);
1269             } catch (RuntimeException e) {
1270                 System.out.println("Skipping CLDR file: " + locale);
1271                 continue;
1272             }
1273             String locale2 = ltp2.getLanguageScript();
1274             if (locale2.equals("sh")) continue;
1275             // int lastPos = locale.lastIndexOf('_');
1276             // if (lastPos < 0) continue;
1277             // String locale2 = locale.substring(0,lastPos);
1278             cldrParents.add(locale2);
1279             languageToMaxCountry.put(locale2, null);
1280         }
1281         //System.out.println("CLDR Parents: " + cldrParents);
1282         return cldrParents;
1283     }
1284 
showFailures(List<String> failures)1285     private static void showFailures(List<String> failures) {
1286         if (failures.size() <= 1) {
1287             return;
1288         }
1289         System.out.println();
1290         System.out.println("Failures in Output");
1291         System.out.println();
1292 
1293         System.out.println(RowData.toStringHeader());
1294         for (String failure : failures) {
1295             System.out.println(failure);
1296         }
1297     }
1298 
showContent(Set<String> available)1299     private static void showContent(Set<String> available) {
1300         System.out.println();
1301         System.out.println("CLDR Content");
1302         System.out.println();
1303         Set<String> languagesLeft = new TreeSet<String>(defaultContent.keySet());
1304         languagesLeft.remove("und");
1305         for (String languageLeft : languagesLeft) {
1306             Log.println("\t\t<defaultContent type=\"" + languageLeft + "\" content=\""
1307                 + defaultContent.get(languageLeft) + "\"/>");
1308         }
1309         // Set<String> warnings = new LinkedHashSet<String>();
1310         //
1311         // CLDRFile supplemental = cldrFactory.make("supplementalData", true);
1312         // Comments tempComments = supplemental.getXpath_comments();
1313         // PrintWriter pw = new PrintWriter(System.out);
1314         // Comparator attributeOrdering = supplemental.getAttributeComparator();
1315         // Map defaultSuppressionMap = supplemental.getDefaultSuppressionMap();
1316         //
1317         // XPathParts last = new XPathParts(attributeOrdering, defaultSuppressionMap);
1318         // XPathParts current = new XPathParts(attributeOrdering, defaultSuppressionMap);
1319         // XPathParts lastFiltered = new XPathParts(attributeOrdering, defaultSuppressionMap);
1320         // XPathParts currentFiltered = new XPathParts(attributeOrdering, defaultSuppressionMap);
1321         //
1322         // Set orderedSet = new TreeSet(supplemental.ldmlComparator);
1323         // CollectionUtilities.addAll(supplemental.iterator("//supplementalData/languageData/language"), orderedSet);
1324         // Set<String> languagesLeft = new TreeSet<String>(defaultContent.keySet());
1325         //
1326         // for (Iterator it2 = orderedSet.iterator(); it2.hasNext();) {
1327         // String xpath = (String)it2.next();
1328         // currentFiltered.set(xpath);
1329         // current.set(xpath);
1330         //
1331         // Map x = current.set(xpath).getAttributes(-1);
1332         // boolean alt = x.containsKey("alt");
1333         // String lang = (String) x.get("type");
1334         // String defaultValue = defaultContent.get(lang);
1335         // if (alt) {
1336         // // skip
1337         // } else if (defaultValue == null) {
1338         // warnings.add("Missing default value for " + lang);
1339         // } else if (!defaultValue.equals(lang)) {
1340         // x.put("defaultContent", defaultValue);
1341         // languagesLeft.remove(lang);
1342         // }
1343         //
1344         // current.writeDifference(pw, currentFiltered, last, lastFiltered, "", tempComments);
1345         // // exchange pairs of parts
1346         // XPathParts temp = current;
1347         // current = last;
1348         // last = temp;
1349         // temp = currentFiltered;
1350         // currentFiltered = lastFiltered;
1351         // lastFiltered = temp;
1352         // }
1353         // pw.flush();
1354 
1355         // for (String warning : warnings) {
1356         // System.out.println(warning);
1357         // }
1358 
1359         // for (String localeCode : available) {
1360         // if (skipLocales.contains(localeCode)) continue;
1361         // String resolvedLanguageCode = getFullyResolved(localeCode);
1362         // // a locale will be empty if its parent has the same resolved code
1363         // String parent = getProcessedParent(localeCode);
1364         // String resolvedParent = getFullyResolved(parent);
1365         // System.out.println(
1366         // (resolvedLanguageCode.equals(resolvedParent) ? "empty" : "")
1367         // + "\t" + localeCode
1368         // + "\t" + resolvedLanguageCode
1369         // + "\t" + parent
1370         // + "\t" + ULocale.getDisplayName(localeCode, ULocale.ENGLISH));
1371         // }
1372     }
1373 
getProcessedParent(String localeCode)1374     public static String getProcessedParent(String localeCode) {
1375         if (localeCode == null || localeCode.equals("root")) return null;
1376         int pos = localeCode.lastIndexOf('_');
1377         if (pos < 0) return "root";
1378         LanguageTagParser ltp = new LanguageTagParser();
1379         String script = ltp.set(localeCode).getScript();
1380         if (script.length() == 0) {
1381             return getFullyResolved(localeCode);
1382         }
1383         return localeCode.substring(0, pos);
1384     }
1385 
getFullyResolved(String languageCode)1386     private static String getFullyResolved(String languageCode) {
1387         String result = defaultContent.get(languageCode);
1388         if (result != null) return result;
1389         // we missed. Try taking parent and trying again
1390         int pos = languageCode.length() + 1;
1391         while (true) {
1392             pos = languageCode.lastIndexOf('_', pos - 1);
1393             if (pos < 0) {
1394                 return "***" + languageCode;
1395             }
1396             result = defaultContent.get(languageCode.substring(0, pos));
1397             if (result != null) {
1398                 LanguageTagParser ltp = new LanguageTagParser().set(languageCode);
1399                 LanguageTagParser ltp2 = new LanguageTagParser().set(result);
1400                 String region = ltp.getRegion();
1401                 if (region.length() == 0) {
1402                     ltp.setRegion(ltp2.getRegion());
1403                 }
1404                 String script = ltp.getScript();
1405                 if (script.length() == 0) {
1406                     ltp.setScript(ltp2.getScript());
1407                 }
1408                 return ltp.toString();
1409             }
1410         }
1411     }
1412 
1413     static Comparator<Iterable> firstElementComparator = new Comparator<Iterable>() {
1414         public int compare(Iterable o1, Iterable o2) {
1415             int result = ((Comparable) o1.iterator().next()).compareTo((o2.iterator().next()));
1416             assert result != 0;
1417             return result;
1418         }
1419     };
1420 
showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent, Map<String, RowData> localeToRowData, Set<String> defaultLocaleContent)1421     private static void showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent,
1422         Map<String, RowData> localeToRowData,
1423         Set<String> defaultLocaleContent) {
1424 
1425         if (SHOW_OLD_DEFAULT_CONTENTS) {
1426             System.out.println();
1427             System.out.println("Computing Defaults Contents");
1428             System.out.println();
1429         }
1430 
1431         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
1432         Set<String> locales = new TreeSet<String>(cldrFactory.getAvailable());
1433         LocaleIDParser lidp = new LocaleIDParser();
1434 
1435         // add all the combinations of language, script, and territory.
1436         for (String locale : localeToRowData.keySet()) {
1437             String baseLanguage = lidp.set(locale).getLanguage();
1438             if (locales.contains(baseLanguage) && !locales.contains(locale)) {
1439                 locales.add(locale);
1440                 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding: " + locale);
1441             }
1442         }
1443 
1444         // adding parents
1445         Set<String> toAdd = new TreeSet<String>();
1446         while (true) {
1447             for (String locale : locales) {
1448                 String newguy = LocaleIDParser.getParent(locale);
1449                 if (newguy != null && !locales.contains(newguy) && !toAdd.contains(newguy)) {
1450                     toAdd.add(newguy);
1451                     if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding parent: " + newguy);
1452                 }
1453             }
1454             if (toAdd.size() == 0) {
1455                 break;
1456             }
1457             locales.addAll(toAdd);
1458             toAdd.clear();
1459         }
1460 
1461         // get sets of siblings
1462         Set<Set<String>> siblingSets = new TreeSet<Set<String>>(firstElementComparator);
1463         Set<String> needsADoin = new TreeSet<String>(locales);
1464 
1465         Set<String> deprecatedLanguages = new TreeSet<String>();
1466         deprecatedLanguages.add("sh");
1467         Set<String> deprecatedRegions = new TreeSet<String>();
1468         deprecatedRegions.add("YU");
1469         deprecatedRegions.add("CS");
1470         deprecatedRegions.add("ZZ");
1471 
1472         // first find all the language subtags that have scripts, and those we need to skip. Those are aliased-only
1473         Set<String> skippingItems = new TreeSet<String>();
1474         Set<String> hasAScript = new TreeSet<String>();
1475         //Set<LocaleIDParser.Level> languageOnly = EnumSet.of(LocaleIDParser.Level.Language);
1476         for (String locale : locales) {
1477             lidp.set(locale);
1478             if (lidp.getScript().length() != 0) {
1479                 hasAScript.add(lidp.getLanguage());
1480             }
1481             Set<LocaleIDParser.Level> levels = lidp.getLevels();
1482             // must have no variants, must have either script or region, no deprecated elements
1483             if (levels.contains(LocaleIDParser.Level.Variants) // no variants
1484                 || !(levels.contains(LocaleIDParser.Level.Script)
1485                     || levels.contains(LocaleIDParser.Level.Region))
1486                 || deprecatedLanguages.contains(lidp.getLanguage())
1487                 || deprecatedRegions.contains(lidp.getRegion())) {
1488                 // skip language-only locales, and ones with variants
1489                 needsADoin.remove(locale);
1490                 skippingItems.add(locale);
1491                 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tremoving: " + locale);
1492                 continue;
1493             }
1494         }
1495         // walk through the locales, getting the ones we care about.
1496         Map<String, Double> scriptLocaleToLanguageLiteratePopulation = new TreeMap<String, Double>();
1497 
1498         for (String locale : new TreeSet<String>(needsADoin)) {
1499             if (!needsADoin.contains(locale)) continue;
1500             lidp.set(locale);
1501             Set<Level> level = lidp.getLevels();
1502             // skip locales that need scripts and don't have them
1503             if (!level.contains(LocaleIDParser.Level.Script) // no script
1504                 && hasAScript.contains(lidp.getLanguage())) {
1505                 needsADoin.remove(locale);
1506                 skippingItems.add(locale);
1507                 continue;
1508             }
1509             // get siblings
1510             Set<String> siblingSet = lidp.getSiblings(needsADoin);
1511             // if it has a script and region
1512             if (level.contains(LocaleIDParser.Level.Script) && level.contains(LocaleIDParser.Level.Region)) {
1513                 double languageLiteratePopulation = 0;
1514                 for (String localeID2 : siblingSet) {
1515                     RowData rowData = localeToRowData.get(localeID2);
1516                     if (rowData != null) {
1517                         languageLiteratePopulation += rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT);
1518                     }
1519                 }
1520                 String parentID = LocaleIDParser.getParent(locale);
1521                 scriptLocaleToLanguageLiteratePopulation.put(parentID, languageLiteratePopulation);
1522             }
1523 
1524             try {
1525                 siblingSets.add(siblingSet);
1526             } catch (RuntimeException e) {
1527                 e.printStackTrace();
1528             }
1529             needsADoin.removeAll(siblingSet);
1530         }
1531         if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("ConvertLanguageData Skipping: " + skippingItems);
1532         if (needsADoin.size() != 0) {
1533             if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("Missing: " + needsADoin);
1534         }
1535 
1536         // walk through the data
1537         Set<String> skippingSingletons = new TreeSet<String>();
1538 
1539         Set<String> missingData = new TreeSet<String>();
1540         for (Set<String> siblingSet : siblingSets) {
1541             if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("** From siblings: " + siblingSet);
1542 
1543             if (false & siblingSet.size() == 1) {
1544                 skippingSingletons.add(siblingSet.iterator().next());
1545                 continue;
1546             }
1547             // get best
1548             double best = Double.NEGATIVE_INFINITY;
1549             String bestLocale = "???";
1550             Set<Pair<Double, String>> data = new TreeSet<>();
1551             LanguageTagParser ltp = new LanguageTagParser();
1552             for (String locale : siblingSet) {
1553                 RowData rowData = localeToRowData.get(locale);
1554                 double languageLiteratePopulation = -1;
1555                 if (rowData != null) {
1556                     languageLiteratePopulation = rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT);
1557                 } else {
1558                     Double d = scriptLocaleToLanguageLiteratePopulation.get(locale);
1559                     if (d != null) {
1560                         languageLiteratePopulation = d;
1561                     } else {
1562                         final String region = ltp.set(locale).getRegion();
1563                         if (region.isEmpty() || StandardCodes.isCountry(region)) {
1564                             missingData.add(locale);
1565                         }
1566                     }
1567                 }
1568                 data.add(new Pair<Double, String>(languageLiteratePopulation, locale));
1569                 if (best < languageLiteratePopulation) {
1570                     best = languageLiteratePopulation;
1571                     bestLocale = locale;
1572                 }
1573             }
1574             // show it
1575             for (Pair<Double, String> datum : data) {
1576                 if (SHOW_OLD_DEFAULT_CONTENTS)
1577                     System.out.format(
1578                         "\tContenders: %s %f (based on literate population)" + CldrUtility.LINE_SEPARATOR,
1579                         datum.getSecond(), datum.getFirst());
1580             }
1581             // System.out.format("\tPicking default content: %s %f (based on literate population)" +
1582             // Utility.LINE_SEPARATOR, bestLocale, best);
1583             // Hack to fix English
1584             // TODO Generalize in the future for other locales with non-primary scripts
1585             if (bestLocale.startsWith("en_")) {
1586                 defaultLocaleContent.add("en_US");
1587             } else {
1588                 defaultLocaleContent.add(bestLocale);
1589             }
1590         }
1591 
1592         for (String singleton : skippingSingletons) {
1593             BadItem.WARNING.show("skipping Singletons", singleton);
1594         }
1595         for (String missing : missingData) {
1596             BadItem.WARNING.show("Missing Data", missing);
1597         }
1598 
1599         // LanguageTagParser ltp = new LanguageTagParser();
1600         // Set<String> warnings = new LinkedHashSet();
1601         // for (String languageCode : languageToMaxCountry.keySet()) {
1602         // CodeAndPopulation best = languageToMaxCountry.get(languageCode);
1603         // String languageSubtag = ltp.set(languageCode).getLanguage();
1604         // String countryCode = "ZZ";
1605         // double rawLanguagePopulation = -1;
1606         // if (best != null) {
1607         // countryCode = best.code;
1608         // rawLanguagePopulation = best.population;
1609         // Set<String> regions = LanguageInfo.INSTANCE.languageToRegions.get(languageSubtag);
1610         // if (regions == null || !regions.contains(countryCode)) {
1611         // Set<String> regions2 = LanguageInfo.INSTANCE.languageToRegionsAlt.get(languageSubtag);
1612         // if (regions2 == null || !regions2.contains(countryCode)) {
1613         // warnings.add("WARNING: " + languageCode + " => " + countryCode + ", not in " + regions + "/" + regions2);
1614         // }
1615         // }
1616         // }
1617         // String resolvedLanguageCode = languageCode + "_" + countryCode;
1618         // ltp.set(languageCode);
1619         // Set<String> scripts = LanguageInfo.INSTANCE.languageToScripts.get(languageCode);
1620         // String script = ltp.getScript();
1621         // if (script.length() == 0) {
1622         // CodeAndPopulation bestScript = languageToMaxScript.get(languageCode);
1623         // if (bestScript != null) {
1624         // script = bestScript.code;
1625         // if (scripts == null || !scripts.contains(script)) {
1626         // warnings.add("WARNING: " + languageCode + " => " + script + ", not in " + scripts);
1627         // }
1628         // } else {
1629         // script = "Zzzz";
1630         // if (scripts == null) {
1631         // scripts = LanguageInfo.INSTANCE.languageToScriptsAlt.get(languageCode);
1632         // }
1633         // if (scripts != null) {
1634         // script = scripts.iterator().next();
1635         // if (scripts.size() != 1) {
1636         // warnings.add("WARNING: " + languageCode + " => " + scripts);
1637         // }
1638         // }
1639         // }
1640         // if (scripts == null) {
1641         // warnings.add("Missing scripts for: " + languageCode);
1642         // } else if (scripts.size() == 1){
1643         // script = "";
1644         // }
1645         // resolvedLanguageCode = languageCode
1646         // + (script.length() == 0 ? "" : "_" + script)
1647         // + "_" + countryCode;
1648         // }
1649         //
1650         //
1651         // System.out.println(
1652         // resolvedLanguageCode
1653         // + "\t" + languageCode
1654         // + "\t" + ULocale.getDisplayName(languageCode, ULocale.ENGLISH)
1655         // + "\t" + countryCode
1656         // + "\t" + ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH)
1657         // + "\t" + formatNumber(rawLanguagePopulation)
1658         // + (cldrParents.contains(languageCode) ? "\tCLDR" : "")
1659         // );
1660         // if (languageCode.length() == 0) continue;
1661         // defaultContent.put(languageCode, resolvedLanguageCode);
1662         // }
1663         // for (String warning : warnings) {
1664         // System.out.println(warning);
1665         // }
1666     }
1667 
1668     // private static void printDefaultContent(Set<String> defaultLocaleContent) {
1669     // String sep = Utility.LINE_SEPARATOR + "\t\t\t";
1670     // String broken = Utility.breakLines(join(defaultLocaleContent," "), sep, PatternCache.get("(\\S)\\S*").matcher(""),
1671     // 80);
1672     //
1673     // Log.println("\t\t<defaultContent locales=\"" + broken + "\"");
1674     // Log.println("\t\t/>");
1675     // }
1676 
getSuppressScript(String languageCode)1677     private static Object getSuppressScript(String languageCode) {
1678         // TODO Auto-generated method stub
1679         return null;
1680     }
1681 
join(Collection c, String separator)1682     public static String join(Collection c, String separator) {
1683         StringBuffer result = new StringBuffer();
1684         boolean first = true;
1685         for (Object x : c) {
1686             if (first)
1687                 first = false;
1688             else
1689                 result.append(separator);
1690             result.append(x);
1691         }
1692         return result.toString();
1693     }
1694 
addBestRegion(String languageCode, String countryCode, double languagePopulationRaw)1695     private static void addBestRegion(String languageCode, String countryCode, double languagePopulationRaw) {
1696         addBest(languageCode, languagePopulationRaw, countryCode, languageToMaxCountry);
1697     }
1698 
addBestScript(String languageCode, String scriptCode, double languagePopulationRaw)1699     private static void addBestScript(String languageCode, String scriptCode, double languagePopulationRaw) {
1700         addBest(languageCode, languagePopulationRaw, scriptCode, languageToMaxScript);
1701     }
1702 
addBest(String languageCode, double languagePopulationRaw, String code, Map<String, CodeAndPopulation> languageToMaxCode)1703     private static void addBest(String languageCode, double languagePopulationRaw, String code,
1704         Map<String, CodeAndPopulation> languageToMaxCode) {
1705         if (languageCode.length() == 0) {
1706             throw new IllegalArgumentException();
1707         }
1708         CodeAndPopulation best = languageToMaxCode.get(languageCode);
1709         if (best == null) {
1710             languageToMaxCode.put(languageCode, best = new CodeAndPopulation());
1711         } else if (best.population >= languagePopulationRaw) {
1712             return;
1713         }
1714         best.population = languagePopulationRaw;
1715         best.code = code;
1716     }
1717 
1718     static class CodeAndPopulation {
1719         String code = null;
1720         double population = Double.NaN;
1721 
toString()1722         public String toString() {
1723             return "{" + code + "," + population + "}";
1724         }
1725     }
1726 
1727     static public class GeneralCollator implements Comparator<String> {
1728         static UTF16.StringComparator cpCompare = new UTF16.StringComparator(true, false, 0);
1729         static RuleBasedCollator UCA = (RuleBasedCollator) Collator
1730             .getInstance(ULocale.ROOT);
1731         static {
1732             UCA.setNumericCollation(true);
1733         }
1734 
compare(String s1, String s2)1735         public int compare(String s1, String s2) {
1736             if (s1 == null) {
1737                 return s2 == null ? 0 : -1;
1738             } else if (s2 == null) {
1739                 return 1;
1740             }
1741             int result = UCA.compare(s1, s2);
1742             if (result != 0) return result;
1743             return cpCompare.compare(s1, s2);
1744         }
1745     };
1746 
1747     public static class InverseComparator<T> implements Comparator<T> {
1748         private Comparator<T> other;
1749 
InverseComparator()1750         public InverseComparator() {
1751             this.other = null;
1752         }
1753 
InverseComparator(Comparator<T> other)1754         public InverseComparator(Comparator<T> other) {
1755             this.other = other;
1756         }
1757 
compare(T a, T b)1758         public int compare(T a, T b) {
1759             return other == null
1760                 ? ((Comparable) b).compareTo(a)
1761                 : other.compare(b, a);
1762         }
1763     }
1764 
1765     static Set<String> languagesNeeded = new TreeSet<String>(
1766         Arrays
1767             .asList("ab ba bh bi bo fj fy gd ha ht ik iu ks ku ky lg mi na nb rm sa sd sg si sm sn su tg tk to tw vo yi za lb dv chr syr kha sco gv"
1768                 .split("\\s")));
1769 
generateIso639_2Data()1770     static void generateIso639_2Data() {
1771         for (String languageSubtag : sc.getAvailableCodes("language")) {
1772             String alpha3 = Iso639Data.toAlpha3(languageSubtag);
1773             Type type = Iso639Data.getType(languageSubtag);
1774             Scope scope = Iso639Data.getScope(languageSubtag);
1775             if (type != null || alpha3 != null || scope != null) {
1776                 Log.println("\t\t<languageCode type=\"" + languageSubtag + "\"" +
1777                     (alpha3 == null ? "" : " iso639Alpha3=\"" + alpha3 + "\"") +
1778                     (type == null ? "" : " iso639Type=\"" + type + "\"") +
1779                     (scope == null ? "" : " iso639Scope=\"" + scope + "\"") +
1780                     "/>");
1781             }
1782 
1783         }
1784     }
1785 
1786     static Relation<String, BasicLanguageData> language2BasicLanguageData = Relation.of(new TreeMap<String, Set<BasicLanguageData>>(), TreeSet.class);
1787 
1788     static Map<String, Relation<BasicLanguageData.Type, String>> language_status_scripts;
1789     static Map<Pair<String, String>, String> language_script_references = new TreeMap<Pair<String, String>, String>();
1790 
1791     static final Map<String, Map<String, R2<List<String>, String>>> LOCALE_ALIAS_INFO = SupplementalDataInfo
1792         .getInstance().getLocaleAliasInfo();
1793 
getLanguage2Scripts(Set<RowData> sortedInput)1794     static void getLanguage2Scripts(Set<RowData> sortedInput) throws IOException {
1795         language_status_scripts = new TreeMap<String, Relation<BasicLanguageData.Type, String>>();
1796 
1797         // // get current scripts
1798         // Relation<String,String> languageToDefaultScript = new Relation(new TreeMap(), TreeSet.class);
1799         // Relation<String,String> secondaryLanguageToDefaultScript = new Relation(new TreeMap(), TreeSet.class);
1800         // for (String languageSubtag : language2BasicLanguageData.keySet()) {
1801         // for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) {
1802         // for (String script : item.getScripts()) {
1803         // addLanguage2Script(languageSubtag, item.getType(), script);
1804         // }
1805         // }
1806         // }
1807         // System.out.println("Language 2 scripts: " + language_status_scripts);
1808 
1809         // #Lcode LanguageName Status Scode ScriptName References
1810         List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data("language_script_raw.txt"));
1811         System.out.println(CldrUtility.LINE_SEPARATOR + "# Problems in language_script_raw.txt"
1812             + CldrUtility.LINE_SEPARATOR);
1813         //int count = -1;
1814         for (List<String> row : input) {
1815             try {
1816                 if (row.size() == 0) continue;
1817                 //++count;
1818                 String language = row.get(0).trim();
1819                 if (language.length() == 0 || language.startsWith("#")) continue;
1820                 BasicLanguageData.Type status = BasicLanguageData.Type.valueOf(row.get(2));
1821                 String scripts = row.get(3);
1822                 if (!checkCode(LstrType.language, language, row)) continue;
1823                 for (String script : scripts.split("\\s+")) {
1824                     if (!checkCode(LstrType.script, script, row)) continue;
1825                     // if the script is not modern, demote
1826                     Info scriptInfo = ScriptMetadata.getInfo(script);
1827                     if (scriptInfo == null) {
1828                         BadItem.ERROR.toString("illegal script; must be represented in Unicode, remove line or fix", script, row);
1829                         continue;
1830                     }
1831                     IdUsage idUsage = scriptInfo.idUsage;
1832                     if (status == BasicLanguageData.Type.primary && idUsage != IdUsage.RECOMMENDED) {
1833                         if (idUsage == IdUsage.ASPIRATIONAL || idUsage == IdUsage.LIMITED_USE) {
1834                             BadItem.WARNING.toString("Script has unexpected usage; make secondary if a Recommended script is used widely for the langauge",
1835                                 idUsage + ", " + script + "=" + getULocaleScriptName(script), row);
1836                         } else {
1837                             BadItem.ERROR.toString("Script is not modern; make secondary", idUsage + ", " + script + "=" + getULocaleScriptName(script), row);
1838                             status = BasicLanguageData.Type.secondary;
1839                         }
1840                     }
1841 
1842                     // if the language is not modern, demote
1843                     if (LOCALE_ALIAS_INFO.get("language").containsKey(language)) {
1844                         BadItem.ERROR.toString("Remove/Change deprecated language", language + " "
1845                             + getLanguageName(language) + "; " + LOCALE_ALIAS_INFO.get("language").get(language), row);
1846                         continue;
1847                     }
1848                     if (status == BasicLanguageData.Type.primary && !sc.isModernLanguage(language)) {
1849                         BadItem.ERROR.toString("Should be secondary, language is not modern", language + " " + getLanguageName(language), row);
1850                         status = BasicLanguageData.Type.secondary;
1851                     }
1852 
1853                     addLanguage2Script(language, status, script);
1854                     if (row.size() > 5) {
1855                         String reference = row.get(5);
1856                         if (reference != null && reference.length() == 0) {
1857                             language_script_references.put(new Pair<String, String>(language, script), reference);
1858                         }
1859                     }
1860                 }
1861             } catch (RuntimeException e) {
1862                 System.err.println(row);
1863                 throw e;
1864             }
1865         }
1866 
1867         // System.out.println("Language 2 scripts: " + language_status_scripts);
1868 
1869         for (String language : sc.getGoodAvailableCodes("language")) {
1870             if (supplementalData.getDeprecatedInfo("language", language) != null) {
1871                 continue;
1872             }
1873             Map<String, String> registryData = sc.getLangData("language", language);
1874             if (registryData != null) {
1875                 String suppressScript = registryData.get("Suppress-Script");
1876                 if (suppressScript == null) continue;
1877                 if (ScriptMetadata.getInfo(suppressScript) == null) {
1878                     // skip, not represented in Unicode
1879                     continue;
1880                 }
1881                 // if there is something already there, we have a problem.
1882                 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1883                 if (status_scripts == null) {
1884                     System.out
1885                         .println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript);
1886                 } else if (!status_scripts.values().contains(suppressScript)) {
1887                     System.out.println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript
1888                         + "\tall:\t" + status_scripts.values());
1889                 } else {
1890                     // at this point, the suppressScript is in the union of the primary and secondary.
1891                     Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary);
1892                     if (primaryScripts != null && !primaryScripts.contains(suppressScript)) {
1893                         System.out.println("Suppress-Script is not in primary: " + language + "\tSuppress-Script:\t"
1894                             + suppressScript + "\tprimary:\t"
1895                             + primaryScripts);
1896                     }
1897                 }
1898                 addLanguage2Script(language, BasicLanguageData.Type.primary, suppressScript);
1899             }
1900         }
1901 
1902         // remove primaries from secondaries
1903         // check for primaries for scripts
1904         for (String language : language_status_scripts.keySet()) {
1905             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1906             Set<String> secondaryScripts = status_scripts.getAll(BasicLanguageData.Type.secondary);
1907             if (secondaryScripts == null) continue;
1908             Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary);
1909             if (primaryScripts == null) {
1910                 // status_scripts.putAll(BasicLanguageData.Type.primary, secondaryScripts);
1911                 // status_scripts.removeAll(BasicLanguageData.Type.secondary);
1912                 if (sc.isModernLanguage(language)) {
1913                     BadItem.ERROR.show("modern language without primary script, might need to edit moribund_languages.txt", language + " "
1914                         + getLanguageName(language));
1915                 }
1916             } else {
1917                 status_scripts.removeAll(BasicLanguageData.Type.secondary, primaryScripts);
1918             }
1919         }
1920 
1921         // check that every living language in the row data has a script
1922         Set<String> livingLanguagesWithTerritories = new TreeSet<String>();
1923         for (RowData rowData : sortedInput) {
1924             String language = rowData.languageCode;
1925             if (sc.isModernLanguage(language) && Iso639Data.getSource(language) != Iso639Data.Source.ISO_639_3) {
1926                 livingLanguagesWithTerritories.add(language);
1927             }
1928         }
1929         for (String language : livingLanguagesWithTerritories) {
1930             Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1931             if (status_scripts != null) {
1932                 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary);
1933                 if (primaryScripts != null && primaryScripts.size() > 0) {
1934                     continue;
1935                 }
1936             }
1937             if (language.equals("tw")) continue; // TODO load aliases and check...
1938             BadItem.WARNING.show("ISO 639-1/2 language in language-territory list without primary script", language + "\t" + getLanguageName(language));
1939         }
1940 
1941         // System.out.println("Language 2 scripts: " + language_status_scripts);
1942     }
1943 
checkScript(String script)1944     private static boolean checkScript(String script) {
1945         // TODO Auto-generated method stub
1946         return false;
1947     }
1948 
1949     static Validity VALIDITY = Validity.getInstance();
1950 
checkCode(LstrType type, String code, List<String> sourceLine)1951     private static boolean checkCode(LstrType type, String code, List<String> sourceLine) {
1952         Status validity = VALIDITY.getCodeToStatus(type).get(code);
1953         if (validity == Status.regular) {
1954             if (type == LstrType.language && code.equals("no")) {
1955                 validity = Status.invalid;
1956             } else {
1957                 return true;
1958             }
1959         } else if (validity == Status.unknown && type == LstrType.region) {
1960             return true;
1961         }
1962         BadItem.ERROR.show("Illegitimate Code", type + ": " + code + " = " + validity, sourceLine);
1963         return false;
1964     }
1965 
addLanguage2Script(String language, BasicLanguageData.Type type, String script)1966     private static void addLanguage2Script(String language, BasicLanguageData.Type type, String script) {
1967         Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language);
1968         if (status_scripts == null)
1969             language_status_scripts.put(language, status_scripts = Relation.of(new TreeMap<BasicLanguageData.Type, Set<String>>(), TreeSet.class));
1970         status_scripts.put(type, script);
1971     }
1972 
addLanguageScriptData()1973     static void addLanguageScriptData() throws IOException {
1974         // check to make sure that every language subtag is in 639-3
1975         Set<String> langRegistryCodes = sc.getGoodAvailableCodes("language");
1976         // Set<String> iso639_2_missing = new TreeSet(langRegistryCodes);
1977         // iso639_2_missing.removeAll(Iso639Data.getAvailable());
1978         // iso639_2_missing.remove("root");
1979         // if (iso639_2_missing.size() != 0) {
1980         // for (String missing : iso639_2_missing){
1981         // System.out.println("*ERROR in StandardCodes* Missing Lang/Script data:\t" + missing + ", " +
1982         // sc.getData("language", missing));
1983         // }
1984         // }
1985 
1986         // Map<String, String> nameToTerritoryCode = new TreeMap();
1987         // for (String territoryCode : sc.getGoodAvailableCodes("territory")) {
1988         // nameToTerritoryCode.put(sc.getData("territory", territoryCode).toLowerCase(), territoryCode);
1989         // }
1990         // nameToTerritoryCode.put("iran", nameToTerritoryCode.get("iran, islamic republic of")); //
1991 
1992         //BasicLanguageData languageData = new BasicLanguageData();
1993 
1994         BufferedReader in = CldrUtility.getUTF8Data("extraLanguagesAndScripts.txt");
1995         while (true) {
1996             String line = in.readLine();
1997             if (line == null) break;
1998             String[] parts = line.split("\\t");
1999             String alpha3 = parts[0];
2000             alpha3 = stripBrackets(alpha3);
2001             String languageSubtag = Iso639Data.fromAlpha3(alpha3);
2002             if (languageSubtag == null) {
2003                 if (langRegistryCodes.contains(alpha3)) {
2004                     languageSubtag = alpha3;
2005                 } else {
2006                     BadItem.WARNING.show("Language subtag not found on line", alpha3, line);
2007                     continue;
2008                 }
2009             }
2010             //String name = parts[1];
2011             Set<String> names = Iso639Data.getNames(languageSubtag);
2012             if (names == null) {
2013                 Map<String, String> name2 = sc.getLangData("language", languageSubtag);
2014                 if (name2 != null) {
2015                     String name3 = name2.get("Description");
2016                     if (name3 != null) {
2017                         names = new TreeSet<String>();
2018                         names.add(name3);
2019                     }
2020                 }
2021             }
2022             // if (names == null || !names.contains(name)) {
2023             // System.out.println("Name <" + name + "> for <" + languageSubtag + "> not found in " + names);
2024             // }
2025 
2026             // names all straight, now get scripts and territories
2027             // [Cyrl]; [Latn]
2028             Set<String> fullScriptList = sc.getGoodAvailableCodes("script");
2029 
2030             String[] scriptList = parts[2].split("[;,]\\s*");
2031             Set<String> scripts = new TreeSet<String>();
2032             Set<String> scriptsAlt = new TreeSet<String>();
2033             for (String script : scriptList) {
2034                 if (script.length() == 0) continue;
2035                 boolean alt = false;
2036                 if (script.endsWith("*")) {
2037                     alt = true;
2038                     script = script.substring(0, script.length() - 1);
2039                 }
2040                 script = stripBrackets(script);
2041                 if (!fullScriptList.contains(script)) {
2042                     System.out.println("Script <" + script + "> for <" + languageSubtag + "> not found in "
2043                         + fullScriptList);
2044                 } else if (alt) {
2045                     scriptsAlt.add(script);
2046                 } else {
2047                     scripts.add(script);
2048                 }
2049             }
2050             // now territories
2051             Set<String> territories = new TreeSet<String>();
2052             if (parts.length > 4) {
2053                 String[] territoryList = parts[4].split("\\s*[;,-]\\s*");
2054                 for (String territoryName : territoryList) {
2055                     if (territoryName.equals("ISO/DIS 639") || territoryName.equals("3")) continue;
2056                     String territoryCode = CountryCodeConverter.getCodeFromName(territoryName);
2057                     if (territoryCode == null) {
2058                         BadItem.ERROR.show("no name found for territory", "<" + territoryName + ">", languageSubtag);
2059                     } else {
2060                         territories.add(territoryCode);
2061                     }
2062                 }
2063             }
2064             // <language type="de" scripts="Latn" territories="IT" alt="secondary"/>
2065             // we're going to go ahead and set these all to secondary.
2066             if (scripts.size() != 0) {
2067                 language2BasicLanguageData.put(languageSubtag,
2068                     new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scripts)
2069                         .setTerritories(territories));
2070             }
2071             if (scriptsAlt.size() != 0) {
2072                 language2BasicLanguageData.put(languageSubtag,
2073                     new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scriptsAlt)
2074                         .setTerritories(territories));
2075             }
2076         }
2077         in.close();
2078 
2079         // add other data
2080         for (String languageSubtag : supplementalData.getBasicLanguageDataLanguages()) {
2081             Set<BasicLanguageData> otherData = supplementalData.getBasicLanguageData(languageSubtag);
2082             language2BasicLanguageData.putAll(languageSubtag, otherData);
2083         }
2084     }
2085 
2086     // private static void showAllBasicLanguageData(Relation<String, BasicLanguageData> language2basicData, String
2087     // comment) {
2088     // // now print
2089     // Relation<String, String> primaryCombos = new Relation(new TreeMap(), TreeSet.class);
2090     // Relation<String, String> secondaryCombos = new Relation(new TreeMap(), TreeSet.class);
2091     //
2092     // Log.println("\t<languageData>" + (comment == null ? "" : " <!-- " + comment + " -->"));
2093     //
2094     // for (String languageSubtag : language2basicData.keySet()) {
2095     // String duplicate = "";
2096     // // script,territory
2097     // primaryCombos.clear();
2098     // secondaryCombos.clear();
2099     //
2100     // for (BasicLanguageData item : language2basicData.getAll(languageSubtag)) {
2101     // Set<String> scripts = item.getScripts();
2102     // if (scripts.size() == 0) scripts = new TreeSet(Arrays.asList(new String[] { "Zzzz" }));
2103     // for (String script : scripts) {
2104     // Set<String> territories = item.getTerritories();
2105     // if (territories.size() == 0) territories = new TreeSet(Arrays.asList(new String[] { "ZZ" }));
2106     // for (String territory : territories) {
2107     // if (item.getType().equals(BasicLanguageData.Type.primary)) {
2108     // primaryCombos.put(script, territory);
2109     // } else {
2110     // secondaryCombos.put(script, territory);
2111     // }
2112     // }
2113     // }
2114     // }
2115     // secondaryCombos.removeAll(primaryCombos);
2116     // showBasicLanguageData(languageSubtag, primaryCombos, null, BasicLanguageData.Type.primary);
2117     // showBasicLanguageData(languageSubtag, secondaryCombos, primaryCombos.keySet(),
2118     // BasicLanguageData.Type.secondary);
2119     // // System.out.println(item.toString(languageSubtag) + duplicate);
2120     // // duplicate = " <!-- " + "**" + " -->";
2121     // }
2122     // Log.println("\t</languageData>");
2123     // }
2124 
showBasicLanguageData(String languageSubtag, Relation<String, String> primaryCombos, Set<String> suppressEmptyScripts, BasicLanguageData.Type type)2125     private static void showBasicLanguageData(String languageSubtag, Relation<String, String> primaryCombos,
2126         Set<String> suppressEmptyScripts, BasicLanguageData.Type type) {
2127         Set<String> scriptsWithSameTerritories = new TreeSet<String>();
2128         Set<String> lastTerritories = Collections.emptySet();
2129         for (String script : primaryCombos.keySet()) {
2130             Set<String> territories = primaryCombos.getAll(script);
2131             if (lastTerritories == Collections.EMPTY_SET) {
2132                 // skip first
2133             } else if (lastTerritories.equals(territories)) {
2134                 scriptsWithSameTerritories.add(script);
2135             } else {
2136                 showBasicLanguageData2(languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts,
2137                     lastTerritories, type);
2138                 scriptsWithSameTerritories.clear();
2139             }
2140             lastTerritories = territories;
2141             scriptsWithSameTerritories.add(script);
2142         }
2143         showBasicLanguageData2(languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts, lastTerritories, type);
2144     }
2145 
showBasicLanguageData2(String languageSubtag, Set<String> scripts, Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type)2146     private static void showBasicLanguageData2(String languageSubtag, Set<String> scripts,
2147         Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type) {
2148         scripts.remove("Zzzz");
2149         territories.remove("ZZ");
2150         if (territories.size() == 0 && suppressEmptyScripts != null) {
2151             scripts.removeAll(suppressEmptyScripts);
2152         }
2153         if (scripts.size() == 0 && territories.size() == 0) return;
2154         Log.println("\t\t<language type=\"" + languageSubtag + "\"" +
2155             (scripts.size() == 0 ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"") +
2156             (territories.size() == 0 ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"") +
2157             (type == BasicLanguageData.Type.primary ? "" : " alt=\"" + type + "\"") +
2158             "/>");
2159     }
2160 
2161     /*
2162      * System.out.println(
2163      * "\t\t<language type=\"" + languageSubtag + "\"" +
2164      * " scripts=\"" + Utility.join(scripts," ") + "\"" +
2165      * (territories.size() == 0 ? "" : " territories=\"" + Utility.join(territories," ") + "\"") +
2166      * "/>"
2167      * );
2168      */
2169 
stripBrackets(String alpha3)2170     private static String stripBrackets(String alpha3) {
2171         if (alpha3.startsWith("[") && alpha3.endsWith("]")) {
2172             alpha3 = alpha3.substring(1, alpha3.length() - 1);
2173         }
2174         return alpha3;
2175     }
2176 
2177     static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH);
2178     static NumberFormat nf_no_comma = NumberFormat.getInstance(ULocale.ENGLISH);
2179     static {
2180         nf_no_comma.setGroupingUsed(false);
2181     }
2182     static NumberFormat pf = NumberFormat.getPercentInstance(ULocale.ENGLISH);
2183 
formatNumber(double original, int roundDigits, boolean xml)2184     public static String formatNumber(double original, int roundDigits, boolean xml) {
2185         double d = original;
2186         if (roundDigits != 0) {
2187             d = CldrUtility.roundToDecimals(original, roundDigits);
2188         }
2189         if (Double.isNaN(d)) {
2190             d = CldrUtility.roundToDecimals(original, roundDigits);
2191             throw new IllegalArgumentException("Double is NaN");
2192         }
2193         if (xml) {
2194             return nf_no_comma.format(d);
2195         }
2196         return nf.format(d);
2197     }
2198 
formatPercent(double d, int roundDigits, boolean xml)2199     public static String formatPercent(double d, int roundDigits, boolean xml) {
2200         if (roundDigits != 0) {
2201             d = CldrUtility.roundToDecimals(d, roundDigits);
2202         }
2203         if (xml) {
2204             nf_no_comma.setMaximumFractionDigits(roundDigits + 2);
2205             return nf_no_comma.format(d * 100.0);
2206         }
2207         pf.setMaximumFractionDigits(roundDigits + 2);
2208         return pf.format(d);
2209     }
2210 
2211     static final LanguageTagCanonicalizer languageTagCanonicalizer = new LanguageTagCanonicalizer();
2212 
fixLanguageCode(String languageCodeRaw, List<String> row)2213     private static String fixLanguageCode(String languageCodeRaw, List<String> row) {
2214         String languageCode = languageTagCanonicalizer.transform(languageCodeRaw);
2215         if (DEBUG && !languageCode.equals(languageCodeRaw)) {
2216             System.out.println("## " + languageCodeRaw + " => " + languageCode);
2217         }
2218         int bar = languageCode.indexOf('_');
2219         String script = "";
2220         if (bar >= 0) {
2221             script = languageCode.substring(bar);
2222             languageCode = languageCode.substring(0, bar);
2223         }
2224         R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("language").get(languageCode);
2225         if (replacement != null) {
2226             String replacementCode = replacement.get0().get(0);
2227             BadItem.ERROR.show("deprecated language code", languageCode + " => " + replacementCode, row);
2228             languageCode = replacementCode;
2229         }
2230         if (!sc.getAvailableCodes("language").contains(languageCode)) {
2231             BadItem.ERROR.show("bad language code", languageCode, row);
2232         }
2233         return languageCode + script;
2234     }
2235 
2236     enum BadItem {
2237         ERROR, WARNING, DETAIL;
2238 
show(String problem, String details, String... items)2239         void show(String problem, String details, String... items) {
2240             System.out.println(toString(problem, details, items));
2241         }
2242 
show(String problem, String details, List<String> row)2243         void show(String problem, String details, List<String> row) {
2244             System.out.println(toString(problem, details, row));
2245         }
2246 
toString(String problem, String details, String... items)2247         private String toString(String problem, String details, String... items) {
2248             return toString(problem, details, Arrays.asList(items));
2249         }
2250 
toString(String problem, String details, List<String> row)2251         private String toString(String problem, String details, List<String> row) {
2252             return "* " + this
2253                 + " *\t" + problem + ":"
2254                 + "\t" + details
2255                 + (row != null && row.size() > 0 ? "\t" + CollectionUtilities.join(row, "\t") : "");
2256         }
2257     }
2258 
fixCountryCode(String countryCode, List<String> row)2259     private static String fixCountryCode(String countryCode, List<String> row) {
2260         R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("territory").get(countryCode);
2261         if (replacement != null) {
2262             String replacementCode = replacement.get0().get(0);
2263             BadItem.ERROR.show("deprecated territory code", countryCode + " => " + replacementCode, row);
2264             countryCode = replacementCode;
2265         }
2266         if (!sc.getAvailableCodes("territory").contains(countryCode)) {
2267             BadItem.ERROR.show("bad territory code", countryCode, row);
2268         }
2269         return countryCode;
2270     }
2271 
getULocaleLocaleName(String languageCode)2272     private static String getULocaleLocaleName(String languageCode) {
2273         return english.getName(languageCode, true);
2274         //return new ULocale(languageCode).getDisplayName();
2275     }
2276 
getULocaleScriptName(String scriptCode)2277     private static String getULocaleScriptName(String scriptCode) {
2278         return english.getName(CLDRFile.SCRIPT_NAME, scriptCode);
2279         // return ULocale.getDisplayScript("und_" + scriptCode, ULocale.ENGLISH);
2280     }
2281 
getULocaleCountryName(String countryCode)2282     private static String getULocaleCountryName(String countryCode) {
2283         return english.getName(CLDRFile.TERRITORY_NAME, countryCode);
2284         //return ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH);
2285     }
2286 }
2287