1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.IOException; 5 import java.io.PrintWriter; 6 import java.text.ParseException; 7 import java.util.ArrayList; 8 import java.util.Arrays; 9 import java.util.Collection; 10 import java.util.Collections; 11 import java.util.Comparator; 12 import java.util.EnumMap; 13 import java.util.HashMap; 14 import java.util.HashSet; 15 import java.util.Iterator; 16 import java.util.LinkedHashSet; 17 import java.util.List; 18 import java.util.Map; 19 import java.util.Set; 20 import java.util.TreeMap; 21 import java.util.TreeSet; 22 import java.util.regex.Matcher; 23 24 import org.unicode.cldr.draft.FileUtilities; 25 import org.unicode.cldr.draft.ScriptMetadata; 26 import org.unicode.cldr.draft.ScriptMetadata.IdUsage; 27 import org.unicode.cldr.draft.ScriptMetadata.Info; 28 import org.unicode.cldr.util.Builder; 29 import org.unicode.cldr.util.CLDRFile; 30 import org.unicode.cldr.util.CLDRPaths; 31 import org.unicode.cldr.util.CldrUtility; 32 import org.unicode.cldr.util.Factory; 33 import org.unicode.cldr.util.Iso639Data; 34 import org.unicode.cldr.util.Iso639Data.Scope; 35 import org.unicode.cldr.util.Iso639Data.Source; 36 import org.unicode.cldr.util.Iso639Data.Type; 37 import org.unicode.cldr.util.LanguageTagCanonicalizer; 38 import org.unicode.cldr.util.LanguageTagParser; 39 import org.unicode.cldr.util.LocaleIDParser; 40 import org.unicode.cldr.util.LocaleIDParser.Level; 41 import org.unicode.cldr.util.Log; 42 import org.unicode.cldr.util.Pair; 43 import org.unicode.cldr.util.PatternCache; 44 import org.unicode.cldr.util.SpreadSheet; 45 import org.unicode.cldr.util.StandardCodes; 46 import org.unicode.cldr.util.StandardCodes.LstrType; 47 import org.unicode.cldr.util.SupplementalDataInfo; 48 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 49 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 50 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 51 import org.unicode.cldr.util.TransliteratorUtilities; 52 import org.unicode.cldr.util.Validity; 53 import org.unicode.cldr.util.Validity.Status; 54 import org.unicode.cldr.util.XPathParts; 55 import org.unicode.cldr.util.XPathParts.Comments; 56 57 import com.google.common.base.Joiner; 58 import com.google.common.collect.ImmutableSet; 59 import com.google.common.math.DoubleMath; 60 import com.ibm.icu.impl.Relation; 61 import com.ibm.icu.impl.Row; 62 import com.ibm.icu.impl.Row.R2; 63 import com.ibm.icu.text.Collator; 64 import com.ibm.icu.text.NumberFormat; 65 import com.ibm.icu.text.RuleBasedCollator; 66 import com.ibm.icu.text.UTF16; 67 import com.ibm.icu.util.ULocale; 68 69 /** 70 * @author markdavis 71 * 72 */ 73 public class ConvertLanguageData { 74 75 private static final boolean DEBUG = false; 76 // change this if you need to override what is generated for the default contents. 77 private static final List<String> defaultOverrides = Arrays.asList("es_ES".split("\\s+")); // und_ZZ 78 79 public static final boolean SHOW_DIFF = false; 80 81 private static final boolean ALLOW_SMALL_NUMBERS = true; 82 83 static final Comparator<String> GENERAL_COLLATOR = new GeneralCollator(); 84 static final Comparator<String> INVERSE_GENERAL = new InverseComparator<>(GENERAL_COLLATOR); 85 86 private static StandardCodes sc = StandardCodes.make(); 87 88 static final double populationFactor = 1; 89 static final double gdpFactor = 1; 90 static final int BAD_COUNTRY_NAME = 0, COUNTRY_CODE = 1, COUNTRY_POPULATION = 2, COUNTRY_LITERACY = 3, 91 COUNTRY_GDP = 4, OFFICIAL_STATUS = 5, BAD_LANGUAGE_NAME = 6, LANGUAGE_CODE = 7, LANGUAGE_POPULATION = 8, 92 LANGUAGE_LITERACY = 9, COMMENT = 10, NOTES = 11; 93 static final Map<String, CodeAndPopulation> languageToMaxCountry = new TreeMap<>(); 94 static final Map<String, CodeAndPopulation> languageToMaxScript = new TreeMap<>(); 95 96 private static final double NON_OFFICIAL_WEIGHT = 0.40; 97 98 private static final boolean SHOW_OLD_DEFAULT_CONTENTS = false; 99 100 private static final ImmutableSet<String> scriptAssumedLocales = ImmutableSet.of( 101 "bm_ML", "ha_GH", "ha_NE", "ha_NG", "kk_KZ", "ks_IN", "ky_KG", "mn_MN", "ms_BN", "ms_MY", "ms_SG", "tk_TM", "tzm_MA", "ug_CN"); 102 103 static Set<String> skipLocales = new HashSet<>( 104 Arrays 105 .asList( 106 "sh sh_BA sh_CS sh_YU characters supplementalData supplementalData-old supplementalData-old2 supplementalData-old3 supplementalMetadata root" 107 .split("\\s"))); 108 109 static Map<String, String> defaultContent = new TreeMap<>(); 110 111 static Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 112 static CLDRFile english = cldrFactory.make("en", true); 113 114 static SupplementalDataInfo supplementalData = SupplementalDataInfo 115 .getInstance(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY); 116 main(String[] args)117 public static void main(String[] args) throws IOException, ParseException { 118 BufferedReader oldFile = null; 119 try { 120 // load elements we care about 121 Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalData.xml"); 122 // Log.println("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"); 123 // Log.println("<!DOCTYPE supplementalData SYSTEM \"http://www.unicode.org/cldr/data/dtd/ldmlSupplemental.dtd\">"); 124 // Log.println("<supplementalData version=\"1.5\">"); 125 126 oldFile = FileUtilities.openUTF8Reader(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY, "supplementalData.xml"); 127 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<languageData>\\s*"), Log.getLog(), false); 128 129 Set<String> available = cldrFactory.getAvailable(); 130 131 Set<String> cldrParents = getCldrParents(available); 132 133 List<String> failures = new ArrayList<>(); 134 Map<String, RowData> localeToRowData = new TreeMap<>(); 135 136 Set<RowData> sortedInput = getExcelData(failures, localeToRowData); 137 138 // get the locales (including parents) 139 Set<String> localesWithData = new TreeSet<>(localeToRowData.keySet()); 140 for (String locale : localeToRowData.keySet()) { 141 while (true) { 142 String parent = LocaleIDParser.getParent(locale); 143 if (parent == null) break; 144 localesWithData.add(parent); 145 locale = parent; 146 } 147 } 148 149 final LanguageTagParser languageTagParser = new LanguageTagParser(); 150 151 for (String localeRaw : available) { 152 String locale = languageTagCanonicalizer.transform(localeRaw); 153 if (!localesWithData.contains(locale)) { 154 CLDRFile locFile = cldrFactory.make(localeRaw, false); 155 if (locFile.isAliasedAtTopLevel()) { 156 continue; 157 } 158 if (scriptAssumedLocales.contains(locale)) { 159 continue; 160 } 161 languageTagParser.set(locale); 162 if (languageTagParser.getVariants().size() != 0) { 163 continue; 164 } 165 String withoutScript = languageTagParser.setScript("").toString(); 166 if (!localesWithData.contains(withoutScript)) { 167 String region = new LanguageTagParser().set(locale).getRegion(); 168 if (StandardCodes.isCountry(region)) { 169 BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale)); 170 } 171 } else { 172 // These exceptions are OK, because these locales by default use the non-default script 173 Set<String> OKExceptions = ImmutableSet.of("sr_Cyrl_ME", "zh_Hans_HK", "zh_Hans_MO"); 174 if (OKExceptions.contains(locale)) { 175 continue; 176 } 177 BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale) 178 + " but have data for " + getLanguageCodeAndName(withoutScript)); 179 } 180 } 181 } 182 183 // TODO sort by country code, then functionalPopulation, then language code 184 // and keep the top country for each language code (even if < 1%) 185 186 addLanguageScriptData(); 187 188 // showAllBasicLanguageData(allLanguageData, "old"); 189 getLanguage2Scripts(sortedInput); 190 191 writeNewBasicData2(sortedInput); 192 // writeNewBasicData(sortedInput); 193 194 writeTerritoryLanguageData(failures, sortedInput); 195 196 checkBasicData(localeToRowData); 197 198 Set<String> defaultLocaleContent = new TreeSet<>(); 199 200 showDefaults(cldrParents, nf, defaultContent, localeToRowData, defaultLocaleContent); 201 202 // showContent(available); 203 204 // certain items are overridden 205 206 List<String> toRemove = new ArrayList<>(); 207 for (String override : defaultOverrides) { 208 String replacement = getReplacement(override, defaultLocaleContent); 209 if (replacement != null) { 210 toRemove.add(replacement); 211 } 212 } 213 defaultLocaleContent.removeAll(toRemove); 214 defaultLocaleContent.addAll(defaultOverrides); 215 216 showFailures(failures); 217 218 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</territoryInfo>\\s*"), null, false); 219 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<references>\\s*"), Log.getLog(), false); 220 // generateIso639_2Data(); 221 references.printReferences(); 222 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</references>\\s*"), null, false); 223 CldrUtility.copyUpTo(oldFile, null, Log.getLog(), false); 224 // Log.println("</supplementalData>"); 225 Log.close(); 226 oldFile.close(); 227 228 Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "language_script_raw.txt"); 229 getLanguageScriptSpreadsheet(Log.getLog()); 230 Log.close(); 231 } catch (Exception e) { 232 e.printStackTrace(); 233 } finally { 234 if (oldFile != null) { 235 oldFile.close(); 236 } 237 System.out.println("DONE"); 238 } 239 } 240 getLanguageCodeAndName(String code)241 public static String getLanguageCodeAndName(String code) { 242 if (code == null) return null; 243 return english.getName(code) + " [" + code + "]"; 244 } 245 getReplacement(String oldDefault, Set<String> defaultLocaleContent)246 private static String getReplacement(String oldDefault, Set<String> defaultLocaleContent) { 247 String parent = LocaleIDParser.getParent(oldDefault); 248 for (String replacement : defaultLocaleContent) { 249 if (replacement.startsWith(parent)) { 250 if (parent.equals(LocaleIDParser.getParent(replacement))) { 251 return replacement; 252 } 253 } 254 } 255 return null; 256 } 257 getLanguageScriptSpreadsheet(PrintWriter out)258 private static void getLanguageScriptSpreadsheet(PrintWriter out) { 259 out.println("#Lcode\tLanguageName\tStatus\tScode\tScriptName\tReferences"); 260 Pair<String, String> languageScript = new Pair<>("", ""); 261 for (String language : language_status_scripts.keySet()) { 262 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 263 for (BasicLanguageData.Type status : status_scripts.keySet()) { 264 for (String script : status_scripts.getAll(status)) { 265 String reference = language_script_references.get(languageScript.setFirst(language).setSecond( 266 script)); 267 out.println(language + "\t" + getLanguageName(language) + "\t" + status + "\t" + script + "\t" 268 + getDisplayScript(script) 269 + (reference == null ? "" : "\t" + reference)); 270 } 271 } 272 } 273 } 274 275 /** 276 * Write data in format: 277 * <languageData> 278 * <language type="aa" scripts="Latn" territories="DJ ER ET"/> 279 * 280 * @param sortedInput 281 */ writeNewBasicData2(Set<RowData> sortedInput)282 private static void writeNewBasicData2(Set<RowData> sortedInput) { 283 double cutoff = 0.2; // 20% 284 285 // Relation<String, BasicLanguageData> newLanguageData = new Relation(new TreeMap(), TreeSet.class); 286 LanguageTagParser ltp = new LanguageTagParser(); 287 Map<String, Relation<BasicLanguageData.Type, String>> language_status_territories = new TreeMap<>(); 288 //Map<String, Pair<String, String>> languageToBestCountry; 289 for (RowData rowData : sortedInput) { 290 if (rowData.countryCode.equals("ZZ")) continue; 291 ltp.set(rowData.languageCode); 292 String languageCode = ltp.getLanguage(); 293 Relation<BasicLanguageData.Type, String> status_territories = language_status_territories.get(languageCode); 294 if (status_territories == null) { 295 language_status_territories.put(languageCode, status_territories = Relation.of( 296 new TreeMap<BasicLanguageData.Type, Set<String>>(), 297 TreeSet.class)); 298 } 299 if (rowData.officialStatus.isMajor()) { 300 status_territories.put(BasicLanguageData.Type.primary, rowData.countryCode); 301 } else if (rowData.officialStatus.isOfficial() 302 || rowData.getLanguagePopulation() >= cutoff * rowData.countryPopulation 303 || rowData.getLanguagePopulation() >= 1000000) { 304 status_territories.put(BasicLanguageData.Type.secondary, rowData.countryCode); 305 } 306 } 307 308 Set<String> allLanguages = new TreeSet<>(language_status_territories.keySet()); 309 allLanguages.addAll(language_status_scripts.keySet()); 310 // now add all the remaining language-script info 311 // <language type="sv" scripts="Latn" territories="AX FI SE"/> 312 Set<String> warnings = new LinkedHashSet<>(); 313 Log.println("\t<languageData>"); 314 for (String languageSubtag : allLanguages) { 315 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(languageSubtag); 316 Relation<BasicLanguageData.Type, String> status_territories = language_status_territories 317 .get(languageSubtag); 318 319 // check against old: 320 Map<BasicLanguageData.Type, BasicLanguageData> oldData = supplementalData 321 .getBasicLanguageDataMap(languageSubtag); 322 if (oldData == null) { 323 oldData = Collections.emptyMap(); 324 } 325 326 EnumMap<BasicLanguageData.Type, BasicLanguageData> newData = new EnumMap<>( 327 BasicLanguageData.Type.class); 328 for (BasicLanguageData.Type status : BasicLanguageData.Type.values()) { 329 Set<String> scripts = status_scripts == null ? null : status_scripts.getAll(status); 330 Set<String> territories = status_territories == null ? null : status_territories.getAll(status); 331 if (scripts == null && territories == null) continue; 332 BasicLanguageData bld = new BasicLanguageData(); 333 bld.setTerritories(territories); 334 bld.setScripts(scripts); 335 bld.setType(status); 336 bld.freeze(); 337 newData.put(status, bld); 338 } 339 340 // compare 341 if (!CldrUtility.equals(oldData.entrySet(), newData.entrySet())) { 342 for (String problem : compare(oldData, newData)) { 343 warnings.add(BadItem.DETAIL.toString("changing <languageData>", languageSubtag 344 + "\t" + english.getName(languageSubtag), problem)); 345 } 346 } 347 348 for (BasicLanguageData bld : newData.values()) { 349 Set<String> scripts = bld.getScripts(); 350 Set<String> territories = bld.getTerritories(); 351 BasicLanguageData.Type status = bld.getType(); 352 Log.println("\t\t<language type=\"" + languageSubtag + "\"" 353 + (scripts.isEmpty() ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"") 354 + (territories.isEmpty() ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"") 355 + (status == BasicLanguageData.Type.primary ? "" : " alt=\"secondary\"") 356 + "/>"); 357 } 358 } 359 Log.println("\t</languageData>"); 360 for (String s : warnings) { 361 if (s.contains("!")) { 362 System.out.println(s); 363 } 364 } 365 for (String s : warnings) { 366 if (!s.contains("!")) { 367 System.out.println(s); 368 } 369 } 370 } 371 compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData, Map<BasicLanguageData.Type, BasicLanguageData> newData)372 private static List<String> compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData, 373 Map<BasicLanguageData.Type, BasicLanguageData> newData) { 374 Map<String, BasicLanguageData.Type> oldDataToType = getDataToType(oldData.values(), true); 375 Map<String, BasicLanguageData.Type> newDataToType = getDataToType(newData.values(), true); 376 List<String> result = new ArrayList<>(); 377 StringBuilder temp = new StringBuilder(); 378 for (String s : Builder.with(new LinkedHashSet<String>()).addAll(oldDataToType.keySet()) 379 .addAll(newDataToType.keySet()).get()) { 380 BasicLanguageData.Type oldValue = oldDataToType.get(s); 381 BasicLanguageData.Type newValue = newDataToType.get(s); 382 if (!CldrUtility.equals(oldValue, newValue)) { 383 temp.setLength(0); 384 temp.append("[").append(s).append(":") 385 .append(english.getName(s.length() == 4 ? "script" : "region", s)).append("] "); 386 if (oldValue == null) { 387 temp.append(" added as ").append(newValue); 388 } else if (newValue == null) { 389 temp.append(" REMOVED!"); 390 } else if (oldValue == BasicLanguageData.Type.primary) { 391 temp.append(" DOWNGRADED TO! ").append(newValue); 392 } else { 393 temp.append(" upgraded to ").append(newValue); 394 } 395 result.add(temp.toString()); 396 } 397 } 398 result.add(newData.toString()); 399 return result; 400 } 401 getDataToType( Collection<BasicLanguageData> collection, boolean script)402 private static Map<String, BasicLanguageData.Type> getDataToType( 403 Collection<BasicLanguageData> collection, boolean script) { 404 Map<String, BasicLanguageData.Type> result = new TreeMap<>(); 405 for (BasicLanguageData i : collection) { 406 for (String s : i.getScripts()) { 407 result.put(s, i.getType()); 408 } 409 for (String s : i.getTerritories()) { 410 result.put(s, i.getType()); 411 } 412 } 413 return result; 414 } 415 checkBasicData(Map<String, RowData> localeToRowData)416 private static void checkBasicData(Map<String, RowData> localeToRowData) { 417 // find languages with multiple scripts 418 Relation<String, String> languageToScripts = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 419 for (String languageSubtag : language2BasicLanguageData.keySet()) { 420 for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 421 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), item.getScripts()); 422 } 423 } 424 // get primary combinations 425 Set<String> primaryCombos = new TreeSet<>(); 426 Set<String> basicCombos = new TreeSet<>(); 427 for (String languageSubtag : language2BasicLanguageData.keySet()) { 428 for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 429 Set<String> scripts = new TreeSet<>(); 430 scripts.addAll(item.getScripts()); 431 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), scripts); 432 if (scripts.size() == 0) { 433 scripts.add("Zzzz"); 434 } 435 Set<String> territories = new TreeSet<>(); 436 territories.addAll(item.getTerritories()); 437 if (territories.size() == 0) { 438 territories.add("ZZ"); 439 continue; 440 } 441 442 for (String script : scripts) { 443 for (String territory : territories) { 444 String locale = StandardCodes.fixLanguageTag(languageSubtag) 445 // + (script.equals("Zzzz") ? "" : languageToScripts.getAll(languageSubtag).size() <= 1 ? "" 446 // : "_" + script) 447 + (territories.equals("ZZ") ? "" : "_" + territory); 448 if (item.getType() != BasicLanguageData.Type.secondary) { 449 primaryCombos.add(locale); 450 } 451 basicCombos.add(locale); 452 } 453 } 454 } 455 } 456 Set<String> populationOver20 = new TreeSet<>(); 457 Set<String> population = new TreeSet<>(); 458 LanguageTagParser ltp = new LanguageTagParser(); 459 for (String rawLocale : localeToRowData.keySet()) { 460 ltp.set(rawLocale); 461 String locale = ltp.getLanguage() + (ltp.getRegion().length() == 0 ? "" : "_" + ltp.getRegion()); 462 population.add(locale); 463 RowData rowData = localeToRowData.get(rawLocale); 464 if (rowData.getLanguagePopulation() / rowData.countryPopulation >= 0.2 465 //|| rowData.getLanguagePopulation() > 900000 466 ) { 467 populationOver20.add(locale); 468 } else { 469 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData( 470 ltp.getLanguageScript(), ltp.getRegion()); 471 if (popData != null && popData.getOfficialStatus().isOfficial()) { 472 populationOver20.add(locale); 473 } 474 } 475 } 476 Set<String> inBasicButNotPopulation = new TreeSet<>(primaryCombos); 477 478 inBasicButNotPopulation.removeAll(population); 479 for (String locale : inBasicButNotPopulation) { 480 ltp.set(locale); 481 String region = ltp.getRegion(); 482 String language = ltp.getLanguage(); 483 if (!sc.isModernLanguage(language)) continue; 484 PopulationData popData = supplementalData.getPopulationDataForTerritory(region); 485 // Afghanistan AF "29,928,987" 28.10% "21,500,000,000" Hazaragi haz "1,770,000" 28.10% 486 BadItem.WARNING.show("In Basic Data but not Population > 20%", 487 getDisplayCountry(region) 488 + "\t" + region 489 + "\t\"" + formatNumber(popData.getPopulation(), 0, false) + "\"" 490 + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false) 491 + "\"" 492 + "\t\"" + formatPercent(popData.getGdp(), 0, false) + "\"" 493 + "\t" + "" 494 + "\t" + getLanguageName(language) 495 + "\t" + language 496 + "\t" + -1 497 + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false) 498 + "\""); 499 } 500 501 Set<String> inPopulationButNotBasic = new TreeSet<>(populationOver20); 502 inPopulationButNotBasic.removeAll(basicCombos); 503 for (Iterator<String> it = inPopulationButNotBasic.iterator(); it.hasNext();) { 504 String locale = it.next(); 505 if (locale.endsWith("_ZZ")) { 506 it.remove(); 507 } 508 } 509 for (String locale : inPopulationButNotBasic) { 510 BadItem.WARNING.show("In Population>20% but not Basic Data", locale + " " + getLanguageName(locale), localeToRowData.get(locale).toString()); 511 } 512 } 513 514 static class LanguageInfo { 515 static LanguageInfo INSTANCE = new LanguageInfo(); 516 517 Map<String, Set<String>> languageToScripts = new TreeMap<>(); 518 Map<String, Set<String>> languageToRegions = new TreeMap<>(); 519 Map<String, Comments> languageToComments = new TreeMap<>(); 520 521 Map<String, Set<String>> languageToScriptsAlt = new TreeMap<>(); 522 Map<String, Set<String>> languageToRegionsAlt = new TreeMap<>(); 523 Map<String, Comments> languageToCommentsAlt = new TreeMap<>(); 524 LanguageInfo()525 private LanguageInfo() { 526 cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 527 //Set<String> available = cldrFactory.getAvailable(); 528 CLDRFile supplemental = cldrFactory.make("supplementalData", true); 529 for (Iterator<String> it = supplemental.iterator("//supplementalData/languageData/language"); it.hasNext();) { 530 String xpath = it.next(); 531 XPathParts parts = XPathParts.getFrozenInstance(xpath); 532 Map<String, String> x = parts.getAttributes(-1); 533 boolean alt = x.containsKey("alt"); 534 String lang = x.get("type"); 535 List<String> scripts = getAttributeList(x, "scripts"); 536 if (scripts != null) { 537 if (alt) { 538 putAll(languageToScriptsAlt, lang, new LinkedHashSet<>(scripts)); 539 } else { 540 putAll(languageToScripts, lang, new LinkedHashSet<>(scripts)); 541 } 542 } 543 List<String> regions = getAttributeList(x, "territories"); 544 if (regions != null) { 545 if (alt) { 546 putAll(languageToRegionsAlt, lang, new LinkedHashSet<>(regions)); 547 } else { 548 putAll(languageToRegions, lang, new LinkedHashSet<>(regions)); 549 } 550 } 551 } 552 } 553 getAttributeList(Map<String, String> x, String attribute)554 private List<String> getAttributeList(Map<String, String> x, String attribute) { 555 List<String> scripts = null; 556 String scriptString = x.get(attribute); 557 if (scriptString != null) { 558 scripts = Arrays.asList(scriptString.split("\\s+")); 559 } 560 return scripts; 561 } 562 } 563 putUnique(Map<K, V> map, K key, V value)564 private static <K, V> void putUnique(Map<K, V> map, K key, V value) { 565 V oldValue = map.get(key); 566 if (oldValue != null && !oldValue.equals(value)) { 567 throw new IllegalArgumentException("Duplicate value for <" + key + ">: <" + oldValue + ">, <" + value + ">"); 568 } 569 map.put(key, value); 570 } 571 putAll(Map<K, Set<W>> map, K key, Set<W> values)572 private static <K, W> void putAll(Map<K, Set<W>> map, K key, Set<W> values) { 573 Set<W> oldValue = map.get(key); 574 if (oldValue == null) { 575 map.put(key, values); 576 } else { 577 oldValue.addAll(values); 578 } 579 } 580 581 // public enum OfficialStatus {unknown, de_facto_official, official, official_regional, official_minority}; 582 583 static class RowData implements Comparable<Object> { 584 private final String countryCode; 585 private final double countryGdp; 586 private final double countryLiteracy; 587 private final double countryPopulation; 588 private final String languageCode; 589 private final OfficialStatus officialStatus; 590 private final double languagePopulation; 591 private final double languageLiteracy; 592 private final String comment; 593 private final String notes; 594 private final String badLanguageName; 595 private final boolean relativeLanguagePopulation; 596 // String badLanguageCode = ""; 597 private final static Set<String> doneCountries = new HashSet<>(); 598 599 private final static Set<String> countryCodes = sc.getGoodAvailableCodes("territory"); 600 RowData(String country, String language)601 public RowData(String country, String language) { 602 this.countryCode = country; 603 this.languageCode = language; 604 badLanguageName = country = language = notes = comment = ""; 605 officialStatus = OfficialStatus.unknown; 606 countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000); 607 countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d; 608 countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue(); 609 languagePopulation = languageLiteracy = Double.NaN; 610 relativeLanguagePopulation = false; 611 } 612 RowData(List<String> row)613 RowData(List<String> row) throws ParseException { 614 countryCode = fixCountryCode(row.get(COUNTRY_CODE), row); 615 616 if (!countryCodes.contains(countryCode)) { 617 System.err.println("WRONG COUNTRY CODE: " + row); 618 } 619 620 double countryPopulation1 = parseDecimal(row.get(COUNTRY_POPULATION)); 621 double countryLiteracy1 = parsePercent(row.get(COUNTRY_LITERACY), countryPopulation1); 622 623 countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000); 624 countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d; 625 countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue(); 626 627 String officialStatusString = row.get(OFFICIAL_STATUS).trim().replace(' ', '_'); 628 if (officialStatusString.equals("national")) { 629 officialStatusString = "official"; 630 } else if (officialStatusString.equals("regional_official")) { 631 officialStatusString = "official_regional"; 632 } else if (officialStatusString.length() == 0 || officialStatusString.equals("uninhabited")) { 633 officialStatusString = "unknown"; 634 } 635 try { 636 officialStatus = OfficialStatus.valueOf(officialStatusString); 637 } catch (RuntimeException e) { 638 throw new IllegalArgumentException("Can't interpret offical-status: " + officialStatusString); 639 } 640 641 String languageCode1 = row.get(LANGUAGE_CODE); 642 if (languageCode1.startsWith("*") || languageCode1.startsWith("\u00A7")) { 643 languageCode1 = languageCode1.substring(1); 644 } 645 languageCode = fixLanguageCode(languageCode1, row); 646 647 if (doneCountries.contains(countryCode) == false) { 648 // showDiff(countryGdp1, countryGdp); 649 // showDiff(countryLiteracy1, countryLiteracy); 650 if (SHOW_DIFF) showDiff(countryPopulation1, countryPopulation, 0.1, false); 651 doneCountries.add(countryCode); 652 } 653 654 double languagePopulation1 = parsePercent(row.get(LANGUAGE_POPULATION), countryPopulation1) 655 * countryPopulation1; 656 if ((officialStatus.isMajor()) 657 && languagePopulation1 * 100 < countryPopulation && languagePopulation1 < 1000000) { 658 BadItem.WARNING.show("official language has population < 1% of country & < 1,000,000", languageCode + ", " + Math.round(languagePopulation1), 659 row); 660 } 661 if (languagePopulation1 < 0.999) { 662 BadItem.WARNING.show("suspect language population, < 1", languageCode + ", " + Math.round(languagePopulation1), row); 663 } 664 if (languagePopulation1 > 10000) { 665 relativeLanguagePopulation = true; 666 languagePopulation1 = languagePopulation1 * countryPopulation / countryPopulation1; // correct the 667 // values 668 } else { 669 relativeLanguagePopulation = false; 670 } 671 if (isApproximatelyGreater(languagePopulation1, countryPopulation, 0.0001)) { 672 BadItem.ERROR.show("language population > country population", Math.round(languagePopulation1) + " > " + countryPopulation, row); 673 } 674 languagePopulation = languagePopulation1 < countryPopulation ? languagePopulation1 : countryPopulation; 675 676 if (SHOW_DIFF) 677 showDiff(languagePopulation1 / countryPopulation1, languagePopulation / countryPopulation, 0.01, true); 678 679 String stringLanguageLiteracy = row.size() <= LANGUAGE_LITERACY ? "" : row.get(LANGUAGE_LITERACY); 680 double languageLiteracy1 = stringLanguageLiteracy.length() == 0 ? countryLiteracy 681 : parsePercent(stringLanguageLiteracy, languagePopulation); 682 if (isApproximatelyEqual(languageLiteracy1, countryLiteracy1, 0.001)) { 683 languageLiteracy1 = countryLiteracy; // correct the values 684 } 685 languageLiteracy = languageLiteracy1; 686 687 if (row.size() > COMMENT) { 688 comment = row.get(COMMENT); 689 } else { 690 comment = ""; 691 } 692 if (row.size() > NOTES) { 693 notes = row.get(NOTES); 694 } else { 695 notes = ""; 696 } 697 badLanguageName = row.get(BAD_LANGUAGE_NAME); 698 } 699 showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang)700 private void showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang) { 701 final double diff = new_a / a - 1; 702 if (Math.abs(diff) > maxRelativeDiff) { 703 System.out.println(formatPercent(diff, 0, false) 704 + "\t" + countryCode + "\t" + getDisplayCountry(countryCode) 705 + (showLang ? "\t" + languageCode + "\t" + getLanguageName(languageCode) : "") 706 + "\t" + formatNumber(a, 0, false) + "\t=>\t" + formatNumber(new_a, 0, false)); 707 } 708 } 709 roundToPartsPer(double a, double whole)710 private double roundToPartsPer(double a, double whole) { 711 // break this out just to make it easier to follow. 712 double log10 = Math.log10(a / whole); 713 long digitsFound = (long) (log10); 714 long factor = (long) (Math.pow(10, digitsFound)); 715 double rounded = Math.round(a / factor); 716 double result = rounded * factor; 717 // if (Math.abs(result - a) >= 1) { 718 // System.out.println("Rounding " + a + " => " + result); 719 // } 720 return result; 721 } 722 isApproximatelyEqual(double a, double b, double epsilon)723 private static boolean isApproximatelyEqual(double a, double b, double epsilon) { 724 return a == b || Math.abs(a - b) < epsilon; 725 } 726 isApproximatelyGreater(double a, double b, double epsilon)727 private static boolean isApproximatelyGreater(double a, double b, double epsilon) { 728 return a > b + epsilon; 729 } 730 parseDecimal(String numericRepresentation)731 double parseDecimal(String numericRepresentation) throws ParseException { 732 try { 733 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN; 734 Number result = nf.parse(numericRepresentation); 735 // if (result == null) return Double.NaN; 736 return result.doubleValue(); 737 } catch (ParseException e) { 738 throw e; 739 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation + 740 // ">").initCause(e); 741 } 742 } 743 parsePercent(String numericRepresentation, double baseValue)744 double parsePercent(String numericRepresentation, double baseValue) throws ParseException { 745 try { 746 double result; 747 if (numericRepresentation.contains("%")) { 748 Number result0 = pf.parse(numericRepresentation); 749 result = result0.doubleValue(); 750 } else { 751 Number result0 = nf.parse(numericRepresentation); 752 result = result0.doubleValue() / baseValue; 753 } 754 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN; 755 // if (result == null) return Double.NaN; 756 return result; 757 } catch (ParseException e) { 758 throw e; 759 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation + 760 // ">").initCause(e); 761 } 762 } 763 getLanguageLiteratePopulation()764 public double getLanguageLiteratePopulation() { 765 return languageLiteracy * languagePopulation; 766 } 767 768 /** 769 * Get the weighted population 770 * 771 * @param weightIfNotOfficial 772 * @return 773 */ getLanguageLiteratePopulation(double weightIfNotOfficial)774 public double getLanguageLiteratePopulation(double weightIfNotOfficial) { 775 double result = languageLiteracy * languagePopulation; 776 if (!officialStatus.isMajor()) { 777 result *= weightIfNotOfficial; 778 } 779 return result; 780 } 781 782 @Override compareTo(Object o)783 public int compareTo(Object o) { 784 RowData that = (RowData) o; 785 int result; 786 if (0 != (result = GENERAL_COLLATOR.compare(countryCode, that.countryCode))) return result; 787 if (languagePopulation > that.languagePopulation) return -1; // descending 788 if (languagePopulation < that.languagePopulation) return 1; 789 if (0 != (result = GENERAL_COLLATOR.compare(languageCode, that.languageCode))) return result; 790 return 0; 791 } 792 toStringHeader()793 public static String toStringHeader() { 794 return "countryCode" + "\t" + "countryPopulation" + "\t" + "countryGdp" 795 + "\t" + "countryLiteracy" 796 + "\t" + "languagePopulation" + "\t" + "languageCode" 797 + "\t" + "writingPopulation"; 798 } 799 800 @Override toString()801 public String toString() { 802 return countryCode + "\t" + countryPopulation + "\t" + countryGdp 803 + "\t" + countryLiteracy 804 + "\t" + languagePopulation + "\t" + languageCode 805 + "\t" + languageLiteracy; 806 } 807 toString(boolean b)808 public String toString(boolean b) { 809 return "region:\t" + getCountryCodeAndName(countryCode) 810 + "\tpop:\t" + countryPopulation 811 + "\tgdp:\t" + countryGdp 812 + "\tlit:\t" + countryLiteracy 813 + "\tlang:\t" + getLanguageCodeAndName(languageCode) 814 + "\tpop:\t" + languagePopulation 815 + "\tlit:\t" + languageLiteracy; 816 } 817 818 static boolean MARK_OUTPUT = false; 819 getRickLanguageCode()820 public String getRickLanguageCode() { 821 if (languageCode.contains("_")) return languageCode; 822 Source source = Iso639Data.getSource(languageCode); 823 if (source == null) { 824 return "§" + languageCode; 825 } 826 if (MARK_OUTPUT) { 827 if (source == Source.ISO_639_3) { 828 return "*" + languageCode; 829 } 830 } 831 return languageCode; 832 } 833 834 static Map<String, String> oldToFixed = new HashMap<>(); 835 getRickLanguageName()836 public String getRickLanguageName() { 837 String cldrResult = getExcelQuote(english.getName(languageCode, true)); 838 // String result = getRickLanguageName2(); 839 // if (!result.equalsIgnoreCase(cldrResult)) { 840 // if (null == oldToFixed.put(result, cldrResult)) { 841 // System.out.println("## " + result + "!=" + cldrResult); 842 // } 843 // } 844 return cldrResult; 845 } 846 getRickLanguageName2()847 public String getRickLanguageName2() { 848 String result = new ULocale(languageCode).getDisplayName(); 849 if (!result.equals(languageCode)) return getExcelQuote(result); 850 Set<String> names = Iso639Data.getNames(languageCode); 851 if (names != null && names.size() != 0) { 852 if (MARK_OUTPUT) { 853 return getExcelQuote("*" + names.iterator().next()); 854 } else { 855 return getExcelQuote(names.iterator().next()); 856 } 857 } 858 return getExcelQuote("§" + badLanguageName); 859 } 860 getCountryName()861 public String getCountryName() { 862 return getExcelQuote(getDisplayCountry(countryCode)); 863 } 864 getCountryGdpString()865 public String getCountryGdpString() { 866 return getExcelQuote(formatNumber(countryGdp, 0, false)); 867 } 868 getCountryLiteracyString()869 public String getCountryLiteracyString() { 870 return formatPercent(countryLiteracy, 2, false); 871 } 872 getCountryPopulationString()873 public String getCountryPopulationString() { 874 return getExcelQuote(formatNumber(countryPopulation, 0, false)); 875 } 876 getLanguageLiteracyString()877 public String getLanguageLiteracyString() { 878 return formatPercent(languageLiteracy, 2, false); 879 } 880 getLanguagePopulationString()881 public String getLanguagePopulationString() { 882 883 try { 884 final double percent = languagePopulation / countryPopulation; 885 return getExcelQuote(relativeLanguagePopulation 886 && percent > 0.03 887 && languagePopulation > 10000 888 ? formatPercent(percent, 2, false) 889 : formatNumber(languagePopulation, 3, false)); 890 } catch (IllegalArgumentException e) { 891 return "NaN"; 892 } 893 } 894 getLanguagePopulation()895 private double getLanguagePopulation() { 896 return languagePopulation; 897 } 898 899 } 900 getExcelQuote(String comment)901 public static String getExcelQuote(String comment) { 902 return comment == null || comment.length() == 0 ? "" 903 : comment.contains(",") ? '"' + comment + '"' 904 : comment.contains("\"") ? '"' + comment.replace("\"", "\"\"") + '"' 905 : comment; 906 } 907 getCountryCodeAndName(String code)908 public static String getCountryCodeAndName(String code) { 909 if (code == null) return null; 910 return english.getName(CLDRFile.TERRITORY_NAME, code) + " [" + code + "]"; 911 } 912 913 static class RickComparator implements Comparator<RowData> { 914 @Override compare(RowData me, RowData that)915 public int compare(RowData me, RowData that) { 916 int result; 917 if (0 != (result = GENERAL_COLLATOR.compare(me.getCountryName(), that.getCountryName()))) return result; 918 if (0 != (result = GENERAL_COLLATOR.compare(me.getRickLanguageName(), that.getRickLanguageName()))) 919 return result; 920 return me.compareTo(that); 921 } 922 } 923 writeTerritoryLanguageData(List<String> failures, Set<RowData> sortedInput)924 private static void writeTerritoryLanguageData(List<String> failures, Set<RowData> sortedInput) { 925 926 String lastCountryCode = ""; 927 boolean first = true; 928 LanguageTagParser ltp = new LanguageTagParser(); 929 930 Log.println(" <!-- See http://unicode.org/cldr/data/diff/supplemental/territory_language_information.html for more information on territoryInfo. -->"); 931 Log.println("\t<territoryInfo>"); 932 933 for (RowData row : sortedInput) { 934 String countryCode = row.countryCode; 935 936 double countryPopulationRaw = row.countryPopulation; 937 double countryPopulation = countryPopulationRaw; // (long) Utility.roundToDecimals(countryPopulationRaw, 2); 938 double languageLiteracy = row.languageLiteracy; 939 double countryLiteracy = row.countryLiteracy; 940 941 double countryGDPRaw = row.countryGdp; 942 long countryGDP = Math.round(countryGDPRaw / gdpFactor); 943 944 String languageCode = row.languageCode; 945 946 double languagePopulationRaw = row.getLanguagePopulation(); 947 double languagePopulation = languagePopulationRaw; // (long) Utility.roundToDecimals(languagePopulationRaw, 948 // 2); 949 950 double languagePopulationPercent = languagePopulation / countryPopulation; 951 // Utility.roundToDecimals(Math.min(100, Math.max(0, 952 // languagePopulation*100 / (double)countryPopulation)),3); 953 954 if (!countryCode.equals(lastCountryCode)) { 955 if (first) { 956 first = false; 957 } else { 958 Log.println("\t\t</territory>"); 959 } 960 Log.print("\t\t<territory type=\"" + countryCode + "\"" 961 + " gdp=\"" + formatNumber(countryGDP, 4, true) + "\"" 962 + " literacyPercent=\"" + formatPercent(countryLiteracy, 3, true) + "\"" 963 + " population=\"" + formatNumber(countryPopulation, 6, true) + "\">"); 964 lastCountryCode = countryCode; 965 Log.println("\t<!--" + getDisplayCountry(countryCode) + "-->"); 966 } 967 968 if (languageCode.length() != 0 969 && languagePopulationPercent > 0.0000 970 && (ALLOW_SMALL_NUMBERS || languagePopulationPercent >= 1 || languagePopulationRaw > 100000 971 || languageCode.equals("haw") || row.officialStatus.isOfficial())) { 972 // add best case 973 addBestRegion(languageCode, countryCode, languagePopulationRaw); 974 String baseScriptLanguage = ltp.set(languageCode).getLanguageScript(); 975 if (!baseScriptLanguage.equals(languageCode)) { 976 addBestRegion(baseScriptLanguage, countryCode, languagePopulationRaw); 977 } 978 String baseLanguage = ltp.set(baseScriptLanguage).getLanguage(); 979 if (!baseLanguage.equals(baseScriptLanguage)) { 980 addBestRegion(baseLanguage, countryCode, languagePopulationRaw); 981 addBestScript(baseLanguage, ltp.set(languageCode).getScript(), languagePopulationRaw); 982 } 983 984 if (languageLiteracy != countryLiteracy) { 985 int debug = 0; 986 } 987 Log.print("\t\t\t<languagePopulation type=\"" 988 + languageCode 989 + "\"" 990 + (DoubleMath.fuzzyCompare(languageLiteracy, countryLiteracy, 0.0001) == 0 ? "" 991 : (DoubleMath.fuzzyCompare(languageLiteracy, 0.05, 0.0001) == 0 ? " writingPercent=\"" : " literacyPercent=\"") 992 + formatPercent(languageLiteracy, 2, true) + "\"") 993 + " populationPercent=\"" + formatPercent(languagePopulationPercent, 2, true) + "\"" 994 + (row.officialStatus.isOfficial() ? " officialStatus=\"" + row.officialStatus + "\"" : "") 995 + references.addReference(row.notes) 996 + "/>"); 997 Log.println("\t<!--" + getLanguageName(languageCode) + "-->"); 998 } else if (!row.countryCode.equals("ZZ")) { 999 failures.add(BadItem.ERROR.toString("too few speakers: suspect line", languageCode, row.toString(true))); 1000 } 1001 // if (first) { 1002 if (false) System.out.print( 1003 "countryCode: " + countryCode + "\t" 1004 + "countryPopulation: " + countryPopulation + "\t" 1005 + "countryGDP: " + countryGDP + "\t" 1006 + "languageCode: " + languageCode + "\t" 1007 + "languagePopulation: " + languagePopulation + CldrUtility.LINE_SEPARATOR); 1008 // } 1009 } 1010 1011 Log.println("\t\t</territory>"); 1012 Log.println("\t</territoryInfo>"); 1013 } 1014 getDisplayCountry(String countryCode)1015 private static String getDisplayCountry(String countryCode) { 1016 String result = getULocaleCountryName(countryCode); 1017 if (!result.equals(countryCode)) { 1018 return result; 1019 } 1020 result = sc.getData("territory", countryCode); 1021 if (result != null) { 1022 return result; 1023 } 1024 return countryCode; 1025 // new ULocale("und-" + countryCode).getDisplayCountry() 1026 } 1027 getDisplayScript(String scriptCode)1028 private static String getDisplayScript(String scriptCode) { 1029 String result = getULocaleScriptName(scriptCode); 1030 if (!result.equals(scriptCode)) { 1031 return result; 1032 } 1033 result = sc.getData("territory", scriptCode); 1034 if (result != null) { 1035 return result; 1036 } 1037 return scriptCode; 1038 // new ULocale("und-" + countryCode).getDisplayCountry() 1039 } 1040 getLanguageName(String languageCode)1041 private static String getLanguageName(String languageCode) { 1042 String result = getULocaleLocaleName(languageCode); 1043 if (!result.equals(languageCode)) return result; 1044 Set<String> names = Iso639Data.getNames(languageCode); 1045 if (names != null && names.size() != 0) { 1046 return names.iterator().next(); 1047 } 1048 return languageCode; 1049 } 1050 1051 static class References { 1052 Map<String, Pair<String, String>> Rxxx_to_reference = new TreeMap<>(); 1053 Map<Pair<String, String>, String> reference_to_Rxxx = new TreeMap<>(); 1054 Map<String, Pair<String, String>> Rxxx_to_oldReferences = supplementalData.getReferences(); 1055 Map<Pair<String, String>, String> oldReferences_to_Rxxx = new TreeMap<>(); 1056 { 1057 for (String Rxxx : Rxxx_to_oldReferences.keySet()) { Rxxx_to_oldReferences.get(Rxxx)1058 oldReferences_to_Rxxx.put(Rxxx_to_oldReferences.get(Rxxx), Rxxx); 1059 } 1060 } 1061 Matcher URI = PatternCache.get("([a-z]+\\://[\\S]+)\\s?(.*)").matcher(""); 1062 1063 static int referenceStart = 1000; 1064 1065 /** 1066 * Returns " references=\"" + Rxxx + "\"" or "" if there is no reference. 1067 * 1068 * @param rawReferenceText 1069 * @return 1070 */ addReference(String rawReferenceText)1071 private String addReference(String rawReferenceText) { 1072 if (rawReferenceText == null || rawReferenceText.length() == 0) return ""; 1073 Pair<String, String> p; 1074 if (URI.reset(rawReferenceText).matches()) { 1075 p = new Pair<>(URI.group(1), URI.group(2) == null || URI.group(2).length() == 0 ? "[missing]" 1076 : URI.group(2)).freeze(); 1077 } else { 1078 p = new Pair<String, String>(null, rawReferenceText).freeze(); 1079 } 1080 1081 String Rxxx = reference_to_Rxxx.get(p); 1082 if (Rxxx == null) { // add new 1083 Rxxx = oldReferences_to_Rxxx.get(p); 1084 if (Rxxx != null) { // if old, just keep number 1085 p = Rxxx_to_oldReferences.get(Rxxx); 1086 } else { // find an empty number 1087 while (true) { 1088 Rxxx = "R" + (referenceStart++); 1089 if (Rxxx_to_reference.get(Rxxx) == null && Rxxx_to_oldReferences.get(Rxxx) == null) { 1090 break; 1091 } 1092 } 1093 } 1094 // add to new references 1095 reference_to_Rxxx.put(p, Rxxx); 1096 Rxxx_to_reference.put(Rxxx, p); 1097 } 1098 // references="R034" 1099 return " references=\"" + Rxxx + "\""; 1100 } 1101 getReferenceHTML(String Rxxx)1102 String getReferenceHTML(String Rxxx) { 1103 Pair<String, String> p = Rxxx_to_reference.get(Rxxx); // exception if fails. 1104 String uri = p.getFirst(); 1105 String value = p.getSecond(); 1106 uri = uri == null ? "" : " uri=\"" + TransliteratorUtilities.toHTML.transliterate(uri) + "\""; 1107 value = value == null ? "[missing]" : TransliteratorUtilities.toHTML.transliterate(value); 1108 return "\t\t<reference type=\"" + Rxxx + "\"" + uri + ">" + value + "</reference>"; 1109 } 1110 printReferences()1111 void printReferences() { 1112 // <reference type="R034" uri="isbn:0-321-18578-1">The Unicode Standard 4.0</reference> 1113 Log.println("\t<references>"); 1114 for (String Rxxx : Rxxx_to_reference.keySet()) { 1115 Log.println(getReferenceHTML(Rxxx)); 1116 } 1117 Log.println("\t</references>"); 1118 } 1119 } 1120 1121 static References references = new References(); 1122 getExcelData(List<String> failures, Map<String, RowData> localeToRowData)1123 private static Set<RowData> getExcelData(List<String> failures, Map<String, RowData> localeToRowData) 1124 throws IOException { 1125 1126 LanguageTagParser ltp = new LanguageTagParser(); 1127 1128 String dir = CLDRPaths.GEN_DIRECTORY + "supplemental/"; 1129 final String ricksFile = "country_language_population_raw.txt"; 1130 System.out.println("\n# Problems in " + ricksFile + "\n"); 1131 List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data(ricksFile)); 1132 1133 Set<String> languages = languagesNeeded; // sc.getGoodAvailableCodes("language"); 1134 1135 Set<String> territories = new TreeSet<>(sc.getGoodAvailableCodes("territory")); 1136 territories.removeAll(supplementalData.getContainers()); 1137 territories.remove("EU"); 1138 territories.remove("QO"); 1139 1140 Set<String> countriesNotFound = new TreeSet<>(territories); 1141 Set<OfficialStatus> statusFound = new TreeSet<>(); 1142 Set<String> countriesWithoutOfficial = new TreeSet<>(territories); 1143 countriesWithoutOfficial.remove("ZZ"); 1144 1145 Map<String, Row.R2<String, Double>> countryToLargestOfficialLanguage = new HashMap<>(); 1146 1147 Set<String> languagesNotFound = new TreeSet<>(languages); 1148 Set<RowData> sortedInput = new TreeSet<>(); 1149 int count = 0; 1150 for (List<String> row : input) { 1151 ++count; 1152 if (count == 1 || row.size() <= COUNTRY_GDP) { 1153 failures.add(join(row, "\t") + "\tShort row"); 1154 continue; 1155 } 1156 try { 1157 RowData x = new RowData(row); 1158 if (x.officialStatus.isOfficial()) { 1159 Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(x.countryCode); 1160 if (largestOffical == null) { 1161 countryToLargestOfficialLanguage.put(x.countryCode, 1162 Row.of(x.languageCode, x.languagePopulation)); 1163 } else if (largestOffical.get1() < x.languagePopulation) { 1164 largestOffical.set0(x.languageCode); 1165 largestOffical.set1(x.languagePopulation); 1166 } 1167 } 1168 if (x.officialStatus.isMajor() || x.countryPopulation < 1000) { 1169 countriesWithoutOfficial.remove(x.countryCode); 1170 } 1171 if (!checkCode(LstrType.region, x.countryCode, row)) continue; 1172 statusFound.add(x.officialStatus); 1173 countriesNotFound.remove(x.countryCode); 1174 languagesNotFound.remove(x.languageCode); 1175 if (x.languageCode.contains("_")) { 1176 ltp.set(x.languageCode); 1177 languagesNotFound.remove(ltp.getLanguage()); 1178 if (!checkCode(LstrType.language, ltp.getLanguage(), row)) continue; 1179 if (!checkCode(LstrType.script, ltp.getScript(), row)) continue; 1180 } 1181 String locale = x.languageCode + "_" + x.countryCode; 1182 if (localeToRowData.get(locale) != null) { 1183 BadItem.ERROR.show("duplicate data", x.languageCode + " with " + x.countryCode, row); 1184 } 1185 localeToRowData.put(locale, x); 1186 sortedInput.add(x); 1187 } catch (ParseException e) { 1188 failures.add(join(row, "\t") + "\t" + e.getMessage() + "\t" 1189 + join(Arrays.asList(e.getStackTrace()), ";\t")); 1190 } catch (RuntimeException e) { 1191 throw (RuntimeException) new IllegalArgumentException("Failure on line " + count + ")\t" + row) 1192 .initCause(e); 1193 } 1194 } 1195 // System.out.println("Note: the following Status values were found in the data: " + 1196 // CldrUtility.join(statusFound, " | ")); 1197 1198 // make sure we have something 1199 for (String country : countriesNotFound) { 1200 RowData x = new RowData(country, "und"); 1201 sortedInput.add(x); 1202 } 1203 for (String language : languagesNotFound) { 1204 RowData x = new RowData("ZZ", language); 1205 sortedInput.add(x); 1206 } 1207 1208 for (RowData row : sortedInput) { 1209 // see which countries have languages that are larger than any offical language 1210 1211 if (!row.officialStatus.isOfficial()) { 1212 //String country = row.countryCode; 1213 Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(row.countryCode); 1214 if (largestOffical != null && largestOffical.get1() < row.languagePopulation) { 1215 BadItem.WARNING.show("language population > all official languages", getLanguageCodeAndName(largestOffical.get0()), row.toString(true)); 1216 } 1217 } 1218 1219 // see which countries are missing an official language 1220 if (!countriesWithoutOfficial.contains(row.countryCode)) continue; 1221 BadItem.ERROR.show("missing official language", row.getCountryName() + "\t" + row.countryCode, row.toString(true)); 1222 countriesWithoutOfficial.remove(row.countryCode); 1223 } 1224 1225 // write out file for rick 1226 PrintWriter log = FileUtilities.openUTF8Writer(dir, ricksFile); 1227 log.println( 1228 "*\tCName" + 1229 "\tCCode" + 1230 "\tCPopulation" + 1231 "\tCLiteracy" + 1232 "\tCGdp" + 1233 "\tOfficialStatus" + 1234 "\tLanguage" + 1235 "\tLCode" + 1236 "\tLPopulation" + 1237 "\tWritingPop" + 1238 "\tReferences" + 1239 "\tNotes"); 1240 RickComparator rickSorting = new RickComparator(); 1241 Set<RowData> rickSorted = new TreeSet<>(rickSorting); 1242 rickSorted.addAll(sortedInput); 1243 1244 for (RowData row : rickSorted) { 1245 final String langLit = row.getLanguageLiteracyString(); 1246 final String countryLit = row.getCountryLiteracyString(); 1247 log.println( 1248 row.getCountryName() 1249 + "\t" + row.countryCode 1250 + "\t" + row.getCountryPopulationString() 1251 + "\t" + countryLit 1252 + "\t" + row.getCountryGdpString() 1253 + "\t" + (row.officialStatus == OfficialStatus.unknown ? "" : row.officialStatus) 1254 + "\t" + row.getRickLanguageName() 1255 + "\t" + row.getRickLanguageCode() 1256 + "\t" + row.getLanguagePopulationString() 1257 + "\t" + (langLit.equals(countryLit) ? "" : langLit) 1258 + "\t" + getExcelQuote(row.comment) 1259 + "\t" + getExcelQuote(row.notes)); 1260 } 1261 log.close(); 1262 return sortedInput; 1263 } 1264 getCldrParents(Set<String> available)1265 private static Set<String> getCldrParents(Set<String> available) { 1266 LanguageTagParser ltp2 = new LanguageTagParser(); 1267 Set<String> cldrParents = new TreeSet<>(); 1268 for (String locale : available) { 1269 if (skipLocales.contains(locale)) continue; 1270 try { 1271 ltp2.set(locale); 1272 } catch (RuntimeException e) { 1273 System.out.println("Skipping CLDR file: " + locale); 1274 continue; 1275 } 1276 String locale2 = ltp2.getLanguageScript(); 1277 if (locale2.equals("sh")) continue; 1278 // int lastPos = locale.lastIndexOf('_'); 1279 // if (lastPos < 0) continue; 1280 // String locale2 = locale.substring(0,lastPos); 1281 cldrParents.add(locale2); 1282 languageToMaxCountry.put(locale2, null); 1283 } 1284 //System.out.println("CLDR Parents: " + cldrParents); 1285 return cldrParents; 1286 } 1287 showFailures(List<String> failures)1288 private static void showFailures(List<String> failures) { 1289 if (failures.size() <= 1) { 1290 return; 1291 } 1292 System.out.println(); 1293 System.out.println("Failures in Output"); 1294 System.out.println(); 1295 1296 System.out.println(RowData.toStringHeader()); 1297 for (String failure : failures) { 1298 System.out.println(failure); 1299 } 1300 } 1301 getProcessedParent(String localeCode)1302 public static String getProcessedParent(String localeCode) { 1303 if (localeCode == null || localeCode.equals("root")) return null; 1304 int pos = localeCode.lastIndexOf('_'); 1305 if (pos < 0) return "root"; 1306 LanguageTagParser ltp = new LanguageTagParser(); 1307 String script = ltp.set(localeCode).getScript(); 1308 if (script.length() == 0) { 1309 return getFullyResolved(localeCode); 1310 } 1311 return localeCode.substring(0, pos); 1312 } 1313 getFullyResolved(String languageCode)1314 private static String getFullyResolved(String languageCode) { 1315 String result = defaultContent.get(languageCode); 1316 if (result != null) return result; 1317 // we missed. Try taking parent and trying again 1318 int pos = languageCode.length() + 1; 1319 while (true) { 1320 pos = languageCode.lastIndexOf('_', pos - 1); 1321 if (pos < 0) { 1322 return "***" + languageCode; 1323 } 1324 result = defaultContent.get(languageCode.substring(0, pos)); 1325 if (result != null) { 1326 LanguageTagParser ltp = new LanguageTagParser().set(languageCode); 1327 LanguageTagParser ltp2 = new LanguageTagParser().set(result); 1328 String region = ltp.getRegion(); 1329 if (region.length() == 0) { 1330 ltp.setRegion(ltp2.getRegion()); 1331 } 1332 String script = ltp.getScript(); 1333 if (script.length() == 0) { 1334 ltp.setScript(ltp2.getScript()); 1335 } 1336 return ltp.toString(); 1337 } 1338 } 1339 } 1340 1341 static Comparator<Iterable> firstElementComparator = new Comparator<Iterable>() { 1342 @Override 1343 public int compare(Iterable o1, Iterable o2) { 1344 int result = ((Comparable) o1.iterator().next()).compareTo((o2.iterator().next())); 1345 assert result != 0; 1346 return result; 1347 } 1348 }; 1349 showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent, Map<String, RowData> localeToRowData, Set<String> defaultLocaleContent)1350 private static void showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent, 1351 Map<String, RowData> localeToRowData, 1352 Set<String> defaultLocaleContent) { 1353 1354 if (SHOW_OLD_DEFAULT_CONTENTS) { 1355 System.out.println(); 1356 System.out.println("Computing Defaults Contents"); 1357 System.out.println(); 1358 } 1359 1360 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 1361 Set<String> locales = new TreeSet<>(cldrFactory.getAvailable()); 1362 LocaleIDParser lidp = new LocaleIDParser(); 1363 1364 // add all the combinations of language, script, and territory. 1365 for (String locale : localeToRowData.keySet()) { 1366 String baseLanguage = lidp.set(locale).getLanguage(); 1367 if (locales.contains(baseLanguage) && !locales.contains(locale)) { 1368 locales.add(locale); 1369 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding: " + locale); 1370 } 1371 } 1372 1373 // adding parents 1374 Set<String> toAdd = new TreeSet<>(); 1375 while (true) { 1376 for (String locale : locales) { 1377 String newguy = LocaleIDParser.getParent(locale); 1378 if (newguy != null && !locales.contains(newguy) && !toAdd.contains(newguy)) { 1379 toAdd.add(newguy); 1380 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding parent: " + newguy); 1381 } 1382 } 1383 if (toAdd.size() == 0) { 1384 break; 1385 } 1386 locales.addAll(toAdd); 1387 toAdd.clear(); 1388 } 1389 1390 // get sets of siblings 1391 Set<Set<String>> siblingSets = new TreeSet<>(firstElementComparator); 1392 Set<String> needsADoin = new TreeSet<>(locales); 1393 1394 Set<String> deprecatedLanguages = new TreeSet<>(); 1395 deprecatedLanguages.add("sh"); 1396 Set<String> deprecatedRegions = new TreeSet<>(); 1397 deprecatedRegions.add("YU"); 1398 deprecatedRegions.add("CS"); 1399 deprecatedRegions.add("ZZ"); 1400 1401 // first find all the language subtags that have scripts, and those we need to skip. Those are aliased-only 1402 Set<String> skippingItems = new TreeSet<>(); 1403 Set<String> hasAScript = new TreeSet<>(); 1404 //Set<LocaleIDParser.Level> languageOnly = EnumSet.of(LocaleIDParser.Level.Language); 1405 for (String locale : locales) { 1406 lidp.set(locale); 1407 if (lidp.getScript().length() != 0) { 1408 hasAScript.add(lidp.getLanguage()); 1409 } 1410 Set<LocaleIDParser.Level> levels = lidp.getLevels(); 1411 // must have no variants, must have either script or region, no deprecated elements 1412 if (levels.contains(LocaleIDParser.Level.Variants) // no variants 1413 || !(levels.contains(LocaleIDParser.Level.Script) 1414 || levels.contains(LocaleIDParser.Level.Region)) 1415 || deprecatedLanguages.contains(lidp.getLanguage()) 1416 || deprecatedRegions.contains(lidp.getRegion())) { 1417 // skip language-only locales, and ones with variants 1418 needsADoin.remove(locale); 1419 skippingItems.add(locale); 1420 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tremoving: " + locale); 1421 continue; 1422 } 1423 } 1424 // walk through the locales, getting the ones we care about. 1425 Map<String, Double> scriptLocaleToLanguageLiteratePopulation = new TreeMap<>(); 1426 1427 for (String locale : new TreeSet<>(needsADoin)) { 1428 if (!needsADoin.contains(locale)) continue; 1429 lidp.set(locale); 1430 Set<Level> level = lidp.getLevels(); 1431 // skip locales that need scripts and don't have them 1432 if (!level.contains(LocaleIDParser.Level.Script) // no script 1433 && hasAScript.contains(lidp.getLanguage())) { 1434 needsADoin.remove(locale); 1435 skippingItems.add(locale); 1436 continue; 1437 } 1438 // get siblings 1439 Set<String> siblingSet = lidp.getSiblings(needsADoin); 1440 // if it has a script and region 1441 if (level.contains(LocaleIDParser.Level.Script) && level.contains(LocaleIDParser.Level.Region)) { 1442 double languageLiteratePopulation = 0; 1443 for (String localeID2 : siblingSet) { 1444 RowData rowData = localeToRowData.get(localeID2); 1445 if (rowData != null) { 1446 languageLiteratePopulation += rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT); 1447 } 1448 } 1449 String parentID = LocaleIDParser.getParent(locale); 1450 scriptLocaleToLanguageLiteratePopulation.put(parentID, languageLiteratePopulation); 1451 } 1452 1453 try { 1454 siblingSets.add(siblingSet); 1455 } catch (RuntimeException e) { 1456 e.printStackTrace(); 1457 } 1458 needsADoin.removeAll(siblingSet); 1459 } 1460 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("ConvertLanguageData Skipping: " + skippingItems); 1461 if (needsADoin.size() != 0) { 1462 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("Missing: " + needsADoin); 1463 } 1464 1465 // walk through the data 1466 Set<String> skippingSingletons = new TreeSet<>(); 1467 1468 Set<String> missingData = new TreeSet<>(); 1469 for (Set<String> siblingSet : siblingSets) { 1470 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("** From siblings: " + siblingSet); 1471 1472 if (false & siblingSet.size() == 1) { 1473 skippingSingletons.add(siblingSet.iterator().next()); 1474 continue; 1475 } 1476 // get best 1477 double best = Double.NEGATIVE_INFINITY; 1478 String bestLocale = "???"; 1479 Set<Pair<Double, String>> data = new TreeSet<>(); 1480 LanguageTagParser ltp = new LanguageTagParser(); 1481 for (String locale : siblingSet) { 1482 RowData rowData = localeToRowData.get(locale); 1483 double languageLiteratePopulation = -1; 1484 if (rowData != null) { 1485 languageLiteratePopulation = rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT); 1486 } else { 1487 Double d = scriptLocaleToLanguageLiteratePopulation.get(locale); 1488 if (d != null) { 1489 languageLiteratePopulation = d; 1490 } else { 1491 final String region = ltp.set(locale).getRegion(); 1492 if (region.isEmpty() || StandardCodes.isCountry(region)) { 1493 missingData.add(locale); 1494 } 1495 } 1496 } 1497 data.add(new Pair<>(languageLiteratePopulation, locale)); 1498 if (best < languageLiteratePopulation) { 1499 best = languageLiteratePopulation; 1500 bestLocale = locale; 1501 } 1502 } 1503 // show it 1504 for (Pair<Double, String> datum : data) { 1505 if (SHOW_OLD_DEFAULT_CONTENTS) 1506 System.out.format( 1507 "\tContenders: %s %f (based on literate population)" + CldrUtility.LINE_SEPARATOR, 1508 datum.getSecond(), datum.getFirst()); 1509 } 1510 // System.out.format("\tPicking default content: %s %f (based on literate population)" + 1511 // Utility.LINE_SEPARATOR, bestLocale, best); 1512 // Hack to fix English 1513 // TODO Generalize in the future for other locales with non-primary scripts 1514 if (bestLocale.startsWith("en_")) { 1515 defaultLocaleContent.add("en_US"); 1516 } else { 1517 defaultLocaleContent.add(bestLocale); 1518 } 1519 } 1520 1521 for (String singleton : skippingSingletons) { 1522 BadItem.WARNING.show("skipping Singletons", singleton); 1523 } 1524 for (String missing : missingData) { 1525 BadItem.WARNING.show("Missing Data", missing); 1526 } 1527 1528 // LanguageTagParser ltp = new LanguageTagParser(); 1529 // Set<String> warnings = new LinkedHashSet(); 1530 // for (String languageCode : languageToMaxCountry.keySet()) { 1531 // CodeAndPopulation best = languageToMaxCountry.get(languageCode); 1532 // String languageSubtag = ltp.set(languageCode).getLanguage(); 1533 // String countryCode = "ZZ"; 1534 // double rawLanguagePopulation = -1; 1535 // if (best != null) { 1536 // countryCode = best.code; 1537 // rawLanguagePopulation = best.population; 1538 // Set<String> regions = LanguageInfo.INSTANCE.languageToRegions.get(languageSubtag); 1539 // if (regions == null || !regions.contains(countryCode)) { 1540 // Set<String> regions2 = LanguageInfo.INSTANCE.languageToRegionsAlt.get(languageSubtag); 1541 // if (regions2 == null || !regions2.contains(countryCode)) { 1542 // warnings.add("WARNING: " + languageCode + " => " + countryCode + ", not in " + regions + "/" + regions2); 1543 // } 1544 // } 1545 // } 1546 // String resolvedLanguageCode = languageCode + "_" + countryCode; 1547 // ltp.set(languageCode); 1548 // Set<String> scripts = LanguageInfo.INSTANCE.languageToScripts.get(languageCode); 1549 // String script = ltp.getScript(); 1550 // if (script.length() == 0) { 1551 // CodeAndPopulation bestScript = languageToMaxScript.get(languageCode); 1552 // if (bestScript != null) { 1553 // script = bestScript.code; 1554 // if (scripts == null || !scripts.contains(script)) { 1555 // warnings.add("WARNING: " + languageCode + " => " + script + ", not in " + scripts); 1556 // } 1557 // } else { 1558 // script = "Zzzz"; 1559 // if (scripts == null) { 1560 // scripts = LanguageInfo.INSTANCE.languageToScriptsAlt.get(languageCode); 1561 // } 1562 // if (scripts != null) { 1563 // script = scripts.iterator().next(); 1564 // if (scripts.size() != 1) { 1565 // warnings.add("WARNING: " + languageCode + " => " + scripts); 1566 // } 1567 // } 1568 // } 1569 // if (scripts == null) { 1570 // warnings.add("Missing scripts for: " + languageCode); 1571 // } else if (scripts.size() == 1){ 1572 // script = ""; 1573 // } 1574 // resolvedLanguageCode = languageCode 1575 // + (script.length() == 0 ? "" : "_" + script) 1576 // + "_" + countryCode; 1577 // } 1578 // 1579 // 1580 // System.out.println( 1581 // resolvedLanguageCode 1582 // + "\t" + languageCode 1583 // + "\t" + ULocale.getDisplayName(languageCode, ULocale.ENGLISH) 1584 // + "\t" + countryCode 1585 // + "\t" + ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH) 1586 // + "\t" + formatNumber(rawLanguagePopulation) 1587 // + (cldrParents.contains(languageCode) ? "\tCLDR" : "") 1588 // ); 1589 // if (languageCode.length() == 0) continue; 1590 // defaultContent.put(languageCode, resolvedLanguageCode); 1591 // } 1592 // for (String warning : warnings) { 1593 // System.out.println(warning); 1594 // } 1595 } 1596 1597 // private static void printDefaultContent(Set<String> defaultLocaleContent) { 1598 // String sep = Utility.LINE_SEPARATOR + "\t\t\t"; 1599 // String broken = Utility.breakLines(join(defaultLocaleContent," "), sep, PatternCache.get("(\\S)\\S*").matcher(""), 1600 // 80); 1601 // 1602 // Log.println("\t\t<defaultContent locales=\"" + broken + "\""); 1603 // Log.println("\t\t/>"); 1604 // } 1605 getSuppressScript(String languageCode)1606 private static Object getSuppressScript(String languageCode) { 1607 // TODO Auto-generated method stub 1608 return null; 1609 } 1610 join(Collection c, String separator)1611 public static String join(Collection c, String separator) { 1612 StringBuffer result = new StringBuffer(); 1613 boolean first = true; 1614 for (Object x : c) { 1615 if (first) 1616 first = false; 1617 else 1618 result.append(separator); 1619 result.append(x); 1620 } 1621 return result.toString(); 1622 } 1623 addBestRegion(String languageCode, String countryCode, double languagePopulationRaw)1624 private static void addBestRegion(String languageCode, String countryCode, double languagePopulationRaw) { 1625 addBest(languageCode, languagePopulationRaw, countryCode, languageToMaxCountry); 1626 } 1627 addBestScript(String languageCode, String scriptCode, double languagePopulationRaw)1628 private static void addBestScript(String languageCode, String scriptCode, double languagePopulationRaw) { 1629 addBest(languageCode, languagePopulationRaw, scriptCode, languageToMaxScript); 1630 } 1631 addBest(String languageCode, double languagePopulationRaw, String code, Map<String, CodeAndPopulation> languageToMaxCode)1632 private static void addBest(String languageCode, double languagePopulationRaw, String code, 1633 Map<String, CodeAndPopulation> languageToMaxCode) { 1634 if (languageCode.length() == 0) { 1635 throw new IllegalArgumentException(); 1636 } 1637 CodeAndPopulation best = languageToMaxCode.get(languageCode); 1638 if (best == null) { 1639 languageToMaxCode.put(languageCode, best = new CodeAndPopulation()); 1640 } else if (best.population >= languagePopulationRaw) { 1641 return; 1642 } 1643 best.population = languagePopulationRaw; 1644 best.code = code; 1645 } 1646 1647 static class CodeAndPopulation { 1648 String code = null; 1649 double population = Double.NaN; 1650 1651 @Override toString()1652 public String toString() { 1653 return "{" + code + "," + population + "}"; 1654 } 1655 } 1656 1657 static public class GeneralCollator implements Comparator<String> { 1658 static UTF16.StringComparator cpCompare = new UTF16.StringComparator(true, false, 0); 1659 static RuleBasedCollator UCA = (RuleBasedCollator) Collator 1660 .getInstance(ULocale.ROOT); 1661 static { 1662 UCA.setNumericCollation(true); 1663 } 1664 1665 @Override compare(String s1, String s2)1666 public int compare(String s1, String s2) { 1667 if (s1 == null) { 1668 return s2 == null ? 0 : -1; 1669 } else if (s2 == null) { 1670 return 1; 1671 } 1672 int result = UCA.compare(s1, s2); 1673 if (result != 0) return result; 1674 return cpCompare.compare(s1, s2); 1675 } 1676 } 1677 1678 public static class InverseComparator<T> implements Comparator<T> { 1679 private Comparator<T> other; 1680 InverseComparator()1681 public InverseComparator() { 1682 this.other = null; 1683 } 1684 InverseComparator(Comparator<T> other)1685 public InverseComparator(Comparator<T> other) { 1686 this.other = other; 1687 } 1688 1689 @Override compare(T a, T b)1690 public int compare(T a, T b) { 1691 return other == null 1692 ? ((Comparable) b).compareTo(a) 1693 : other.compare(b, a); 1694 } 1695 } 1696 1697 static Set<String> languagesNeeded = new TreeSet<>( 1698 Arrays 1699 .asList("ab ba bh bi bo fj fy gd ha ht ik iu ks ku ky lg mi na nb rm sa sd sg si sm sn su tg tk to tw vo yi za lb dv chr syr kha sco gv" 1700 .split("\\s"))); 1701 generateIso639_2Data()1702 static void generateIso639_2Data() { 1703 for (String languageSubtag : sc.getAvailableCodes("language")) { 1704 String alpha3 = Iso639Data.toAlpha3(languageSubtag); 1705 Type type = Iso639Data.getType(languageSubtag); 1706 Scope scope = Iso639Data.getScope(languageSubtag); 1707 if (type != null || alpha3 != null || scope != null) { 1708 Log.println("\t\t<languageCode type=\"" + languageSubtag + "\"" + 1709 (alpha3 == null ? "" : " iso639Alpha3=\"" + alpha3 + "\"") + 1710 (type == null ? "" : " iso639Type=\"" + type + "\"") + 1711 (scope == null ? "" : " iso639Scope=\"" + scope + "\"") + 1712 "/>"); 1713 } 1714 1715 } 1716 } 1717 1718 static Relation<String, BasicLanguageData> language2BasicLanguageData = Relation.of(new TreeMap<String, Set<BasicLanguageData>>(), TreeSet.class); 1719 1720 static Map<String, Relation<BasicLanguageData.Type, String>> language_status_scripts; 1721 static Map<Pair<String, String>, String> language_script_references = new TreeMap<>(); 1722 1723 static final Map<String, Map<String, R2<List<String>, String>>> LOCALE_ALIAS_INFO = SupplementalDataInfo 1724 .getInstance().getLocaleAliasInfo(); 1725 getLanguage2Scripts(Set<RowData> sortedInput)1726 static void getLanguage2Scripts(Set<RowData> sortedInput) throws IOException { 1727 language_status_scripts = new TreeMap<>(); 1728 1729 // // get current scripts 1730 // Relation<String,String> languageToDefaultScript = new Relation(new TreeMap(), TreeSet.class); 1731 // Relation<String,String> secondaryLanguageToDefaultScript = new Relation(new TreeMap(), TreeSet.class); 1732 // for (String languageSubtag : language2BasicLanguageData.keySet()) { 1733 // for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 1734 // for (String script : item.getScripts()) { 1735 // addLanguage2Script(languageSubtag, item.getType(), script); 1736 // } 1737 // } 1738 // } 1739 // System.out.println("Language 2 scripts: " + language_status_scripts); 1740 1741 // #Lcode LanguageName Status Scode ScriptName References 1742 List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data("language_script_raw.txt")); 1743 System.out.println(CldrUtility.LINE_SEPARATOR + "# Problems in language_script_raw.txt" 1744 + CldrUtility.LINE_SEPARATOR); 1745 //int count = -1; 1746 for (List<String> row : input) { 1747 try { 1748 if (row.size() == 0) continue; 1749 //++count; 1750 String language = row.get(0).trim(); 1751 if (language.length() == 0 || language.startsWith("#")) continue; 1752 BasicLanguageData.Type status = BasicLanguageData.Type.valueOf(row.get(2)); 1753 String scripts = row.get(3); 1754 if (!checkCode(LstrType.language, language, row)) continue; 1755 for (String script : scripts.split("\\s+")) { 1756 if (!checkCode(LstrType.script, script, row)) continue; 1757 // if the script is not modern, demote 1758 Info scriptInfo = ScriptMetadata.getInfo(script); 1759 if (scriptInfo == null) { 1760 BadItem.ERROR.toString("illegal script; must be represented in Unicode, remove line or fix", script, row); 1761 continue; 1762 } 1763 IdUsage idUsage = scriptInfo.idUsage; 1764 if (status == BasicLanguageData.Type.primary && idUsage != IdUsage.RECOMMENDED) { 1765 if (idUsage == IdUsage.ASPIRATIONAL || idUsage == IdUsage.LIMITED_USE) { 1766 BadItem.WARNING.toString("Script has unexpected usage; make secondary if a Recommended script is used widely for the langauge", 1767 idUsage + ", " + script + "=" + getULocaleScriptName(script), row); 1768 } else { 1769 BadItem.ERROR.toString("Script is not modern; make secondary", idUsage + ", " + script + "=" + getULocaleScriptName(script), row); 1770 status = BasicLanguageData.Type.secondary; 1771 } 1772 } 1773 1774 // if the language is not modern, demote 1775 if (LOCALE_ALIAS_INFO.get("language").containsKey(language)) { 1776 BadItem.ERROR.toString("Remove/Change deprecated language", language + " " 1777 + getLanguageName(language) + "; " + LOCALE_ALIAS_INFO.get("language").get(language), row); 1778 continue; 1779 } 1780 if (status == BasicLanguageData.Type.primary && !sc.isModernLanguage(language)) { 1781 BadItem.ERROR.toString("Should be secondary, language is not modern", language + " " + getLanguageName(language), row); 1782 status = BasicLanguageData.Type.secondary; 1783 } 1784 1785 addLanguage2Script(language, status, script); 1786 if (row.size() > 5) { 1787 String reference = row.get(5); 1788 if (reference != null && reference.length() == 0) { 1789 language_script_references.put(new Pair<>(language, script), reference); 1790 } 1791 } 1792 } 1793 } catch (RuntimeException e) { 1794 System.err.println(row); 1795 throw e; 1796 } 1797 } 1798 1799 // System.out.println("Language 2 scripts: " + language_status_scripts); 1800 1801 for (String language : sc.getGoodAvailableCodes("language")) { 1802 if (supplementalData.getDeprecatedInfo("language", language) != null) { 1803 continue; 1804 } 1805 Map<String, String> registryData = sc.getLangData("language", language); 1806 if (registryData != null) { 1807 String suppressScript = registryData.get("Suppress-Script"); 1808 if (suppressScript == null) continue; 1809 if (ScriptMetadata.getInfo(suppressScript) == null) { 1810 // skip, not represented in Unicode 1811 continue; 1812 } 1813 // if there is something already there, we have a problem. 1814 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1815 if (status_scripts == null) { 1816 System.out 1817 .println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript); 1818 } else if (!status_scripts.values().contains(suppressScript)) { 1819 System.out.println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript 1820 + "\tall:\t" + status_scripts.values()); 1821 } else { 1822 // at this point, the suppressScript is in the union of the primary and secondary. 1823 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary); 1824 if (primaryScripts != null && !primaryScripts.contains(suppressScript)) { 1825 System.out.println("Suppress-Script is not in primary: " + language + "\tSuppress-Script:\t" 1826 + suppressScript + "\tprimary:\t" 1827 + primaryScripts); 1828 } 1829 } 1830 addLanguage2Script(language, BasicLanguageData.Type.primary, suppressScript); 1831 } 1832 } 1833 1834 // remove primaries from secondaries 1835 // check for primaries for scripts 1836 for (String language : language_status_scripts.keySet()) { 1837 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1838 Set<String> secondaryScripts = status_scripts.getAll(BasicLanguageData.Type.secondary); 1839 if (secondaryScripts == null) continue; 1840 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary); 1841 if (primaryScripts == null) { 1842 // status_scripts.putAll(BasicLanguageData.Type.primary, secondaryScripts); 1843 // status_scripts.removeAll(BasicLanguageData.Type.secondary); 1844 if (sc.isModernLanguage(language)) { 1845 BadItem.ERROR.show("modern language without primary script, might need to edit moribund_languages.txt", language + " " 1846 + getLanguageName(language)); 1847 } 1848 } else { 1849 status_scripts.removeAll(BasicLanguageData.Type.secondary, primaryScripts); 1850 } 1851 } 1852 1853 // check that every living language in the row data has a script 1854 Set<String> livingLanguagesWithTerritories = new TreeSet<>(); 1855 for (RowData rowData : sortedInput) { 1856 String language = rowData.languageCode; 1857 if (sc.isModernLanguage(language) && Iso639Data.getSource(language) != Iso639Data.Source.ISO_639_3) { 1858 livingLanguagesWithTerritories.add(language); 1859 } 1860 } 1861 for (String language : livingLanguagesWithTerritories) { 1862 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1863 if (status_scripts != null) { 1864 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary); 1865 if (primaryScripts != null && primaryScripts.size() > 0) { 1866 continue; 1867 } 1868 } 1869 if (language.equals("tw")) continue; // TODO load aliases and check... 1870 BadItem.WARNING.show("ISO 639-1/2 language in language-territory list without primary script", language + "\t" + getLanguageName(language)); 1871 } 1872 1873 // System.out.println("Language 2 scripts: " + language_status_scripts); 1874 } 1875 checkScript(String script)1876 private static boolean checkScript(String script) { 1877 // TODO Auto-generated method stub 1878 return false; 1879 } 1880 1881 static Validity VALIDITY = Validity.getInstance(); 1882 checkCode(LstrType type, String code, List<String> sourceLine)1883 private static boolean checkCode(LstrType type, String code, List<String> sourceLine) { 1884 Status validity = VALIDITY.getCodeToStatus(type).get(code); 1885 if (validity == Status.regular) { 1886 if (type == LstrType.language && code.equals("no")) { 1887 validity = Status.invalid; 1888 } else { 1889 return true; 1890 } 1891 } else if (validity == Status.unknown && type == LstrType.region) { 1892 return true; 1893 } 1894 BadItem.ERROR.show("Illegitimate Code", type + ": " + code + " = " + validity, sourceLine); 1895 return false; 1896 } 1897 addLanguage2Script(String language, BasicLanguageData.Type type, String script)1898 private static void addLanguage2Script(String language, BasicLanguageData.Type type, String script) { 1899 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1900 if (status_scripts == null) 1901 language_status_scripts.put(language, status_scripts = Relation.of(new TreeMap<BasicLanguageData.Type, Set<String>>(), TreeSet.class)); 1902 status_scripts.put(type, script); 1903 } 1904 addLanguageScriptData()1905 static void addLanguageScriptData() throws IOException { 1906 // check to make sure that every language subtag is in 639-3 1907 Set<String> langRegistryCodes = sc.getGoodAvailableCodes("language"); 1908 // Set<String> iso639_2_missing = new TreeSet(langRegistryCodes); 1909 // iso639_2_missing.removeAll(Iso639Data.getAvailable()); 1910 // iso639_2_missing.remove("root"); 1911 // if (iso639_2_missing.size() != 0) { 1912 // for (String missing : iso639_2_missing){ 1913 // System.out.println("*ERROR in StandardCodes* Missing Lang/Script data:\t" + missing + ", " + 1914 // sc.getData("language", missing)); 1915 // } 1916 // } 1917 1918 // Map<String, String> nameToTerritoryCode = new TreeMap(); 1919 // for (String territoryCode : sc.getGoodAvailableCodes("territory")) { 1920 // nameToTerritoryCode.put(sc.getData("territory", territoryCode).toLowerCase(), territoryCode); 1921 // } 1922 // nameToTerritoryCode.put("iran", nameToTerritoryCode.get("iran, islamic republic of")); // 1923 1924 //BasicLanguageData languageData = new BasicLanguageData(); 1925 1926 BufferedReader in = CldrUtility.getUTF8Data("extraLanguagesAndScripts.txt"); 1927 while (true) { 1928 String line = in.readLine(); 1929 if (line == null) break; 1930 String[] parts = line.split("\\t"); 1931 String alpha3 = parts[0]; 1932 alpha3 = stripBrackets(alpha3); 1933 String languageSubtag = Iso639Data.fromAlpha3(alpha3); 1934 if (languageSubtag == null) { 1935 if (langRegistryCodes.contains(alpha3)) { 1936 languageSubtag = alpha3; 1937 } else { 1938 BadItem.WARNING.show("Language subtag not found on line", alpha3, line); 1939 continue; 1940 } 1941 } 1942 //String name = parts[1]; 1943 Set<String> names = Iso639Data.getNames(languageSubtag); 1944 if (names == null) { 1945 Map<String, String> name2 = sc.getLangData("language", languageSubtag); 1946 if (name2 != null) { 1947 String name3 = name2.get("Description"); 1948 if (name3 != null) { 1949 names = new TreeSet<>(); 1950 names.add(name3); 1951 } 1952 } 1953 } 1954 // if (names == null || !names.contains(name)) { 1955 // System.out.println("Name <" + name + "> for <" + languageSubtag + "> not found in " + names); 1956 // } 1957 1958 // names all straight, now get scripts and territories 1959 // [Cyrl]; [Latn] 1960 Set<String> fullScriptList = sc.getGoodAvailableCodes("script"); 1961 1962 String[] scriptList = parts[2].split("[;,]\\s*"); 1963 Set<String> scripts = new TreeSet<>(); 1964 Set<String> scriptsAlt = new TreeSet<>(); 1965 for (String script : scriptList) { 1966 if (script.length() == 0) continue; 1967 boolean alt = false; 1968 if (script.endsWith("*")) { 1969 alt = true; 1970 script = script.substring(0, script.length() - 1); 1971 } 1972 script = stripBrackets(script); 1973 if (!fullScriptList.contains(script)) { 1974 System.out.println("Script <" + script + "> for <" + languageSubtag + "> not found in " 1975 + fullScriptList); 1976 } else if (alt) { 1977 scriptsAlt.add(script); 1978 } else { 1979 scripts.add(script); 1980 } 1981 } 1982 // now territories 1983 Set<String> territories = new TreeSet<>(); 1984 if (parts.length > 4) { 1985 String[] territoryList = parts[4].split("\\s*[;,-]\\s*"); 1986 for (String territoryName : territoryList) { 1987 if (territoryName.equals("ISO/DIS 639") || territoryName.equals("3")) continue; 1988 String territoryCode = CountryCodeConverter.getCodeFromName(territoryName, true); 1989 if (territoryCode == null) { 1990 BadItem.ERROR.show("no name found for territory", "<" + territoryName + ">", languageSubtag); 1991 } else { 1992 territories.add(territoryCode); 1993 } 1994 } 1995 } 1996 // <language type="de" scripts="Latn" territories="IT" alt="secondary"/> 1997 // we're going to go ahead and set these all to secondary. 1998 if (scripts.size() != 0) { 1999 language2BasicLanguageData.put(languageSubtag, 2000 new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scripts) 2001 .setTerritories(territories)); 2002 } 2003 if (scriptsAlt.size() != 0) { 2004 language2BasicLanguageData.put(languageSubtag, 2005 new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scriptsAlt) 2006 .setTerritories(territories)); 2007 } 2008 } 2009 in.close(); 2010 2011 // add other data 2012 for (String languageSubtag : supplementalData.getBasicLanguageDataLanguages()) { 2013 Set<BasicLanguageData> otherData = supplementalData.getBasicLanguageData(languageSubtag); 2014 language2BasicLanguageData.putAll(languageSubtag, otherData); 2015 } 2016 } 2017 2018 // private static void showAllBasicLanguageData(Relation<String, BasicLanguageData> language2basicData, String 2019 // comment) { 2020 // // now print 2021 // Relation<String, String> primaryCombos = new Relation(new TreeMap(), TreeSet.class); 2022 // Relation<String, String> secondaryCombos = new Relation(new TreeMap(), TreeSet.class); 2023 // 2024 // Log.println("\t<languageData>" + (comment == null ? "" : " <!-- " + comment + " -->")); 2025 // 2026 // for (String languageSubtag : language2basicData.keySet()) { 2027 // String duplicate = ""; 2028 // // script,territory 2029 // primaryCombos.clear(); 2030 // secondaryCombos.clear(); 2031 // 2032 // for (BasicLanguageData item : language2basicData.getAll(languageSubtag)) { 2033 // Set<String> scripts = item.getScripts(); 2034 // if (scripts.size() == 0) scripts = new TreeSet(Arrays.asList(new String[] { "Zzzz" })); 2035 // for (String script : scripts) { 2036 // Set<String> territories = item.getTerritories(); 2037 // if (territories.size() == 0) territories = new TreeSet(Arrays.asList(new String[] { "ZZ" })); 2038 // for (String territory : territories) { 2039 // if (item.getType().equals(BasicLanguageData.Type.primary)) { 2040 // primaryCombos.put(script, territory); 2041 // } else { 2042 // secondaryCombos.put(script, territory); 2043 // } 2044 // } 2045 // } 2046 // } 2047 // secondaryCombos.removeAll(primaryCombos); 2048 // showBasicLanguageData(languageSubtag, primaryCombos, null, BasicLanguageData.Type.primary); 2049 // showBasicLanguageData(languageSubtag, secondaryCombos, primaryCombos.keySet(), 2050 // BasicLanguageData.Type.secondary); 2051 // // System.out.println(item.toString(languageSubtag) + duplicate); 2052 // // duplicate = " <!-- " + "**" + " -->"; 2053 // } 2054 // Log.println("\t</languageData>"); 2055 // } 2056 showBasicLanguageData(String languageSubtag, Relation<String, String> primaryCombos, Set<String> suppressEmptyScripts, BasicLanguageData.Type type)2057 private static void showBasicLanguageData(String languageSubtag, Relation<String, String> primaryCombos, 2058 Set<String> suppressEmptyScripts, BasicLanguageData.Type type) { 2059 Set<String> scriptsWithSameTerritories = new TreeSet<>(); 2060 Set<String> lastTerritories = Collections.emptySet(); 2061 for (String script : primaryCombos.keySet()) { 2062 Set<String> territories = primaryCombos.getAll(script); 2063 if (lastTerritories == Collections.EMPTY_SET) { 2064 // skip first 2065 } else if (lastTerritories.equals(territories)) { 2066 scriptsWithSameTerritories.add(script); 2067 } else { 2068 showBasicLanguageData2(languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts, 2069 lastTerritories, type); 2070 scriptsWithSameTerritories.clear(); 2071 } 2072 lastTerritories = territories; 2073 scriptsWithSameTerritories.add(script); 2074 } 2075 showBasicLanguageData2(languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts, lastTerritories, type); 2076 } 2077 showBasicLanguageData2(String languageSubtag, Set<String> scripts, Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type)2078 private static void showBasicLanguageData2(String languageSubtag, Set<String> scripts, 2079 Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type) { 2080 scripts.remove("Zzzz"); 2081 territories.remove("ZZ"); 2082 if (territories.size() == 0 && suppressEmptyScripts != null) { 2083 scripts.removeAll(suppressEmptyScripts); 2084 } 2085 if (scripts.size() == 0 && territories.size() == 0) return; 2086 Log.println("\t\t<language type=\"" + languageSubtag + "\"" + 2087 (scripts.size() == 0 ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"") + 2088 (territories.size() == 0 ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"") + 2089 (type == BasicLanguageData.Type.primary ? "" : " alt=\"" + type + "\"") + 2090 "/>"); 2091 } 2092 2093 /* 2094 * System.out.println( 2095 * "\t\t<language type=\"" + languageSubtag + "\"" + 2096 * " scripts=\"" + Utility.join(scripts," ") + "\"" + 2097 * (territories.size() == 0 ? "" : " territories=\"" + Utility.join(territories," ") + "\"") + 2098 * "/>" 2099 * ); 2100 */ 2101 stripBrackets(String alpha3)2102 private static String stripBrackets(String alpha3) { 2103 if (alpha3.startsWith("[") && alpha3.endsWith("]")) { 2104 alpha3 = alpha3.substring(1, alpha3.length() - 1); 2105 } 2106 return alpha3; 2107 } 2108 2109 static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH); 2110 static NumberFormat nf_no_comma = NumberFormat.getInstance(ULocale.ENGLISH); 2111 static { 2112 nf_no_comma.setGroupingUsed(false); 2113 } 2114 static NumberFormat pf = NumberFormat.getPercentInstance(ULocale.ENGLISH); 2115 formatNumber(double original, int roundDigits, boolean xml)2116 public static String formatNumber(double original, int roundDigits, boolean xml) { 2117 double d = original; 2118 if (roundDigits != 0) { 2119 d = CldrUtility.roundToDecimals(original, roundDigits); 2120 } 2121 if (Double.isNaN(d)) { 2122 d = CldrUtility.roundToDecimals(original, roundDigits); 2123 throw new IllegalArgumentException("Double is NaN"); 2124 } 2125 if (xml) { 2126 return nf_no_comma.format(d); 2127 } 2128 return nf.format(d); 2129 } 2130 formatPercent(double d, int roundDigits, boolean xml)2131 public static String formatPercent(double d, int roundDigits, boolean xml) { 2132 if (roundDigits != 0) { 2133 d = CldrUtility.roundToDecimals(d, roundDigits); 2134 } 2135 if (xml) { 2136 nf_no_comma.setMaximumFractionDigits(roundDigits + 2); 2137 return nf_no_comma.format(d * 100.0); 2138 } 2139 pf.setMaximumFractionDigits(roundDigits + 2); 2140 return pf.format(d); 2141 } 2142 2143 static final LanguageTagCanonicalizer languageTagCanonicalizer = new LanguageTagCanonicalizer(); 2144 fixLanguageCode(String languageCodeRaw, List<String> row)2145 private static String fixLanguageCode(String languageCodeRaw, List<String> row) { 2146 String languageCode = languageTagCanonicalizer.transform(languageCodeRaw); 2147 if (DEBUG && !languageCode.equals(languageCodeRaw)) { 2148 System.out.println("## " + languageCodeRaw + " => " + languageCode); 2149 } 2150 int bar = languageCode.indexOf('_'); 2151 String script = ""; 2152 if (bar >= 0) { 2153 script = languageCode.substring(bar); 2154 languageCode = languageCode.substring(0, bar); 2155 } 2156 R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("language").get(languageCode); 2157 if (replacement != null) { 2158 String replacementCode = replacement.get0().get(0); 2159 BadItem.ERROR.show("deprecated language code", languageCode + " => " + replacementCode, row); 2160 languageCode = replacementCode; 2161 } 2162 if (!sc.getAvailableCodes("language").contains(languageCode)) { 2163 BadItem.ERROR.show("bad language code", languageCode, row); 2164 } 2165 return languageCode + script; 2166 } 2167 2168 enum BadItem { 2169 ERROR, WARNING, DETAIL; 2170 show(String problem, String details, String... items)2171 void show(String problem, String details, String... items) { 2172 System.out.println(toString(problem, details, items)); 2173 } 2174 show(String problem, String details, List<String> row)2175 void show(String problem, String details, List<String> row) { 2176 System.out.println(toString(problem, details, row)); 2177 } 2178 toString(String problem, String details, String... items)2179 private String toString(String problem, String details, String... items) { 2180 return toString(problem, details, Arrays.asList(items)); 2181 } 2182 toString(String problem, String details, List<String> row)2183 private String toString(String problem, String details, List<String> row) { 2184 return "* " + this 2185 + " *\t" + problem + ":" 2186 + "\t" + details 2187 + (row != null && row.size() > 0 ? "\t" + Joiner.on("\t").join(row) : ""); 2188 } 2189 } 2190 fixCountryCode(String countryCode, List<String> row)2191 private static String fixCountryCode(String countryCode, List<String> row) { 2192 R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("territory").get(countryCode); 2193 if (replacement != null) { 2194 String replacementCode = replacement.get0().get(0); 2195 BadItem.ERROR.show("deprecated territory code", countryCode + " => " + replacementCode, row); 2196 countryCode = replacementCode; 2197 } 2198 if (!sc.getAvailableCodes("territory").contains(countryCode)) { 2199 BadItem.ERROR.show("bad territory code", countryCode, row); 2200 } 2201 return countryCode; 2202 } 2203 getULocaleLocaleName(String languageCode)2204 private static String getULocaleLocaleName(String languageCode) { 2205 return english.getName(languageCode, true); 2206 //return new ULocale(languageCode).getDisplayName(); 2207 } 2208 getULocaleScriptName(String scriptCode)2209 private static String getULocaleScriptName(String scriptCode) { 2210 return english.getName(CLDRFile.SCRIPT_NAME, scriptCode); 2211 // return ULocale.getDisplayScript("und_" + scriptCode, ULocale.ENGLISH); 2212 } 2213 getULocaleCountryName(String countryCode)2214 private static String getULocaleCountryName(String countryCode) { 2215 return english.getName(CLDRFile.TERRITORY_NAME, countryCode); 2216 //return ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH); 2217 } 2218 } 2219