1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.IOException; 5 import java.io.PrintWriter; 6 import java.text.ParseException; 7 import java.util.ArrayList; 8 import java.util.Arrays; 9 import java.util.Collection; 10 import java.util.Collections; 11 import java.util.Comparator; 12 import java.util.EnumMap; 13 import java.util.HashMap; 14 import java.util.HashSet; 15 import java.util.Iterator; 16 import java.util.LinkedHashSet; 17 import java.util.List; 18 import java.util.Map; 19 import java.util.Set; 20 import java.util.TreeMap; 21 import java.util.TreeSet; 22 import java.util.regex.Matcher; 23 24 import org.unicode.cldr.draft.FileUtilities; 25 import org.unicode.cldr.draft.ScriptMetadata; 26 import org.unicode.cldr.draft.ScriptMetadata.IdUsage; 27 import org.unicode.cldr.draft.ScriptMetadata.Info; 28 import org.unicode.cldr.util.Builder; 29 import org.unicode.cldr.util.CLDRFile; 30 import org.unicode.cldr.util.CLDRPaths; 31 import org.unicode.cldr.util.CldrUtility; 32 import org.unicode.cldr.util.Factory; 33 import org.unicode.cldr.util.Iso639Data; 34 import org.unicode.cldr.util.Iso639Data.Scope; 35 import org.unicode.cldr.util.Iso639Data.Source; 36 import org.unicode.cldr.util.Iso639Data.Type; 37 import org.unicode.cldr.util.LanguageTagCanonicalizer; 38 import org.unicode.cldr.util.LanguageTagParser; 39 import org.unicode.cldr.util.LocaleIDParser; 40 import org.unicode.cldr.util.LocaleIDParser.Level; 41 import org.unicode.cldr.util.Log; 42 import org.unicode.cldr.util.Pair; 43 import org.unicode.cldr.util.PatternCache; 44 import org.unicode.cldr.util.SpreadSheet; 45 import org.unicode.cldr.util.StandardCodes; 46 import org.unicode.cldr.util.StandardCodes.LstrType; 47 import org.unicode.cldr.util.SupplementalDataInfo; 48 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 49 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 50 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 51 import org.unicode.cldr.util.TransliteratorUtilities; 52 import org.unicode.cldr.util.Validity; 53 import org.unicode.cldr.util.Validity.Status; 54 import org.unicode.cldr.util.XPathParts; 55 import org.unicode.cldr.util.XPathParts.Comments; 56 57 import com.google.common.collect.ImmutableSet; 58 import com.google.common.math.DoubleMath; 59 import com.ibm.icu.dev.util.CollectionUtilities; 60 import com.ibm.icu.impl.Relation; 61 import com.ibm.icu.impl.Row; 62 import com.ibm.icu.impl.Row.R2; 63 import com.ibm.icu.text.Collator; 64 import com.ibm.icu.text.NumberFormat; 65 import com.ibm.icu.text.RuleBasedCollator; 66 import com.ibm.icu.text.UTF16; 67 import com.ibm.icu.util.ULocale; 68 69 /** 70 * @author markdavis 71 * 72 */ 73 public class ConvertLanguageData { 74 75 private static final boolean DEBUG = false; 76 // change this if you need to override what is generated for the default contents. 77 private static final List<String> defaultOverrides = Arrays.asList("es_ES".split("\\s+")); // und_ZZ 78 79 public static final boolean SHOW_DIFF = false; 80 81 private static final boolean ALLOW_SMALL_NUMBERS = true; 82 83 static final Comparator<String> GENERAL_COLLATOR = new GeneralCollator(); 84 static final Comparator<String> INVERSE_GENERAL = new InverseComparator<String>(GENERAL_COLLATOR); 85 86 private static StandardCodes sc = StandardCodes.make(); 87 88 static final double populationFactor = 1; 89 static final double gdpFactor = 1; 90 static final int BAD_COUNTRY_NAME = 0, COUNTRY_CODE = 1, COUNTRY_POPULATION = 2, COUNTRY_LITERACY = 3, 91 COUNTRY_GDP = 4, OFFICIAL_STATUS = 5, BAD_LANGUAGE_NAME = 6, LANGUAGE_CODE = 7, LANGUAGE_POPULATION = 8, 92 LANGUAGE_LITERACY = 9, COMMENT = 10, NOTES = 11; 93 static final Map<String, CodeAndPopulation> languageToMaxCountry = new TreeMap<String, CodeAndPopulation>(); 94 static final Map<String, CodeAndPopulation> languageToMaxScript = new TreeMap<String, CodeAndPopulation>(); 95 96 private static final double NON_OFFICIAL_WEIGHT = 0.40; 97 98 private static final boolean SHOW_OLD_DEFAULT_CONTENTS = false; 99 100 private static final ImmutableSet<String> scriptAssumedLocales = ImmutableSet.of( 101 "bm_ML", "ha_GH", "ha_NE", "ha_NG", "kk_KZ", "ks_IN", "ky_KG", "mn_MN", "ms_BN", "ms_MY", "ms_SG", "tk_TM", "tzm_MA", "ug_CN"); 102 103 static Set<String> skipLocales = new HashSet<String>( 104 Arrays 105 .asList( 106 "sh sh_BA sh_CS sh_YU characters supplementalData supplementalData-old supplementalData-old2 supplementalData-old3 supplementalMetadata root" 107 .split("\\s"))); 108 109 static Map<String, String> defaultContent = new TreeMap<String, String>(); 110 111 static Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 112 static CLDRFile english = cldrFactory.make("en", true); 113 114 static SupplementalDataInfo supplementalData = SupplementalDataInfo 115 .getInstance(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY); 116 main(String[] args)117 public static void main(String[] args) throws IOException, ParseException { 118 BufferedReader oldFile = null; 119 try { 120 // load elements we care about 121 Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalData.xml"); 122 // Log.println("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"); 123 // Log.println("<!DOCTYPE supplementalData SYSTEM \"http://www.unicode.org/cldr/data/dtd/ldmlSupplemental.dtd\">"); 124 // Log.println("<supplementalData version=\"1.5\">"); 125 126 oldFile = FileUtilities.openUTF8Reader(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY, "supplementalData.xml"); 127 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<languageData>\\s*"), Log.getLog(), false); 128 129 Set<String> available = cldrFactory.getAvailable(); 130 131 Set<String> cldrParents = getCldrParents(available); 132 133 List<String> failures = new ArrayList<String>(); 134 Map<String, RowData> localeToRowData = new TreeMap<String, RowData>(); 135 136 Set<RowData> sortedInput = getExcelData(failures, localeToRowData); 137 138 // get the locales (including parents) 139 Set<String> localesWithData = new TreeSet<String>(localeToRowData.keySet()); 140 for (String locale : localeToRowData.keySet()) { 141 while (true) { 142 String parent = LocaleIDParser.getParent(locale); 143 if (parent == null) break; 144 localesWithData.add(parent); 145 locale = parent; 146 } 147 } 148 149 final LanguageTagParser languageTagParser = new LanguageTagParser(); 150 151 for (String localeRaw : available) { 152 String locale = languageTagCanonicalizer.transform(localeRaw); 153 if (!localesWithData.contains(locale)) { 154 CLDRFile locFile = cldrFactory.make(localeRaw, false); 155 if (locFile.isAliasedAtTopLevel()) { 156 continue; 157 } 158 if (scriptAssumedLocales.contains(locale)) { 159 continue; 160 } 161 languageTagParser.set(locale); 162 if (languageTagParser.getVariants().size() != 0) { 163 continue; 164 } 165 String withoutScript = languageTagParser.setScript("").toString(); 166 if (!localesWithData.contains(withoutScript)) { 167 String region = new LanguageTagParser().set(locale).getRegion(); 168 if (StandardCodes.isCountry(region)) { 169 BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale)); 170 } 171 } else { 172 // These exceptions are OK, because these locales by default use the non-default script 173 Set<String> OKExceptions = ImmutableSet.of("sr_Cyrl_ME", "zh_Hans_HK", "zh_Hans_MO"); 174 if (OKExceptions.contains(locale)) { 175 continue; 176 } 177 BadItem.ERROR.show("missing language/population data for CLDR locale", locale + " = " + getLanguageCodeAndName(locale) 178 + " but have data for " + getLanguageCodeAndName(withoutScript)); 179 } 180 } 181 } 182 183 // TODO sort by country code, then functionalPopulation, then language code 184 // and keep the top country for each language code (even if < 1%) 185 186 addLanguageScriptData(); 187 188 // showAllBasicLanguageData(allLanguageData, "old"); 189 getLanguage2Scripts(sortedInput); 190 191 writeNewBasicData2(sortedInput); 192 // writeNewBasicData(sortedInput); 193 194 writeTerritoryLanguageData(failures, sortedInput); 195 196 checkBasicData(localeToRowData); 197 198 Set<String> defaultLocaleContent = new TreeSet<String>(); 199 200 showDefaults(cldrParents, nf, defaultContent, localeToRowData, defaultLocaleContent); 201 202 // showContent(available); 203 204 // certain items are overridden 205 206 List<String> toRemove = new ArrayList<String>(); 207 for (String override : defaultOverrides) { 208 String replacement = getReplacement(override, defaultLocaleContent); 209 if (replacement != null) { 210 toRemove.add(replacement); 211 } 212 } 213 defaultLocaleContent.removeAll(toRemove); 214 defaultLocaleContent.addAll(defaultOverrides); 215 216 showFailures(failures); 217 218 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</territoryInfo>\\s*"), null, false); 219 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<references>\\s*"), Log.getLog(), false); 220 // generateIso639_2Data(); 221 references.printReferences(); 222 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*</references>\\s*"), null, false); 223 CldrUtility.copyUpTo(oldFile, null, Log.getLog(), false); 224 // Log.println("</supplementalData>"); 225 Log.close(); 226 oldFile.close(); 227 228 Log.setLog(CLDRPaths.GEN_DIRECTORY + "/supplemental", "language_script_raw.txt"); 229 getLanguageScriptSpreadsheet(Log.getLog()); 230 Log.close(); 231 } catch (Exception e) { 232 e.printStackTrace(); 233 } finally { 234 if (oldFile != null) { 235 oldFile.close(); 236 } 237 System.out.println("DONE"); 238 } 239 } 240 getLanguageCodeAndName(String code)241 public static String getLanguageCodeAndName(String code) { 242 if (code == null) return null; 243 return english.getName(code) + " [" + code + "]"; 244 } 245 getReplacement(String oldDefault, Set<String> defaultLocaleContent)246 private static String getReplacement(String oldDefault, Set<String> defaultLocaleContent) { 247 String parent = LocaleIDParser.getParent(oldDefault); 248 for (String replacement : defaultLocaleContent) { 249 if (replacement.startsWith(parent)) { 250 if (parent.equals(LocaleIDParser.getParent(replacement))) { 251 return replacement; 252 } 253 } 254 } 255 return null; 256 } 257 getLanguageScriptSpreadsheet(PrintWriter out)258 private static void getLanguageScriptSpreadsheet(PrintWriter out) { 259 out.println("#Lcode LanguageName Status Scode ScriptName References"); 260 Pair<String, String> languageScript = new Pair<String, String>("", ""); 261 for (String language : language_status_scripts.keySet()) { 262 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 263 for (BasicLanguageData.Type status : status_scripts.keySet()) { 264 for (String script : status_scripts.getAll(status)) { 265 String reference = language_script_references.get(languageScript.setFirst(language).setSecond( 266 script)); 267 out.println(language + "\t" + getLanguageName(language) + "\t" + status + "\t" + script + "\t" 268 + getDisplayScript(script) 269 + (reference == null ? "" : "\t" + reference)); 270 } 271 } 272 } 273 } 274 275 /** 276 * Write data in format: 277 * <languageData> 278 * <language type="aa" scripts="Latn" territories="DJ ER ET"/> 279 * 280 * @param sortedInput 281 */ writeNewBasicData2(Set<RowData> sortedInput)282 private static void writeNewBasicData2(Set<RowData> sortedInput) { 283 double cutoff = 0.2; // 20% 284 285 // Relation<String, BasicLanguageData> newLanguageData = new Relation(new TreeMap(), TreeSet.class); 286 LanguageTagParser ltp = new LanguageTagParser(); 287 Map<String, Relation<BasicLanguageData.Type, String>> language_status_territories = new TreeMap<String, Relation<BasicLanguageData.Type, String>>(); 288 //Map<String, Pair<String, String>> languageToBestCountry; 289 for (RowData rowData : sortedInput) { 290 if (rowData.countryCode.equals("ZZ")) continue; 291 ltp.set(rowData.languageCode); 292 String languageCode = ltp.getLanguage(); 293 Relation<BasicLanguageData.Type, String> status_territories = language_status_territories.get(languageCode); 294 if (status_territories == null) { 295 language_status_territories.put(languageCode, status_territories = Relation.of( 296 new TreeMap<BasicLanguageData.Type, Set<String>>(), 297 TreeSet.class)); 298 } 299 if (rowData.officialStatus.isMajor()) { 300 status_territories.put(BasicLanguageData.Type.primary, rowData.countryCode); 301 } else if (rowData.officialStatus.isOfficial() 302 || rowData.getLanguagePopulation() >= cutoff * rowData.countryPopulation 303 || rowData.getLanguagePopulation() >= 1000000) { 304 status_territories.put(BasicLanguageData.Type.secondary, rowData.countryCode); 305 } 306 } 307 308 Set<String> allLanguages = new TreeSet<String>(language_status_territories.keySet()); 309 allLanguages.addAll(language_status_scripts.keySet()); 310 // now add all the remaining language-script info 311 // <language type="sv" scripts="Latn" territories="AX FI SE"/> 312 Set<String> warnings = new LinkedHashSet<String>(); 313 Log.println("\t<languageData>"); 314 for (String languageSubtag : allLanguages) { 315 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(languageSubtag); 316 Relation<BasicLanguageData.Type, String> status_territories = language_status_territories 317 .get(languageSubtag); 318 319 // check against old: 320 Map<BasicLanguageData.Type, BasicLanguageData> oldData = supplementalData 321 .getBasicLanguageDataMap(languageSubtag); 322 if (oldData == null) { 323 oldData = Collections.emptyMap(); 324 } 325 326 EnumMap<BasicLanguageData.Type, BasicLanguageData> newData = new EnumMap<BasicLanguageData.Type, BasicLanguageData>( 327 BasicLanguageData.Type.class); 328 for (BasicLanguageData.Type status : BasicLanguageData.Type.values()) { 329 Set<String> scripts = status_scripts == null ? null : status_scripts.getAll(status); 330 Set<String> territories = status_territories == null ? null : status_territories.getAll(status); 331 if (scripts == null && territories == null) continue; 332 BasicLanguageData bld = new BasicLanguageData(); 333 bld.setTerritories(territories); 334 bld.setScripts(scripts); 335 bld.setType(status); 336 bld.freeze(); 337 newData.put(status, bld); 338 } 339 340 // compare 341 if (!CldrUtility.equals(oldData.entrySet(), newData.entrySet())) { 342 for (String problem : compare(oldData, newData)) { 343 warnings.add(BadItem.DETAIL.toString("changing <languageData>", languageSubtag 344 + "\t" + english.getName(languageSubtag), problem)); 345 } 346 } 347 348 for (BasicLanguageData bld : newData.values()) { 349 Set<String> scripts = bld.getScripts(); 350 Set<String> territories = bld.getTerritories(); 351 BasicLanguageData.Type status = bld.getType(); 352 Log.println("\t\t<language type=\"" + languageSubtag + "\"" 353 + (scripts.isEmpty() ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"") 354 + (territories.isEmpty() ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"") 355 + (status == BasicLanguageData.Type.primary ? "" : " alt=\"secondary\"") 356 + "/>"); 357 } 358 } 359 Log.println("\t</languageData>"); 360 for (String s : warnings) { 361 if (s.contains("!")) { 362 System.out.println(s); 363 } 364 } 365 for (String s : warnings) { 366 if (!s.contains("!")) { 367 System.out.println(s); 368 } 369 } 370 } 371 compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData, Map<BasicLanguageData.Type, BasicLanguageData> newData)372 private static List<String> compare(Map<BasicLanguageData.Type, BasicLanguageData> oldData, 373 Map<BasicLanguageData.Type, BasicLanguageData> newData) { 374 Map<String, BasicLanguageData.Type> oldDataToType = getDataToType(oldData.values(), true); 375 Map<String, BasicLanguageData.Type> newDataToType = getDataToType(newData.values(), true); 376 List<String> result = new ArrayList<>(); 377 StringBuilder temp = new StringBuilder(); 378 for (String s : Builder.with(new LinkedHashSet<String>()).addAll(oldDataToType.keySet()) 379 .addAll(newDataToType.keySet()).get()) { 380 BasicLanguageData.Type oldValue = oldDataToType.get(s); 381 BasicLanguageData.Type newValue = newDataToType.get(s); 382 if (!CldrUtility.equals(oldValue, newValue)) { 383 temp.setLength(0); 384 temp.append("[").append(s).append(":") 385 .append(english.getName(s.length() == 4 ? "script" : "region", s)).append("] "); 386 if (oldValue == null) { 387 temp.append(" added as ").append(newValue); 388 } else if (newValue == null) { 389 temp.append(" REMOVED!"); 390 } else if (oldValue == BasicLanguageData.Type.primary) { 391 temp.append(" DOWNGRADED TO! ").append(newValue); 392 } else { 393 temp.append(" upgraded to ").append(newValue); 394 } 395 result.add(temp.toString()); 396 } 397 } 398 result.add(newData.toString()); 399 return result; 400 } 401 getDataToType( Collection<BasicLanguageData> collection, boolean script)402 private static Map<String, BasicLanguageData.Type> getDataToType( 403 Collection<BasicLanguageData> collection, boolean script) { 404 Map<String, BasicLanguageData.Type> result = new TreeMap<String, BasicLanguageData.Type>(); 405 for (BasicLanguageData i : collection) { 406 for (String s : i.getScripts()) { 407 result.put(s, i.getType()); 408 } 409 for (String s : i.getTerritories()) { 410 result.put(s, i.getType()); 411 } 412 } 413 return result; 414 } 415 checkBasicData(Map<String, RowData> localeToRowData)416 private static void checkBasicData(Map<String, RowData> localeToRowData) { 417 // find languages with multiple scripts 418 Relation<String, String> languageToScripts = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 419 for (String languageSubtag : language2BasicLanguageData.keySet()) { 420 for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 421 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), item.getScripts()); 422 } 423 } 424 // get primary combinations 425 Set<String> primaryCombos = new TreeSet<String>(); 426 Set<String> basicCombos = new TreeSet<String>(); 427 for (String languageSubtag : language2BasicLanguageData.keySet()) { 428 for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 429 Set<String> scripts = new TreeSet<String>(); 430 scripts.addAll(item.getScripts()); 431 languageToScripts.putAll(StandardCodes.fixLanguageTag(languageSubtag), scripts); 432 if (scripts.size() == 0) { 433 scripts.add("Zzzz"); 434 } 435 Set<String> territories = new TreeSet<String>(); 436 territories.addAll(item.getTerritories()); 437 if (territories.size() == 0) { 438 territories.add("ZZ"); 439 continue; 440 } 441 442 for (String script : scripts) { 443 for (String territory : territories) { 444 String locale = StandardCodes.fixLanguageTag(languageSubtag) 445 // + (script.equals("Zzzz") ? "" : languageToScripts.getAll(languageSubtag).size() <= 1 ? "" 446 // : "_" + script) 447 + (territories.equals("ZZ") ? "" : "_" + territory); 448 if (item.getType() != BasicLanguageData.Type.secondary) { 449 primaryCombos.add(locale); 450 } 451 basicCombos.add(locale); 452 } 453 } 454 } 455 } 456 Set<String> populationOver20 = new TreeSet<String>(); 457 Set<String> population = new TreeSet<String>(); 458 LanguageTagParser ltp = new LanguageTagParser(); 459 for (String rawLocale : localeToRowData.keySet()) { 460 ltp.set(rawLocale); 461 String locale = ltp.getLanguage() + (ltp.getRegion().length() == 0 ? "" : "_" + ltp.getRegion()); 462 population.add(locale); 463 RowData rowData = localeToRowData.get(rawLocale); 464 if (rowData.getLanguagePopulation() / rowData.countryPopulation >= 0.2 465 //|| rowData.getLanguagePopulation() > 900000 466 ) { 467 populationOver20.add(locale); 468 } else { 469 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData( 470 ltp.getLanguageScript(), ltp.getRegion()); 471 if (popData != null && popData.getOfficialStatus().isOfficial()) { 472 populationOver20.add(locale); 473 } 474 } 475 } 476 Set<String> inBasicButNotPopulation = new TreeSet<String>(primaryCombos); 477 478 inBasicButNotPopulation.removeAll(population); 479 for (String locale : inBasicButNotPopulation) { 480 ltp.set(locale); 481 String region = ltp.getRegion(); 482 String language = ltp.getLanguage(); 483 if (!sc.isModernLanguage(language)) continue; 484 PopulationData popData = supplementalData.getPopulationDataForTerritory(region); 485 // Afghanistan AF "29,928,987" 28.10% "21,500,000,000" Hazaragi haz "1,770,000" 28.10% 486 BadItem.WARNING.show("In Basic Data but not Population > 20%", 487 getDisplayCountry(region) 488 + "\t" + region 489 + "\t\"" + formatNumber(popData.getPopulation(), 0, false) + "\"" 490 + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false) 491 + "\"" 492 + "\t\"" + formatPercent(popData.getGdp(), 0, false) + "\"" 493 + "\t" + "" 494 + "\t" + getLanguageName(language) 495 + "\t" + language 496 + "\t" + -1 497 + "\t\"" + formatPercent(popData.getLiteratePopulation() / popData.getPopulation(), 0, false) 498 + "\""); 499 } 500 501 Set<String> inPopulationButNotBasic = new TreeSet<String>(populationOver20); 502 inPopulationButNotBasic.removeAll(basicCombos); 503 for (Iterator<String> it = inPopulationButNotBasic.iterator(); it.hasNext();) { 504 String locale = it.next(); 505 if (locale.endsWith("_ZZ")) { 506 it.remove(); 507 } 508 } 509 for (String locale : inPopulationButNotBasic) { 510 BadItem.WARNING.show("In Population>20% but not Basic Data", locale + " " + getLanguageName(locale), localeToRowData.get(locale).toString()); 511 } 512 } 513 514 static class LanguageInfo { 515 static LanguageInfo INSTANCE = new LanguageInfo(); 516 517 Map<String, Set<String>> languageToScripts = new TreeMap<String, Set<String>>(); 518 Map<String, Set<String>> languageToRegions = new TreeMap<String, Set<String>>(); 519 Map<String, Comments> languageToComments = new TreeMap<String, Comments>(); 520 521 Map<String, Set<String>> languageToScriptsAlt = new TreeMap<String, Set<String>>(); 522 Map<String, Set<String>> languageToRegionsAlt = new TreeMap<String, Set<String>>(); 523 Map<String, Comments> languageToCommentsAlt = new TreeMap<String, Comments>(); 524 LanguageInfo()525 private LanguageInfo() { 526 cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 527 //Set<String> available = cldrFactory.getAvailable(); 528 CLDRFile supplemental = cldrFactory.make("supplementalData", true); 529 XPathParts parts = new XPathParts(); 530 for (Iterator<String> it = supplemental.iterator("//supplementalData/languageData/language"); it.hasNext();) { 531 String xpath = it.next(); 532 Map<String, String> x = parts.set(xpath).getAttributes(-1); 533 boolean alt = x.containsKey("alt"); 534 String lang = x.get("type"); 535 List<String> scripts = getAttributeList(x, "scripts"); 536 if (scripts != null) { 537 if (alt) { 538 putAll(languageToScriptsAlt, lang, new LinkedHashSet<String>(scripts)); 539 } else { 540 putAll(languageToScripts, lang, new LinkedHashSet<String>(scripts)); 541 } 542 } 543 List<String> regions = getAttributeList(x, "territories"); 544 if (regions != null) { 545 if (alt) { 546 putAll(languageToRegionsAlt, lang, new LinkedHashSet<String>(regions)); 547 } else { 548 putAll(languageToRegions, lang, new LinkedHashSet<String>(regions)); 549 } 550 } 551 } 552 } 553 getAttributeList(Map<String, String> x, String attribute)554 private List<String> getAttributeList(Map<String, String> x, String attribute) { 555 List<String> scripts = null; 556 String scriptString = x.get(attribute); 557 if (scriptString != null) { 558 scripts = Arrays.asList(scriptString.split("\\s+")); 559 } 560 return scripts; 561 } 562 } 563 putUnique(Map<K, V> map, K key, V value)564 private static <K, V> void putUnique(Map<K, V> map, K key, V value) { 565 V oldValue = map.get(key); 566 if (oldValue != null && !oldValue.equals(value)) { 567 throw new IllegalArgumentException("Duplicate value for <" + key + ">: <" + oldValue + ">, <" + value + ">"); 568 } 569 map.put(key, value); 570 } 571 putAll(Map<K, Set<W>> map, K key, Set<W> values)572 private static <K, W> void putAll(Map<K, Set<W>> map, K key, Set<W> values) { 573 Set<W> oldValue = map.get(key); 574 if (oldValue == null) { 575 map.put(key, values); 576 } else { 577 oldValue.addAll(values); 578 } 579 } 580 581 // public enum OfficialStatus {unknown, de_facto_official, official, official_regional, official_minority}; 582 583 static class RowData implements Comparable<Object> { 584 private final String countryCode; 585 private final double countryGdp; 586 private final double countryLiteracy; 587 private final double countryPopulation; 588 private final String languageCode; 589 private final OfficialStatus officialStatus; 590 private final double languagePopulation; 591 private final double languageLiteracy; 592 private final String comment; 593 private final String notes; 594 private final String badLanguageName; 595 private final boolean relativeLanguagePopulation; 596 // String badLanguageCode = ""; 597 private final static Set<String> doneCountries = new HashSet<String>(); 598 599 private final static Set<String> countryCodes = sc.getGoodAvailableCodes("territory"); 600 RowData(String country, String language)601 public RowData(String country, String language) { 602 this.countryCode = country; 603 this.languageCode = language; 604 badLanguageName = country = language = notes = comment = ""; 605 officialStatus = OfficialStatus.unknown; 606 countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000); 607 countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d; 608 countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue(); 609 languagePopulation = languageLiteracy = Double.NaN; 610 relativeLanguagePopulation = false; 611 } 612 RowData(List<String> row)613 RowData(List<String> row) throws ParseException { 614 countryCode = fixCountryCode(row.get(COUNTRY_CODE), row); 615 616 if (!countryCodes.contains(countryCode)) { 617 System.err.println("WRONG COUNTRY CODE: " + row); 618 } 619 620 double countryPopulation1 = parseDecimal(row.get(COUNTRY_POPULATION)); 621 double countryLiteracy1 = parsePercent(row.get(COUNTRY_LITERACY), countryPopulation1); 622 623 countryGdp = roundToPartsPer(AddPopulationData.getGdp(countryCode).doubleValue(), 1000); 624 countryLiteracy = AddPopulationData.getLiteracy(countryCode).doubleValue() / 100.0d; 625 countryPopulation = AddPopulationData.getPopulation(countryCode).doubleValue(); 626 627 String officialStatusString = row.get(OFFICIAL_STATUS).trim().replace(' ', '_'); 628 if (officialStatusString.equals("national")) { 629 officialStatusString = "official"; 630 } else if (officialStatusString.equals("regional_official")) { 631 officialStatusString = "official_regional"; 632 } else if (officialStatusString.length() == 0 || officialStatusString.equals("uninhabited")) { 633 officialStatusString = "unknown"; 634 } 635 try { 636 officialStatus = OfficialStatus.valueOf(officialStatusString); 637 } catch (RuntimeException e) { 638 throw new IllegalArgumentException("Can't interpret offical-status: " + officialStatusString); 639 } 640 641 String languageCode1 = row.get(LANGUAGE_CODE); 642 if (languageCode1.startsWith("*") || languageCode1.startsWith("\u00A7")) { 643 languageCode1 = languageCode1.substring(1); 644 } 645 languageCode = fixLanguageCode(languageCode1, row); 646 647 if (doneCountries.contains(countryCode) == false) { 648 // showDiff(countryGdp1, countryGdp); 649 // showDiff(countryLiteracy1, countryLiteracy); 650 if (SHOW_DIFF) showDiff(countryPopulation1, countryPopulation, 0.1, false); 651 doneCountries.add(countryCode); 652 } 653 654 double languagePopulation1 = parsePercent(row.get(LANGUAGE_POPULATION), countryPopulation1) 655 * countryPopulation1; 656 if ((officialStatus.isMajor()) 657 && languagePopulation1 * 100 < countryPopulation && languagePopulation1 < 1000000) { 658 BadItem.WARNING.show("official language has population < 1% of country & < 1,000,000", languageCode + ", " + Math.round(languagePopulation1), 659 row); 660 } 661 if (languagePopulation1 < 0.999) { 662 BadItem.WARNING.show("suspect language population, < 1", languageCode + ", " + Math.round(languagePopulation1), row); 663 } 664 if (languagePopulation1 > 10000) { 665 relativeLanguagePopulation = true; 666 languagePopulation1 = languagePopulation1 * countryPopulation / countryPopulation1; // correct the 667 // values 668 } else { 669 relativeLanguagePopulation = false; 670 } 671 if (isApproximatelyGreater(languagePopulation1, countryPopulation, 0.0001)) { 672 BadItem.ERROR.show("language population > country population", Math.round(languagePopulation1) + " > " + countryPopulation, row); 673 } 674 languagePopulation = languagePopulation1 < countryPopulation ? languagePopulation1 : countryPopulation; 675 676 if (SHOW_DIFF) 677 showDiff(languagePopulation1 / countryPopulation1, languagePopulation / countryPopulation, 0.01, true); 678 679 String stringLanguageLiteracy = row.size() <= LANGUAGE_LITERACY ? "" : row.get(LANGUAGE_LITERACY); 680 double languageLiteracy1 = stringLanguageLiteracy.length() == 0 ? countryLiteracy 681 : parsePercent(stringLanguageLiteracy, languagePopulation); 682 if (isApproximatelyEqual(languageLiteracy1, countryLiteracy1, 0.001)) { 683 languageLiteracy1 = countryLiteracy; // correct the values 684 } 685 languageLiteracy = languageLiteracy1; 686 687 if (row.size() > COMMENT) { 688 comment = row.get(COMMENT); 689 } else { 690 comment = ""; 691 } 692 if (row.size() > NOTES) { 693 notes = row.get(NOTES); 694 } else { 695 notes = ""; 696 } 697 badLanguageName = row.get(BAD_LANGUAGE_NAME); 698 } 699 showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang)700 private void showDiff(double a, double new_a, double maxRelativeDiff, boolean showLang) { 701 final double diff = new_a / a - 1; 702 if (Math.abs(diff) > maxRelativeDiff) { 703 System.out.println(formatPercent(diff, 0, false) 704 + "\t" + countryCode + "\t" + getDisplayCountry(countryCode) 705 + (showLang ? "\t" + languageCode + "\t" + getLanguageName(languageCode) : "") 706 + "\t" + formatNumber(a, 0, false) + "\t=>\t" + formatNumber(new_a, 0, false)); 707 } 708 } 709 roundToPartsPer(double a, double whole)710 private double roundToPartsPer(double a, double whole) { 711 // break this out just to make it easier to follow. 712 double log10 = Math.log10(a / whole); 713 long digitsFound = (long) (log10); 714 long factor = (long) (Math.pow(10, digitsFound)); 715 double rounded = Math.round(a / factor); 716 double result = rounded * factor; 717 // if (Math.abs(result - a) >= 1) { 718 // System.out.println("Rounding " + a + " => " + result); 719 // } 720 return result; 721 } 722 isApproximatelyEqual(double a, double b, double epsilon)723 private static boolean isApproximatelyEqual(double a, double b, double epsilon) { 724 return a == b || Math.abs(a - b) < epsilon; 725 } 726 isApproximatelyGreater(double a, double b, double epsilon)727 private static boolean isApproximatelyGreater(double a, double b, double epsilon) { 728 return a > b + epsilon; 729 } 730 parseDecimal(String numericRepresentation)731 double parseDecimal(String numericRepresentation) throws ParseException { 732 try { 733 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN; 734 Number result = nf.parse(numericRepresentation); 735 // if (result == null) return Double.NaN; 736 return result.doubleValue(); 737 } catch (ParseException e) { 738 throw e; 739 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation + 740 // ">").initCause(e); 741 } 742 } 743 parsePercent(String numericRepresentation, double baseValue)744 double parsePercent(String numericRepresentation, double baseValue) throws ParseException { 745 try { 746 double result; 747 if (numericRepresentation.contains("%")) { 748 Number result0 = pf.parse(numericRepresentation); 749 result = result0.doubleValue(); 750 } else { 751 Number result0 = nf.parse(numericRepresentation); 752 result = result0.doubleValue() / baseValue; 753 } 754 // if (numericRepresentation == null || numericRepresentation.length() == 0) return Double.NaN; 755 // if (result == null) return Double.NaN; 756 return result; 757 } catch (ParseException e) { 758 throw e; 759 // (RuntimeException) new IllegalArgumentException("can't parse <" + numericRepresentation + 760 // ">").initCause(e); 761 } 762 } 763 getLanguageLiteratePopulation()764 public double getLanguageLiteratePopulation() { 765 return languageLiteracy * languagePopulation; 766 } 767 768 /** 769 * Get the weighted population 770 * 771 * @param weightIfNotOfficial 772 * @return 773 */ getLanguageLiteratePopulation(double weightIfNotOfficial)774 public double getLanguageLiteratePopulation(double weightIfNotOfficial) { 775 double result = languageLiteracy * languagePopulation; 776 if (!officialStatus.isMajor()) { 777 result *= weightIfNotOfficial; 778 } 779 return result; 780 } 781 compareTo(Object o)782 public int compareTo(Object o) { 783 RowData that = (RowData) o; 784 int result; 785 if (0 != (result = GENERAL_COLLATOR.compare(countryCode, that.countryCode))) return result; 786 if (languagePopulation > that.languagePopulation) return -1; // descending 787 if (languagePopulation < that.languagePopulation) return 1; 788 if (0 != (result = GENERAL_COLLATOR.compare(languageCode, that.languageCode))) return result; 789 return 0; 790 } 791 toStringHeader()792 public static String toStringHeader() { 793 return "countryCode" + "\t" + "countryPopulation" + "\t" + "countryGdp" 794 + "\t" + "countryLiteracy" 795 + "\t" + "languagePopulation" + "\t" + "languageCode" 796 + "\t" + "writingPopulation"; 797 } 798 toString()799 public String toString() { 800 return countryCode + "\t" + countryPopulation + "\t" + countryGdp 801 + "\t" + countryLiteracy 802 + "\t" + languagePopulation + "\t" + languageCode 803 + "\t" + languageLiteracy; 804 } 805 toString(boolean b)806 public String toString(boolean b) { 807 return "region:\t" + getCountryCodeAndName(countryCode) 808 + "\tpop:\t" + countryPopulation 809 + "\tgdp:\t" + countryGdp 810 + "\tlit:\t" + countryLiteracy 811 + "\tlang:\t" + getLanguageCodeAndName(languageCode) 812 + "\tpop:\t" + languagePopulation 813 + "\tlit:\t" + languageLiteracy; 814 } 815 816 static boolean MARK_OUTPUT = false; 817 getRickLanguageCode()818 public String getRickLanguageCode() { 819 if (languageCode.contains("_")) return languageCode; 820 Source source = Iso639Data.getSource(languageCode); 821 if (source == null) { 822 return "§" + languageCode; 823 } 824 if (MARK_OUTPUT) { 825 if (source == Source.ISO_639_3) { 826 return "*" + languageCode; 827 } 828 } 829 return languageCode; 830 } 831 832 static Map<String, String> oldToFixed = new HashMap<>(); 833 getRickLanguageName()834 public String getRickLanguageName() { 835 String cldrResult = getExcelQuote(english.getName(languageCode, true)); 836 // String result = getRickLanguageName2(); 837 // if (!result.equalsIgnoreCase(cldrResult)) { 838 // if (null == oldToFixed.put(result, cldrResult)) { 839 // System.out.println("## " + result + "!=" + cldrResult); 840 // } 841 // } 842 return cldrResult; 843 } 844 getRickLanguageName2()845 public String getRickLanguageName2() { 846 String result = new ULocale(languageCode).getDisplayName(); 847 if (!result.equals(languageCode)) return getExcelQuote(result); 848 Set<String> names = Iso639Data.getNames(languageCode); 849 if (names != null && names.size() != 0) { 850 if (MARK_OUTPUT) { 851 return getExcelQuote("*" + names.iterator().next()); 852 } else { 853 return getExcelQuote(names.iterator().next()); 854 } 855 } 856 return getExcelQuote("§" + badLanguageName); 857 } 858 getCountryName()859 public String getCountryName() { 860 return getExcelQuote(getDisplayCountry(countryCode)); 861 } 862 getCountryGdpString()863 public String getCountryGdpString() { 864 return getExcelQuote(formatNumber(countryGdp, 0, false)); 865 } 866 getCountryLiteracyString()867 public String getCountryLiteracyString() { 868 return formatPercent(countryLiteracy, 2, false); 869 } 870 getCountryPopulationString()871 public String getCountryPopulationString() { 872 return getExcelQuote(formatNumber(countryPopulation, 0, false)); 873 } 874 getLanguageLiteracyString()875 public String getLanguageLiteracyString() { 876 return formatPercent(languageLiteracy, 2, false); 877 } 878 getLanguagePopulationString()879 public String getLanguagePopulationString() { 880 881 try { 882 final double percent = languagePopulation / countryPopulation; 883 return getExcelQuote(relativeLanguagePopulation 884 && percent > 0.03 885 && languagePopulation > 10000 886 ? formatPercent(percent, 2, false) 887 : formatNumber(languagePopulation, 3, false)); 888 } catch (IllegalArgumentException e) { 889 return "NaN"; 890 } 891 } 892 getLanguagePopulation()893 private double getLanguagePopulation() { 894 return languagePopulation; 895 } 896 897 } 898 getExcelQuote(String comment)899 public static String getExcelQuote(String comment) { 900 return comment == null || comment.length() == 0 ? "" 901 : comment.contains(",") ? '"' + comment + '"' 902 : comment.contains("\"") ? '"' + comment.replace("\"", "\"\"") + '"' 903 : comment; 904 } 905 getCountryCodeAndName(String code)906 public static String getCountryCodeAndName(String code) { 907 if (code == null) return null; 908 return english.getName(CLDRFile.TERRITORY_NAME, code) + " [" + code + "]"; 909 } 910 911 static class RickComparator implements Comparator<RowData> { compare(RowData me, RowData that)912 public int compare(RowData me, RowData that) { 913 int result; 914 if (0 != (result = GENERAL_COLLATOR.compare(me.getCountryName(), that.getCountryName()))) return result; 915 if (0 != (result = GENERAL_COLLATOR.compare(me.getRickLanguageName(), that.getRickLanguageName()))) 916 return result; 917 return me.compareTo(that); 918 } 919 } 920 writeTerritoryLanguageData(List<String> failures, Set<RowData> sortedInput)921 private static void writeTerritoryLanguageData(List<String> failures, Set<RowData> sortedInput) { 922 923 String lastCountryCode = ""; 924 boolean first = true; 925 LanguageTagParser ltp = new LanguageTagParser(); 926 927 Log.println(" <!-- See http://unicode.org/cldr/data/diff/supplemental/territory_language_information.html for more information on territoryInfo. -->"); 928 Log.println("\t<territoryInfo>"); 929 930 for (RowData row : sortedInput) { 931 String countryCode = row.countryCode; 932 933 double countryPopulationRaw = row.countryPopulation; 934 double countryPopulation = countryPopulationRaw; // (long) Utility.roundToDecimals(countryPopulationRaw, 2); 935 double languageLiteracy = row.languageLiteracy; 936 double countryLiteracy = row.countryLiteracy; 937 938 double countryGDPRaw = row.countryGdp; 939 long countryGDP = Math.round(countryGDPRaw / gdpFactor); 940 941 String languageCode = row.languageCode; 942 943 double languagePopulationRaw = row.getLanguagePopulation(); 944 double languagePopulation = languagePopulationRaw; // (long) Utility.roundToDecimals(languagePopulationRaw, 945 // 2); 946 947 double languagePopulationPercent = languagePopulation / countryPopulation; 948 // Utility.roundToDecimals(Math.min(100, Math.max(0, 949 // languagePopulation*100 / (double)countryPopulation)),3); 950 951 if (!countryCode.equals(lastCountryCode)) { 952 if (first) { 953 first = false; 954 } else { 955 Log.println("\t\t</territory>"); 956 } 957 Log.print("\t\t<territory type=\"" + countryCode + "\"" 958 + " gdp=\"" + formatNumber(countryGDP, 4, true) + "\"" 959 + " literacyPercent=\"" + formatPercent(countryLiteracy, 3, true) + "\"" 960 + " population=\"" + formatNumber(countryPopulation, 6, true) + "\">"); 961 lastCountryCode = countryCode; 962 Log.println("\t<!--" + getDisplayCountry(countryCode) + "-->"); 963 } 964 965 if (languageCode.length() != 0 966 && languagePopulationPercent > 0.0000 967 && (ALLOW_SMALL_NUMBERS || languagePopulationPercent >= 1 || languagePopulationRaw > 100000 968 || languageCode.equals("haw") || row.officialStatus.isOfficial())) { 969 // add best case 970 addBestRegion(languageCode, countryCode, languagePopulationRaw); 971 String baseScriptLanguage = ltp.set(languageCode).getLanguageScript(); 972 if (!baseScriptLanguage.equals(languageCode)) { 973 addBestRegion(baseScriptLanguage, countryCode, languagePopulationRaw); 974 } 975 String baseLanguage = ltp.set(baseScriptLanguage).getLanguage(); 976 if (!baseLanguage.equals(baseScriptLanguage)) { 977 addBestRegion(baseLanguage, countryCode, languagePopulationRaw); 978 addBestScript(baseLanguage, ltp.set(languageCode).getScript(), languagePopulationRaw); 979 } 980 981 if (languageLiteracy != countryLiteracy) { 982 int debug = 0; 983 } 984 Log.print("\t\t\t<languagePopulation type=\"" 985 + languageCode 986 + "\"" 987 + (DoubleMath.fuzzyCompare(languageLiteracy, countryLiteracy, 0.0001) == 0 ? "" 988 : (DoubleMath.fuzzyCompare(languageLiteracy, 0.05, 0.0001) == 0 ? " writingPercent=\"" : " literacyPercent=\"") 989 + formatPercent(languageLiteracy, 2, true) + "\"") 990 + " populationPercent=\"" + formatPercent(languagePopulationPercent, 2, true) + "\"" 991 + (row.officialStatus.isOfficial() ? " officialStatus=\"" + row.officialStatus + "\"" : "") 992 + references.addReference(row.notes) 993 + "/>"); 994 Log.println("\t<!--" + getLanguageName(languageCode) + "-->"); 995 } else if (!row.countryCode.equals("ZZ")) { 996 failures.add(BadItem.ERROR.toString("too few speakers: suspect line", languageCode, row.toString(true))); 997 } 998 // if (first) { 999 if (false) System.out.print( 1000 "countryCode: " + countryCode + "\t" 1001 + "countryPopulation: " + countryPopulation + "\t" 1002 + "countryGDP: " + countryGDP + "\t" 1003 + "languageCode: " + languageCode + "\t" 1004 + "languagePopulation: " + languagePopulation + CldrUtility.LINE_SEPARATOR); 1005 // } 1006 } 1007 1008 Log.println("\t\t</territory>"); 1009 Log.println("\t</territoryInfo>"); 1010 } 1011 getDisplayCountry(String countryCode)1012 private static String getDisplayCountry(String countryCode) { 1013 String result = getULocaleCountryName(countryCode); 1014 if (!result.equals(countryCode)) { 1015 return result; 1016 } 1017 result = sc.getData("territory", countryCode); 1018 if (result != null) { 1019 return result; 1020 } 1021 return countryCode; 1022 // new ULocale("und-" + countryCode).getDisplayCountry() 1023 } 1024 getDisplayScript(String scriptCode)1025 private static String getDisplayScript(String scriptCode) { 1026 String result = getULocaleScriptName(scriptCode); 1027 if (!result.equals(scriptCode)) { 1028 return result; 1029 } 1030 result = sc.getData("territory", scriptCode); 1031 if (result != null) { 1032 return result; 1033 } 1034 return scriptCode; 1035 // new ULocale("und-" + countryCode).getDisplayCountry() 1036 } 1037 getLanguageName(String languageCode)1038 private static String getLanguageName(String languageCode) { 1039 String result = getULocaleLocaleName(languageCode); 1040 if (!result.equals(languageCode)) return result; 1041 Set<String> names = Iso639Data.getNames(languageCode); 1042 if (names != null && names.size() != 0) { 1043 return names.iterator().next(); 1044 } 1045 return languageCode; 1046 } 1047 1048 static class References { 1049 Map<String, Pair<String, String>> Rxxx_to_reference = new TreeMap<String, Pair<String, String>>(); 1050 Map<Pair<String, String>, String> reference_to_Rxxx = new TreeMap<Pair<String, String>, String>(); 1051 Map<String, Pair<String, String>> Rxxx_to_oldReferences = supplementalData.getReferences(); 1052 Map<Pair<String, String>, String> oldReferences_to_Rxxx = new TreeMap<Pair<String, String>, String>(); 1053 { 1054 for (String Rxxx : Rxxx_to_oldReferences.keySet()) { Rxxx_to_oldReferences.get(Rxxx)1055 oldReferences_to_Rxxx.put(Rxxx_to_oldReferences.get(Rxxx), Rxxx); 1056 } 1057 } 1058 Matcher URI = PatternCache.get("([a-z]+\\://[\\S]+)\\s?(.*)").matcher(""); 1059 1060 static int referenceStart = 1000; 1061 1062 /** 1063 * Returns " references=\"" + Rxxx + "\"" or "" if there is no reference. 1064 * 1065 * @param rawReferenceText 1066 * @return 1067 */ addReference(String rawReferenceText)1068 private String addReference(String rawReferenceText) { 1069 if (rawReferenceText == null || rawReferenceText.length() == 0) return ""; 1070 Pair<String, String> p; 1071 if (URI.reset(rawReferenceText).matches()) { 1072 p = new Pair<String, String>(URI.group(1), URI.group(2) == null || URI.group(2).length() == 0 ? "[missing]" 1073 : URI.group(2)).freeze(); 1074 } else { 1075 p = new Pair<String, String>(null, rawReferenceText).freeze(); 1076 } 1077 1078 String Rxxx = reference_to_Rxxx.get(p); 1079 if (Rxxx == null) { // add new 1080 Rxxx = oldReferences_to_Rxxx.get(p); 1081 if (Rxxx != null) { // if old, just keep number 1082 p = Rxxx_to_oldReferences.get(Rxxx); 1083 } else { // find an empty number 1084 while (true) { 1085 Rxxx = "R" + (referenceStart++); 1086 if (Rxxx_to_reference.get(Rxxx) == null && Rxxx_to_oldReferences.get(Rxxx) == null) { 1087 break; 1088 } 1089 } 1090 } 1091 // add to new references 1092 reference_to_Rxxx.put(p, Rxxx); 1093 Rxxx_to_reference.put(Rxxx, p); 1094 } 1095 // references="R034" 1096 return " references=\"" + Rxxx + "\""; 1097 } 1098 getReferenceHTML(String Rxxx)1099 String getReferenceHTML(String Rxxx) { 1100 Pair<String, String> p = Rxxx_to_reference.get(Rxxx); // exception if fails. 1101 String uri = p.getFirst(); 1102 String value = p.getSecond(); 1103 uri = uri == null ? "" : " uri=\"" + TransliteratorUtilities.toHTML.transliterate(uri) + "\""; 1104 value = value == null ? "[missing]" : TransliteratorUtilities.toHTML.transliterate(value); 1105 return "\t\t<reference type=\"" + Rxxx + "\"" + uri + ">" + value + "</reference>"; 1106 } 1107 printReferences()1108 void printReferences() { 1109 // <reference type="R034" uri="isbn:0-321-18578-1">The Unicode Standard 4.0</reference> 1110 Log.println("\t<references>"); 1111 for (String Rxxx : Rxxx_to_reference.keySet()) { 1112 Log.println(getReferenceHTML(Rxxx)); 1113 } 1114 Log.println("\t</references>"); 1115 } 1116 } 1117 1118 static References references = new References(); 1119 getExcelData(List<String> failures, Map<String, RowData> localeToRowData)1120 private static Set<RowData> getExcelData(List<String> failures, Map<String, RowData> localeToRowData) 1121 throws IOException { 1122 1123 LanguageTagParser ltp = new LanguageTagParser(); 1124 1125 String dir = CLDRPaths.GEN_DIRECTORY + "supplemental/"; 1126 final String ricksFile = "country_language_population_raw.txt"; 1127 System.out.println("\n# Problems in " + ricksFile + "\n"); 1128 List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data(ricksFile)); 1129 1130 Set<String> languages = languagesNeeded; // sc.getGoodAvailableCodes("language"); 1131 1132 Set<String> territories = new TreeSet<String>(sc.getGoodAvailableCodes("territory")); 1133 territories.removeAll(supplementalData.getContainers()); 1134 territories.remove("EU"); 1135 territories.remove("QO"); 1136 1137 Set<String> countriesNotFound = new TreeSet<String>(territories); 1138 Set<OfficialStatus> statusFound = new TreeSet<OfficialStatus>(); 1139 Set<String> countriesWithoutOfficial = new TreeSet<String>(territories); 1140 countriesWithoutOfficial.remove("ZZ"); 1141 1142 Map<String, Row.R2<String, Double>> countryToLargestOfficialLanguage = new HashMap<String, Row.R2<String, Double>>(); 1143 1144 Set<String> languagesNotFound = new TreeSet<String>(languages); 1145 Set<RowData> sortedInput = new TreeSet<RowData>(); 1146 int count = 0; 1147 for (List<String> row : input) { 1148 ++count; 1149 if (count == 1 || row.size() <= COUNTRY_GDP) { 1150 failures.add(join(row, "\t") + "\tShort row"); 1151 continue; 1152 } 1153 try { 1154 RowData x = new RowData(row); 1155 if (x.officialStatus.isOfficial()) { 1156 Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(x.countryCode); 1157 if (largestOffical == null) { 1158 countryToLargestOfficialLanguage.put(x.countryCode, 1159 Row.of(x.languageCode, x.languagePopulation)); 1160 } else if (largestOffical.get1() < x.languagePopulation) { 1161 largestOffical.set0(x.languageCode); 1162 largestOffical.set1(x.languagePopulation); 1163 } 1164 } 1165 if (x.officialStatus.isMajor() || x.countryPopulation < 1000) { 1166 countriesWithoutOfficial.remove(x.countryCode); 1167 } 1168 if (!checkCode(LstrType.region, x.countryCode, row)) continue; 1169 statusFound.add(x.officialStatus); 1170 countriesNotFound.remove(x.countryCode); 1171 languagesNotFound.remove(x.languageCode); 1172 if (x.languageCode.contains("_")) { 1173 ltp.set(x.languageCode); 1174 languagesNotFound.remove(ltp.getLanguage()); 1175 if (!checkCode(LstrType.language, ltp.getLanguage(), row)) continue; 1176 if (!checkCode(LstrType.script, ltp.getScript(), row)) continue; 1177 } 1178 String locale = x.languageCode + "_" + x.countryCode; 1179 if (localeToRowData.get(locale) != null) { 1180 BadItem.ERROR.show("duplicate data", x.languageCode + " with " + x.countryCode, row); 1181 } 1182 localeToRowData.put(locale, x); 1183 sortedInput.add(x); 1184 } catch (ParseException e) { 1185 failures.add(join(row, "\t") + "\t" + e.getMessage() + "\t" 1186 + join(Arrays.asList(e.getStackTrace()), ";\t")); 1187 } catch (RuntimeException e) { 1188 throw (RuntimeException) new IllegalArgumentException("Failure on line " + count + ")\t" + row) 1189 .initCause(e); 1190 } 1191 } 1192 // System.out.println("Note: the following Status values were found in the data: " + 1193 // CldrUtility.join(statusFound, " | ")); 1194 1195 // make sure we have something 1196 for (String country : countriesNotFound) { 1197 RowData x = new RowData(country, "und"); 1198 sortedInput.add(x); 1199 } 1200 for (String language : languagesNotFound) { 1201 RowData x = new RowData("ZZ", language); 1202 sortedInput.add(x); 1203 } 1204 1205 for (RowData row : sortedInput) { 1206 // see which countries have languages that are larger than any offical language 1207 1208 if (!row.officialStatus.isOfficial()) { 1209 //String country = row.countryCode; 1210 Row.R2<String, Double> largestOffical = countryToLargestOfficialLanguage.get(row.countryCode); 1211 if (largestOffical != null && largestOffical.get1() < row.languagePopulation) { 1212 BadItem.WARNING.show("language population > all official languages", getLanguageCodeAndName(largestOffical.get0()), row.toString(true)); 1213 } 1214 } 1215 1216 // see which countries are missing an official language 1217 if (!countriesWithoutOfficial.contains(row.countryCode)) continue; 1218 BadItem.ERROR.show("missing official language", row.getCountryName() + "\t" + row.countryCode, row.toString(true)); 1219 countriesWithoutOfficial.remove(row.countryCode); 1220 } 1221 1222 // write out file for rick 1223 PrintWriter log = FileUtilities.openUTF8Writer(dir, ricksFile); 1224 log.println( 1225 "*\tCName" + 1226 "\tCCode" + 1227 "\tCPopulation" + 1228 "\tCLiteracy" + 1229 "\tCGdp" + 1230 "\tOfficialStatus" + 1231 "\tLanguage" + 1232 "\tLCode" + 1233 "\tLPopulation" + 1234 "\tWritingPop" + 1235 "\tReferences" + 1236 "\tNotes"); 1237 RickComparator rickSorting = new RickComparator(); 1238 Set<RowData> rickSorted = new TreeSet<RowData>(rickSorting); 1239 rickSorted.addAll(sortedInput); 1240 1241 for (RowData row : rickSorted) { 1242 final String langLit = row.getLanguageLiteracyString(); 1243 final String countryLit = row.getCountryLiteracyString(); 1244 log.println( 1245 row.getCountryName() 1246 + "\t" + row.countryCode 1247 + "\t" + row.getCountryPopulationString() 1248 + "\t" + countryLit 1249 + "\t" + row.getCountryGdpString() 1250 + "\t" + (row.officialStatus == OfficialStatus.unknown ? "" : row.officialStatus) 1251 + "\t" + row.getRickLanguageName() 1252 + "\t" + row.getRickLanguageCode() 1253 + "\t" + row.getLanguagePopulationString() 1254 + "\t" + (langLit.equals(countryLit) ? "" : langLit) 1255 + "\t" + getExcelQuote(row.comment) 1256 + "\t" + getExcelQuote(row.notes)); 1257 } 1258 log.close(); 1259 return sortedInput; 1260 } 1261 getCldrParents(Set<String> available)1262 private static Set<String> getCldrParents(Set<String> available) { 1263 LanguageTagParser ltp2 = new LanguageTagParser(); 1264 Set<String> cldrParents = new TreeSet<String>(); 1265 for (String locale : available) { 1266 if (skipLocales.contains(locale)) continue; 1267 try { 1268 ltp2.set(locale); 1269 } catch (RuntimeException e) { 1270 System.out.println("Skipping CLDR file: " + locale); 1271 continue; 1272 } 1273 String locale2 = ltp2.getLanguageScript(); 1274 if (locale2.equals("sh")) continue; 1275 // int lastPos = locale.lastIndexOf('_'); 1276 // if (lastPos < 0) continue; 1277 // String locale2 = locale.substring(0,lastPos); 1278 cldrParents.add(locale2); 1279 languageToMaxCountry.put(locale2, null); 1280 } 1281 //System.out.println("CLDR Parents: " + cldrParents); 1282 return cldrParents; 1283 } 1284 showFailures(List<String> failures)1285 private static void showFailures(List<String> failures) { 1286 if (failures.size() <= 1) { 1287 return; 1288 } 1289 System.out.println(); 1290 System.out.println("Failures in Output"); 1291 System.out.println(); 1292 1293 System.out.println(RowData.toStringHeader()); 1294 for (String failure : failures) { 1295 System.out.println(failure); 1296 } 1297 } 1298 showContent(Set<String> available)1299 private static void showContent(Set<String> available) { 1300 System.out.println(); 1301 System.out.println("CLDR Content"); 1302 System.out.println(); 1303 Set<String> languagesLeft = new TreeSet<String>(defaultContent.keySet()); 1304 languagesLeft.remove("und"); 1305 for (String languageLeft : languagesLeft) { 1306 Log.println("\t\t<defaultContent type=\"" + languageLeft + "\" content=\"" 1307 + defaultContent.get(languageLeft) + "\"/>"); 1308 } 1309 // Set<String> warnings = new LinkedHashSet<String>(); 1310 // 1311 // CLDRFile supplemental = cldrFactory.make("supplementalData", true); 1312 // Comments tempComments = supplemental.getXpath_comments(); 1313 // PrintWriter pw = new PrintWriter(System.out); 1314 // Comparator attributeOrdering = supplemental.getAttributeComparator(); 1315 // Map defaultSuppressionMap = supplemental.getDefaultSuppressionMap(); 1316 // 1317 // XPathParts last = new XPathParts(attributeOrdering, defaultSuppressionMap); 1318 // XPathParts current = new XPathParts(attributeOrdering, defaultSuppressionMap); 1319 // XPathParts lastFiltered = new XPathParts(attributeOrdering, defaultSuppressionMap); 1320 // XPathParts currentFiltered = new XPathParts(attributeOrdering, defaultSuppressionMap); 1321 // 1322 // Set orderedSet = new TreeSet(supplemental.ldmlComparator); 1323 // CollectionUtilities.addAll(supplemental.iterator("//supplementalData/languageData/language"), orderedSet); 1324 // Set<String> languagesLeft = new TreeSet<String>(defaultContent.keySet()); 1325 // 1326 // for (Iterator it2 = orderedSet.iterator(); it2.hasNext();) { 1327 // String xpath = (String)it2.next(); 1328 // currentFiltered.set(xpath); 1329 // current.set(xpath); 1330 // 1331 // Map x = current.set(xpath).getAttributes(-1); 1332 // boolean alt = x.containsKey("alt"); 1333 // String lang = (String) x.get("type"); 1334 // String defaultValue = defaultContent.get(lang); 1335 // if (alt) { 1336 // // skip 1337 // } else if (defaultValue == null) { 1338 // warnings.add("Missing default value for " + lang); 1339 // } else if (!defaultValue.equals(lang)) { 1340 // x.put("defaultContent", defaultValue); 1341 // languagesLeft.remove(lang); 1342 // } 1343 // 1344 // current.writeDifference(pw, currentFiltered, last, lastFiltered, "", tempComments); 1345 // // exchange pairs of parts 1346 // XPathParts temp = current; 1347 // current = last; 1348 // last = temp; 1349 // temp = currentFiltered; 1350 // currentFiltered = lastFiltered; 1351 // lastFiltered = temp; 1352 // } 1353 // pw.flush(); 1354 1355 // for (String warning : warnings) { 1356 // System.out.println(warning); 1357 // } 1358 1359 // for (String localeCode : available) { 1360 // if (skipLocales.contains(localeCode)) continue; 1361 // String resolvedLanguageCode = getFullyResolved(localeCode); 1362 // // a locale will be empty if its parent has the same resolved code 1363 // String parent = getProcessedParent(localeCode); 1364 // String resolvedParent = getFullyResolved(parent); 1365 // System.out.println( 1366 // (resolvedLanguageCode.equals(resolvedParent) ? "empty" : "") 1367 // + "\t" + localeCode 1368 // + "\t" + resolvedLanguageCode 1369 // + "\t" + parent 1370 // + "\t" + ULocale.getDisplayName(localeCode, ULocale.ENGLISH)); 1371 // } 1372 } 1373 getProcessedParent(String localeCode)1374 public static String getProcessedParent(String localeCode) { 1375 if (localeCode == null || localeCode.equals("root")) return null; 1376 int pos = localeCode.lastIndexOf('_'); 1377 if (pos < 0) return "root"; 1378 LanguageTagParser ltp = new LanguageTagParser(); 1379 String script = ltp.set(localeCode).getScript(); 1380 if (script.length() == 0) { 1381 return getFullyResolved(localeCode); 1382 } 1383 return localeCode.substring(0, pos); 1384 } 1385 getFullyResolved(String languageCode)1386 private static String getFullyResolved(String languageCode) { 1387 String result = defaultContent.get(languageCode); 1388 if (result != null) return result; 1389 // we missed. Try taking parent and trying again 1390 int pos = languageCode.length() + 1; 1391 while (true) { 1392 pos = languageCode.lastIndexOf('_', pos - 1); 1393 if (pos < 0) { 1394 return "***" + languageCode; 1395 } 1396 result = defaultContent.get(languageCode.substring(0, pos)); 1397 if (result != null) { 1398 LanguageTagParser ltp = new LanguageTagParser().set(languageCode); 1399 LanguageTagParser ltp2 = new LanguageTagParser().set(result); 1400 String region = ltp.getRegion(); 1401 if (region.length() == 0) { 1402 ltp.setRegion(ltp2.getRegion()); 1403 } 1404 String script = ltp.getScript(); 1405 if (script.length() == 0) { 1406 ltp.setScript(ltp2.getScript()); 1407 } 1408 return ltp.toString(); 1409 } 1410 } 1411 } 1412 1413 static Comparator<Iterable> firstElementComparator = new Comparator<Iterable>() { 1414 public int compare(Iterable o1, Iterable o2) { 1415 int result = ((Comparable) o1.iterator().next()).compareTo((o2.iterator().next())); 1416 assert result != 0; 1417 return result; 1418 } 1419 }; 1420 showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent, Map<String, RowData> localeToRowData, Set<String> defaultLocaleContent)1421 private static void showDefaults(Set<String> cldrParents, NumberFormat nf, Map<String, String> defaultContent, 1422 Map<String, RowData> localeToRowData, 1423 Set<String> defaultLocaleContent) { 1424 1425 if (SHOW_OLD_DEFAULT_CONTENTS) { 1426 System.out.println(); 1427 System.out.println("Computing Defaults Contents"); 1428 System.out.println(); 1429 } 1430 1431 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 1432 Set<String> locales = new TreeSet<String>(cldrFactory.getAvailable()); 1433 LocaleIDParser lidp = new LocaleIDParser(); 1434 1435 // add all the combinations of language, script, and territory. 1436 for (String locale : localeToRowData.keySet()) { 1437 String baseLanguage = lidp.set(locale).getLanguage(); 1438 if (locales.contains(baseLanguage) && !locales.contains(locale)) { 1439 locales.add(locale); 1440 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding: " + locale); 1441 } 1442 } 1443 1444 // adding parents 1445 Set<String> toAdd = new TreeSet<String>(); 1446 while (true) { 1447 for (String locale : locales) { 1448 String newguy = LocaleIDParser.getParent(locale); 1449 if (newguy != null && !locales.contains(newguy) && !toAdd.contains(newguy)) { 1450 toAdd.add(newguy); 1451 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tadding parent: " + newguy); 1452 } 1453 } 1454 if (toAdd.size() == 0) { 1455 break; 1456 } 1457 locales.addAll(toAdd); 1458 toAdd.clear(); 1459 } 1460 1461 // get sets of siblings 1462 Set<Set<String>> siblingSets = new TreeSet<Set<String>>(firstElementComparator); 1463 Set<String> needsADoin = new TreeSet<String>(locales); 1464 1465 Set<String> deprecatedLanguages = new TreeSet<String>(); 1466 deprecatedLanguages.add("sh"); 1467 Set<String> deprecatedRegions = new TreeSet<String>(); 1468 deprecatedRegions.add("YU"); 1469 deprecatedRegions.add("CS"); 1470 deprecatedRegions.add("ZZ"); 1471 1472 // first find all the language subtags that have scripts, and those we need to skip. Those are aliased-only 1473 Set<String> skippingItems = new TreeSet<String>(); 1474 Set<String> hasAScript = new TreeSet<String>(); 1475 //Set<LocaleIDParser.Level> languageOnly = EnumSet.of(LocaleIDParser.Level.Language); 1476 for (String locale : locales) { 1477 lidp.set(locale); 1478 if (lidp.getScript().length() != 0) { 1479 hasAScript.add(lidp.getLanguage()); 1480 } 1481 Set<LocaleIDParser.Level> levels = lidp.getLevels(); 1482 // must have no variants, must have either script or region, no deprecated elements 1483 if (levels.contains(LocaleIDParser.Level.Variants) // no variants 1484 || !(levels.contains(LocaleIDParser.Level.Script) 1485 || levels.contains(LocaleIDParser.Level.Region)) 1486 || deprecatedLanguages.contains(lidp.getLanguage()) 1487 || deprecatedRegions.contains(lidp.getRegion())) { 1488 // skip language-only locales, and ones with variants 1489 needsADoin.remove(locale); 1490 skippingItems.add(locale); 1491 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("\tremoving: " + locale); 1492 continue; 1493 } 1494 } 1495 // walk through the locales, getting the ones we care about. 1496 Map<String, Double> scriptLocaleToLanguageLiteratePopulation = new TreeMap<String, Double>(); 1497 1498 for (String locale : new TreeSet<String>(needsADoin)) { 1499 if (!needsADoin.contains(locale)) continue; 1500 lidp.set(locale); 1501 Set<Level> level = lidp.getLevels(); 1502 // skip locales that need scripts and don't have them 1503 if (!level.contains(LocaleIDParser.Level.Script) // no script 1504 && hasAScript.contains(lidp.getLanguage())) { 1505 needsADoin.remove(locale); 1506 skippingItems.add(locale); 1507 continue; 1508 } 1509 // get siblings 1510 Set<String> siblingSet = lidp.getSiblings(needsADoin); 1511 // if it has a script and region 1512 if (level.contains(LocaleIDParser.Level.Script) && level.contains(LocaleIDParser.Level.Region)) { 1513 double languageLiteratePopulation = 0; 1514 for (String localeID2 : siblingSet) { 1515 RowData rowData = localeToRowData.get(localeID2); 1516 if (rowData != null) { 1517 languageLiteratePopulation += rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT); 1518 } 1519 } 1520 String parentID = LocaleIDParser.getParent(locale); 1521 scriptLocaleToLanguageLiteratePopulation.put(parentID, languageLiteratePopulation); 1522 } 1523 1524 try { 1525 siblingSets.add(siblingSet); 1526 } catch (RuntimeException e) { 1527 e.printStackTrace(); 1528 } 1529 needsADoin.removeAll(siblingSet); 1530 } 1531 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("ConvertLanguageData Skipping: " + skippingItems); 1532 if (needsADoin.size() != 0) { 1533 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("Missing: " + needsADoin); 1534 } 1535 1536 // walk through the data 1537 Set<String> skippingSingletons = new TreeSet<String>(); 1538 1539 Set<String> missingData = new TreeSet<String>(); 1540 for (Set<String> siblingSet : siblingSets) { 1541 if (SHOW_OLD_DEFAULT_CONTENTS) System.out.println("** From siblings: " + siblingSet); 1542 1543 if (false & siblingSet.size() == 1) { 1544 skippingSingletons.add(siblingSet.iterator().next()); 1545 continue; 1546 } 1547 // get best 1548 double best = Double.NEGATIVE_INFINITY; 1549 String bestLocale = "???"; 1550 Set<Pair<Double, String>> data = new TreeSet<>(); 1551 LanguageTagParser ltp = new LanguageTagParser(); 1552 for (String locale : siblingSet) { 1553 RowData rowData = localeToRowData.get(locale); 1554 double languageLiteratePopulation = -1; 1555 if (rowData != null) { 1556 languageLiteratePopulation = rowData.getLanguageLiteratePopulation(NON_OFFICIAL_WEIGHT); 1557 } else { 1558 Double d = scriptLocaleToLanguageLiteratePopulation.get(locale); 1559 if (d != null) { 1560 languageLiteratePopulation = d; 1561 } else { 1562 final String region = ltp.set(locale).getRegion(); 1563 if (region.isEmpty() || StandardCodes.isCountry(region)) { 1564 missingData.add(locale); 1565 } 1566 } 1567 } 1568 data.add(new Pair<Double, String>(languageLiteratePopulation, locale)); 1569 if (best < languageLiteratePopulation) { 1570 best = languageLiteratePopulation; 1571 bestLocale = locale; 1572 } 1573 } 1574 // show it 1575 for (Pair<Double, String> datum : data) { 1576 if (SHOW_OLD_DEFAULT_CONTENTS) 1577 System.out.format( 1578 "\tContenders: %s %f (based on literate population)" + CldrUtility.LINE_SEPARATOR, 1579 datum.getSecond(), datum.getFirst()); 1580 } 1581 // System.out.format("\tPicking default content: %s %f (based on literate population)" + 1582 // Utility.LINE_SEPARATOR, bestLocale, best); 1583 // Hack to fix English 1584 // TODO Generalize in the future for other locales with non-primary scripts 1585 if (bestLocale.startsWith("en_")) { 1586 defaultLocaleContent.add("en_US"); 1587 } else { 1588 defaultLocaleContent.add(bestLocale); 1589 } 1590 } 1591 1592 for (String singleton : skippingSingletons) { 1593 BadItem.WARNING.show("skipping Singletons", singleton); 1594 } 1595 for (String missing : missingData) { 1596 BadItem.WARNING.show("Missing Data", missing); 1597 } 1598 1599 // LanguageTagParser ltp = new LanguageTagParser(); 1600 // Set<String> warnings = new LinkedHashSet(); 1601 // for (String languageCode : languageToMaxCountry.keySet()) { 1602 // CodeAndPopulation best = languageToMaxCountry.get(languageCode); 1603 // String languageSubtag = ltp.set(languageCode).getLanguage(); 1604 // String countryCode = "ZZ"; 1605 // double rawLanguagePopulation = -1; 1606 // if (best != null) { 1607 // countryCode = best.code; 1608 // rawLanguagePopulation = best.population; 1609 // Set<String> regions = LanguageInfo.INSTANCE.languageToRegions.get(languageSubtag); 1610 // if (regions == null || !regions.contains(countryCode)) { 1611 // Set<String> regions2 = LanguageInfo.INSTANCE.languageToRegionsAlt.get(languageSubtag); 1612 // if (regions2 == null || !regions2.contains(countryCode)) { 1613 // warnings.add("WARNING: " + languageCode + " => " + countryCode + ", not in " + regions + "/" + regions2); 1614 // } 1615 // } 1616 // } 1617 // String resolvedLanguageCode = languageCode + "_" + countryCode; 1618 // ltp.set(languageCode); 1619 // Set<String> scripts = LanguageInfo.INSTANCE.languageToScripts.get(languageCode); 1620 // String script = ltp.getScript(); 1621 // if (script.length() == 0) { 1622 // CodeAndPopulation bestScript = languageToMaxScript.get(languageCode); 1623 // if (bestScript != null) { 1624 // script = bestScript.code; 1625 // if (scripts == null || !scripts.contains(script)) { 1626 // warnings.add("WARNING: " + languageCode + " => " + script + ", not in " + scripts); 1627 // } 1628 // } else { 1629 // script = "Zzzz"; 1630 // if (scripts == null) { 1631 // scripts = LanguageInfo.INSTANCE.languageToScriptsAlt.get(languageCode); 1632 // } 1633 // if (scripts != null) { 1634 // script = scripts.iterator().next(); 1635 // if (scripts.size() != 1) { 1636 // warnings.add("WARNING: " + languageCode + " => " + scripts); 1637 // } 1638 // } 1639 // } 1640 // if (scripts == null) { 1641 // warnings.add("Missing scripts for: " + languageCode); 1642 // } else if (scripts.size() == 1){ 1643 // script = ""; 1644 // } 1645 // resolvedLanguageCode = languageCode 1646 // + (script.length() == 0 ? "" : "_" + script) 1647 // + "_" + countryCode; 1648 // } 1649 // 1650 // 1651 // System.out.println( 1652 // resolvedLanguageCode 1653 // + "\t" + languageCode 1654 // + "\t" + ULocale.getDisplayName(languageCode, ULocale.ENGLISH) 1655 // + "\t" + countryCode 1656 // + "\t" + ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH) 1657 // + "\t" + formatNumber(rawLanguagePopulation) 1658 // + (cldrParents.contains(languageCode) ? "\tCLDR" : "") 1659 // ); 1660 // if (languageCode.length() == 0) continue; 1661 // defaultContent.put(languageCode, resolvedLanguageCode); 1662 // } 1663 // for (String warning : warnings) { 1664 // System.out.println(warning); 1665 // } 1666 } 1667 1668 // private static void printDefaultContent(Set<String> defaultLocaleContent) { 1669 // String sep = Utility.LINE_SEPARATOR + "\t\t\t"; 1670 // String broken = Utility.breakLines(join(defaultLocaleContent," "), sep, PatternCache.get("(\\S)\\S*").matcher(""), 1671 // 80); 1672 // 1673 // Log.println("\t\t<defaultContent locales=\"" + broken + "\""); 1674 // Log.println("\t\t/>"); 1675 // } 1676 getSuppressScript(String languageCode)1677 private static Object getSuppressScript(String languageCode) { 1678 // TODO Auto-generated method stub 1679 return null; 1680 } 1681 join(Collection c, String separator)1682 public static String join(Collection c, String separator) { 1683 StringBuffer result = new StringBuffer(); 1684 boolean first = true; 1685 for (Object x : c) { 1686 if (first) 1687 first = false; 1688 else 1689 result.append(separator); 1690 result.append(x); 1691 } 1692 return result.toString(); 1693 } 1694 addBestRegion(String languageCode, String countryCode, double languagePopulationRaw)1695 private static void addBestRegion(String languageCode, String countryCode, double languagePopulationRaw) { 1696 addBest(languageCode, languagePopulationRaw, countryCode, languageToMaxCountry); 1697 } 1698 addBestScript(String languageCode, String scriptCode, double languagePopulationRaw)1699 private static void addBestScript(String languageCode, String scriptCode, double languagePopulationRaw) { 1700 addBest(languageCode, languagePopulationRaw, scriptCode, languageToMaxScript); 1701 } 1702 addBest(String languageCode, double languagePopulationRaw, String code, Map<String, CodeAndPopulation> languageToMaxCode)1703 private static void addBest(String languageCode, double languagePopulationRaw, String code, 1704 Map<String, CodeAndPopulation> languageToMaxCode) { 1705 if (languageCode.length() == 0) { 1706 throw new IllegalArgumentException(); 1707 } 1708 CodeAndPopulation best = languageToMaxCode.get(languageCode); 1709 if (best == null) { 1710 languageToMaxCode.put(languageCode, best = new CodeAndPopulation()); 1711 } else if (best.population >= languagePopulationRaw) { 1712 return; 1713 } 1714 best.population = languagePopulationRaw; 1715 best.code = code; 1716 } 1717 1718 static class CodeAndPopulation { 1719 String code = null; 1720 double population = Double.NaN; 1721 toString()1722 public String toString() { 1723 return "{" + code + "," + population + "}"; 1724 } 1725 } 1726 1727 static public class GeneralCollator implements Comparator<String> { 1728 static UTF16.StringComparator cpCompare = new UTF16.StringComparator(true, false, 0); 1729 static RuleBasedCollator UCA = (RuleBasedCollator) Collator 1730 .getInstance(ULocale.ROOT); 1731 static { 1732 UCA.setNumericCollation(true); 1733 } 1734 compare(String s1, String s2)1735 public int compare(String s1, String s2) { 1736 if (s1 == null) { 1737 return s2 == null ? 0 : -1; 1738 } else if (s2 == null) { 1739 return 1; 1740 } 1741 int result = UCA.compare(s1, s2); 1742 if (result != 0) return result; 1743 return cpCompare.compare(s1, s2); 1744 } 1745 }; 1746 1747 public static class InverseComparator<T> implements Comparator<T> { 1748 private Comparator<T> other; 1749 InverseComparator()1750 public InverseComparator() { 1751 this.other = null; 1752 } 1753 InverseComparator(Comparator<T> other)1754 public InverseComparator(Comparator<T> other) { 1755 this.other = other; 1756 } 1757 compare(T a, T b)1758 public int compare(T a, T b) { 1759 return other == null 1760 ? ((Comparable) b).compareTo(a) 1761 : other.compare(b, a); 1762 } 1763 } 1764 1765 static Set<String> languagesNeeded = new TreeSet<String>( 1766 Arrays 1767 .asList("ab ba bh bi bo fj fy gd ha ht ik iu ks ku ky lg mi na nb rm sa sd sg si sm sn su tg tk to tw vo yi za lb dv chr syr kha sco gv" 1768 .split("\\s"))); 1769 generateIso639_2Data()1770 static void generateIso639_2Data() { 1771 for (String languageSubtag : sc.getAvailableCodes("language")) { 1772 String alpha3 = Iso639Data.toAlpha3(languageSubtag); 1773 Type type = Iso639Data.getType(languageSubtag); 1774 Scope scope = Iso639Data.getScope(languageSubtag); 1775 if (type != null || alpha3 != null || scope != null) { 1776 Log.println("\t\t<languageCode type=\"" + languageSubtag + "\"" + 1777 (alpha3 == null ? "" : " iso639Alpha3=\"" + alpha3 + "\"") + 1778 (type == null ? "" : " iso639Type=\"" + type + "\"") + 1779 (scope == null ? "" : " iso639Scope=\"" + scope + "\"") + 1780 "/>"); 1781 } 1782 1783 } 1784 } 1785 1786 static Relation<String, BasicLanguageData> language2BasicLanguageData = Relation.of(new TreeMap<String, Set<BasicLanguageData>>(), TreeSet.class); 1787 1788 static Map<String, Relation<BasicLanguageData.Type, String>> language_status_scripts; 1789 static Map<Pair<String, String>, String> language_script_references = new TreeMap<Pair<String, String>, String>(); 1790 1791 static final Map<String, Map<String, R2<List<String>, String>>> LOCALE_ALIAS_INFO = SupplementalDataInfo 1792 .getInstance().getLocaleAliasInfo(); 1793 getLanguage2Scripts(Set<RowData> sortedInput)1794 static void getLanguage2Scripts(Set<RowData> sortedInput) throws IOException { 1795 language_status_scripts = new TreeMap<String, Relation<BasicLanguageData.Type, String>>(); 1796 1797 // // get current scripts 1798 // Relation<String,String> languageToDefaultScript = new Relation(new TreeMap(), TreeSet.class); 1799 // Relation<String,String> secondaryLanguageToDefaultScript = new Relation(new TreeMap(), TreeSet.class); 1800 // for (String languageSubtag : language2BasicLanguageData.keySet()) { 1801 // for (BasicLanguageData item : language2BasicLanguageData.getAll(languageSubtag)) { 1802 // for (String script : item.getScripts()) { 1803 // addLanguage2Script(languageSubtag, item.getType(), script); 1804 // } 1805 // } 1806 // } 1807 // System.out.println("Language 2 scripts: " + language_status_scripts); 1808 1809 // #Lcode LanguageName Status Scode ScriptName References 1810 List<List<String>> input = SpreadSheet.convert(CldrUtility.getUTF8Data("language_script_raw.txt")); 1811 System.out.println(CldrUtility.LINE_SEPARATOR + "# Problems in language_script_raw.txt" 1812 + CldrUtility.LINE_SEPARATOR); 1813 //int count = -1; 1814 for (List<String> row : input) { 1815 try { 1816 if (row.size() == 0) continue; 1817 //++count; 1818 String language = row.get(0).trim(); 1819 if (language.length() == 0 || language.startsWith("#")) continue; 1820 BasicLanguageData.Type status = BasicLanguageData.Type.valueOf(row.get(2)); 1821 String scripts = row.get(3); 1822 if (!checkCode(LstrType.language, language, row)) continue; 1823 for (String script : scripts.split("\\s+")) { 1824 if (!checkCode(LstrType.script, script, row)) continue; 1825 // if the script is not modern, demote 1826 Info scriptInfo = ScriptMetadata.getInfo(script); 1827 if (scriptInfo == null) { 1828 BadItem.ERROR.toString("illegal script; must be represented in Unicode, remove line or fix", script, row); 1829 continue; 1830 } 1831 IdUsage idUsage = scriptInfo.idUsage; 1832 if (status == BasicLanguageData.Type.primary && idUsage != IdUsage.RECOMMENDED) { 1833 if (idUsage == IdUsage.ASPIRATIONAL || idUsage == IdUsage.LIMITED_USE) { 1834 BadItem.WARNING.toString("Script has unexpected usage; make secondary if a Recommended script is used widely for the langauge", 1835 idUsage + ", " + script + "=" + getULocaleScriptName(script), row); 1836 } else { 1837 BadItem.ERROR.toString("Script is not modern; make secondary", idUsage + ", " + script + "=" + getULocaleScriptName(script), row); 1838 status = BasicLanguageData.Type.secondary; 1839 } 1840 } 1841 1842 // if the language is not modern, demote 1843 if (LOCALE_ALIAS_INFO.get("language").containsKey(language)) { 1844 BadItem.ERROR.toString("Remove/Change deprecated language", language + " " 1845 + getLanguageName(language) + "; " + LOCALE_ALIAS_INFO.get("language").get(language), row); 1846 continue; 1847 } 1848 if (status == BasicLanguageData.Type.primary && !sc.isModernLanguage(language)) { 1849 BadItem.ERROR.toString("Should be secondary, language is not modern", language + " " + getLanguageName(language), row); 1850 status = BasicLanguageData.Type.secondary; 1851 } 1852 1853 addLanguage2Script(language, status, script); 1854 if (row.size() > 5) { 1855 String reference = row.get(5); 1856 if (reference != null && reference.length() == 0) { 1857 language_script_references.put(new Pair<String, String>(language, script), reference); 1858 } 1859 } 1860 } 1861 } catch (RuntimeException e) { 1862 System.err.println(row); 1863 throw e; 1864 } 1865 } 1866 1867 // System.out.println("Language 2 scripts: " + language_status_scripts); 1868 1869 for (String language : sc.getGoodAvailableCodes("language")) { 1870 if (supplementalData.getDeprecatedInfo("language", language) != null) { 1871 continue; 1872 } 1873 Map<String, String> registryData = sc.getLangData("language", language); 1874 if (registryData != null) { 1875 String suppressScript = registryData.get("Suppress-Script"); 1876 if (suppressScript == null) continue; 1877 if (ScriptMetadata.getInfo(suppressScript) == null) { 1878 // skip, not represented in Unicode 1879 continue; 1880 } 1881 // if there is something already there, we have a problem. 1882 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1883 if (status_scripts == null) { 1884 System.out 1885 .println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript); 1886 } else if (!status_scripts.values().contains(suppressScript)) { 1887 System.out.println("Missing Suppress-Script: " + language + "\tSuppress-Script:\t" + suppressScript 1888 + "\tall:\t" + status_scripts.values()); 1889 } else { 1890 // at this point, the suppressScript is in the union of the primary and secondary. 1891 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary); 1892 if (primaryScripts != null && !primaryScripts.contains(suppressScript)) { 1893 System.out.println("Suppress-Script is not in primary: " + language + "\tSuppress-Script:\t" 1894 + suppressScript + "\tprimary:\t" 1895 + primaryScripts); 1896 } 1897 } 1898 addLanguage2Script(language, BasicLanguageData.Type.primary, suppressScript); 1899 } 1900 } 1901 1902 // remove primaries from secondaries 1903 // check for primaries for scripts 1904 for (String language : language_status_scripts.keySet()) { 1905 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1906 Set<String> secondaryScripts = status_scripts.getAll(BasicLanguageData.Type.secondary); 1907 if (secondaryScripts == null) continue; 1908 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary); 1909 if (primaryScripts == null) { 1910 // status_scripts.putAll(BasicLanguageData.Type.primary, secondaryScripts); 1911 // status_scripts.removeAll(BasicLanguageData.Type.secondary); 1912 if (sc.isModernLanguage(language)) { 1913 BadItem.ERROR.show("modern language without primary script, might need to edit moribund_languages.txt", language + " " 1914 + getLanguageName(language)); 1915 } 1916 } else { 1917 status_scripts.removeAll(BasicLanguageData.Type.secondary, primaryScripts); 1918 } 1919 } 1920 1921 // check that every living language in the row data has a script 1922 Set<String> livingLanguagesWithTerritories = new TreeSet<String>(); 1923 for (RowData rowData : sortedInput) { 1924 String language = rowData.languageCode; 1925 if (sc.isModernLanguage(language) && Iso639Data.getSource(language) != Iso639Data.Source.ISO_639_3) { 1926 livingLanguagesWithTerritories.add(language); 1927 } 1928 } 1929 for (String language : livingLanguagesWithTerritories) { 1930 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1931 if (status_scripts != null) { 1932 Set<String> primaryScripts = status_scripts.getAll(BasicLanguageData.Type.primary); 1933 if (primaryScripts != null && primaryScripts.size() > 0) { 1934 continue; 1935 } 1936 } 1937 if (language.equals("tw")) continue; // TODO load aliases and check... 1938 BadItem.WARNING.show("ISO 639-1/2 language in language-territory list without primary script", language + "\t" + getLanguageName(language)); 1939 } 1940 1941 // System.out.println("Language 2 scripts: " + language_status_scripts); 1942 } 1943 checkScript(String script)1944 private static boolean checkScript(String script) { 1945 // TODO Auto-generated method stub 1946 return false; 1947 } 1948 1949 static Validity VALIDITY = Validity.getInstance(); 1950 checkCode(LstrType type, String code, List<String> sourceLine)1951 private static boolean checkCode(LstrType type, String code, List<String> sourceLine) { 1952 Status validity = VALIDITY.getCodeToStatus(type).get(code); 1953 if (validity == Status.regular) { 1954 if (type == LstrType.language && code.equals("no")) { 1955 validity = Status.invalid; 1956 } else { 1957 return true; 1958 } 1959 } else if (validity == Status.unknown && type == LstrType.region) { 1960 return true; 1961 } 1962 BadItem.ERROR.show("Illegitimate Code", type + ": " + code + " = " + validity, sourceLine); 1963 return false; 1964 } 1965 addLanguage2Script(String language, BasicLanguageData.Type type, String script)1966 private static void addLanguage2Script(String language, BasicLanguageData.Type type, String script) { 1967 Relation<BasicLanguageData.Type, String> status_scripts = language_status_scripts.get(language); 1968 if (status_scripts == null) 1969 language_status_scripts.put(language, status_scripts = Relation.of(new TreeMap<BasicLanguageData.Type, Set<String>>(), TreeSet.class)); 1970 status_scripts.put(type, script); 1971 } 1972 addLanguageScriptData()1973 static void addLanguageScriptData() throws IOException { 1974 // check to make sure that every language subtag is in 639-3 1975 Set<String> langRegistryCodes = sc.getGoodAvailableCodes("language"); 1976 // Set<String> iso639_2_missing = new TreeSet(langRegistryCodes); 1977 // iso639_2_missing.removeAll(Iso639Data.getAvailable()); 1978 // iso639_2_missing.remove("root"); 1979 // if (iso639_2_missing.size() != 0) { 1980 // for (String missing : iso639_2_missing){ 1981 // System.out.println("*ERROR in StandardCodes* Missing Lang/Script data:\t" + missing + ", " + 1982 // sc.getData("language", missing)); 1983 // } 1984 // } 1985 1986 // Map<String, String> nameToTerritoryCode = new TreeMap(); 1987 // for (String territoryCode : sc.getGoodAvailableCodes("territory")) { 1988 // nameToTerritoryCode.put(sc.getData("territory", territoryCode).toLowerCase(), territoryCode); 1989 // } 1990 // nameToTerritoryCode.put("iran", nameToTerritoryCode.get("iran, islamic republic of")); // 1991 1992 //BasicLanguageData languageData = new BasicLanguageData(); 1993 1994 BufferedReader in = CldrUtility.getUTF8Data("extraLanguagesAndScripts.txt"); 1995 while (true) { 1996 String line = in.readLine(); 1997 if (line == null) break; 1998 String[] parts = line.split("\\t"); 1999 String alpha3 = parts[0]; 2000 alpha3 = stripBrackets(alpha3); 2001 String languageSubtag = Iso639Data.fromAlpha3(alpha3); 2002 if (languageSubtag == null) { 2003 if (langRegistryCodes.contains(alpha3)) { 2004 languageSubtag = alpha3; 2005 } else { 2006 BadItem.WARNING.show("Language subtag not found on line", alpha3, line); 2007 continue; 2008 } 2009 } 2010 //String name = parts[1]; 2011 Set<String> names = Iso639Data.getNames(languageSubtag); 2012 if (names == null) { 2013 Map<String, String> name2 = sc.getLangData("language", languageSubtag); 2014 if (name2 != null) { 2015 String name3 = name2.get("Description"); 2016 if (name3 != null) { 2017 names = new TreeSet<String>(); 2018 names.add(name3); 2019 } 2020 } 2021 } 2022 // if (names == null || !names.contains(name)) { 2023 // System.out.println("Name <" + name + "> for <" + languageSubtag + "> not found in " + names); 2024 // } 2025 2026 // names all straight, now get scripts and territories 2027 // [Cyrl]; [Latn] 2028 Set<String> fullScriptList = sc.getGoodAvailableCodes("script"); 2029 2030 String[] scriptList = parts[2].split("[;,]\\s*"); 2031 Set<String> scripts = new TreeSet<String>(); 2032 Set<String> scriptsAlt = new TreeSet<String>(); 2033 for (String script : scriptList) { 2034 if (script.length() == 0) continue; 2035 boolean alt = false; 2036 if (script.endsWith("*")) { 2037 alt = true; 2038 script = script.substring(0, script.length() - 1); 2039 } 2040 script = stripBrackets(script); 2041 if (!fullScriptList.contains(script)) { 2042 System.out.println("Script <" + script + "> for <" + languageSubtag + "> not found in " 2043 + fullScriptList); 2044 } else if (alt) { 2045 scriptsAlt.add(script); 2046 } else { 2047 scripts.add(script); 2048 } 2049 } 2050 // now territories 2051 Set<String> territories = new TreeSet<String>(); 2052 if (parts.length > 4) { 2053 String[] territoryList = parts[4].split("\\s*[;,-]\\s*"); 2054 for (String territoryName : territoryList) { 2055 if (territoryName.equals("ISO/DIS 639") || territoryName.equals("3")) continue; 2056 String territoryCode = CountryCodeConverter.getCodeFromName(territoryName); 2057 if (territoryCode == null) { 2058 BadItem.ERROR.show("no name found for territory", "<" + territoryName + ">", languageSubtag); 2059 } else { 2060 territories.add(territoryCode); 2061 } 2062 } 2063 } 2064 // <language type="de" scripts="Latn" territories="IT" alt="secondary"/> 2065 // we're going to go ahead and set these all to secondary. 2066 if (scripts.size() != 0) { 2067 language2BasicLanguageData.put(languageSubtag, 2068 new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scripts) 2069 .setTerritories(territories)); 2070 } 2071 if (scriptsAlt.size() != 0) { 2072 language2BasicLanguageData.put(languageSubtag, 2073 new BasicLanguageData().setType(BasicLanguageData.Type.secondary).setScripts(scriptsAlt) 2074 .setTerritories(territories)); 2075 } 2076 } 2077 in.close(); 2078 2079 // add other data 2080 for (String languageSubtag : supplementalData.getBasicLanguageDataLanguages()) { 2081 Set<BasicLanguageData> otherData = supplementalData.getBasicLanguageData(languageSubtag); 2082 language2BasicLanguageData.putAll(languageSubtag, otherData); 2083 } 2084 } 2085 2086 // private static void showAllBasicLanguageData(Relation<String, BasicLanguageData> language2basicData, String 2087 // comment) { 2088 // // now print 2089 // Relation<String, String> primaryCombos = new Relation(new TreeMap(), TreeSet.class); 2090 // Relation<String, String> secondaryCombos = new Relation(new TreeMap(), TreeSet.class); 2091 // 2092 // Log.println("\t<languageData>" + (comment == null ? "" : " <!-- " + comment + " -->")); 2093 // 2094 // for (String languageSubtag : language2basicData.keySet()) { 2095 // String duplicate = ""; 2096 // // script,territory 2097 // primaryCombos.clear(); 2098 // secondaryCombos.clear(); 2099 // 2100 // for (BasicLanguageData item : language2basicData.getAll(languageSubtag)) { 2101 // Set<String> scripts = item.getScripts(); 2102 // if (scripts.size() == 0) scripts = new TreeSet(Arrays.asList(new String[] { "Zzzz" })); 2103 // for (String script : scripts) { 2104 // Set<String> territories = item.getTerritories(); 2105 // if (territories.size() == 0) territories = new TreeSet(Arrays.asList(new String[] { "ZZ" })); 2106 // for (String territory : territories) { 2107 // if (item.getType().equals(BasicLanguageData.Type.primary)) { 2108 // primaryCombos.put(script, territory); 2109 // } else { 2110 // secondaryCombos.put(script, territory); 2111 // } 2112 // } 2113 // } 2114 // } 2115 // secondaryCombos.removeAll(primaryCombos); 2116 // showBasicLanguageData(languageSubtag, primaryCombos, null, BasicLanguageData.Type.primary); 2117 // showBasicLanguageData(languageSubtag, secondaryCombos, primaryCombos.keySet(), 2118 // BasicLanguageData.Type.secondary); 2119 // // System.out.println(item.toString(languageSubtag) + duplicate); 2120 // // duplicate = " <!-- " + "**" + " -->"; 2121 // } 2122 // Log.println("\t</languageData>"); 2123 // } 2124 showBasicLanguageData(String languageSubtag, Relation<String, String> primaryCombos, Set<String> suppressEmptyScripts, BasicLanguageData.Type type)2125 private static void showBasicLanguageData(String languageSubtag, Relation<String, String> primaryCombos, 2126 Set<String> suppressEmptyScripts, BasicLanguageData.Type type) { 2127 Set<String> scriptsWithSameTerritories = new TreeSet<String>(); 2128 Set<String> lastTerritories = Collections.emptySet(); 2129 for (String script : primaryCombos.keySet()) { 2130 Set<String> territories = primaryCombos.getAll(script); 2131 if (lastTerritories == Collections.EMPTY_SET) { 2132 // skip first 2133 } else if (lastTerritories.equals(territories)) { 2134 scriptsWithSameTerritories.add(script); 2135 } else { 2136 showBasicLanguageData2(languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts, 2137 lastTerritories, type); 2138 scriptsWithSameTerritories.clear(); 2139 } 2140 lastTerritories = territories; 2141 scriptsWithSameTerritories.add(script); 2142 } 2143 showBasicLanguageData2(languageSubtag, scriptsWithSameTerritories, suppressEmptyScripts, lastTerritories, type); 2144 } 2145 showBasicLanguageData2(String languageSubtag, Set<String> scripts, Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type)2146 private static void showBasicLanguageData2(String languageSubtag, Set<String> scripts, 2147 Set<String> suppressEmptyScripts, Set<String> territories, BasicLanguageData.Type type) { 2148 scripts.remove("Zzzz"); 2149 territories.remove("ZZ"); 2150 if (territories.size() == 0 && suppressEmptyScripts != null) { 2151 scripts.removeAll(suppressEmptyScripts); 2152 } 2153 if (scripts.size() == 0 && territories.size() == 0) return; 2154 Log.println("\t\t<language type=\"" + languageSubtag + "\"" + 2155 (scripts.size() == 0 ? "" : " scripts=\"" + CldrUtility.join(scripts, " ") + "\"") + 2156 (territories.size() == 0 ? "" : " territories=\"" + CldrUtility.join(territories, " ") + "\"") + 2157 (type == BasicLanguageData.Type.primary ? "" : " alt=\"" + type + "\"") + 2158 "/>"); 2159 } 2160 2161 /* 2162 * System.out.println( 2163 * "\t\t<language type=\"" + languageSubtag + "\"" + 2164 * " scripts=\"" + Utility.join(scripts," ") + "\"" + 2165 * (territories.size() == 0 ? "" : " territories=\"" + Utility.join(territories," ") + "\"") + 2166 * "/>" 2167 * ); 2168 */ 2169 stripBrackets(String alpha3)2170 private static String stripBrackets(String alpha3) { 2171 if (alpha3.startsWith("[") && alpha3.endsWith("]")) { 2172 alpha3 = alpha3.substring(1, alpha3.length() - 1); 2173 } 2174 return alpha3; 2175 } 2176 2177 static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH); 2178 static NumberFormat nf_no_comma = NumberFormat.getInstance(ULocale.ENGLISH); 2179 static { 2180 nf_no_comma.setGroupingUsed(false); 2181 } 2182 static NumberFormat pf = NumberFormat.getPercentInstance(ULocale.ENGLISH); 2183 formatNumber(double original, int roundDigits, boolean xml)2184 public static String formatNumber(double original, int roundDigits, boolean xml) { 2185 double d = original; 2186 if (roundDigits != 0) { 2187 d = CldrUtility.roundToDecimals(original, roundDigits); 2188 } 2189 if (Double.isNaN(d)) { 2190 d = CldrUtility.roundToDecimals(original, roundDigits); 2191 throw new IllegalArgumentException("Double is NaN"); 2192 } 2193 if (xml) { 2194 return nf_no_comma.format(d); 2195 } 2196 return nf.format(d); 2197 } 2198 formatPercent(double d, int roundDigits, boolean xml)2199 public static String formatPercent(double d, int roundDigits, boolean xml) { 2200 if (roundDigits != 0) { 2201 d = CldrUtility.roundToDecimals(d, roundDigits); 2202 } 2203 if (xml) { 2204 nf_no_comma.setMaximumFractionDigits(roundDigits + 2); 2205 return nf_no_comma.format(d * 100.0); 2206 } 2207 pf.setMaximumFractionDigits(roundDigits + 2); 2208 return pf.format(d); 2209 } 2210 2211 static final LanguageTagCanonicalizer languageTagCanonicalizer = new LanguageTagCanonicalizer(); 2212 fixLanguageCode(String languageCodeRaw, List<String> row)2213 private static String fixLanguageCode(String languageCodeRaw, List<String> row) { 2214 String languageCode = languageTagCanonicalizer.transform(languageCodeRaw); 2215 if (DEBUG && !languageCode.equals(languageCodeRaw)) { 2216 System.out.println("## " + languageCodeRaw + " => " + languageCode); 2217 } 2218 int bar = languageCode.indexOf('_'); 2219 String script = ""; 2220 if (bar >= 0) { 2221 script = languageCode.substring(bar); 2222 languageCode = languageCode.substring(0, bar); 2223 } 2224 R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("language").get(languageCode); 2225 if (replacement != null) { 2226 String replacementCode = replacement.get0().get(0); 2227 BadItem.ERROR.show("deprecated language code", languageCode + " => " + replacementCode, row); 2228 languageCode = replacementCode; 2229 } 2230 if (!sc.getAvailableCodes("language").contains(languageCode)) { 2231 BadItem.ERROR.show("bad language code", languageCode, row); 2232 } 2233 return languageCode + script; 2234 } 2235 2236 enum BadItem { 2237 ERROR, WARNING, DETAIL; 2238 show(String problem, String details, String... items)2239 void show(String problem, String details, String... items) { 2240 System.out.println(toString(problem, details, items)); 2241 } 2242 show(String problem, String details, List<String> row)2243 void show(String problem, String details, List<String> row) { 2244 System.out.println(toString(problem, details, row)); 2245 } 2246 toString(String problem, String details, String... items)2247 private String toString(String problem, String details, String... items) { 2248 return toString(problem, details, Arrays.asList(items)); 2249 } 2250 toString(String problem, String details, List<String> row)2251 private String toString(String problem, String details, List<String> row) { 2252 return "* " + this 2253 + " *\t" + problem + ":" 2254 + "\t" + details 2255 + (row != null && row.size() > 0 ? "\t" + CollectionUtilities.join(row, "\t") : ""); 2256 } 2257 } 2258 fixCountryCode(String countryCode, List<String> row)2259 private static String fixCountryCode(String countryCode, List<String> row) { 2260 R2<List<String>, String> replacement = supplementalData.getLocaleAliasInfo().get("territory").get(countryCode); 2261 if (replacement != null) { 2262 String replacementCode = replacement.get0().get(0); 2263 BadItem.ERROR.show("deprecated territory code", countryCode + " => " + replacementCode, row); 2264 countryCode = replacementCode; 2265 } 2266 if (!sc.getAvailableCodes("territory").contains(countryCode)) { 2267 BadItem.ERROR.show("bad territory code", countryCode, row); 2268 } 2269 return countryCode; 2270 } 2271 getULocaleLocaleName(String languageCode)2272 private static String getULocaleLocaleName(String languageCode) { 2273 return english.getName(languageCode, true); 2274 //return new ULocale(languageCode).getDisplayName(); 2275 } 2276 getULocaleScriptName(String scriptCode)2277 private static String getULocaleScriptName(String scriptCode) { 2278 return english.getName(CLDRFile.SCRIPT_NAME, scriptCode); 2279 // return ULocale.getDisplayScript("und_" + scriptCode, ULocale.ENGLISH); 2280 } 2281 getULocaleCountryName(String countryCode)2282 private static String getULocaleCountryName(String countryCode) { 2283 return english.getName(CLDRFile.TERRITORY_NAME, countryCode); 2284 //return ULocale.getDisplayCountry("und_" + countryCode, ULocale.ENGLISH); 2285 } 2286 } 2287