1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.IOException; 6 import java.io.PrintWriter; 7 import java.util.Arrays; 8 import java.util.BitSet; 9 import java.util.Collection; 10 import java.util.Comparator; 11 import java.util.HashMap; 12 import java.util.HashSet; 13 import java.util.LinkedHashSet; 14 import java.util.List; 15 import java.util.Map; 16 import java.util.Map.Entry; 17 import java.util.Set; 18 import java.util.TreeMap; 19 import java.util.TreeSet; 20 21 import org.unicode.cldr.draft.FileUtilities; 22 import org.unicode.cldr.draft.ScriptMetadata; 23 import org.unicode.cldr.draft.ScriptMetadata.Info; 24 import org.unicode.cldr.util.Builder; 25 import org.unicode.cldr.util.CLDRConfig; 26 import org.unicode.cldr.util.CLDRFile; 27 import org.unicode.cldr.util.CLDRLocale; 28 import org.unicode.cldr.util.CLDRPaths; 29 import org.unicode.cldr.util.CldrUtility; 30 import org.unicode.cldr.util.Containment; 31 import org.unicode.cldr.util.Counter; 32 import org.unicode.cldr.util.Factory; 33 import org.unicode.cldr.util.Iso639Data; 34 import org.unicode.cldr.util.Iso639Data.Scope; 35 import org.unicode.cldr.util.LanguageTagParser; 36 import org.unicode.cldr.util.LocaleIDParser; 37 import org.unicode.cldr.util.Log; 38 import org.unicode.cldr.util.Organization; 39 import org.unicode.cldr.util.PatternCache; 40 import org.unicode.cldr.util.SimpleFactory; 41 import org.unicode.cldr.util.StandardCodes; 42 import org.unicode.cldr.util.SupplementalDataInfo; 43 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; 44 import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type; 45 import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; 46 import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; 47 48 import com.google.common.base.Joiner; 49 import com.google.common.collect.ImmutableMap; 50 import com.google.common.collect.ImmutableSet; 51 import com.ibm.icu.impl.Relation; 52 import com.ibm.icu.impl.Row; 53 import com.ibm.icu.impl.Row.R2; 54 import com.ibm.icu.impl.Row.R3; 55 import com.ibm.icu.impl.Row.R4; 56 import com.ibm.icu.lang.UScript; 57 import com.ibm.icu.text.Collator; 58 import com.ibm.icu.text.NumberFormat; 59 import com.ibm.icu.text.UTF16; 60 import com.ibm.icu.text.UnicodeSet; 61 import com.ibm.icu.text.UnicodeSetIterator; 62 import com.ibm.icu.util.ULocale; 63 64 /** 65 * Problems: 66 * "und_Hani", "zh_Hani" 67 * "und_Sinh", "si_Sinh" 68 * 69 * @author markdavis 70 * 71 */ 72 public class GenerateMaximalLocales { 73 74 private static final String TEMP_UNKNOWN_REGION = "XZ"; 75 76 private static final String DEBUG_ADD_KEY = "und_Latn_ZA"; 77 78 private static final boolean SHOW_ADD = CldrUtility.getProperty("GenerateMaximalLocalesDebug", false); 79 private static final boolean SUPPRESS_CHANGES = CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false); 80 private static final boolean SHOW_CONTAINERS = false; 81 82 enum OutputStyle { 83 PLAINTEXT, C, C_ALT, XML 84 } 85 86 private static OutputStyle OUTPUT_STYLE = OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML") 87 .toUpperCase()); 88 89 // set based on above 90 private static final String SEPARATOR = OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT ? CldrUtility.LINE_SEPARATOR 91 : "\t"; 92 private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_"; 93 // private static final boolean FAVOR_REGION = true; // OUTPUT_STYLE == OutputStyle.C_ALT; 94 95 private static final boolean tryDifferent = true; 96 97 private static final File list[] = { 98 new File(CLDRPaths.MAIN_DIRECTORY), 99 new File(CLDRPaths.SEED_DIRECTORY), 100 new File(CLDRPaths.EXEMPLARS_DIRECTORY) }; 101 102 private static Factory factory = SimpleFactory.make(list, ".*"); 103 private static Factory mainFactory = CLDRConfig.getInstance().getCldrFactory(); 104 private static SupplementalDataInfo supplementalData = SupplementalDataInfo 105 .getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY); 106 private static StandardCodes standardCodes = StandardCodes.make(); 107 private static CLDRFile english = factory.make("en", false); 108 static Relation<String, String> cldrContainerToLanguages = Relation.of(new HashMap<String, Set<String>>(), HashSet.class); 109 static { 110 for (CLDRLocale locale : ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) { 111 String region = locale.getCountry(); 112 if (region == null || region.isEmpty() || Containment.isLeaf(region)) { 113 continue; 114 } cldrContainerToLanguages.put(region, locale.getLanguage())115 cldrContainerToLanguages.put(region, locale.getLanguage()); 116 } cldrContainerToLanguages.freeze()117 cldrContainerToLanguages.freeze(); 118 System.out.println("Keep containers " + cldrContainerToLanguages); 119 } 120 121 private static final List<String> KEEP_TARGETS = Arrays.asList("und_Arab_PK", "und_Latn_ET"); 122 private static final ImmutableSet<String> deprecatedISONotInLST = ImmutableSet.of("scc", "scr"); 123 124 /** 125 * This is the simplest way to override, by supplying the max value. 126 * It gets a very low weight, so doesn't override any stronger value. 127 */ 128 private static final String[] MAX_ADDITIONS = new String[] { 129 "bss_Latn_CM", 130 "gez_Ethi_ET", 131 "ken_Latn_CM", 132 "und_Arab_PK", 133 "wa_Latn_BE", 134 135 "fub_Arab_CM", 136 "fuf_Latn_GN", 137 "kby_Arab_NE", 138 "kdh_Arab_TG", 139 "apd_Arab_TG", 140 "zlm_Latn_TG", 141 142 "cr_Cans_CA", 143 "hif_Latn_FJ", 144 "gon_Telu_IN", 145 "lzz_Latn_TR", 146 "lif_Deva_NP", 147 "unx_Beng_IN", 148 "unr_Beng_IN", 149 "ttt_Latn_AZ", 150 "pnt_Grek_GR", 151 "tly_Latn_AZ", 152 "tkr_Latn_AZ", 153 "bsq_Bass_LR", 154 "ccp_Cakm_BD", 155 "blt_Tavt_VN", 156 "rhg_Arab_MM", 157 "rhg_Rohg_MM", 158 }; 159 160 /** 161 * The following overrides MASH the final values, so they may not result in consistent results. Safer is to add to MAX_ADDITIONS. 162 * However, if you add, add both the language and language+script mappings. 163 */ 164 // Many of the overrides below can be removed once the language/pop/country data is updated. 165 private static final Map<String, String> LANGUAGE_OVERRIDES = CldrUtility.asMap(new String[][] { 166 { "cic", "cic_Latn_US" }, 167 { "cic_Latn", "cic_Latn_US" }, 168 { "eo", "eo_Latn_001" }, 169 { "eo_Latn", "eo_Latn_001" }, 170 { "es", "es_Latn_ES" }, 171 { "es_Latn", "es_Latn_ES" }, 172 { "ff_BF", "ff_Latn_BF" }, 173 { "ff_GM", "ff_Latn_GM" }, 174 { "ff_GH", "ff_Latn_GH" }, 175 { "ff_GW", "ff_Latn_GW" }, 176 { "ff_LR", "ff_Latn_LR" }, 177 { "ff_NE", "ff_Latn_NE" }, 178 { "ff_NG", "ff_Latn_NG" }, 179 { "ff_SL", "ff_Latn_SL" }, 180 { "ff_Adlm", "ff_Adlm_GN" }, 181 { "ia", "ia_Latn_001" }, 182 { "ia_Latn", "ia_Latn_001" }, 183 { "io", "io_Latn_001" }, 184 { "io_Latn", "io_Latn_001" }, 185 { "jbo", "jbo_Latn_001" }, 186 { "jbo_Latn", "jbo_Latn_001" }, 187 { "ku_Arab", "ku_Arab_IQ" }, 188 { "lrc", "lrc_Arab_IR" }, 189 { "lrc_Arab", "lrc_Arab_IR" }, 190 { "man", "man_Latn_GM" }, 191 { "man_Latn", "man_Latn_GM" }, 192 { "mas", "mas_Latn_KE" }, 193 { "mas_Latn", "mas_Latn_KE" }, 194 { "mn", "mn_Cyrl_MN" }, 195 { "mn_Cyrl", "mn_Cyrl_MN" }, 196 { "mro", "mro_Mroo_BD" }, 197 { "mro_BD", "mro_Mroo_BD" }, 198 { "ms_Arab", "ms_Arab_MY" }, 199 { "pap", "pap_Latn_AW" }, 200 { "pap_Latn", "pap_Latn_AW" }, 201 { "prg", "prg_Latn_001" }, 202 { "prg_Latn", "prg_Latn_001" }, 203 { "rif", "rif_Tfng_MA" }, 204 { "rif_Latn", "rif_Latn_MA" }, 205 { "rif_Tfng", "rif_Tfng_MA" }, 206 { "rif_MA", "rif_Tfng_MA" }, 207 { "shi", "shi_Tfng_MA" }, 208 { "shi_Tfng", "shi_Tfng_MA" }, 209 { "shi_MA", "shi_Tfng_MA" }, 210 { "sr_Latn", "sr_Latn_RS" }, 211 { "ss", "ss_Latn_ZA" }, 212 { "ss_Latn", "ss_Latn_ZA" }, 213 { "swc", "swc_Latn_CD" }, 214 { "ti", "ti_Ethi_ET" }, 215 { "ti_Ethi", "ti_Ethi_ET" }, 216 { "und", "en_Latn_US" }, 217 { "und_Adlm", "ff_Adlm_GN" }, 218 { "und_Adlm_GN", "ff_Adlm_GN" }, 219 { "und_Arab", "ar_Arab_EG" }, 220 { "und_Arab_PK", "ur_Arab_PK" }, 221 { "und_Bopo", "zh_Bopo_TW" }, 222 { "und_Deva_FJ", "hif_Deva_FJ" }, 223 { "und_EZ", "de_Latn_EZ" }, 224 { "und_Hani", "zh_Hani_CN" }, 225 { "und_Hani_CN", "zh_Hani_CN" }, 226 { "und_Kana", "ja_Kana_JP" }, 227 { "und_Kana_JP", "ja_Kana_JP" }, 228 { "und_Latn", "en_Latn_US" }, 229 { "und_Latn_ET", "en_Latn_ET" }, 230 { "und_Latn_NE", "ha_Latn_NE" }, 231 { "und_Latn_PH", "fil_Latn_PH" }, 232 { "und_ML", "bm_Latn_ML" }, 233 { "und_Latn_ML", "bm_Latn_ML" }, 234 { "und_MU", "mfe_Latn_MU" }, 235 { "und_NE", "ha_Latn_NE" }, 236 { "und_PH", "fil_Latn_PH" }, 237 { "und_PK", "ur_Arab_PK" }, 238 { "und_SO", "so_Latn_SO" }, 239 { "und_SS", "en_Latn_SS" }, 240 { "und_TK", "tkl_Latn_TK" }, 241 { "und_UN", "en_Latn_UN" }, 242 { "und_005", "pt_Latn_BR" }, 243 { "vo", "vo_Latn_001" }, 244 { "vo_Latn", "vo_Latn_001" }, 245 { "yi", "yi_Hebr_001" }, 246 { "yi_Hebr", "yi_Hebr_001" }, 247 { "yue", "yue_Hant_HK" }, 248 { "yue_Hant", "yue_Hant_HK" }, 249 { "yue_Hans", "yue_Hans_CN" }, 250 { "yue_CN", "yue_Hans_CN" }, 251 { "zh_Hani", "zh_Hani_CN" }, 252 253 { "zh_Bopo", "zh_Bopo_TW" }, 254 { "ccp", "ccp_Cakm_BD" }, 255 { "ccp_Cakm", "ccp_Cakm_BD" }, 256 { "und_Cakm", "ccp_Cakm_BD" }, 257 { "cu_Glag", "cu_Glag_BG" }, 258 { "sd_Khoj", "sd_Khoj_IN" }, 259 { "lif_Limb", "lif_Limb_IN" }, 260 { "grc_Linb", "grc_Linb_GR" }, 261 { "arc_Nbat", "arc_Nbat_JO" }, 262 { "arc_Palm", "arc_Palm_SY" }, 263 { "pal_Phlp", "pal_Phlp_CN" }, 264 { "en_Shaw", "en_Shaw_GB" }, 265 { "sd_Sind", "sd_Sind_IN" }, 266 { "und_Brai", "fr_Brai_FR" }, // hack 267 { "und_Hanb", "zh_Hanb_TW" }, // Special script code 268 { "zh_Hanb", "zh_Hanb_TW" }, // Special script code 269 { "und_Jamo", "ko_Jamo_KR" }, // Special script code 270 271 //{"und_Cyrl_PL", "be_Cyrl_PL"}, 272 273 // {"cr", "cr_Cans_CA"}, 274 // {"hif", "hif_Latn_FJ"}, 275 // {"gon", "gon_Telu_IN"}, 276 // {"lzz", "lzz_Latn_TR"}, 277 // {"lif", "lif_Deva_NP"}, 278 // {"unx", "unx_Beng_IN"}, 279 // {"unr", "unr_Beng_IN"}, 280 // {"ttt", "ttt_Latn_AZ"}, 281 // {"pnt", "pnt_Grek_GR"}, 282 // {"tly", "tly_Latn_AZ"}, 283 // {"tkr", "tkr_Latn_AZ"}, 284 // {"bsq", "bsq_Bass_LR"}, 285 // {"ccp", "ccp_Cakm_BD"}, 286 // {"blt", "blt_Tavt_VN"}, 287 { "mis_Medf", "mis_Medf_NG" }, 288 289 { "ku_Yezi", "ku_Yezi_GE" }, 290 { "und_EU", "en_Latn_IE" }, 291 }); 292 293 /** 294 * The following supplements the suppress-script. It overrides info from exemplars and the locale info. 295 */ 296 private static String[][] SpecialScripts = { 297 { "zh", "Hans" }, // Hans (not Hani) 298 { "yue", "Hant" }, // Hans (not Hani) 299 { "chk", "Latn" }, // Chuukese (Micronesia) 300 { "fil", "Latn" }, // Filipino (Philippines)" 301 { "ko", "Kore" }, // Korean (North Korea) 302 { "ko_KR", "Kore" }, // Korean (North Korea) 303 { "pap", "Latn" }, // Papiamento (Netherlands Antilles) 304 { "pau", "Latn" }, // Palauan (Palau) 305 { "su", "Latn" }, // Sundanese (Indonesia) 306 { "tet", "Latn" }, // Tetum (East Timor) 307 { "tk", "Latn" }, // Turkmen (Turkmenistan) 308 { "ty", "Latn" }, // Tahitian (French Polynesia) 309 { "ja", "Jpan" }, // Special script for japan 310 { "und", "Latn" }, // Ultimate fallback 311 }; 312 313 private static Map<String, String> localeToScriptCache = new TreeMap<>(); 314 static { 315 for (String language : standardCodes.getAvailableCodes("language")) { 316 Map<String, String> info = standardCodes.getLangData("language", language); 317 String script = info.get("Suppress-Script"); 318 if (script != null) { localeToScriptCache.put(language, script)319 localeToScriptCache.put(language, script); 320 } 321 } 322 for (String[] pair : SpecialScripts) { localeToScriptCache.put(pair[0], pair[1])323 localeToScriptCache.put(pair[0], pair[1]); 324 } 325 } 326 327 private static Map<String, String> FALLBACK_SCRIPTS; 328 static { 329 LanguageTagParser additionLtp = new LanguageTagParser(); 330 Map<String, String> _FALLBACK_SCRIPTS = new TreeMap<>(); 331 for (String addition : MAX_ADDITIONS) { 332 additionLtp.set(addition); 333 String lan = additionLtp.getLanguage(); _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript())334 _FALLBACK_SCRIPTS.put(lan, additionLtp.getScript()); 335 } 336 FALLBACK_SCRIPTS = ImmutableMap.copyOf(_FALLBACK_SCRIPTS); 337 } 338 339 private static int errorCount; 340 main(String[] args)341 public static void main(String[] args) throws IOException { 342 343 printDefaultLanguagesAndScripts(); 344 345 Map<String, String> toMaximized = new TreeMap<>(); 346 347 tryDifferentAlgorithm(toMaximized); 348 349 minimize(toMaximized); 350 351 // HACK TEMP_UNKNOWN_REGION 352 // this is to get around the removal of items with ZZ in minimize. 353 // probably cleaner way to do it, but this provides control over just those we want to retain. 354 Set<String> toRemove = new TreeSet<>(); 355 Map<String, String> toFix = new TreeMap<>(); 356 for (Entry<String, String> entry : toMaximized.entrySet()) { 357 String key = entry.getKey(); 358 String value = entry.getValue(); 359 if (key.contains(TEMP_UNKNOWN_REGION)) { 360 toRemove.add(key); 361 } else if (value.contains(TEMP_UNKNOWN_REGION)) { 362 toFix.put(key, value.replace(TEMP_UNKNOWN_REGION, UNKNOWN_REGION)); 363 } 364 } 365 for (String key : toRemove) { 366 toMaximized.remove(key); 367 } 368 toMaximized.putAll(toFix); 369 370 Map<String, String> oldLikely = SupplementalDataInfo.getInstance().getLikelySubtags(); 371 Set<String> changes = compareMapsAndFixNew("*WARNING* Likely Subtags: ", oldLikely, toMaximized, "ms_Arab", 372 "ms_Arab_ID"); 373 System.out.println(Joiner.on("\n").join(changes)); 374 375 if (OUTPUT_STYLE == OutputStyle.C_ALT) { 376 doAlt(toMaximized); 377 } 378 379 if (SHOW_ADD) 380 System.out 381 .println("/*" 382 + CldrUtility.LINE_SEPARATOR 383 + " To Maximize:" 384 + 385 CldrUtility.LINE_SEPARATOR 386 + " If using raw strings, make sure the input language/locale uses the right separator, and has the right casing." 387 + 388 CldrUtility.LINE_SEPARATOR 389 + " Remove the script Zzzz and the region ZZ if they occur; change an empty language subtag to 'und'." 390 + 391 CldrUtility.LINE_SEPARATOR 392 + " Get the language, region, and script from the cleaned-up tag, plus any variants/extensions" 393 + 394 CldrUtility.LINE_SEPARATOR 395 + " Try each of the following in order (where the field exists)" 396 + 397 CldrUtility.LINE_SEPARATOR 398 + " Lookup language-script-region. If in the table, return the result + variants" 399 + 400 CldrUtility.LINE_SEPARATOR 401 + " Lookup language-script. If in the table, return the result (substituting the original region if it exists) + variants" 402 + 403 CldrUtility.LINE_SEPARATOR 404 + " Lookup language-region. If in the table, return the result (substituting the original script if it exists) + variants" 405 + 406 CldrUtility.LINE_SEPARATOR 407 + " Lookup language. If in the table, return the result (substituting the original region and script if either or both exist) + variants" 408 + 409 CldrUtility.LINE_SEPARATOR 410 + 411 CldrUtility.LINE_SEPARATOR 412 + " Example: Input is zh-ZZZZ-SG." 413 + 414 CldrUtility.LINE_SEPARATOR 415 + " Normalize to zh-SG. Lookup in table. No match." 416 + 417 CldrUtility.LINE_SEPARATOR 418 + " Remove SG, but remember it. Lookup zh, and get the match (zh-Hans-CN). Substitute SG, and return zh-Hans-SG." 419 + 420 CldrUtility.LINE_SEPARATOR 421 + 422 CldrUtility.LINE_SEPARATOR 423 + " To Minimize:" 424 + 425 CldrUtility.LINE_SEPARATOR 426 + " First get max = maximize(input)." 427 + 428 CldrUtility.LINE_SEPARATOR 429 + " Then for trial in {language, language-region, language-script}" 430 + 431 CldrUtility.LINE_SEPARATOR 432 + " If maximize(trial) == max, then return trial." 433 + 434 CldrUtility.LINE_SEPARATOR 435 + " If you don't get a match, return max." 436 + 437 CldrUtility.LINE_SEPARATOR 438 + 439 CldrUtility.LINE_SEPARATOR 440 + " Example: Input is zh-Hant. Maximize to get zh-Hant-TW." 441 + 442 CldrUtility.LINE_SEPARATOR 443 + " zh => zh-Hans-CN. No match, so continue." 444 + 445 CldrUtility.LINE_SEPARATOR 446 + " zh-TW => zh-Hans-TW. Match, so return zh-TW." 447 + 448 CldrUtility.LINE_SEPARATOR 449 + 450 CldrUtility.LINE_SEPARATOR 451 + " (A variant of this uses {language, language-script, language-region}): that is, tries script before language." 452 + 453 CldrUtility.LINE_SEPARATOR + " toMaximal size:\t" + toMaximized.size() + 454 CldrUtility.LINE_SEPARATOR + "*/"); 455 456 printLikelySubtags(toMaximized); 457 458 // if (OUTPUT_STYLE != OutputStyle.XML) { 459 // printMap("const MapToMinimalSubtags default_subtags[]", toMinimized, null); 460 // } 461 462 printDefaultContent(toMaximized); 463 464 System.out.println(CldrUtility.LINE_SEPARATOR + "ERRORS:\t" + errorCount + CldrUtility.LINE_SEPARATOR); 465 466 } 467 468 static class RowData implements Comparable<RowData> { 469 OfficialStatus os; 470 String name; 471 Long pop; 472 RowData(OfficialStatus os, String name, Long pop)473 public RowData(OfficialStatus os, String name, Long pop) { 474 this.os = os; 475 this.name = name; 476 this.pop = pop; 477 } 478 getStatus()479 public OfficialStatus getStatus() { 480 // TODO Auto-generated method stub 481 return os; 482 } 483 getName()484 public CharSequence getName() { 485 // TODO Auto-generated method stub 486 return name; 487 } 488 getLiteratePopulation()489 public Long getLiteratePopulation() { 490 // TODO Auto-generated method stub 491 return pop; 492 } 493 494 @Override compareTo(RowData o)495 public int compareTo(RowData o) { 496 // TODO Auto-generated method stub 497 int result = os.compareTo(o.os); 498 if (result != 0) return -result; 499 long result2 = pop - o.pop; 500 if (result2 != 0) return result2 < 0 ? 1 : -1; 501 return name.compareTo(o.name); 502 } 503 504 @Override equals(Object o)505 public boolean equals(Object o) { 506 return 0 == compareTo((RowData) o); 507 } 508 509 @Override hashCode()510 public int hashCode() { 511 throw new UnsupportedOperationException(); 512 } 513 } 514 printDefaultLanguagesAndScripts()515 private static void printDefaultLanguagesAndScripts() { 516 517 final int minTotalPopulation = 10000000; 518 final int minTerritoryPopulation = 1000000; 519 final double minTerritoryPercent = 1.0 / 3; 520 Map<String, Set<RowData>> languageToReason = new TreeMap<>(); 521 Counter<String> languageToLiteratePopulation = new Counter<>(); 522 NumberFormat nf = NumberFormat.getIntegerInstance(ULocale.ENGLISH); 523 nf.setGroupingUsed(true); 524 LanguageTagParser ltp = new LanguageTagParser(); 525 LikelySubtags likelySubtags = new LikelySubtags(); 526 /* 527 * A. X is a qualified language**, and at least one of the following is true: 528 * 529 * 1. X is has official status* in any country 530 * 2. X exceeds a threshold population† of literate users worldwide: 1M 531 * 3. X exceeds a threshold population† in some country Z: 100K and 20% of Z's population†. 532 * 533 * B. X is an exception explicitly approved by the committee or X has minimal 534 * language coverage‡ in CLDR itself. 535 * C. The language is in the CLDR-target locales 536 */ 537 OfficialStatus minimalStatus = OfficialStatus.official_regional; // OfficialStatus.de_facto_official; 538 Map<String, String> languages = new TreeMap<>(); 539 for (String language : standardCodes.getAvailableCodes("language")) { 540 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language); 541 String result = english.getStringValue(path); 542 if (result != null) { 543 languages.put(language, result); 544 } 545 } 546 for (String language : languages.keySet()) { 547 System.out.println(language + "\t" + languages.get(language)); 548 } 549 550 // also CLDR-target locales 551 final Set<String> CLDRMainLanguages = new TreeSet<>(StandardCodes.make().getLocaleCoverageLocales(Organization.cldr)); 552 553 for (String territory : supplementalData.getTerritoriesWithPopulationData()) { 554 PopulationData territoryPop = supplementalData.getPopulationDataForTerritory(territory); 555 double territoryPopulation = territoryPop.getLiteratePopulation(); 556 for (String languageScript : supplementalData.getLanguagesForTerritoryWithPopulationData(territory)) { 557 PopulationData popData = supplementalData.getLanguageAndTerritoryPopulationData(languageScript, 558 territory); 559 ltp.set(languageScript); 560 String language = ltp.getLanguage(); 561 // if (ltp.getScript().isEmpty()) { 562 // String max = likelySubtags.maximize(languageScript); 563 // if (max != null) { 564 // ltp.set(max).setRegion(""); 565 // languageScript = ltp.toString(); 566 // } 567 // } 568 boolean add = false; 569 // #1 570 OfficialStatus status = popData.getOfficialStatus(); 571 if (status.compareTo(minimalStatus) >= 0) { 572 add = true; 573 } 574 long literatePopulation = getWritingPopulation(popData); 575 // #2 576 languageToLiteratePopulation.add(language, literatePopulation); 577 // #3 578 if (literatePopulation > minTerritoryPopulation 579 && literatePopulation > minTerritoryPercent * territoryPopulation) { 580 add = true; 581 } 582 if (add == false && CLDRMainLanguages.contains(language)) { 583 add = true; 584 } 585 if (add) { 586 add(languageToReason, language, territory, status, literatePopulation); 587 // Add the containing regions 588 for (String container : Containment.leafToContainer(territory)) { 589 add(languageToReason, language, container, OfficialStatus.unknown, literatePopulation); 590 } 591 } 592 } 593 } 594 // #2, now that we have the data 595 for (String language : languageToLiteratePopulation.keySet()) { 596 long totalPop = languageToLiteratePopulation.getCount(language); 597 if (totalPop > minTotalPopulation) { 598 add(languageToReason, language, "001", OfficialStatus.unknown, totalPop); 599 } 600 } 601 602 // Specials 603 add(languageToReason, "und", "001", OfficialStatus.unknown, 0); 604 605 // for (String language : Iso639Data.getAvailable()) { 606 // Scope scope = Iso639Data.getScope(language); 607 // Type type = Iso639Data.getType(language); 608 // if (scope == Scope.Special) { 609 // add(languageToReason, language, "001", OfficialStatus.unknown, -1); 610 // } 611 // } 612 // print them 613 614 System.out.println("Detailed - Including:\t" + languageToReason.size()); 615 616 for (String language : languageToReason.keySet()) { 617 Set<RowData> reasons = languageToReason.get(language); 618 619 RowData lastReason = reasons.iterator().next(); 620 621 System.out.append(language) 622 .append("\t") 623 .append(english.getName(language)) 624 .append("\t") 625 .append(lastReason.getStatus().toShortString()) 626 .append("\t") 627 .append(nf.format(languageToLiteratePopulation.getCount(language))); 628 for (RowData reason : reasons) { 629 String status = reason.getStatus().toShortString(); 630 System.out.append("\t") 631 .append(status) 632 .append("-") 633 .append(reason.getName()) 634 .append("-") 635 .append(nf.format(reason.getLiteratePopulation())); 636 } 637 System.out.append("\n"); 638 } 639 640 // now list them 641 642 Set<String> others = new TreeSet<>(); 643 others.addAll(standardCodes.getGoodAvailableCodes("language")); 644 others.removeAll(languageToReason.keySet()); 645 System.out.println("\nIncluded Languages:\t" + languageToReason.keySet().size()); 646 showLanguages(languageToReason.keySet(), languageToReason); 647 System.out.println("\nExcluded Languages:\t" + others.size()); 648 showLanguages(others, languageToReason); 649 } 650 getWritingPopulation(PopulationData popData)651 private static long getWritingPopulation(PopulationData popData) { 652 final double writingPopulation = popData.getWritingPopulation(); 653 if (!Double.isNaN(writingPopulation)) { 654 return (long) writingPopulation; 655 } 656 return (long) popData.getLiteratePopulation(); 657 } 658 showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason)659 private static void showLanguages(Set<String> others, Map<String, Set<RowData>> languageToReason) { 660 Set<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ENGLISH)); 661 for (String language : others) { 662 sorted.add(getLanguageName(language, languageToReason)); 663 } 664 char last = 0; 665 for (String language : sorted) { 666 final char curr = language.charAt(0); 667 if (last != curr) { 668 System.out.println(); 669 } else if (last != '\u0000') { 670 System.out.print(", "); 671 } 672 System.out.print(language); 673 last = curr; 674 } 675 System.out.println(); 676 } 677 getLanguageName(String language, Map<String, Set<RowData>> languageToReason)678 private static String getLanguageName(String language, 679 Map<String, Set<RowData>> languageToReason) { 680 OfficialStatus best = OfficialStatus.unknown; 681 Set<RowData> reasons = languageToReason.get(language); 682 if (reasons != null) { 683 for (RowData reason : reasons) { 684 final OfficialStatus currentStatus = reason.getStatus(); 685 if (best.compareTo(currentStatus) < 0) { 686 best = currentStatus; 687 } 688 } 689 } 690 String status = best.toShortString(); 691 Scope scope = Iso639Data.getScope(language); 692 if (scope == Scope.Special) { 693 status = "S"; 694 } 695 String languageFormatted = english.getName(language) + " [" + language + "]-" + status; 696 return languageFormatted; 697 } 698 add(Map<String, Set<RowData>> languageToReason, String language, String territoryRaw, OfficialStatus status, long population)699 private static void add(Map<String, Set<RowData>> languageToReason, String language, 700 String territoryRaw, OfficialStatus status, long population) { 701 String territory = english.getName("territory", territoryRaw) + " [" + territoryRaw + "]"; 702 Set<RowData> set = languageToReason.get(language); 703 if (set == null) { 704 languageToReason.put(language, set = new TreeSet<>()); 705 } 706 set.add(new RowData(status, territory, population)); 707 } 708 printDefaultContent(Map<String, String> toMaximized)709 private static void printDefaultContent(Map<String, String> toMaximized) throws IOException { 710 711 Set<String> defaultLocaleContent = new TreeSet<>(); 712 713 // go through all the cldr locales, and add default contents 714 // now computed from toMaximized 715 Set<String> available = factory.getAvailable(); 716 Relation<String, String> toChildren = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 717 LanguageTagParser ltp = new LanguageTagParser(); 718 719 // System.out.println(maximize("az_Latn_AZ", toMaximized)); 720 Set<String> hasScript = new TreeSet<>(); 721 722 // first get a mapping to children 723 for (String locale : available) { 724 if (locale.equals("root")) { 725 continue; 726 } 727 if (ltp.set(locale).getVariants().size() != 0) { 728 continue; 729 } 730 String parent = LocaleIDParser.getSimpleParent(locale); 731 if (ltp.getScript().length() != 0) { 732 hasScript.add(parent); 733 } 734 if (parent.equals("root")) { 735 continue; 736 } 737 toChildren.put(parent, locale); 738 } 739 740 // Suppress script for locales for which we only have one locale in common/main. See ticket #7834. 741 Set<String> suppressScriptLocales = new HashSet<>(Arrays.asList( 742 "bm_ML", "en_US", "ha_NG", "iu_CA", "ms_MY", "mn_MN", 743 "byn_ER", "ff_SN", "dyo_SN", "kk_KZ", "ku_TR", "ky_KG", "ml_IN", "so_SO", "sw_TZ", "wo_SN", "yo_NG", "dje_NE", 744 "blt_VN", 745 "hi_IN", 746 "nv_US", 747 "doi_IN" 748 )); 749 750 // if any have a script, then throw out any that don't have a script (unless they're specifically included.) 751 Set<String> toRemove = new TreeSet<>(); 752 for (String locale : hasScript) { 753 toRemove.clear(); 754 Set<String> children = toChildren.getAll(locale); 755 for (String child : children) { 756 if (ltp.set(child).getScript().length() == 0 && !suppressScriptLocales.contains(child)) { 757 toRemove.add(child); 758 } 759 } 760 if (toRemove.size() != 0) { 761 System.out.println("\tRemoving:\t" + locale + "\t" + toRemove + "\tfrom\t" + children); 762 toChildren.removeAll(locale, toRemove); 763 } 764 } 765 766 // we add a child as a default locale if it has the same maximization 767 main: for (String locale : toChildren.keySet()) { 768 String maximized = maximize(locale, toMaximized); 769 if (maximized == null) { 770 if (SHOW_ADD) System.out.println("Missing maximized:\t" + locale); 771 continue; 772 } 773 Set<String> children = toChildren.getAll(locale); 774 Map<String, String> debugStuff = new TreeMap<>(); 775 for (String child : children) { 776 String maximizedChild = maximize(child, toMaximized); 777 if (maximized.equals(maximizedChild)) { 778 defaultLocaleContent.add(child); 779 continue main; 780 } 781 debugStuff.put(child, maximizedChild); 782 } 783 if (SHOW_ADD) System.out.println("Can't find maximized: " + locale + "=" + maximized 784 + "\tin\t" + debugStuff); 785 } 786 787 defaultLocaleContent.remove("und_ZZ"); // und_ZZ isn't ever a real locale. 788 789 showDefaultContentDifferencesAndFix(defaultLocaleContent); 790 791 Log.setLogNoBOM(CLDRPaths.GEN_DIRECTORY + "/supplemental", "supplementalMetadata.xml"); 792 BufferedReader oldFile = FileUtilities.openUTF8Reader(CLDRPaths.SUPPLEMENTAL_DIRECTORY, "supplementalMetadata.xml"); 793 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*<defaultContent locales=\"\\s*"), Log.getLog(), false); 794 795 String sep = CldrUtility.LINE_SEPARATOR + "\t\t\t"; 796 String broken = CldrUtility.breakLines(CldrUtility.join(defaultLocaleContent, " "), sep, 797 PatternCache.get("(\\S)\\S*").matcher(""), 80); 798 799 Log.println("\t\t<defaultContent locales=\"" + broken + "\""); 800 Log.println("\t\t/>"); 801 802 // Log.println("</supplementalData>"); 803 CldrUtility.copyUpTo(oldFile, PatternCache.get("\\s*/>\\s*(<!--.*)?"), null, true); // skip to matching > 804 CldrUtility.copyUpTo(oldFile, null, Log.getLog(), true); // copy the rest 805 806 Log.close(); 807 oldFile.close(); 808 } 809 810 // private static void oldAlgorithm(Map<String,String> toMaximized) { 811 // Set<String> defaultContentLocales = supplementalData.getDefaultContentLocales(); 812 // LanguageTagParser parser = new LanguageTagParser(); 813 // for (String locale : defaultContentLocales) { 814 // String parent = parser.getParent(locale); 815 // toMaximized.put(parent, locale); 816 // if (SHOW_ADD) System.out.println("Adding:\t" + parent + "\t=>\t" + locale + "\t\tDefaultContent"); 817 // } 818 // 819 // for (String[] specialCase : SpecialCases) { 820 // toMaximized.put(specialCase[0], specialCase[1]); 821 // if (SHOW_ADD) System.out.println("Adding:\t" + specialCase[0] + "\t=>\t" + specialCase[1] + "\t\tSpecial"); 822 // } 823 // 824 // // recurse and close 825 // closeMapping(toMaximized); 826 // 827 // addScript(toMaximized, parser); 828 // 829 // closeMapping(toMaximized); 830 // 831 // addLanguageScript(toMaximized, parser); 832 // 833 // closeMapping(toMaximized); 834 // 835 // addLanguageCountry(toMaximized, parser); 836 // 837 // closeMapping(toMaximized); 838 // 839 // addCountries(toMaximized); 840 // addScript(toMaximized, parser); 841 // closeMapping(toMaximized); 842 // closeUnd(toMaximized); 843 // 844 // addDeprecated(toMaximized); 845 // 846 // closeMapping(toMaximized); 847 // 848 // checkConsistency(toMaximized); 849 // } 850 851 private static class MaxData { 852 Relation<String, Row.R3<Double, String, String>> languages = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 853 Map<String, Counter<String>> languagesToScripts = new TreeMap<>(); 854 Map<String, Counter<String>> languagesToRegions = new TreeMap<>(); 855 856 Relation<String, Row.R3<Double, String, String>> scripts = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 857 Map<String, Counter<String>> scriptsToLanguages = new TreeMap<>(); 858 Map<String, Counter<String>> scriptsToRegions = new TreeMap<>(); 859 860 Relation<String, Row.R3<Double, String, String>> regions = Relation.of(new TreeMap<String, Set<Row.R3<Double, String, String>>>(), TreeSet.class); 861 Map<String, Counter<String>> regionsToLanguages = new TreeMap<>(); 862 Map<String, Counter<String>> regionsToScripts = new TreeMap<>(); 863 864 Map<String, Counter<Row.R2<String, String>>> containersToLanguage = new TreeMap<>(); 865 Relation<String, Row.R4<Double, String, String, String>> containersToLangRegion = Relation.of( 866 new TreeMap<String, Set<Row.R4<Double, String, String, String>>>(), TreeSet.class); 867 868 Relation<Row.R2<String, String>, Row.R2<Double, String>> languageScripts = Relation.of( 869 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 870 TreeSet.class); 871 Relation<Row.R2<String, String>, Row.R2<Double, String>> scriptRegions = Relation.of( 872 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 873 TreeSet.class); 874 Relation<Row.R2<String, String>, Row.R2<Double, String>> languageRegions = Relation.of( 875 new TreeMap<Row.R2<String, String>, Set<Row.R2<Double, String>>>(), 876 TreeSet.class); 877 878 /** 879 * Add population information. "order" is the negative of the population (makes the first be the highest). 880 * @param language 881 * @param script 882 * @param region 883 * @param order 884 */ add(String language, String script, String region, Double order)885 void add(String language, String script, String region, Double order) { 886 if (language.equals("cpp")) { 887 System.out.println(language + "\t" + script + "\t" + region + "\t" + -order); 888 } 889 languages.put(language, Row.of(order, script, region)); 890 // addCounter(languagesToScripts, language, script, order); 891 // addCounter(languagesToRegions, language, region, order); 892 893 scripts.put(script, Row.of(order, language, region)); 894 // addCounter(scriptsToLanguages, script, language, order); 895 // addCounter(scriptsToRegions, script, region, order); 896 897 regions.put(region, Row.of(order, language, script)); 898 // addCounter(regionsToLanguages, region, language, order); 899 // addCounter(regionsToScripts, region, script, order); 900 901 languageScripts.put(Row.of(language, script), Row.of(order, region)); 902 scriptRegions.put(Row.of(script, region), Row.of(order, language)); 903 languageRegions.put(Row.of(language, region), Row.of(order, script)); 904 905 Set<String> containerSet = Containment.leafToContainer(region); 906 if (containerSet != null) { 907 for (String container : containerSet) { 908 909 containersToLangRegion.put(container, Row.of(order, language, script, region)); 910 Counter<R2<String, String>> data = containersToLanguage.get(container); 911 if (data == null) { 912 containersToLanguage.put(container, data = new Counter<>()); 913 } 914 data.add(Row.of(language, script), (long) (double) order); 915 916 } 917 } 918 919 if (SHOW_ADD) System.out.println("Data:\t" + language + "\t" + script + "\t" + region + "\t" + order); 920 } 921 // private void addCounter(Map<String, Counter<String>> map, String key, String key2, Double count) { 922 // Counter<String> counter = map.get(key); 923 // if (counter == null) { 924 // map.put(key, counter = new Counter<String>()); 925 // } 926 // counter.add(key2, count.longValue()); 927 // } 928 } 929 930 private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000; 931 private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20; 932 private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000; 933 private static final double UNOFFICIAL_SCALE_DOWN = 0.2; 934 935 private static NumberFormat percent = NumberFormat.getPercentInstance(); 936 private static NumberFormat number = NumberFormat.getIntegerInstance(); 937 tryDifferentAlgorithm(Map<String, String> toMaximized)938 private static void tryDifferentAlgorithm(Map<String, String> toMaximized) { 939 // we are going to try a different approach. 940 // first gather counts for maximized values 941 // Set<Row.R3<String,String,String>,Double> rowsToCounts = new TreeMap(); 942 MaxData maxData = new MaxData(); 943 Set<String> cldrLocales = factory.getAvailable(); 944 Set<String> otherTerritories = new TreeSet<>(standardCodes.getGoodAvailableCodes("territory")); 945 946 // process all the information to get the top values for each triple. 947 // each of the combinations of 1 or 2 components gets to be a key. 948 for (String region : supplementalData.getTerritoriesWithPopulationData()) { 949 otherTerritories.remove(region); 950 PopulationData regionData = supplementalData.getPopulationDataForTerritory(region); 951 final double literateTerritoryPopulation = regionData.getLiteratePopulation(); 952 // we need any unofficial language to meet a certain absolute size requirement and proportion size 953 // requirement. 954 // so the bar is x percent of the population, reset up to y absolute size. 955 double minimalLiteratePopulation = literateTerritoryPopulation * MIN_UNOFFICIAL_LANGUAGE_PROPORTION; 956 if (minimalLiteratePopulation < MIN_UNOFFICIAL_LANGUAGE_SIZE) { 957 minimalLiteratePopulation = MIN_UNOFFICIAL_LANGUAGE_SIZE; 958 } 959 960 for (String writtenLanguage : supplementalData.getLanguagesForTerritoryWithPopulationData(region)) { 961 PopulationData data = supplementalData.getLanguageAndTerritoryPopulationData(writtenLanguage, region); 962 final double literatePopulation = getWritingPopulation(data); //data.getLiteratePopulation(); 963 double order = -literatePopulation; // negative so we get the inverse order 964 965 if (data.getOfficialStatus() == OfficialStatus.unknown) { 966 final String locale = writtenLanguage + "_" + region; 967 if (literatePopulation >= minimalLiteratePopulation) { 968 // ok, skip 969 } else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE && cldrLocales.contains(locale)) { 970 // ok, skip 971 } else { 972 // if (SHOW_ADD) 973 // System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t" 974 // + english.getName(locale) 975 // + "\t-- too small:\t" + number.format(literatePopulation)); 976 // continue; 977 } 978 order *= UNOFFICIAL_SCALE_DOWN; 979 if (SHOW_ADD) 980 System.out.println("Retaining\t" + writtenLanguage + "\t" + region + "\t" 981 + english.getName(locale) 982 + "\t" + number.format(literatePopulation) 983 + "\t" + percent.format(literatePopulation / literateTerritoryPopulation) 984 + (cldrLocales.contains(locale) ? "\tin-CLDR" : "")); 985 } 986 String script; 987 String language = writtenLanguage; 988 final int pos = writtenLanguage.indexOf('_'); 989 if (pos > 0) { 990 language = writtenLanguage.substring(0, pos); 991 script = writtenLanguage.substring(pos + 1); 992 } else { 993 script = getScriptForLocale2(language); 994 } 995 maxData.add(language, script, region, order); 996 } 997 } 998 999 LanguageTagParser additionLtp = new LanguageTagParser(); 1000 1001 for (String addition : MAX_ADDITIONS) { 1002 additionLtp.set(addition); 1003 String lan = additionLtp.getLanguage(); 1004 Set<R3<Double, String, String>> key = maxData.languages.get(lan); 1005 if (key == null) { 1006 maxData.add(lan, additionLtp.getScript(), additionLtp.getRegion(), 1.0); 1007 } else { 1008 int debug = 0; 1009 } 1010 } 1011 1012 for (Entry<String, Collection<String>> entry : DeriveScripts.getLanguageToScript().asMap().entrySet()) { 1013 String language = entry.getKey(); 1014 final Collection<String> values = entry.getValue(); 1015 if (values.size() != 1) { 1016 continue; // skip, no either way 1017 } 1018 Set<R3<Double, String, String>> old = maxData.languages.get(language); 1019 if (!maxData.languages.containsKey(language)) { 1020 maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0); 1021 } 1022 } 1023 1024 // add others, with English default 1025 for (String region : otherTerritories) { 1026 if (region.length() == 3) continue; // FIX ONCE WE ADD REGIONS 1027 maxData.add("en", "Latn", region, 1.0); 1028 } 1029 1030 // get a reverse mapping, so that we can add the aliases 1031 1032 Map<String, R2<List<String>, String>> languageAliases = SupplementalDataInfo.getInstance().getLocaleAliasInfo() 1033 .get("language"); 1034 for (Entry<String, R2<List<String>, String>> str : languageAliases.entrySet()) { 1035 String reason = str.getValue().get1(); 1036 if ("overlong".equals(reason) || "bibliographic".equals(reason) || "macrolanguage".equals(reason)) { 1037 continue; 1038 } 1039 List<String> replacements = str.getValue().get0(); 1040 if (replacements == null) { 1041 continue; 1042 } 1043 String goodLanguage = replacements.get(0); 1044 1045 String badLanguage = str.getKey(); 1046 if (badLanguage.contains("_")) { 1047 continue; 1048 } 1049 if (deprecatedISONotInLST.contains(badLanguage)) { 1050 continue; 1051 } 1052 Set<R3<Double, String, String>> goodLanguageData = maxData.languages.getAll(goodLanguage); 1053 if (goodLanguageData == null) { 1054 continue; 1055 } 1056 R3<Double, String, String> value = goodLanguageData.iterator().next(); 1057 final String script = value.get1(); 1058 final String region = value.get2(); 1059 maxData.add(badLanguage, script, region, 1.0); 1060 System.out.println("Adding aliases: " + badLanguage + ", " + script + ", " + region + ", " + reason); 1061 } 1062 1063 // now, get the best for each one 1064 for (String language : maxData.languages.keySet()) { 1065 R3<Double, String, String> value = maxData.languages.getAll(language).iterator().next(); 1066 final Comparable<String> script = value.get1(); 1067 final Comparable<String> region = value.get2(); 1068 add(language, language + "_" + script + "_" + region, toMaximized, "L->SR", LocaleOverride.REPLACE_EXISTING, 1069 SHOW_ADD); 1070 } 1071 for (String language : maxData.languagesToScripts.keySet()) { 1072 String script = maxData.languagesToScripts.get(language).getKeysetSortedByCount(true).iterator().next(); 1073 add(language, language + "_" + script, toMaximized, "L->S", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1074 } 1075 for (String language : maxData.languagesToRegions.keySet()) { 1076 String region = maxData.languagesToRegions.get(language).getKeysetSortedByCount(true).iterator().next(); 1077 add(language, language + "_" + region, toMaximized, "L->R", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1078 } 1079 1080 for (String script : maxData.scripts.keySet()) { 1081 R3<Double, String, String> value = maxData.scripts.getAll(script).iterator().next(); 1082 final Comparable<String> language = value.get1(); 1083 final Comparable<String> region = value.get2(); 1084 add("und_" + script, language + "_" + script + "_" + region, toMaximized, "S->LR", 1085 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1086 } 1087 for (String script : maxData.scriptsToLanguages.keySet()) { 1088 String language = maxData.scriptsToLanguages.get(script).getKeysetSortedByCount(true).iterator().next(); 1089 add("und_" + script, language + "_" + script, toMaximized, "S->L", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1090 } 1091 for (String script : maxData.scriptsToRegions.keySet()) { 1092 String region = maxData.scriptsToRegions.get(script).getKeysetSortedByCount(true).iterator().next(); 1093 add("und_" + script, "und_" + script + "_" + region, toMaximized, "S->R", LocaleOverride.REPLACE_EXISTING, 1094 SHOW_ADD); 1095 } 1096 1097 for (String region : maxData.regions.keySet()) { 1098 R3<Double, String, String> value = maxData.regions.getAll(region).iterator().next(); 1099 final Comparable<String> language = value.get1(); 1100 final Comparable<String> script = value.get2(); 1101 add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R->LS", 1102 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1103 } 1104 for (String region : maxData.regionsToLanguages.keySet()) { 1105 String language = maxData.regionsToLanguages.get(region).getKeysetSortedByCount(true).iterator().next(); 1106 add("und_" + region, language + "_" + region, toMaximized, "R->L", LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1107 } 1108 for (String region : maxData.regionsToScripts.keySet()) { 1109 String script = maxData.regionsToScripts.get(region).getKeysetSortedByCount(true).iterator().next(); 1110 add("und_" + region, "und_" + script + "_" + region, toMaximized, "R->S", LocaleOverride.REPLACE_EXISTING, 1111 SHOW_ADD); 1112 } 1113 1114 for (Entry<String, Counter<R2<String, String>>> containerAndInfo : maxData.containersToLanguage.entrySet()) { 1115 String region = containerAndInfo.getKey(); 1116 if (region.equals("001")) { 1117 continue; 1118 } 1119 Counter<R2<String, String>> data = containerAndInfo.getValue(); 1120 Set<R2<String, String>> keysetSortedByCount = data.getKeysetSortedByCount(true); 1121 if (SHOW_CONTAINERS) { // debug 1122 System.out.println("Container2L:\t" + region + "\t" + shorten(data.getEntrySetSortedByCount(true, null))); 1123 System.out.println("Container2LR:\t" + region + "\t" + maxData.containersToLangRegion.get(region)); 1124 } 1125 R2<String, String> value = keysetSortedByCount.iterator().next(); // will get most negative 1126 final Comparable<String> language = value.get0(); 1127 final Comparable<String> script = value.get1(); 1128 1129 // fix special cases like es-419, where a locale exists. 1130 // for those cases, what we add as output is the container. Otherwise the region. 1131 Set<String> skipLanguages = cldrContainerToLanguages.get(region); 1132 if (skipLanguages != null 1133 && skipLanguages.contains(language)) { 1134 add("und_" + region, language + "_" + script + "_" + region, toMaximized, "R*->LS", 1135 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1136 continue; 1137 } 1138 1139 // we now have the best language and script. Find the best region for that 1140 for (R4<Double, String, String, String> e : maxData.containersToLangRegion.get(region)) { 1141 final Comparable<String> language2 = e.get1(); 1142 final Comparable<String> script2 = e.get2(); 1143 if (language2.equals(language) && script2.equals(script)) { 1144 add("und_" + region, language + "_" + script + "_" + e.get3(), toMaximized, "R*->LS", 1145 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1146 break; 1147 } 1148 } 1149 } 1150 1151 for (R2<String, String> languageScript : maxData.languageScripts.keySet()) { 1152 R2<Double, String> value = maxData.languageScripts.getAll(languageScript).iterator().next(); 1153 final Comparable<String> language = languageScript.get0(); 1154 final Comparable<String> script = languageScript.get1(); 1155 final Comparable<String> region = value.get1(); 1156 add(language + "_" + script, language + "_" + script + "_" + region, toMaximized, "LS->R", 1157 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1158 } 1159 1160 for (R2<String, String> scriptRegion : maxData.scriptRegions.keySet()) { 1161 R2<Double, String> value = maxData.scriptRegions.getAll(scriptRegion).iterator().next(); 1162 final Comparable<String> script = scriptRegion.get0(); 1163 final Comparable<String> region = scriptRegion.get1(); 1164 final Comparable<String> language = value.get1(); 1165 add("und_" + script + "_" + region, language + "_" + script + "_" + region, toMaximized, "SR->L", 1166 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1167 } 1168 1169 for (R2<String, String> languageRegion : maxData.languageRegions.keySet()) { 1170 R2<Double, String> value = maxData.languageRegions.getAll(languageRegion).iterator().next(); 1171 final Comparable<String> language = languageRegion.get0(); 1172 final Comparable<String> region = languageRegion.get1(); 1173 final Comparable<String> script = value.get1(); 1174 add(language + "_" + region, language + "_" + script + "_" + region, toMaximized, "LR->S", 1175 LocaleOverride.REPLACE_EXISTING, SHOW_ADD); 1176 } 1177 1178 // get the script info from metadata as fallback 1179 1180 TreeSet<String> sorted = new TreeSet<>(ScriptMetadata.getScripts()); 1181 for (String script : sorted) { 1182 Info i = ScriptMetadata.getInfo(script); 1183 String likelyLanguage = i.likelyLanguage; 1184 String originCountry = i.originCountry; 1185 final String result = likelyLanguage + "_" + script + "_" + originCountry; 1186 add("und_" + script, result, toMaximized, "S->LR•", 1187 LocaleOverride.KEEP_EXISTING, SHOW_ADD); 1188 add(likelyLanguage, result, toMaximized, "L->SR•", 1189 LocaleOverride.KEEP_EXISTING, SHOW_ADD); 1190 } 1191 1192 // add overrides 1193 for (String key : LANGUAGE_OVERRIDES.keySet()) { 1194 add(key, LANGUAGE_OVERRIDES.get(key), toMaximized, "OVERRIDE", LocaleOverride.REPLACE_EXISTING, true); 1195 } 1196 } 1197 shorten(Object data)1198 public static String shorten(Object data) { 1199 String info = data.toString(); 1200 if (info.length() > 255) { 1201 info = info.substring(0, 127) + "…"; 1202 } 1203 return info; 1204 } 1205 doAlt(Map<String, String> toMaximized)1206 private static void doAlt(Map<String, String> toMaximized) { 1207 // TODO Auto-generated method stub 1208 Map<String, String> temp = new TreeMap<>(); 1209 for (String locale : toMaximized.keySet()) { 1210 String target = toMaximized.get(locale); 1211 temp.put(toAlt(locale, true), toAlt(target, true)); 1212 } 1213 toMaximized.clear(); 1214 toMaximized.putAll(temp); 1215 } 1216 maximize(String languageTag, Map<String, String> toMaximized)1217 public static String maximize(String languageTag, Map<String, String> toMaximized) { 1218 LanguageTagParser ltp = new LanguageTagParser(); 1219 1220 // clean up the input by removing Zzzz, ZZ, and changing "" into und. 1221 ltp.set(languageTag); 1222 String language = ltp.getLanguage(); 1223 String region = ltp.getRegion(); 1224 String script = ltp.getScript(); 1225 boolean changed = false; 1226 if (language.equals("")) { 1227 ltp.setLanguage(language = "und"); 1228 changed = true; 1229 } 1230 if (region.equals(UNKNOWN_SCRIPT)) { 1231 ltp.setScript(script = ""); 1232 changed = true; 1233 } 1234 if (ltp.getRegion().equals(UNKNOWN_REGION)) { 1235 ltp.setRegion(region = ""); 1236 changed = true; 1237 } 1238 if (changed) { 1239 languageTag = ltp.toString(); 1240 } 1241 // check whole 1242 String result = toMaximized.get(languageTag); 1243 if (result != null) { 1244 return result; 1245 } 1246 // try empty region 1247 if (region.length() != 0) { 1248 result = toMaximized.get(ltp.setRegion("").toString()); 1249 if (result != null) { 1250 return ltp.set(result).setRegion(region).toString(); 1251 } 1252 ltp.setRegion(region); // restore 1253 } 1254 // try empty script 1255 if (script.length() != 0) { 1256 result = toMaximized.get(ltp.setScript("").toString()); 1257 if (result != null) { 1258 return ltp.set(result).setScript(script).toString(); 1259 } 1260 // try empty script and region 1261 if (region.length() != 0) { 1262 result = toMaximized.get(ltp.setRegion("").toString()); 1263 if (result != null) { 1264 return ltp.set(result).setScript(script).setRegion(region).toString(); 1265 } 1266 } 1267 } 1268 if (!language.equals("und") && script.length() != 0 && region.length() != 0) { 1269 return languageTag; // it was ok, and we couldn't do anything with it 1270 } 1271 return null; // couldn't maximize 1272 } 1273 minimize(String input, Map<String, String> toMaximized, boolean favorRegion)1274 public static String minimize(String input, Map<String, String> toMaximized, boolean favorRegion) { 1275 if (input.equals("nb_Latn_SJ")) { 1276 System.out.print(""); // debug 1277 } 1278 String maximized = maximize(input, toMaximized); 1279 if (maximized == null) { 1280 return null; // failed 1281 } 1282 LanguageTagParser ltp = new LanguageTagParser().set(maximized); 1283 String language = ltp.getLanguage(); 1284 String region = ltp.getRegion(); 1285 String script = ltp.getScript(); 1286 // try building up from shorter to longer, and find the first that matches 1287 // could be more optimized, but for this code we want simplest 1288 String[] trials = { language, 1289 language + TAG_SEPARATOR + (favorRegion ? region : script), 1290 language + TAG_SEPARATOR + (!favorRegion ? region : script) }; 1291 for (String trial : trials) { 1292 String newMaximized = maximize(trial, toMaximized); 1293 if (maximized.equals(newMaximized)) { 1294 return trial; 1295 } 1296 } 1297 return maximized; 1298 } 1299 1300 // /** 1301 // * Verify that we can map from each language, script, and country to something. 1302 // * @param toMaximized 1303 // */ 1304 // private static void checkConsistency(Map<String, String> toMaximized) { 1305 // Map<String,String> needMappings = new TreeMap(); 1306 // LanguageTagParser parser = new LanguageTagParser(); 1307 // for (String maximized : new TreeSet<String>(toMaximized.values())) { 1308 // parser.set(maximized); 1309 // final String language = parser.getLanguage(); 1310 // final String script = parser.getScript(); 1311 // final String region = parser.getRegion(); 1312 // if (language.length() == 0 || script.length() == 0 || region.length() == 0) { 1313 // failure(" { \"" + maximized + "\", \"" + maximized + "\" }, // " + english.getName(maximized) + 1314 // "\t\tFailed-Consistency"); 1315 // continue; 1316 // } 1317 // addIfNotIn(language, maximized, needMappings, toMaximized, "Consistency"); 1318 // addIfNotIn(language + "_" + script, maximized, needMappings, toMaximized, "Consistency"); 1319 // addIfNotIn(language + "_" + region, maximized, needMappings, toMaximized, "Consistency"); 1320 // addIfNotIn("und_" + script, maximized, needMappings, toMaximized, "Consistency"); 1321 // addIfNotIn("und_" + script + "_" + region, maximized, needMappings, toMaximized, "Consistency"); 1322 // addIfNotIn("und_" + region, maximized, needMappings, toMaximized, "Consistency"); 1323 // } 1324 // toMaximized.putAll(needMappings); 1325 // } 1326 1327 // private static void failure(String string) { 1328 // System.out.println(string); 1329 // errorCount++; 1330 // } 1331 1332 // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Map<String, String> 1333 // otherToCheck, String kind) { 1334 // addIfNotIn(key, value, toAdd, otherToCheck == null ? null : otherToCheck.keySet(), null, kind); 1335 // } 1336 1337 // private static void addIfNotIn(String key, String value, Map<String, String> toAdd, Set<String> skipKey, 1338 // Set<String> skipValue, String kind) { 1339 // if (!key.equals(value) 1340 // && !toAdd.containsKey(key) 1341 // && (skipKey == null || !skipKey.contains(key)) 1342 // && (skipValue == null || !skipValue.contains(value))) { 1343 // add(key, value, toAdd, kind); 1344 // } 1345 // } 1346 1347 enum LocaleOverride { 1348 KEEP_EXISTING, REPLACE_EXISTING 1349 } 1350 add(String key, String value, Map<String, String> toAdd, String kind, LocaleOverride override, boolean showAction)1351 private static void add(String key, String value, Map<String, String> toAdd, String kind, LocaleOverride override, 1352 boolean showAction) { 1353 if (key.equals(DEBUG_ADD_KEY)) { 1354 System.out.println("*debug*"); 1355 } 1356 String oldValue = toAdd.get(key); 1357 if (oldValue == null) { 1358 if (showAction) { 1359 System.out.println("\tAdding:\t\t" + getName(key) + "\t=>\t" + getName(value) + "\t\t\t\t" + kind); 1360 } 1361 } else if (override == LocaleOverride.KEEP_EXISTING || value.equals(oldValue)) { 1362 // if (showAction) { 1363 // System.out.println("Skipping:\t" + key + "\t=>\t" + value + "\t\t\t\t" + kind); 1364 // } 1365 return; 1366 } else { 1367 if (showAction) { 1368 System.out.println("\tReplacing:\t" + getName(key) + "\t=>\t" + getName(value) + "\t, was\t" + getName(oldValue) + "\t\t" + kind); 1369 } 1370 } 1371 toAdd.put(key, value); 1372 } 1373 getName(String value)1374 private static String getName(String value) { 1375 return ConvertLanguageData.getLanguageCodeAndName(value); 1376 } 1377 1378 // private static void addCountries(Map<String, String> toMaximized) { 1379 // Map <String, Map<String, Double>> scriptToLanguageToSize = new TreeMap(); 1380 // 1381 // for (String territory : supplementalData.getTerritoriesWithPopulationData()) { 1382 // Set<String> languages = supplementalData.getLanguagesForTerritoryWithPopulationData(territory); 1383 // String biggestOfficial = null; 1384 // double biggest = -1; 1385 // for (String language : languages) { 1386 // PopulationData info = supplementalData.getLanguageAndTerritoryPopulationData(language, territory); 1387 // // add to info about script 1388 // 1389 // String script = getScriptForLocale(language); 1390 // if (script != null) { 1391 // Map<String, Double> languageInfo = scriptToLanguageToSize.get(script); 1392 // if (languageInfo == null) scriptToLanguageToSize.put(script, languageInfo = new TreeMap()); 1393 // String baseLanguage = language; 1394 // int pos = baseLanguage.indexOf('_'); 1395 // if (pos >= 0) { 1396 // baseLanguage = baseLanguage.substring(0,pos); 1397 // } 1398 // Double size = languageInfo.get(baseLanguage); 1399 // languageInfo.put(baseLanguage, (size == null ? 0 : size) + info.getLiteratePopulation()); 1400 // } 1401 // 1402 // 1403 // final OfficialStatus officialStatus = info.getOfficialStatus(); 1404 // if (officialStatus == OfficialStatus.de_facto_official || officialStatus == OfficialStatus.official) { 1405 // double size2 = info.getLiteratePopulation(); 1406 // if (biggest < size2) { 1407 // biggest = size2; 1408 // biggestOfficial = language; 1409 // } 1410 // } 1411 // } 1412 // if (biggestOfficial != null) { 1413 // final String replacementTag = "und_" + territory; 1414 // String maximized = biggestOfficial + "_" + territory; 1415 // toMaximized.put(replacementTag, maximized); 1416 // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tLanguage-Territory"); 1417 // } 1418 // } 1419 // 1420 // for (String script : scriptToLanguageToSize.keySet()) { 1421 // String biggestOfficial = null; 1422 // double biggest = -1; 1423 // 1424 // final Map<String, Double> languageToSize = scriptToLanguageToSize.get(script); 1425 // for (String language : languageToSize.keySet()) { 1426 // double size = languageToSize.get(language); 1427 // if (biggest < size) { 1428 // biggest = size; 1429 // biggestOfficial = language; 1430 // } 1431 // } 1432 // if (biggestOfficial != null) { 1433 // final String replacementTag = "und_" + script; 1434 // String maximized = biggestOfficial + "_" + script; 1435 // toMaximized.put(replacementTag, maximized); 1436 // if (SHOW_ADD) System.out.println("Adding:\t" + replacementTag + "\t=>\t" + maximized + "\t\tUnd-Script"); 1437 // } 1438 // } 1439 // } 1440 1441 // private static void closeUnd(Map<String, String> toMaximized) { 1442 // Map<String,String> toAdd = new TreeMap<String,String>(); 1443 // for (String oldSource : toMaximized.keySet()) { 1444 // String maximized = toMaximized.get(oldSource); 1445 // if (!maximized.startsWith("und")) { 1446 // int pos = maximized.indexOf("_"); 1447 // if (pos >= 0) { 1448 // addIfNotIn( "und" + maximized.substring(pos), maximized, toAdd, toMaximized, "CloseUnd"); 1449 // } 1450 // } 1451 // } 1452 // toMaximized.putAll(toAdd); 1453 // } 1454 1455 /** 1456 * Generate tags where the deprecated values map to the expanded values 1457 * 1458 * @param toMaximized 1459 */ 1460 // private static void addDeprecated(Map<String, String> toMaximized) { 1461 // Map<String, Map<String, List<String>>> typeToTagToReplacement = supplementalData.getLocaleAliasInfo(); 1462 // LanguageTagParser temp = new LanguageTagParser(); 1463 // LanguageTagParser tagParsed = new LanguageTagParser(); 1464 // LanguageTagParser replacementParsed = new LanguageTagParser(); 1465 // Map<String,String> toAdd = new TreeMap<String,String>(); 1466 // while (true) { 1467 // toAdd.clear(); 1468 // for (String type : typeToTagToReplacement.keySet()) { 1469 // if (type.equals("variant") || type.equals("zone")) continue; 1470 // boolean addUnd = !type.equals("language"); 1471 // 1472 // Map<String, List<String>> tagToReplacement = typeToTagToReplacement.get(type); 1473 // System.out.println("*" + type + " = " + tagToReplacement); 1474 // 1475 // for (String tag: tagToReplacement.keySet()) { 1476 // 1477 // final List<String> list = tagToReplacement.get(tag); 1478 // if (list == null) continue; // we don't have any information 1479 // String replacement = list.get(0); 1480 // 1481 // // only do multiples 1482 // if (tag.contains("_") || !replacement.contains("_")) { 1483 // continue; 1484 // } 1485 // 1486 // // we now have a tag and a replacement value 1487 // // make parsers that we can use 1488 // try { 1489 // tagParsed.set(addUnd ? "und-" + tag : tag); 1490 // replacementParsed.set(addUnd ? "und-" + replacement : replacement); 1491 // } catch (RuntimeException e) { 1492 // continue; 1493 // } 1494 // addIfNotIn(tag, replacement, toAdd, toMaximized,"Deprecated"); 1495 // 1496 // for (String locale : toMaximized.keySet()) { 1497 // String maximized = toMaximized.get(locale); 1498 // addIfMatches(temp.set(locale), maximized, replacementParsed, tagParsed, toAdd, toMaximized); 1499 // addIfMatches(temp.set(maximized), maximized, replacementParsed, tagParsed, toAdd, toMaximized); 1500 // } 1501 // } 1502 // } 1503 // if (toAdd.size() == 0) { 1504 // break; 1505 // } 1506 // toMaximized.putAll(toAdd); 1507 // } 1508 // } 1509 1510 // private static void addIfMatches(LanguageTagParser locale, String maximized, LanguageTagParser tagParsed, 1511 // LanguageTagParser replacementParsed, Map<String, String> toAdd, Map<String, String> toMaximized) { 1512 // if (!tagParsed.getLanguage().equals(locale.getLanguage()) && !tagParsed.getLanguage().equals("und")) { 1513 // return; 1514 // } 1515 // if (!tagParsed.getScript().equals(locale.getScript()) && !tagParsed.getScript().equals("")) { 1516 // return; 1517 // } 1518 // if (!tagParsed.getRegion().equals(locale.getRegion()) && !tagParsed.getRegion().equals("")) { 1519 // return; 1520 // } 1521 // if (!replacementParsed.getLanguage().equals("und")) { 1522 // locale.setLanguage(replacementParsed.getLanguage()); 1523 // } 1524 // if (!replacementParsed.getScript().equals("")) { 1525 // locale.setScript(replacementParsed.getScript()); 1526 // } 1527 // if (!replacementParsed.getRegion().equals("")) { 1528 // locale.setRegion(replacementParsed.getRegion()); 1529 // } 1530 // addIfNotIn(locale.toString(), maximized, toAdd, toMaximized,"Deprecated"); 1531 // } 1532 1533 // private static int getSubtagPosition(String locale, String subtags) { 1534 // int pos = -1; 1535 // while (true) { 1536 // pos = locale.indexOf(subtags, pos + 1); 1537 // if (pos < 0) return -1; 1538 // // make sure boundaries are ok 1539 // if (pos != 0) { 1540 // char charBefore = locale.charAt(pos-1); 1541 // if (charBefore != '_' && charBefore != '_') return -1; 1542 // } 1543 // int limit = pos + subtags.length(); 1544 // if (limit != locale.length()) { 1545 // char charAfter = locale.charAt(limit); 1546 // if (charAfter != '_' && charAfter != '_') return -1; 1547 // } 1548 // return pos; 1549 // } 1550 // } 1551 1552 /* 1553 * Format 1554 * const DefaultSubtags default_subtags[] = { 1555 * { 1556 * // Afar => Afar (Latin, Ethiopia) 1557 * "aa", 1558 * "aa_Latn_ET" 1559 * },{ 1560 * // Afrikaans => Afrikaans (Latin, South Africa) 1561 * "af", 1562 * "af_Latn_ZA" 1563 * },{ 1564 */ 1565 printLikelySubtags(Map<String, String> fluffup)1566 private static void printLikelySubtags(Map<String, String> fluffup) throws IOException { 1567 1568 PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, 1569 "/supplemental/likelySubtags" + (OUTPUT_STYLE == OutputStyle.XML ? ".xml" : ".txt")); 1570 String spacing = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t" : " "; 1571 String header = OUTPUT_STYLE != OutputStyle.XML ? "const MapToMaximalSubtags default_subtags[] = {" 1572 : "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + CldrUtility.LINE_SEPARATOR 1573 + "<!DOCTYPE supplementalData SYSTEM \"../../common/dtd/ldmlSupplemental.dtd\">" 1574 + CldrUtility.LINE_SEPARATOR 1575 + "<!--" 1576 + CldrUtility.LINE_SEPARATOR 1577 + CldrUtility.getCopyrightString() 1578 + CldrUtility.LINE_SEPARATOR 1579 + "-->" 1580 + CldrUtility.LINE_SEPARATOR 1581 + "<!--" 1582 + CldrUtility.LINE_SEPARATOR 1583 + "Likely subtags data is generated programatically from CLDR's language/territory/population" + CldrUtility.LINE_SEPARATOR 1584 + "data using the GenerateMaximalLocales tool. Under normal circumstances, this file should" + CldrUtility.LINE_SEPARATOR 1585 + "not be patched by hand, as any changes made in that fashion may be lost." 1586 + CldrUtility.LINE_SEPARATOR 1587 + "-->" 1588 + CldrUtility.LINE_SEPARATOR 1589 + "<supplementalData>" + CldrUtility.LINE_SEPARATOR 1590 + " <version number=\"$" + 1591 "Revision$\"/>" + CldrUtility.LINE_SEPARATOR 1592 + " <likelySubtags>"; 1593 String footer = OUTPUT_STYLE != OutputStyle.XML ? SEPARATOR + "};" 1594 : " </likelySubtags>" + CldrUtility.LINE_SEPARATOR 1595 + "</supplementalData>"; 1596 out.println(header); 1597 boolean first = true; 1598 Set<String> keys = new TreeSet<>(new LocaleStringComparator()); 1599 keys.addAll(fluffup.keySet()); 1600 for (String printingLocale : keys) { 1601 String printingTarget = fluffup.get(printingLocale); 1602 String comment = printingName(printingLocale, spacing) + spacing + "=>" + spacing 1603 + printingName(printingTarget, spacing); 1604 1605 if (OUTPUT_STYLE == OutputStyle.XML) { 1606 out.println("\t\t<likelySubtag from=\"" + printingLocale + 1607 "\" to=\"" + printingTarget + "\"" + 1608 "/>" + CldrUtility.LINE_SEPARATOR + "\t\t" + "<!--" + comment + "-->"); 1609 } else { 1610 if (first) { 1611 first = false; 1612 } else { 1613 out.print(","); 1614 } 1615 if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) { 1616 comment = printingName(printingLocale, spacing) + SEPARATOR + " // " + spacing + "=>" + spacing 1617 + printingName(printingTarget, spacing); 1618 } 1619 out.print( 1620 " {" 1621 + SEPARATOR + " // " + comment 1622 + SEPARATOR + " \"" + printingLocale + "\"," 1623 + SEPARATOR + " \"" + printingTarget + "\"" 1624 + CldrUtility.LINE_SEPARATOR + " }"); 1625 } 1626 } 1627 out.println(footer); 1628 out.close(); 1629 } 1630 printingName(String locale, String spacing)1631 public static String printingName(String locale, String spacing) { 1632 if (locale == null) { 1633 return null; 1634 } 1635 LanguageTagParser parser = new LanguageTagParser().set(locale); 1636 String lang = parser.getLanguage(); 1637 String script = parser.getScript(); 1638 String region = parser.getRegion(); 1639 return "{" + spacing + 1640 (lang.equals("und") ? "?" : english.getName(CLDRFile.LANGUAGE_NAME, lang)) + ";" + spacing + 1641 (script == null || script.equals("") ? "?" : english.getName(CLDRFile.SCRIPT_NAME, script)) + ";" + spacing 1642 + 1643 (region == null || region.equals("") ? "?" : english.getName(CLDRFile.TERRITORY_NAME, region)) + spacing 1644 + "}"; 1645 } 1646 1647 private static final String[][] ALT_REVERSAL = { 1648 { "nb", "no" }, 1649 { "no", "nb" }, 1650 { "he", "iw" }, 1651 { "iw", "he" }, 1652 }; 1653 toAlt(String locale, boolean change)1654 public static String toAlt(String locale, boolean change) { 1655 if (!change || locale == null) { 1656 return locale; 1657 } 1658 String firstTag = getFirstTag(locale); 1659 for (String[] pair : ALT_REVERSAL) { 1660 if (firstTag.equals(pair[0])) { 1661 locale = pair[1] + locale.substring(pair[1].length()); 1662 break; 1663 } 1664 } 1665 locale = locale.replace("_", "-"); 1666 return locale; 1667 } 1668 getFirstTag(String locale)1669 private static String getFirstTag(String locale) { 1670 int pos = locale.indexOf('_'); 1671 return pos < 0 ? locale : locale.substring(0, pos); 1672 } 1673 1674 // private static Map<String, String> getBackMapping(Map<String, String> fluffup) { 1675 // Relation<String,String> backMap = new Relation(new TreeMap(), TreeSet.class, BEST_LANGUAGE_COMPARATOR); 1676 // for (String source : fluffup.keySet()) { 1677 // if (source.startsWith("und")) { 1678 // continue; 1679 // } 1680 // String maximized = fluffup.get(source); 1681 // backMap.put(maximized, source); // put in right order 1682 // } 1683 // Map<String,String> returnBackMap = new TreeMap(); 1684 // for (String maximized : backMap.keySet()) { 1685 // final Set<String> all = backMap.getAll(maximized); 1686 // final String minimized = all.iterator().next(); 1687 // returnBackMap.put(maximized, minimized); 1688 // } 1689 // return returnBackMap; 1690 // } 1691 1692 /** 1693 * Language tags are presumed to share the first language, except possibly "und". Best is least 1694 */ 1695 // private static Comparator BEST_LANGUAGE_COMPARATOR = new Comparator<String>() { 1696 // LanguageTagParser p1 = new LanguageTagParser(); 1697 // LanguageTagParser p2 = new LanguageTagParser(); 1698 // public int compare(String o1, String o2) { 1699 // if (o1.equals(o2)) return 0; 1700 // p1.set(o1); 1701 // p2.set(o2); 1702 // String lang1 = p1.getLanguage(); 1703 // String lang2 = p2.getLanguage(); 1704 // 1705 // // compare languages first 1706 // // put und at the end 1707 // int result = lang1.compareTo(lang2); 1708 // if (result != 0) { 1709 // if (lang1.equals("und")) return 1; 1710 // if (lang2.equals("und")) return -1; 1711 // return result; 1712 // } 1713 // 1714 // // now scripts and regions. 1715 // // if they have different numbers of fields, the shorter wins. 1716 // // If there are two fields, region is lowest. 1717 // // The simplest way is to just compare scripts first 1718 // // so zh-TW < zh-Hant, because we first compare "" to Hant 1719 // String script1 = p1.getScript(); 1720 // String script2 = p2.getScript(); 1721 // int scriptOrder = script1.compareTo(script2); 1722 // if (scriptOrder != 0) return scriptOrder; 1723 // 1724 // String region1 = p1.getRegion(); 1725 // String region2 = p2.getRegion(); 1726 // int regionOrder = region1.compareTo(region2); 1727 // if (regionOrder != 0) return regionOrder; 1728 // 1729 // return o1.compareTo(o2); 1730 // } 1731 // 1732 // }; 1733 minimize(Map<String, String> fluffup)1734 public static void minimize(Map<String, String> fluffup) { 1735 LanguageTagParser parser = new LanguageTagParser(); 1736 LanguageTagParser targetParser = new LanguageTagParser(); 1737 Set<String> removals = new TreeSet<>(); 1738 while (true) { 1739 removals.clear(); 1740 for (String locale : fluffup.keySet()) { 1741 String target = fluffup.get(locale); 1742 if (targetParser.set(target).getRegion().equals(UNKNOWN_REGION)) { 1743 removals.add(locale); 1744 if (SHOW_ADD) 1745 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1746 + "\t\t - Unknown Region in target"); 1747 continue; 1748 } 1749 if (targetParser.getScript().equals(UNKNOWN_SCRIPT)) { 1750 removals.add(locale); 1751 if (SHOW_ADD) 1752 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1753 + "\t\t - Unknown Script in target"); 1754 continue; 1755 } 1756 1757 String region = parser.set(locale).getRegion(); 1758 if (region.length() != 0) { 1759 if (region.equals(UNKNOWN_REGION)) { 1760 removals.add(locale); 1761 if (SHOW_ADD) 1762 System.out.println("Removing:\t" + getName(locale) + "\t=>\t" + getName(target) 1763 + "\t\t - Unknown Region in source"); 1764 continue; 1765 } 1766 parser.setRegion(""); 1767 String newLocale = parser.toString(); 1768 String newTarget = fluffup.get(newLocale); 1769 if (newTarget != null) { 1770 newTarget = targetParser.set(newTarget).setRegion(region).toString(); 1771 if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { 1772 removals.add(locale); 1773 if (SHOW_ADD) 1774 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with " 1775 + newLocale); 1776 continue; 1777 } 1778 } 1779 } 1780 String script = parser.set(locale).getScript(); 1781 if (locale.equals(DEBUG_ADD_KEY)) { 1782 System.out.println("*debug*"); 1783 } 1784 if (script.length() != 0) { 1785 if (script.equals(UNKNOWN_SCRIPT)) { 1786 removals.add(locale); 1787 if (SHOW_ADD) 1788 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\t - Unknown Script"); 1789 continue; 1790 } 1791 parser.setScript(""); 1792 String newLocale = parser.toString(); 1793 String newTarget = fluffup.get(newLocale); 1794 if (newTarget != null) { 1795 newTarget = targetParser.set(newTarget).setScript(script).toString(); 1796 if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { 1797 removals.add(locale); 1798 if (SHOW_ADD) 1799 System.out.println("Removing:\t" + locale + "\t=>\t" + target + "\t\tRedundant with " 1800 + newLocale); 1801 continue; 1802 } 1803 } 1804 } 1805 } 1806 if (removals.size() == 0) { 1807 break; 1808 } 1809 for (String locale : removals) { 1810 fluffup.remove(locale); 1811 } 1812 } 1813 } 1814 1815 // private static void addLanguageScript(Map<String, String> fluffup, LanguageTagParser parser) { 1816 // // add script 1817 // Map<String, String> temp = new TreeMap<String, String>(); 1818 // while (true) { 1819 // temp.clear(); 1820 // for (String target : new TreeSet<String>(fluffup.values())) { 1821 // parser.set(target); 1822 // final String territory = parser.getRegion(); 1823 // if (territory.length() == 0) { 1824 // continue; 1825 // } 1826 // parser.setRegion(""); 1827 // String possibleSource = parser.toString(); 1828 // if (fluffup.containsKey(possibleSource)) { 1829 // continue; 1830 // } 1831 // String other = temp.get(possibleSource); 1832 // if (other != null) { 1833 // if (!target.equals(other)) { 1834 // System.out.println("**Failure with multiple sources in addLanguageScript: " 1835 // + possibleSource + "\t=>\t" + target + ", " + other); 1836 // } 1837 // continue; 1838 // } 1839 // temp.put(possibleSource, target); 1840 // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguage-Script"); 1841 // } 1842 // if (temp.size() == 0) { 1843 // break; 1844 // } 1845 // fluffup.putAll(temp); 1846 // } 1847 // 1848 // } 1849 1850 // private static void addLanguageCountry(Map<String, String> fluffup, LanguageTagParser parser) { 1851 // // add script 1852 // Map<String, String> temp = new TreeMap<String, String>(); 1853 // while (true) { 1854 // temp.clear(); 1855 // for (String target : new TreeSet<String>(fluffup.values())) { 1856 // parser.set(target); 1857 // String script = parser.getScript(); 1858 // if (script.length() == 0) { 1859 // continue; 1860 // } 1861 // parser.setScript(""); 1862 // String possibleSource = parser.toString(); 1863 // if (fluffup.containsKey(possibleSource)) { 1864 // continue; 1865 // } 1866 // String other = temp.get(possibleSource); 1867 // 1868 // if (other != null) { 1869 // if (!target.equals(other)) { 1870 // script = getScriptForLocale(possibleSource); 1871 // if (script == null) { 1872 // System.out.println("**Failure with multiple sources in addLanguageCountry: " 1873 // + possibleSource + "\t=>\t" + target + ", " + other); 1874 // continue; // error message in routine 1875 // } 1876 // parser.setScript(script); 1877 // target = parser.toString(); 1878 // } 1879 // } 1880 // 1881 // temp.put(possibleSource, target); 1882 // if (SHOW_ADD) System.out.println("Adding:\t" + possibleSource + "\t=>\t" + target + "\t\tLanguageCountry"); 1883 // } 1884 // if (temp.size() == 0) { 1885 // break; 1886 // } 1887 // fluffup.putAll(temp); 1888 // } 1889 // 1890 // } 1891 1892 // private static void addScript(Map<String, String> fluffup, LanguageTagParser parser) { 1893 // // add script 1894 // Map<String, String> temp = new TreeMap<String, String>(); 1895 // while (true) { 1896 // temp.clear(); 1897 // Set skipTarget = fluffup.keySet(); 1898 // for (String locale : fluffup.keySet()) { 1899 // String target = fluffup.get(locale); 1900 // parser.set(target); 1901 // if (parser.getScript().length() != 0) { 1902 // continue; 1903 // } 1904 // String script = getScriptForLocale(target); 1905 // 1906 // if (script == null) { 1907 // continue; // error message in routine 1908 // } 1909 // parser.setScript(script); 1910 // String furtherTarget = parser.toString(); 1911 // addIfNotIn(target, furtherTarget, temp, fluffup, "Script"); 1912 // } 1913 // if (temp.size() == 0) { 1914 // break; 1915 // } 1916 // fluffup.putAll(temp); 1917 // } 1918 // } 1919 1920 // private static String getScriptForLocale(String locale) { 1921 // String result = getScriptForLocale2(locale); 1922 // if (result != null) return result; 1923 // int pos = locale.indexOf('_'); 1924 // if (pos >= 0) { 1925 // result = getScriptForLocale2(locale.substring(0,pos)); 1926 // } 1927 // return result; 1928 // } 1929 1930 private static String UNKNOWN_SCRIPT = "Zzzz"; 1931 private static String UNKNOWN_REGION = "ZZ"; 1932 getScriptForLocale2(String locale)1933 private static String getScriptForLocale2(String locale) { 1934 String result = localeToScriptCache.get(locale); 1935 if (result != null) { 1936 return result; 1937 } 1938 if (locale.equals("ky")) { 1939 int debug = 0; 1940 } 1941 try { 1942 Map<Type, BasicLanguageData> data = supplementalData.getBasicLanguageDataMap(locale); 1943 if (data != null) { 1944 for (BasicLanguageData datum : data.values()) { 1945 final Set<String> scripts = datum.getScripts(); 1946 boolean isPrimary = datum.getType() == BasicLanguageData.Type.primary; 1947 if (scripts.size() != 1) { 1948 if (scripts.size() > 1 && isPrimary) { 1949 break; 1950 } 1951 continue; 1952 } 1953 String script = scripts.iterator().next(); 1954 if (isPrimary) { 1955 return result = script; 1956 } else if (result == null) { 1957 result = script; 1958 } 1959 } 1960 if (result != null) { 1961 return result; 1962 } 1963 } 1964 CLDRFile cldrFile; 1965 try { 1966 cldrFile = factory.make(locale, true); 1967 } catch (RuntimeException e) { 1968 result = FALLBACK_SCRIPTS.get(locale); 1969 if (result == null) { 1970 System.out.println("***Failed to find script for: " + locale + "\t" + english.getName(locale)); 1971 return result = UNKNOWN_SCRIPT; 1972 } else { 1973 return result; 1974 } 1975 } 1976 UnicodeSet exemplars = getExemplarSet(cldrFile, ""); 1977 Set<String> CLDRScripts = getScriptsFromUnicodeSet(exemplars); 1978 CLDRScripts.remove(UNKNOWN_SCRIPT); 1979 if (CLDRScripts.size() == 1) { 1980 return result = CLDRScripts.iterator().next(); 1981 } else if (CLDRScripts.size() == 0) { 1982 System.out.println("**Failed to get script for:\t" + locale); 1983 return result = UNKNOWN_SCRIPT; 1984 } else { 1985 System.out.println("**Failed, too many scripts for:\t" + locale + ", " + CLDRScripts); 1986 return result = UNKNOWN_SCRIPT; 1987 } 1988 } finally { 1989 if (result.equals(UNKNOWN_SCRIPT)) { 1990 String temp = LANGUAGE_OVERRIDES.get(locale); 1991 if (temp != null) { 1992 result = new LanguageTagParser().set(temp).getScript(); 1993 System.out.println("Getting script from LANGUAGE_OVERRIDES for " + locale + " => " + result); 1994 } 1995 } 1996 localeToScriptCache.put(locale, result); 1997 if (SHOW_ADD) 1998 System.out.println("Script:\t" + locale + "\t" + english.getName(locale) + "\t=>\t" + result + "\t" 1999 + english.getName(CLDRFile.SCRIPT_NAME, result)); 2000 } 2001 } 2002 2003 // private static Map<String, String> closeMapping(Map<String, String> fluffup) { 2004 // if (SHOW_ADD) System.out.flush(); 2005 // Map<String,String> temp = new TreeMap<String,String>(); 2006 // while (true) { 2007 // temp.clear(); 2008 // for (String locale : fluffup.keySet()) { 2009 // String target = fluffup.get(locale); 2010 // if (target.equals("si_Sinh") || target.equals("zh-Hani")) { 2011 // System.out.println("????"); 2012 // } 2013 // String furtherTarget = fluffup.get(target); 2014 // if (furtherTarget == null) { 2015 // continue; 2016 // } 2017 // addIfNotIn(locale, furtherTarget, temp, null, "Close"); 2018 // } 2019 // if (temp.size() == 0) { 2020 // break; 2021 // } 2022 // fluffup.putAll(temp); 2023 // } 2024 // if (SHOW_ADD) System.out.flush(); 2025 // return temp; 2026 // } 2027 getScriptsFromUnicodeSet(UnicodeSet exemplars)2028 public static Set<String> getScriptsFromUnicodeSet(UnicodeSet exemplars) { 2029 // use bits first, since that's faster 2030 BitSet scriptBits = new BitSet(); 2031 boolean show = false; 2032 for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) { 2033 if (show) 2034 System.out.println(Integer.toHexString(it.codepoint)); 2035 if (it.codepoint != UnicodeSetIterator.IS_STRING) { 2036 scriptBits.set(UScript.getScript(it.codepoint)); 2037 } else { 2038 int cp; 2039 for (int i = 0; i < it.string.length(); i += UTF16.getCharCount(cp)) { 2040 scriptBits.set(UScript.getScript(cp = UTF16.charAt(it.string, i))); 2041 } 2042 } 2043 } 2044 scriptBits.clear(UScript.COMMON); 2045 scriptBits.clear(UScript.INHERITED); 2046 Set<String> scripts = new TreeSet<>(); 2047 for (int j = 0; j < scriptBits.size(); ++j) { 2048 if (scriptBits.get(j)) { 2049 scripts.add(UScript.getShortName(j)); 2050 } 2051 } 2052 return scripts; 2053 } 2054 getExemplarSet(CLDRFile cldrfile, String type)2055 public static UnicodeSet getExemplarSet(CLDRFile cldrfile, String type) { 2056 if (type.length() != 0) 2057 type = "[@type=\"" + type + "\"]"; 2058 String v = cldrfile.getStringValue("//ldml/characters/exemplarCharacters" 2059 + type); 2060 if (v == null) 2061 return new UnicodeSet(); 2062 return new UnicodeSet(v); 2063 } 2064 2065 // private static String[][] SpecialCases = { 2066 // { "zh_Hani", "zh_Hans_CN"}, 2067 // { "si_Sinh", "si_Sinh_LK"}, 2068 // { "ii", "ii_CN"}, // Sichuan Yi (Yi) 2069 // { "iu", "iu_CA"}, // Inuktitut (Unified Canadian Aboriginal Syllabics) 2070 // { "und", "en"}, // English default 2071 // }; 2072 showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent)2073 static void showDefaultContentDifferencesAndFix(Set<String> defaultLocaleContent) { 2074 Set<String> errors = new LinkedHashSet<>(); 2075 Map<String, String> oldDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents( 2076 ConvertLanguageData.supplementalData.getDefaultContentLocales(), new TreeMap<String, String>(), errors); 2077 if (!errors.isEmpty()) { 2078 System.out.println(Joiner.on("\n").join(errors)); 2079 errors.clear(); 2080 } 2081 Map<String, String> newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent, 2082 new TreeMap<String, String>(), errors); 2083 if (!errors.isEmpty()) { 2084 System.out.println("Default Content errors: " + Joiner.on("\n").join(errors)); 2085 errors.clear(); 2086 } 2087 Set<String> changes = compareMapsAndFixNew("*WARNING* Default Content: ", oldDefaultContent, newDefaultContent, 2088 "ar", "ar_001"); 2089 System.out.println(Joiner.on("\n").join(changes)); 2090 defaultLocaleContent.clear(); 2091 defaultLocaleContent.addAll(newDefaultContent.values()); 2092 newDefaultContent = SupplementalDataInfo.makeLocaleToDefaultContents(defaultLocaleContent, 2093 new TreeMap<String, String>(), errors); 2094 if (!errors.isEmpty()) { 2095 System.out.println("***New Errors: " + Joiner.on("\n").join(errors)); 2096 } 2097 } 2098 compareMapsAndFixNew(String title, Map<String, String> oldContent, Map<String, String> newContent, String... allowedOverrideValues)2099 private static Set<String> compareMapsAndFixNew(String title, 2100 Map<String, String> oldContent, 2101 Map<String, String> newContent, String... allowedOverrideValues) { 2102 Map<String, String> allowedOverrideValuesTest = new HashMap<>(); 2103 for (int i = 0; i < allowedOverrideValues.length; i += 2) { 2104 allowedOverrideValuesTest.put(allowedOverrideValues[i], allowedOverrideValues[i + 1]); 2105 } 2106 Set<String> changes = new TreeSet<>(); 2107 for (String parent : Builder.with(new TreeSet<String>()).addAll(newContent.keySet()) 2108 .addAll(oldContent.keySet()).get()) { 2109 String oldValue = oldContent.get(parent); 2110 String newValue = newContent.get(parent); 2111 String overrideValue = allowedOverrideValuesTest.get(parent); 2112 if (overrideValue != null) { 2113 newContent.put(parent, overrideValue); 2114 newValue = overrideValue; 2115 } 2116 if (CldrUtility.equals(oldValue, newValue)) { 2117 continue; 2118 } 2119 String message; 2120 if (oldValue == null) { 2121 message = "Adding " + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2122 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2123 newContent.put(parent, newValue); 2124 } else if (newValue == null) { 2125 if (SUPPRESS_CHANGES) { 2126 message = "Suppressing removal of " 2127 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2128 + ConvertLanguageData.getLanguageCodeAndName(oldValue); 2129 newContent.put(parent, oldValue); 2130 } else { 2131 message = "Removing " 2132 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2133 + ConvertLanguageData.getLanguageCodeAndName(oldValue); 2134 newContent.remove(oldValue); 2135 } 2136 } else { 2137 if (SUPPRESS_CHANGES) { 2138 message = "Suppressing change of " 2139 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2140 + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to " 2141 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2142 newContent.remove(newValue); 2143 newContent.put(parent, oldValue); 2144 } else { 2145 message = "Changing " 2146 + ConvertLanguageData.getLanguageCodeAndName(parent) + " => " 2147 + ConvertLanguageData.getLanguageCodeAndName(oldValue) + " to " 2148 + ConvertLanguageData.getLanguageCodeAndName(newValue); 2149 newContent.remove(oldValue); 2150 newContent.put(parent, newValue); 2151 } 2152 } 2153 changes.add(title + message); 2154 } 2155 return changes; 2156 } 2157 2158 public static class LocaleStringComparator implements Comparator<String> { 2159 LanguageTagParser ltp0 = new LanguageTagParser(); 2160 LanguageTagParser ltp1 = new LanguageTagParser(); 2161 2162 @Override compare(String arg0, String arg1)2163 public int compare(String arg0, String arg1) { 2164 ltp0.set(arg0); 2165 ltp1.set(arg1); 2166 String s0 = ltp0.getLanguage(); 2167 String s1 = ltp1.getLanguage(); 2168 int result = s0.compareTo(s1); 2169 if (result != 0) { 2170 return s0.equals("und") ? 1 2171 : s1.equals("und") ? -1 2172 : result; 2173 } 2174 s0 = ltp0.getScript(); 2175 s1 = ltp1.getScript(); 2176 result = s0.compareTo(s1); 2177 if (result != 0) { 2178 return result; 2179 } 2180 s0 = ltp0.getRegion(); 2181 s1 = ltp1.getRegion(); 2182 result = s0.compareTo(s1); 2183 if (result != 0) { 2184 return result; 2185 } 2186 return arg0.compareTo(arg1); // just in case 2187 } 2188 2189 } 2190 } 2191