1 package org.unicode.cldr.unittest; 2 3 import java.util.Arrays; 4 import java.util.HashSet; 5 import java.util.Map; 6 import java.util.Map.Entry; 7 import java.util.Set; 8 import java.util.TreeMap; 9 import java.util.TreeSet; 10 11 import org.unicode.cldr.draft.ScriptMetadata; 12 import org.unicode.cldr.draft.ScriptMetadata.Info; 13 import org.unicode.cldr.tool.LikelySubtags; 14 import org.unicode.cldr.util.CLDRConfig; 15 import org.unicode.cldr.util.CLDRFile; 16 import org.unicode.cldr.util.ChainedMap; 17 import org.unicode.cldr.util.ChainedMap.M3; 18 import org.unicode.cldr.util.Containment; 19 import org.unicode.cldr.util.LanguageTagParser; 20 import org.unicode.cldr.util.StandardCodes; 21 import org.unicode.cldr.util.SupplementalDataInfo; 22 23 import com.ibm.icu.dev.test.TestFmwk; 24 import com.ibm.icu.lang.UCharacter; 25 import com.ibm.icu.lang.UProperty; 26 import com.ibm.icu.lang.UScript; 27 import com.ibm.icu.text.UnicodeSet; 28 import com.ibm.icu.util.VersionInfo; 29 30 public class LikelySubtagsTest extends TestFmwk { 31 32 private boolean DEBUG = false; 33 private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig 34 .getInstance().getSupplementalDataInfo(); 35 static final Map<String, String> likely = SUPPLEMENTAL_DATA_INFO 36 .getLikelySubtags(); 37 static final LikelySubtags LIKELY = new LikelySubtags( 38 SUPPLEMENTAL_DATA_INFO, likely); 39 main(String[] args)40 public static void main(String[] args) { 41 new LikelySubtagsTest().run(args); 42 } 43 44 static class Tags { 45 final Set<String> languages = new TreeSet<String>(); 46 final Set<String> scripts = new TreeSet<String>(); 47 final Set<String> regions = new TreeSet<String>(); 48 final Set<String> scriptRegion = new TreeSet<String>(); 49 final Set<String> languageScript = new TreeSet<String>(); 50 final Set<String> languageRegion = new TreeSet<String>(); 51 final Set<String> all = new TreeSet<String>(); 52 final ChainedMap.M4<String, String, String, Boolean> languageToScriptToRegions = ChainedMap 53 .of(new TreeMap<String, Object>(), 54 new TreeMap<String, Object>(), 55 new TreeMap<String, Object>(), Boolean.class); 56 final ChainedMap.M3<String, String, Boolean> languageToRegions = ChainedMap 57 .of(new TreeMap<String, Object>(), 58 new TreeMap<String, Object>(), Boolean.class); 59 Tags()60 public Tags() { 61 final LanguageTagParser ltp = new LanguageTagParser(); 62 for (Entry<String, String> entry : likely.entrySet()) { 63 add(ltp.set(entry.getKey()), true); 64 add(ltp.set(entry.getValue()), false); 65 } 66 // add unfamiliar script, unfamiliar region 67 for (String lang : languageToScriptToRegions.keySet()) { 68 if (lang.equals("und")) { 69 continue; 70 } 71 M3<String, String, Boolean> scriptToRegion = languageToScriptToRegions 72 .get(lang); 73 final Set<String> scriptsFor = scriptToRegion.keySet(); 74 final Set<String> regionsFor = languageToRegions.get(lang) 75 .keySet(); 76 77 String firstScriptNotIn = getNonEmptyNotIn(scripts, scriptsFor); 78 String firstRegionNotIn = getNonEmptyNotIn(regions, regionsFor); 79 80 languageToScriptToRegions.put(lang, firstScriptNotIn, 81 firstRegionNotIn, Boolean.TRUE); 82 // clone for safety before iterating 83 for (String script : new HashSet<String>(scriptsFor)) { 84 languageToScriptToRegions.put(lang, script, 85 firstRegionNotIn, Boolean.TRUE); 86 } 87 for (String region : new HashSet<String>(regionsFor)) { 88 languageToScriptToRegions.put(lang, firstScriptNotIn, 89 region, Boolean.TRUE); 90 } 91 } 92 93 // System.out.println("all: " + all); 94 // System.out.println("scriptRegion: " + scriptRegion); 95 // System.out.println("languageScript: " + languageScript); 96 // System.out.println("languageRegion: " + languageRegion); 97 } 98 getNonEmptyNotIn(Iterable<T> a, Set<T> b)99 private static <T> T getNonEmptyNotIn(Iterable<T> a, Set<T> b) { 100 for (T x : a) { 101 if (!b.contains(x) && !x.toString().isEmpty()) { 102 return x; 103 } 104 } 105 throw new IllegalArgumentException(); 106 } 107 add(LanguageTagParser ltp, boolean source)108 void add(LanguageTagParser ltp, boolean source) { 109 String sourceLanguage = ltp.getLanguage(); 110 String sourceScript = ltp.getScript(); 111 String sourceRegion = ltp.getRegion(); 112 languageToScriptToRegions.put(sourceLanguage, sourceScript, 113 sourceRegion, Boolean.TRUE); 114 languageToScriptToRegions.put(sourceLanguage, sourceScript, "", 115 Boolean.TRUE); 116 languageToScriptToRegions.put(sourceLanguage, "", "", Boolean.TRUE); 117 languageToRegions.put(sourceLanguage, "", Boolean.TRUE); 118 if (StandardCodes.isCountry(sourceRegion)) { 119 languageToScriptToRegions.put(sourceLanguage, "", sourceRegion, 120 Boolean.TRUE); 121 languageToRegions.put(sourceLanguage, sourceRegion, 122 Boolean.TRUE); 123 } 124 125 // capture all cases of 2 items 126 if (source) { 127 if (!sourceScript.isEmpty() && !sourceRegion.isEmpty()) { 128 if (!sourceLanguage.equals("und")) { 129 all.add(ltp.toString()); 130 } else { 131 scriptRegion.add(ltp.toString()); 132 } 133 } else if (!sourceLanguage.equals("und")) { 134 if (!sourceScript.isEmpty()) { 135 languageScript.add(ltp.toString()); 136 } else if (!sourceRegion.isEmpty()) { 137 languageRegion.add(ltp.toString()); 138 } 139 } 140 } 141 languages.add(sourceLanguage); 142 scripts.add(sourceScript); 143 if (StandardCodes.isCountry(sourceRegion) || sourceRegion.isEmpty()) { 144 regions.add(sourceRegion); 145 } 146 } 147 } 148 149 static final Tags TAGS = new Tags(); 150 151 final LanguageTagParser maxLtp = new LanguageTagParser(); 152 final LanguageTagParser sourceLtp = new LanguageTagParser(); 153 154 /** 155 * Return false if we should skip the language 156 * 157 * @param source 158 * @return 159 */ checkAdding(String source)160 public boolean checkAdding(String source) { 161 // if X maps to Y, then adding a field from Y to X will still map to Y 162 // Example: 163 // und_AF => fa_Arab_AF 164 // therefore, the following should also be true: 165 // und_Arab_AF => fa_Arab_AF 166 // fa_AF => fa_Arab_AF 167 // fa_Arab_AF => fa_Arab_AF 168 169 String max = LIKELY.maximize(source); 170 if (!assertNotEquals("Maximize " + source, null, max)) { 171 return source.contains("_"); 172 } 173 sourceLtp.set(source); 174 if (!sourceLtp.getRegion().isEmpty() 175 && !StandardCodes.isCountry(sourceLtp.getRegion())) { 176 return true; 177 } 178 maxLtp.set(max); 179 for (int i = 1; i < 8; ++i) { 180 if ((i & 1) != 0) { 181 if (!sourceLtp.getLanguage().equals("und")) 182 continue; 183 sourceLtp.setLanguage(maxLtp.getLanguage()); 184 } 185 if ((i & 2) != 0) { 186 if (!sourceLtp.getScript().isEmpty()) 187 continue; 188 sourceLtp.setScript(maxLtp.getScript()); 189 } 190 if ((i & 4) != 0) { 191 if (!sourceLtp.getRegion().isEmpty()) 192 continue; 193 sourceLtp.setRegion(maxLtp.getRegion()); 194 } 195 String test = sourceLtp.toString(); 196 final String maximize = LIKELY.maximize(test); 197 if (!max.equals(maximize)) { 198 if (!assertEquals(source + " -> " + max + ", so testing " 199 + test, max, maximize)) { 200 LIKELY.maximize(test); // do again for debugging 201 } 202 } 203 sourceLtp.set(source); // restore 204 } 205 return true; 206 } 207 TestCompleteness()208 public void TestCompleteness() { 209 // if (logKnownIssue("Cldrbug:7121", 210 // "Problems with likely subtags test")) { 211 // return; 212 // } 213 // checkAdding("und_Bopo"); 214 // checkAdding("und_Brai"); 215 // checkAdding("und_Limb"); 216 // checkAdding("und_Cakm"); 217 // checkAdding("und_Shaw"); 218 219 final LanguageTagParser ltp = new LanguageTagParser(); 220 if (DEBUG) { 221 System.out.println(TAGS.languages.size() + "\t" + TAGS.languages); 222 System.out.println(TAGS.scripts.size() + "\t" + TAGS.scripts); 223 System.out.println(TAGS.regions.size() + "\t" + TAGS.regions); 224 } 225 main: for (Entry<String, Map<String, Map<String, Boolean>>> languageScriptRegion : TAGS.languageToScriptToRegions) { 226 String language = languageScriptRegion.getKey(); 227 ltp.set(language); // clears script, region 228 for (Entry<String, Map<String, Boolean>> scriptRegion : languageScriptRegion 229 .getValue().entrySet()) { 230 String script = scriptRegion.getKey(); 231 ltp.setScript(script); 232 for (String region : scriptRegion.getValue().keySet()) { 233 ltp.setRegion(region); 234 String testTag = ltp.toString(); 235 // System.out.println(testTag); 236 if (!checkAdding(testTag)) { 237 continue main; 238 } 239 } 240 } 241 } 242 } 243 244 static Set<String> exceptions = new HashSet<String>(Arrays.asList("Zyyy", 245 "Zinh", "Zzzz", "Brai")); 246 TestStability()247 public void TestStability() { 248 // when maximized must never change 249 // first get all the subtags 250 // then test all the combinations 251 LanguageTagParser ltp = new LanguageTagParser(); 252 for (Entry<String, String> entry : likely.entrySet()) { 253 ltp.set(entry.getKey()); 254 String sourceLanguage = ltp.getLanguage(); 255 if (sourceLanguage.equals("und")) { 256 sourceLanguage = ""; 257 } 258 String sourceScript = ltp.getScript(); 259 String sourceRegion = ltp.getRegion(); 260 ltp.set(entry.getValue()); 261 String targetLanguage = ltp.getLanguage(); 262 String targetScript = ltp.getScript(); 263 String targetRegion = ltp.getRegion(); 264 if (!sourceLanguage.isEmpty()) { 265 assertEquals("language", sourceLanguage, targetLanguage); 266 } 267 if (!sourceScript.isEmpty()) { 268 assertEquals("script", sourceScript, targetScript); 269 } 270 if (!sourceRegion.isEmpty()) { 271 if (Containment.isLeaf(sourceRegion)) { 272 assertEquals("region", sourceRegion, targetRegion); 273 } 274 } 275 } 276 277 } 278 TestForMissingScriptMetadata()279 public void TestForMissingScriptMetadata() { 280 TreeSet<String> metadataScripts = new TreeSet<String>( 281 ScriptMetadata.getScripts()); 282 UnicodeSet current = new UnicodeSet(0, 0x10FFFF); 283 UnicodeSet toRemove = new UnicodeSet(); 284 285 while (!current.isEmpty()) { 286 int ch = current.charAt(0); 287 int script = UScript.getScript(ch); 288 String shortName = UScript.getShortName(script); 289 Info i = ScriptMetadata.getInfo(shortName); 290 if (i == null) { 291 errln("Script Metadata is missing: " + shortName); 292 continue; 293 } 294 if (i.likelyLanguage.equals("und") 295 && !exceptions.contains(shortName)) { 296 errln("Script has no likely language: " + shortName); 297 } 298 toRemove.applyIntPropertyValue(UProperty.SCRIPT, script); 299 current.removeAll(toRemove); 300 metadataScripts.remove(shortName); 301 } 302 metadataScripts 303 .removeAll(Arrays.asList("Hans", "Hant", "Hanb", "Jamo", "Jpan", "Kore")); // remove 304 // "combo" 305 // scripts 306 if (!metadataScripts.isEmpty()) { 307 // Warning, not error, so that we can add scripts to the script metadata 308 // and later update to the Unicode version that has characters for those scripts. 309 warnln("Script Metadata for characters not in Unicode: " 310 + metadataScripts); 311 } 312 } 313 TestMissingInfoForLanguage()314 public void TestMissingInfoForLanguage() { 315 CLDRFile english = CLDRConfig.getInstance().getEnglish(); 316 317 for (String language : CLDRConfig.getInstance().getCldrFactory() 318 .getAvailableLanguages()) { 319 if (language.contains("_") || language.equals("root")) { 320 continue; 321 } 322 String likelyExpansion = likely.get(language); 323 if (likelyExpansion == null) { 324 errln("Missing likely subtags for: " + language); 325 } else { 326 logln("Likely subtags for " + language + ":\t " + likely); 327 } 328 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language); 329 String englishName = english.getStringValue(path); 330 if (englishName == null) { 331 errln("Missing English translation for: " + language); 332 } 333 } 334 } 335 TestMissingInfoForRegion()336 public void TestMissingInfoForRegion() { 337 CLDRFile english = CLDRConfig.getInstance().getEnglish(); 338 339 for (String region : StandardCodes.make().getGoodAvailableCodes( 340 "territory")) { 341 String likelyExpansion = likely.get("und_" + region); 342 if (likelyExpansion == null) { 343 if (region.equals("ZZ") || region.equals("001") || region.equals("UN") 344 || SUPPLEMENTAL_DATA_INFO.getContained(region) == null) { // not 345 // container 346 String likelyTag = LikelySubtags.maximize("und_" + region, 347 likely); 348 if (likelyTag == null || !likelyTag.startsWith("en_Latn_")) { 349 errln("Missing likely subtags for region: " + region 350 + "\t" + english.getName("territory", region)); 351 } 352 } else { // container 353 errln("Missing likely subtags for macroregion (fix to exclude regions having 'en'): " 354 + region 355 + "\t" 356 + english.getName("territory", region)); 357 } 358 } else { 359 logln("Likely subtags for region: " + region + ":\t " + likely); 360 } 361 String path = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, region); 362 String englishName = english.getStringValue(path); 363 if (englishName == null) { 364 errln("Missing English translation for: " + region); 365 } 366 } 367 } 368 TestMissingInfoForScript()369 public void TestMissingInfoForScript() { 370 VersionInfo icuUnicodeVersion = UCharacter.getUnicodeVersion(); 371 TreeSet<String> sorted = new TreeSet<String>( 372 ScriptMetadata.getScripts()); 373 Set<String> exceptions2 = new HashSet<String>( 374 Arrays.asList("zh_Hans_CN")); 375 for (String script : sorted) { 376 if (exceptions.contains(script) || script.equals("Latn") 377 || script.equals("Dsrt")) { 378 // we minimize away und_X, when the code puts in en...US 379 continue; 380 } 381 Info i = ScriptMetadata.getInfo(script); 382 // System.out.println(i); 383 String likelyLanguage = i.likelyLanguage; 384 String originCountry = i.originCountry; 385 String undScript = "und_" + script; 386 String langScript = likelyLanguage + "_" + script + "_"; 387 String likelyExpansion = likely.get(undScript); 388 if (likelyExpansion == null) { 389 String msg = "Missing likely language for script (und_" + script 390 + ") should be something like:\t " 391 + showOverride(script, originCountry, langScript); 392 if (i.age.compareTo(icuUnicodeVersion) <= 0) { 393 // Error: Missing data for a script in ICU's Unicode version. 394 errln(msg); 395 } else { 396 // Warning: Missing data for a script in a future Unicode version. 397 warnln(msg); 398 } 399 } else if (!exceptions2.contains(likelyExpansion) 400 && !likelyExpansion.startsWith(langScript)) { 401 // if 402 // (logKnownIssue("Cldrbug:7181","Missing script metadata for " 403 // + script) 404 // && (script.equals("Tfng") || script.equals("Brah"))) { 405 // logln("Wrong likely language for script (und_" + script + 406 // "). Should not be " + likelyExpansion 407 // + ", but something like:\t " + showOverride(script, 408 // originCountry, langScript)); 409 // } else { 410 errln("Wrong likely language for script (und_" + script 411 + "). Should not be " + likelyExpansion 412 + ", but something like:\t " 413 + showOverride(script, originCountry, langScript)); 414 // } 415 } else { 416 logln("OK: " + undScript + " => " + likelyExpansion); 417 } 418 } 419 /** 420 * und_Bopo => zh_Bopo_TW und_Copt => cop_Copt_EG // fix 002 und_Dsrt => 421 * en_Dsrt_US // fix US 422 */ 423 } 424 showOverride(String script, String originCountry, String langScript)425 public String showOverride(String script, String originCountry, 426 String langScript) { 427 return "{\"und_" + script + "\", \"" + langScript + originCountry 428 + "\"},"; 429 } 430 } 431