1 package org.unicode.cldr.unittest; 2 3 import java.util.Arrays; 4 import java.util.HashSet; 5 import java.util.Map; 6 import java.util.Map.Entry; 7 import java.util.Set; 8 import java.util.TreeMap; 9 import java.util.TreeSet; 10 11 import org.unicode.cldr.draft.ScriptMetadata; 12 import org.unicode.cldr.draft.ScriptMetadata.Info; 13 import org.unicode.cldr.tool.LikelySubtags; 14 import org.unicode.cldr.util.CLDRConfig; 15 import org.unicode.cldr.util.CLDRFile; 16 import org.unicode.cldr.util.ChainedMap; 17 import org.unicode.cldr.util.ChainedMap.M3; 18 import org.unicode.cldr.util.Containment; 19 import org.unicode.cldr.util.LanguageTagParser; 20 import org.unicode.cldr.util.StandardCodes; 21 import org.unicode.cldr.util.SupplementalDataInfo; 22 23 import com.ibm.icu.dev.test.TestFmwk; 24 import com.ibm.icu.lang.UCharacter; 25 import com.ibm.icu.lang.UProperty; 26 import com.ibm.icu.lang.UScript; 27 import com.ibm.icu.text.UnicodeSet; 28 import com.ibm.icu.util.VersionInfo; 29 30 public class LikelySubtagsTest extends TestFmwk { 31 32 private boolean DEBUG = false; 33 private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig 34 .getInstance().getSupplementalDataInfo(); 35 static final Map<String, String> likely = SUPPLEMENTAL_DATA_INFO 36 .getLikelySubtags(); 37 static final LikelySubtags LIKELY = new LikelySubtags(); 38 main(String[] args)39 public static void main(String[] args) { 40 new LikelySubtagsTest().run(args); 41 } 42 43 static class Tags { 44 final Set<String> languages = new TreeSet<String>(); 45 final Set<String> scripts = new TreeSet<String>(); 46 final Set<String> regions = new TreeSet<String>(); 47 final Set<String> scriptRegion = new TreeSet<String>(); 48 final Set<String> languageScript = new TreeSet<String>(); 49 final Set<String> languageRegion = new TreeSet<String>(); 50 final Set<String> all = new TreeSet<String>(); 51 final ChainedMap.M4<String, String, String, Boolean> languageToScriptToRegions = ChainedMap 52 .of(new TreeMap<String, Object>(), 53 new TreeMap<String, Object>(), 54 new TreeMap<String, Object>(), Boolean.class); 55 final ChainedMap.M3<String, String, Boolean> languageToRegions = ChainedMap 56 .of(new TreeMap<String, Object>(), 57 new TreeMap<String, Object>(), Boolean.class); 58 Tags()59 public Tags() { 60 final LanguageTagParser ltp = new LanguageTagParser(); 61 for (Entry<String, String> entry : likely.entrySet()) { 62 add(ltp.set(entry.getKey()), true); 63 add(ltp.set(entry.getValue()), false); 64 } 65 // add unfamiliar script, unfamiliar region 66 for (String lang : languageToScriptToRegions.keySet()) { 67 if (lang.equals("und")) { 68 continue; 69 } 70 M3<String, String, Boolean> scriptToRegion = languageToScriptToRegions 71 .get(lang); 72 final Set<String> scriptsFor = scriptToRegion.keySet(); 73 final Set<String> regionsFor = languageToRegions.get(lang) 74 .keySet(); 75 76 String firstScriptNotIn = getNonEmptyNotIn(scripts, scriptsFor); 77 String firstRegionNotIn = getNonEmptyNotIn(regions, regionsFor); 78 79 languageToScriptToRegions.put(lang, firstScriptNotIn, 80 firstRegionNotIn, Boolean.TRUE); 81 // clone for safety before iterating 82 for (String script : new HashSet<String>(scriptsFor)) { 83 languageToScriptToRegions.put(lang, script, 84 firstRegionNotIn, Boolean.TRUE); 85 } 86 for (String region : new HashSet<String>(regionsFor)) { 87 languageToScriptToRegions.put(lang, firstScriptNotIn, 88 region, Boolean.TRUE); 89 } 90 } 91 92 // System.out.println("all: " + all); 93 // System.out.println("scriptRegion: " + scriptRegion); 94 // System.out.println("languageScript: " + languageScript); 95 // System.out.println("languageRegion: " + languageRegion); 96 } 97 getNonEmptyNotIn(Iterable<T> a, Set<T> b)98 private static <T> T getNonEmptyNotIn(Iterable<T> a, Set<T> b) { 99 for (T x : a) { 100 if (!b.contains(x) && !x.toString().isEmpty()) { 101 return x; 102 } 103 } 104 throw new IllegalArgumentException(); 105 } 106 add(LanguageTagParser ltp, boolean source)107 void add(LanguageTagParser ltp, boolean source) { 108 String sourceLanguage = ltp.getLanguage(); 109 String sourceScript = ltp.getScript(); 110 String sourceRegion = ltp.getRegion(); 111 languageToScriptToRegions.put(sourceLanguage, sourceScript, 112 sourceRegion, Boolean.TRUE); 113 languageToScriptToRegions.put(sourceLanguage, sourceScript, "", 114 Boolean.TRUE); 115 languageToScriptToRegions.put(sourceLanguage, "", "", Boolean.TRUE); 116 languageToRegions.put(sourceLanguage, "", Boolean.TRUE); 117 if (StandardCodes.isCountry(sourceRegion)) { 118 languageToScriptToRegions.put(sourceLanguage, "", sourceRegion, 119 Boolean.TRUE); 120 languageToRegions.put(sourceLanguage, sourceRegion, 121 Boolean.TRUE); 122 } 123 124 // capture all cases of 2 items 125 if (source) { 126 if (!sourceScript.isEmpty() && !sourceRegion.isEmpty()) { 127 if (!sourceLanguage.equals("und")) { 128 all.add(ltp.toString()); 129 } else { 130 scriptRegion.add(ltp.toString()); 131 } 132 } else if (!sourceLanguage.equals("und")) { 133 if (!sourceScript.isEmpty()) { 134 languageScript.add(ltp.toString()); 135 } else if (!sourceRegion.isEmpty()) { 136 languageRegion.add(ltp.toString()); 137 } 138 } 139 } 140 languages.add(sourceLanguage); 141 scripts.add(sourceScript); 142 if (StandardCodes.isCountry(sourceRegion) || sourceRegion.isEmpty()) { 143 regions.add(sourceRegion); 144 } 145 } 146 } 147 148 static final Tags TAGS = new Tags(); 149 150 final LanguageTagParser maxLtp = new LanguageTagParser(); 151 final LanguageTagParser sourceLtp = new LanguageTagParser(); 152 153 /** 154 * Return false if we should skip the language 155 * 156 * @param source 157 * @return 158 */ checkAdding(String source)159 public boolean checkAdding(String source) { 160 // if X maps to Y, then adding a field from Y to X will still map to Y 161 // Example: 162 // und_AF => fa_Arab_AF 163 // therefore, the following should also be true: 164 // und_Arab_AF => fa_Arab_AF 165 // fa_AF => fa_Arab_AF 166 // fa_Arab_AF => fa_Arab_AF 167 168 String max = LIKELY.maximize(source); 169 if (!assertNotEquals("Maximize " + source, null, max)) { 170 return source.contains("_"); 171 } 172 sourceLtp.set(source); 173 if (!sourceLtp.getRegion().isEmpty() 174 && !StandardCodes.isCountry(sourceLtp.getRegion())) { 175 return true; 176 } 177 maxLtp.set(max); 178 for (int i = 1; i < 8; ++i) { 179 if ((i & 1) != 0) { 180 if (!sourceLtp.getLanguage().equals("und")) 181 continue; 182 sourceLtp.setLanguage(maxLtp.getLanguage()); 183 } 184 if ((i & 2) != 0) { 185 if (!sourceLtp.getScript().isEmpty()) 186 continue; 187 sourceLtp.setScript(maxLtp.getScript()); 188 } 189 if ((i & 4) != 0) { 190 if (!sourceLtp.getRegion().isEmpty()) 191 continue; 192 sourceLtp.setRegion(maxLtp.getRegion()); 193 } 194 String test = sourceLtp.toString(); 195 final String maximize = LIKELY.maximize(test); 196 if (!max.equals(maximize)) { 197 if (!assertEquals(source + " -> " + max + ", so testing " 198 + test, max, maximize)) { 199 LIKELY.maximize(test); // do again for debugging 200 } 201 } 202 sourceLtp.set(source); // restore 203 } 204 return true; 205 } 206 TestCompleteness()207 public void TestCompleteness() { 208 // if (logKnownIssue("Cldrbug:7121", 209 // "Problems with likely subtags test")) { 210 // return; 211 // } 212 // checkAdding("und_Bopo"); 213 // checkAdding("und_Brai"); 214 // checkAdding("und_Limb"); 215 // checkAdding("und_Cakm"); 216 // checkAdding("und_Shaw"); 217 218 final LanguageTagParser ltp = new LanguageTagParser(); 219 if (DEBUG) { 220 System.out.println(TAGS.languages.size() + "\t" + TAGS.languages); 221 System.out.println(TAGS.scripts.size() + "\t" + TAGS.scripts); 222 System.out.println(TAGS.regions.size() + "\t" + TAGS.regions); 223 } 224 main: for (Entry<String, Map<String, Map<String, Boolean>>> languageScriptRegion : TAGS.languageToScriptToRegions) { 225 String language = languageScriptRegion.getKey(); 226 ltp.set(language); // clears script, region 227 for (Entry<String, Map<String, Boolean>> scriptRegion : languageScriptRegion 228 .getValue().entrySet()) { 229 String script = scriptRegion.getKey(); 230 ltp.setScript(script); 231 for (String region : scriptRegion.getValue().keySet()) { 232 ltp.setRegion(region); 233 String testTag = ltp.toString(); 234 // System.out.println(testTag); 235 if (!checkAdding(testTag)) { 236 continue main; 237 } 238 } 239 } 240 } 241 } 242 243 static Set<String> exceptions = new HashSet<String>(Arrays.asList("Zyyy", 244 "Zinh", "Zzzz", "Brai")); 245 TestStability()246 public void TestStability() { 247 // when maximized must never change 248 // first get all the subtags 249 // then test all the combinations 250 LanguageTagParser ltp = new LanguageTagParser(); 251 for (Entry<String, String> entry : likely.entrySet()) { 252 ltp.set(entry.getKey()); 253 String sourceLanguage = ltp.getLanguage(); 254 if (sourceLanguage.equals("und")) { 255 sourceLanguage = ""; 256 } 257 String sourceScript = ltp.getScript(); 258 String sourceRegion = ltp.getRegion(); 259 ltp.set(entry.getValue()); 260 String targetLanguage = ltp.getLanguage(); 261 String targetScript = ltp.getScript(); 262 String targetRegion = ltp.getRegion(); 263 if (!sourceLanguage.isEmpty()) { 264 assertEquals("language", sourceLanguage, targetLanguage); 265 } 266 if (!sourceScript.isEmpty()) { 267 assertEquals("script", sourceScript, targetScript); 268 } 269 if (!sourceRegion.isEmpty()) { 270 if (Containment.isLeaf(sourceRegion)) { 271 assertEquals("region", sourceRegion, targetRegion); 272 } 273 } 274 } 275 276 } 277 TestForMissingScriptMetadata()278 public void TestForMissingScriptMetadata() { 279 TreeSet<String> metadataScripts = new TreeSet<String>( 280 ScriptMetadata.getScripts()); 281 UnicodeSet current = new UnicodeSet(0, 0x10FFFF); 282 UnicodeSet toRemove = new UnicodeSet(); 283 284 while (!current.isEmpty()) { 285 int ch = current.charAt(0); 286 int script = UScript.getScript(ch); 287 String shortName = UScript.getShortName(script); 288 Info i = ScriptMetadata.getInfo(shortName); 289 if (i == null) { 290 errln("Script Metadata is missing: " + shortName); 291 continue; 292 } 293 if (i.likelyLanguage.equals("und") 294 && !exceptions.contains(shortName)) { 295 errln("Script has no likely language: " + shortName); 296 } 297 toRemove.applyIntPropertyValue(UProperty.SCRIPT, script); 298 current.removeAll(toRemove); 299 metadataScripts.remove(shortName); 300 } 301 metadataScripts 302 .removeAll(Arrays.asList("Hans", "Hant", "Hanb", "Jamo", "Jpan", "Kore")); // remove 303 // "combo" 304 // scripts 305 if (!metadataScripts.isEmpty()) { 306 // Warning, not error, so that we can add scripts to the script metadata 307 // and later update to the Unicode version that has characters for those scripts. 308 warnln("Script Metadata for characters not in Unicode: " 309 + metadataScripts); 310 } 311 } 312 TestMissingInfoForLanguage()313 public void TestMissingInfoForLanguage() { 314 CLDRFile english = CLDRConfig.getInstance().getEnglish(); 315 316 for (String language : CLDRConfig.getInstance().getCldrFactory() 317 .getAvailableLanguages()) { 318 if (language.contains("_") || language.equals("root")) { 319 continue; 320 } 321 String likelyExpansion = likely.get(language); 322 if (likelyExpansion == null) { 323 errln("Missing likely subtags for: " + language); 324 } else { 325 logln("Likely subtags for " + language + ":\t " + likely); 326 } 327 String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language); 328 String englishName = english.getStringValue(path); 329 if (englishName == null) { 330 errln("Missing English translation for: " + language); 331 } 332 } 333 } 334 TestMissingInfoForRegion()335 public void TestMissingInfoForRegion() { 336 CLDRFile english = CLDRConfig.getInstance().getEnglish(); 337 338 for (String region : StandardCodes.make().getGoodAvailableCodes( 339 "territory")) { 340 String likelyExpansion = likely.get("und_" + region); 341 if (likelyExpansion == null) { 342 if (region.equals("ZZ") || region.equals("001") || region.equals("UN") 343 || SUPPLEMENTAL_DATA_INFO.getContained(region) == null) { // not 344 // container 345 String likelyTag = LikelySubtags.maximize("und_" + region, 346 likely); 347 if (likelyTag == null || !likelyTag.startsWith("en_Latn_")) { 348 errln("Missing likely subtags for region: " + region 349 + "\t" + english.getName("territory", region)); 350 } 351 } else { // container 352 errln("Missing likely subtags for macroregion (fix to exclude regions having 'en'): " 353 + region 354 + "\t" 355 + english.getName("territory", region)); 356 } 357 } else { 358 logln("Likely subtags for region: " + region + ":\t " + likely); 359 } 360 String path = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, region); 361 String englishName = english.getStringValue(path); 362 if (englishName == null) { 363 errln("Missing English translation for: " + region); 364 } 365 } 366 } 367 TestMissingInfoForScript()368 public void TestMissingInfoForScript() { 369 VersionInfo icuUnicodeVersion = UCharacter.getUnicodeVersion(); 370 TreeSet<String> sorted = new TreeSet<String>( 371 ScriptMetadata.getScripts()); 372 Set<String> exceptions2 = new HashSet<String>( 373 Arrays.asList("zh_Hans_CN")); 374 for (String script : sorted) { 375 if (exceptions.contains(script) || script.equals("Latn") 376 || script.equals("Dsrt")) { 377 // we minimize away und_X, when the code puts in en...US 378 continue; 379 } 380 Info i = ScriptMetadata.getInfo(script); 381 // System.out.println(i); 382 String likelyLanguage = i.likelyLanguage; 383 String originCountry = i.originCountry; 384 String undScript = "und_" + script; 385 String langScript = likelyLanguage + "_" + script + "_"; 386 String likelyExpansion = likely.get(undScript); 387 if (likelyExpansion == null) { 388 String msg = "Missing likely language for script (und_" + script 389 + ") should be something like:\t " 390 + showOverride(script, originCountry, langScript); 391 if (i.age.compareTo(icuUnicodeVersion) <= 0) { 392 // Error: Missing data for a script in ICU's Unicode version. 393 errln(msg); 394 } else { 395 // Warning: Missing data for a script in a future Unicode version. 396 warnln(msg); 397 } 398 } else if (!exceptions2.contains(likelyExpansion) 399 && !likelyExpansion.startsWith(langScript)) { 400 // if 401 // (logKnownIssue("Cldrbug:7181","Missing script metadata for " 402 // + script) 403 // && (script.equals("Tfng") || script.equals("Brah"))) { 404 // logln("Wrong likely language for script (und_" + script + 405 // "). Should not be " + likelyExpansion 406 // + ", but something like:\t " + showOverride(script, 407 // originCountry, langScript)); 408 // } else { 409 errln("Wrong likely language for script (und_" + script 410 + "). Should not be " + likelyExpansion 411 + ", but something like:\t " 412 + showOverride(script, originCountry, langScript)); 413 // } 414 } else { 415 logln("OK: " + undScript + " => " + likelyExpansion); 416 } 417 } 418 /** 419 * und_Bopo => zh_Bopo_TW und_Copt => cop_Copt_EG // fix 002 und_Dsrt => 420 * en_Dsrt_US // fix US 421 */ 422 } 423 showOverride(String script, String originCountry, String langScript)424 public String showOverride(String script, String originCountry, 425 String langScript) { 426 return "{\"und_" + script + "\", \"" + langScript + originCountry 427 + "\"},"; 428 } 429 } 430