1 package org.unicode.cldr.unittest;
2 
3 import java.util.Arrays;
4 import java.util.HashSet;
5 import java.util.Map;
6 import java.util.Map.Entry;
7 import java.util.Set;
8 import java.util.TreeMap;
9 import java.util.TreeSet;
10 
11 import org.unicode.cldr.draft.ScriptMetadata;
12 import org.unicode.cldr.draft.ScriptMetadata.Info;
13 import org.unicode.cldr.tool.LikelySubtags;
14 import org.unicode.cldr.util.CLDRConfig;
15 import org.unicode.cldr.util.CLDRFile;
16 import org.unicode.cldr.util.ChainedMap;
17 import org.unicode.cldr.util.ChainedMap.M3;
18 import org.unicode.cldr.util.Containment;
19 import org.unicode.cldr.util.LanguageTagParser;
20 import org.unicode.cldr.util.StandardCodes;
21 import org.unicode.cldr.util.SupplementalDataInfo;
22 
23 import com.ibm.icu.dev.test.TestFmwk;
24 import com.ibm.icu.lang.UCharacter;
25 import com.ibm.icu.lang.UProperty;
26 import com.ibm.icu.lang.UScript;
27 import com.ibm.icu.text.UnicodeSet;
28 import com.ibm.icu.util.VersionInfo;
29 
30 public class LikelySubtagsTest extends TestFmwk {
31 
32     private boolean DEBUG = false;
33     private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig
34         .getInstance().getSupplementalDataInfo();
35     static final Map<String, String> likely = SUPPLEMENTAL_DATA_INFO
36         .getLikelySubtags();
37     static final LikelySubtags LIKELY = new LikelySubtags(
38         SUPPLEMENTAL_DATA_INFO, likely);
39 
main(String[] args)40     public static void main(String[] args) {
41         new LikelySubtagsTest().run(args);
42     }
43 
44     static class Tags {
45         final Set<String> languages = new TreeSet<String>();
46         final Set<String> scripts = new TreeSet<String>();
47         final Set<String> regions = new TreeSet<String>();
48         final Set<String> scriptRegion = new TreeSet<String>();
49         final Set<String> languageScript = new TreeSet<String>();
50         final Set<String> languageRegion = new TreeSet<String>();
51         final Set<String> all = new TreeSet<String>();
52         final ChainedMap.M4<String, String, String, Boolean> languageToScriptToRegions = ChainedMap
53             .of(new TreeMap<String, Object>(),
54                 new TreeMap<String, Object>(),
55                 new TreeMap<String, Object>(), Boolean.class);
56         final ChainedMap.M3<String, String, Boolean> languageToRegions = ChainedMap
57             .of(new TreeMap<String, Object>(),
58                 new TreeMap<String, Object>(), Boolean.class);
59 
Tags()60         public Tags() {
61             final LanguageTagParser ltp = new LanguageTagParser();
62             for (Entry<String, String> entry : likely.entrySet()) {
63                 add(ltp.set(entry.getKey()), true);
64                 add(ltp.set(entry.getValue()), false);
65             }
66             // add unfamiliar script, unfamiliar region
67             for (String lang : languageToScriptToRegions.keySet()) {
68                 if (lang.equals("und")) {
69                     continue;
70                 }
71                 M3<String, String, Boolean> scriptToRegion = languageToScriptToRegions
72                     .get(lang);
73                 final Set<String> scriptsFor = scriptToRegion.keySet();
74                 final Set<String> regionsFor = languageToRegions.get(lang)
75                     .keySet();
76 
77                 String firstScriptNotIn = getNonEmptyNotIn(scripts, scriptsFor);
78                 String firstRegionNotIn = getNonEmptyNotIn(regions, regionsFor);
79 
80                 languageToScriptToRegions.put(lang, firstScriptNotIn,
81                     firstRegionNotIn, Boolean.TRUE);
82                 // clone for safety before iterating
83                 for (String script : new HashSet<String>(scriptsFor)) {
84                     languageToScriptToRegions.put(lang, script,
85                         firstRegionNotIn, Boolean.TRUE);
86                 }
87                 for (String region : new HashSet<String>(regionsFor)) {
88                     languageToScriptToRegions.put(lang, firstScriptNotIn,
89                         region, Boolean.TRUE);
90                 }
91             }
92 
93             // System.out.println("all: " + all);
94             // System.out.println("scriptRegion: " + scriptRegion);
95             // System.out.println("languageScript: " + languageScript);
96             // System.out.println("languageRegion: " + languageRegion);
97         }
98 
getNonEmptyNotIn(Iterable<T> a, Set<T> b)99         private static <T> T getNonEmptyNotIn(Iterable<T> a, Set<T> b) {
100             for (T x : a) {
101                 if (!b.contains(x) && !x.toString().isEmpty()) {
102                     return x;
103                 }
104             }
105             throw new IllegalArgumentException();
106         }
107 
add(LanguageTagParser ltp, boolean source)108         void add(LanguageTagParser ltp, boolean source) {
109             String sourceLanguage = ltp.getLanguage();
110             String sourceScript = ltp.getScript();
111             String sourceRegion = ltp.getRegion();
112             languageToScriptToRegions.put(sourceLanguage, sourceScript,
113                 sourceRegion, Boolean.TRUE);
114             languageToScriptToRegions.put(sourceLanguage, sourceScript, "",
115                 Boolean.TRUE);
116             languageToScriptToRegions.put(sourceLanguage, "", "", Boolean.TRUE);
117             languageToRegions.put(sourceLanguage, "", Boolean.TRUE);
118             if (StandardCodes.isCountry(sourceRegion)) {
119                 languageToScriptToRegions.put(sourceLanguage, "", sourceRegion,
120                     Boolean.TRUE);
121                 languageToRegions.put(sourceLanguage, sourceRegion,
122                     Boolean.TRUE);
123             }
124 
125             // capture all cases of 2 items
126             if (source) {
127                 if (!sourceScript.isEmpty() && !sourceRegion.isEmpty()) {
128                     if (!sourceLanguage.equals("und")) {
129                         all.add(ltp.toString());
130                     } else {
131                         scriptRegion.add(ltp.toString());
132                     }
133                 } else if (!sourceLanguage.equals("und")) {
134                     if (!sourceScript.isEmpty()) {
135                         languageScript.add(ltp.toString());
136                     } else if (!sourceRegion.isEmpty()) {
137                         languageRegion.add(ltp.toString());
138                     }
139                 }
140             }
141             languages.add(sourceLanguage);
142             scripts.add(sourceScript);
143             if (StandardCodes.isCountry(sourceRegion) || sourceRegion.isEmpty()) {
144                 regions.add(sourceRegion);
145             }
146         }
147     }
148 
149     static final Tags TAGS = new Tags();
150 
151     final LanguageTagParser maxLtp = new LanguageTagParser();
152     final LanguageTagParser sourceLtp = new LanguageTagParser();
153 
154     /**
155      * Return false if we should skip the language
156      *
157      * @param source
158      * @return
159      */
checkAdding(String source)160     public boolean checkAdding(String source) {
161         // if X maps to Y, then adding a field from Y to X will still map to Y
162         // Example:
163         // und_AF => fa_Arab_AF
164         // therefore, the following should also be true:
165         // und_Arab_AF => fa_Arab_AF
166         // fa_AF => fa_Arab_AF
167         // fa_Arab_AF => fa_Arab_AF
168 
169         String max = LIKELY.maximize(source);
170         if (!assertNotEquals("Maximize " + source, null, max)) {
171             return source.contains("_");
172         }
173         sourceLtp.set(source);
174         if (!sourceLtp.getRegion().isEmpty()
175             && !StandardCodes.isCountry(sourceLtp.getRegion())) {
176             return true;
177         }
178         maxLtp.set(max);
179         for (int i = 1; i < 8; ++i) {
180             if ((i & 1) != 0) {
181                 if (!sourceLtp.getLanguage().equals("und"))
182                     continue;
183                 sourceLtp.setLanguage(maxLtp.getLanguage());
184             }
185             if ((i & 2) != 0) {
186                 if (!sourceLtp.getScript().isEmpty())
187                     continue;
188                 sourceLtp.setScript(maxLtp.getScript());
189             }
190             if ((i & 4) != 0) {
191                 if (!sourceLtp.getRegion().isEmpty())
192                     continue;
193                 sourceLtp.setRegion(maxLtp.getRegion());
194             }
195             String test = sourceLtp.toString();
196             final String maximize = LIKELY.maximize(test);
197             if (!max.equals(maximize)) {
198                 if (!assertEquals(source + " -> " + max + ", so testing "
199                     + test, max, maximize)) {
200                     LIKELY.maximize(test); // do again for debugging
201                 }
202             }
203             sourceLtp.set(source); // restore
204         }
205         return true;
206     }
207 
TestCompleteness()208     public void TestCompleteness() {
209         // if (logKnownIssue("Cldrbug:7121",
210         // "Problems with likely subtags test")) {
211         // return;
212         // }
213         // checkAdding("und_Bopo");
214         // checkAdding("und_Brai");
215         // checkAdding("und_Limb");
216         // checkAdding("und_Cakm");
217         // checkAdding("und_Shaw");
218 
219         final LanguageTagParser ltp = new LanguageTagParser();
220         if (DEBUG) {
221             System.out.println(TAGS.languages.size() + "\t" + TAGS.languages);
222             System.out.println(TAGS.scripts.size() + "\t" + TAGS.scripts);
223             System.out.println(TAGS.regions.size() + "\t" + TAGS.regions);
224         }
225         main: for (Entry<String, Map<String, Map<String, Boolean>>> languageScriptRegion : TAGS.languageToScriptToRegions) {
226             String language = languageScriptRegion.getKey();
227             ltp.set(language); // clears script, region
228             for (Entry<String, Map<String, Boolean>> scriptRegion : languageScriptRegion
229                 .getValue().entrySet()) {
230                 String script = scriptRegion.getKey();
231                 ltp.setScript(script);
232                 for (String region : scriptRegion.getValue().keySet()) {
233                     ltp.setRegion(region);
234                     String testTag = ltp.toString();
235                     // System.out.println(testTag);
236                     if (!checkAdding(testTag)) {
237                         continue main;
238                     }
239                 }
240             }
241         }
242     }
243 
244     static Set<String> exceptions = new HashSet<String>(Arrays.asList("Zyyy",
245         "Zinh", "Zzzz", "Brai"));
246 
TestStability()247     public void TestStability() {
248         // when maximized must never change
249         // first get all the subtags
250         // then test all the combinations
251         LanguageTagParser ltp = new LanguageTagParser();
252         for (Entry<String, String> entry : likely.entrySet()) {
253             ltp.set(entry.getKey());
254             String sourceLanguage = ltp.getLanguage();
255             if (sourceLanguage.equals("und")) {
256                 sourceLanguage = "";
257             }
258             String sourceScript = ltp.getScript();
259             String sourceRegion = ltp.getRegion();
260             ltp.set(entry.getValue());
261             String targetLanguage = ltp.getLanguage();
262             String targetScript = ltp.getScript();
263             String targetRegion = ltp.getRegion();
264             if (!sourceLanguage.isEmpty()) {
265                 assertEquals("language", sourceLanguage, targetLanguage);
266             }
267             if (!sourceScript.isEmpty()) {
268                 assertEquals("script", sourceScript, targetScript);
269             }
270             if (!sourceRegion.isEmpty()) {
271                 if (Containment.isLeaf(sourceRegion)) {
272                     assertEquals("region", sourceRegion, targetRegion);
273                 }
274             }
275         }
276 
277     }
278 
TestForMissingScriptMetadata()279     public void TestForMissingScriptMetadata() {
280         TreeSet<String> metadataScripts = new TreeSet<String>(
281             ScriptMetadata.getScripts());
282         UnicodeSet current = new UnicodeSet(0, 0x10FFFF);
283         UnicodeSet toRemove = new UnicodeSet();
284 
285         while (!current.isEmpty()) {
286             int ch = current.charAt(0);
287             int script = UScript.getScript(ch);
288             String shortName = UScript.getShortName(script);
289             Info i = ScriptMetadata.getInfo(shortName);
290             if (i == null) {
291                 errln("Script Metadata is missing: " + shortName);
292                 continue;
293             }
294             if (i.likelyLanguage.equals("und")
295                 && !exceptions.contains(shortName)) {
296                 errln("Script has no likely language: " + shortName);
297             }
298             toRemove.applyIntPropertyValue(UProperty.SCRIPT, script);
299             current.removeAll(toRemove);
300             metadataScripts.remove(shortName);
301         }
302         metadataScripts
303             .removeAll(Arrays.asList("Hans", "Hant", "Hanb", "Jamo", "Jpan", "Kore")); // remove
304         // "combo"
305         // scripts
306         if (!metadataScripts.isEmpty()) {
307             // Warning, not error, so that we can add scripts to the script metadata
308             // and later update to the Unicode version that has characters for those scripts.
309             warnln("Script Metadata for characters not in Unicode: "
310                 + metadataScripts);
311         }
312     }
313 
TestMissingInfoForLanguage()314     public void TestMissingInfoForLanguage() {
315         CLDRFile english = CLDRConfig.getInstance().getEnglish();
316 
317         for (String language : CLDRConfig.getInstance().getCldrFactory()
318             .getAvailableLanguages()) {
319             if (language.contains("_") || language.equals("root")) {
320                 continue;
321             }
322             String likelyExpansion = likely.get(language);
323             if (likelyExpansion == null) {
324                 errln("Missing likely subtags for: " + language);
325             } else {
326                 logln("Likely subtags for " + language + ":\t " + likely);
327             }
328             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language);
329             String englishName = english.getStringValue(path);
330             if (englishName == null) {
331                 errln("Missing English translation for: " + language);
332             }
333         }
334     }
335 
TestMissingInfoForRegion()336     public void TestMissingInfoForRegion() {
337         CLDRFile english = CLDRConfig.getInstance().getEnglish();
338 
339         for (String region : StandardCodes.make().getGoodAvailableCodes(
340             "territory")) {
341             String likelyExpansion = likely.get("und_" + region);
342             if (likelyExpansion == null) {
343                 if (region.equals("ZZ") || region.equals("001") || region.equals("UN")
344                     || SUPPLEMENTAL_DATA_INFO.getContained(region) == null) { // not
345                     // container
346                     String likelyTag = LikelySubtags.maximize("und_" + region,
347                         likely);
348                     if (likelyTag == null || !likelyTag.startsWith("en_Latn_")) {
349                         errln("Missing likely subtags for region: " + region
350                             + "\t" + english.getName("territory", region));
351                     }
352                 } else { // container
353                     errln("Missing likely subtags for macroregion (fix to exclude regions having 'en'): "
354                         + region
355                         + "\t"
356                         + english.getName("territory", region));
357                 }
358             } else {
359                 logln("Likely subtags for region: " + region + ":\t " + likely);
360             }
361             String path = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, region);
362             String englishName = english.getStringValue(path);
363             if (englishName == null) {
364                 errln("Missing English translation for: " + region);
365             }
366         }
367     }
368 
TestMissingInfoForScript()369     public void TestMissingInfoForScript() {
370         VersionInfo icuUnicodeVersion = UCharacter.getUnicodeVersion();
371         TreeSet<String> sorted = new TreeSet<String>(
372             ScriptMetadata.getScripts());
373         Set<String> exceptions2 = new HashSet<String>(
374             Arrays.asList("zh_Hans_CN"));
375         for (String script : sorted) {
376             if (exceptions.contains(script) || script.equals("Latn")
377                 || script.equals("Dsrt")) {
378                 // we minimize away und_X, when the code puts in en...US
379                 continue;
380             }
381             Info i = ScriptMetadata.getInfo(script);
382             // System.out.println(i);
383             String likelyLanguage = i.likelyLanguage;
384             String originCountry = i.originCountry;
385             String undScript = "und_" + script;
386             String langScript = likelyLanguage + "_" + script + "_";
387             String likelyExpansion = likely.get(undScript);
388             if (likelyExpansion == null) {
389                 String msg = "Missing likely language for script (und_" + script
390                     + ")  should be something like:\t "
391                     + showOverride(script, originCountry, langScript);
392                 if (i.age.compareTo(icuUnicodeVersion) <= 0) {
393                     // Error: Missing data for a script in ICU's Unicode version.
394                     errln(msg);
395                 } else {
396                     // Warning: Missing data for a script in a future Unicode version.
397                     warnln(msg);
398                 }
399             } else if (!exceptions2.contains(likelyExpansion)
400                 && !likelyExpansion.startsWith(langScript)) {
401                 // if
402                 // (logKnownIssue("Cldrbug:7181","Missing script metadata for "
403                 // + script)
404                 // && (script.equals("Tfng") || script.equals("Brah"))) {
405                 // logln("Wrong likely language for script (und_" + script +
406                 // "). Should not be " + likelyExpansion
407                 // + ", but something like:\t " + showOverride(script,
408                 // originCountry, langScript));
409                 // } else {
410                 errln("Wrong likely language for script (und_" + script
411                     + "). Should not be " + likelyExpansion
412                     + ", but something like:\t "
413                     + showOverride(script, originCountry, langScript));
414                 // }
415             } else {
416                 logln("OK: " + undScript + " => " + likelyExpansion);
417             }
418         }
419         /**
420          * und_Bopo => zh_Bopo_TW und_Copt => cop_Copt_EG // fix 002 und_Dsrt =>
421          * en_Dsrt_US // fix US
422          */
423     }
424 
showOverride(String script, String originCountry, String langScript)425     public String showOverride(String script, String originCountry,
426         String langScript) {
427         return "{\"und_" + script + "\", \"" + langScript + originCountry
428             + "\"},";
429     }
430 }
431