1 package org.unicode.cldr.unittest;
2 
3 import java.util.Arrays;
4 import java.util.HashSet;
5 import java.util.Map;
6 import java.util.Map.Entry;
7 import java.util.Set;
8 import java.util.TreeMap;
9 import java.util.TreeSet;
10 
11 import org.unicode.cldr.draft.ScriptMetadata;
12 import org.unicode.cldr.draft.ScriptMetadata.Info;
13 import org.unicode.cldr.tool.LikelySubtags;
14 import org.unicode.cldr.util.CLDRConfig;
15 import org.unicode.cldr.util.CLDRFile;
16 import org.unicode.cldr.util.ChainedMap;
17 import org.unicode.cldr.util.ChainedMap.M3;
18 import org.unicode.cldr.util.Containment;
19 import org.unicode.cldr.util.LanguageTagParser;
20 import org.unicode.cldr.util.StandardCodes;
21 import org.unicode.cldr.util.SupplementalDataInfo;
22 
23 import com.ibm.icu.dev.test.TestFmwk;
24 import com.ibm.icu.lang.UCharacter;
25 import com.ibm.icu.lang.UProperty;
26 import com.ibm.icu.lang.UScript;
27 import com.ibm.icu.text.UnicodeSet;
28 import com.ibm.icu.util.VersionInfo;
29 
30 public class LikelySubtagsTest extends TestFmwk {
31 
32     private boolean DEBUG = false;
33     private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig
34         .getInstance().getSupplementalDataInfo();
35     static final Map<String, String> likely = SUPPLEMENTAL_DATA_INFO
36         .getLikelySubtags();
37     static final LikelySubtags LIKELY = new LikelySubtags();
38 
main(String[] args)39     public static void main(String[] args) {
40         new LikelySubtagsTest().run(args);
41     }
42 
43     static class Tags {
44         final Set<String> languages = new TreeSet<String>();
45         final Set<String> scripts = new TreeSet<String>();
46         final Set<String> regions = new TreeSet<String>();
47         final Set<String> scriptRegion = new TreeSet<String>();
48         final Set<String> languageScript = new TreeSet<String>();
49         final Set<String> languageRegion = new TreeSet<String>();
50         final Set<String> all = new TreeSet<String>();
51         final ChainedMap.M4<String, String, String, Boolean> languageToScriptToRegions = ChainedMap
52             .of(new TreeMap<String, Object>(),
53                 new TreeMap<String, Object>(),
54                 new TreeMap<String, Object>(), Boolean.class);
55         final ChainedMap.M3<String, String, Boolean> languageToRegions = ChainedMap
56             .of(new TreeMap<String, Object>(),
57                 new TreeMap<String, Object>(), Boolean.class);
58 
Tags()59         public Tags() {
60             final LanguageTagParser ltp = new LanguageTagParser();
61             for (Entry<String, String> entry : likely.entrySet()) {
62                 add(ltp.set(entry.getKey()), true);
63                 add(ltp.set(entry.getValue()), false);
64             }
65             // add unfamiliar script, unfamiliar region
66             for (String lang : languageToScriptToRegions.keySet()) {
67                 if (lang.equals("und")) {
68                     continue;
69                 }
70                 M3<String, String, Boolean> scriptToRegion = languageToScriptToRegions
71                     .get(lang);
72                 final Set<String> scriptsFor = scriptToRegion.keySet();
73                 final Set<String> regionsFor = languageToRegions.get(lang)
74                     .keySet();
75 
76                 String firstScriptNotIn = getNonEmptyNotIn(scripts, scriptsFor);
77                 String firstRegionNotIn = getNonEmptyNotIn(regions, regionsFor);
78 
79                 languageToScriptToRegions.put(lang, firstScriptNotIn,
80                     firstRegionNotIn, Boolean.TRUE);
81                 // clone for safety before iterating
82                 for (String script : new HashSet<String>(scriptsFor)) {
83                     languageToScriptToRegions.put(lang, script,
84                         firstRegionNotIn, Boolean.TRUE);
85                 }
86                 for (String region : new HashSet<String>(regionsFor)) {
87                     languageToScriptToRegions.put(lang, firstScriptNotIn,
88                         region, Boolean.TRUE);
89                 }
90             }
91 
92             // System.out.println("all: " + all);
93             // System.out.println("scriptRegion: " + scriptRegion);
94             // System.out.println("languageScript: " + languageScript);
95             // System.out.println("languageRegion: " + languageRegion);
96         }
97 
getNonEmptyNotIn(Iterable<T> a, Set<T> b)98         private static <T> T getNonEmptyNotIn(Iterable<T> a, Set<T> b) {
99             for (T x : a) {
100                 if (!b.contains(x) && !x.toString().isEmpty()) {
101                     return x;
102                 }
103             }
104             throw new IllegalArgumentException();
105         }
106 
add(LanguageTagParser ltp, boolean source)107         void add(LanguageTagParser ltp, boolean source) {
108             String sourceLanguage = ltp.getLanguage();
109             String sourceScript = ltp.getScript();
110             String sourceRegion = ltp.getRegion();
111             languageToScriptToRegions.put(sourceLanguage, sourceScript,
112                 sourceRegion, Boolean.TRUE);
113             languageToScriptToRegions.put(sourceLanguage, sourceScript, "",
114                 Boolean.TRUE);
115             languageToScriptToRegions.put(sourceLanguage, "", "", Boolean.TRUE);
116             languageToRegions.put(sourceLanguage, "", Boolean.TRUE);
117             if (StandardCodes.isCountry(sourceRegion)) {
118                 languageToScriptToRegions.put(sourceLanguage, "", sourceRegion,
119                     Boolean.TRUE);
120                 languageToRegions.put(sourceLanguage, sourceRegion,
121                     Boolean.TRUE);
122             }
123 
124             // capture all cases of 2 items
125             if (source) {
126                 if (!sourceScript.isEmpty() && !sourceRegion.isEmpty()) {
127                     if (!sourceLanguage.equals("und")) {
128                         all.add(ltp.toString());
129                     } else {
130                         scriptRegion.add(ltp.toString());
131                     }
132                 } else if (!sourceLanguage.equals("und")) {
133                     if (!sourceScript.isEmpty()) {
134                         languageScript.add(ltp.toString());
135                     } else if (!sourceRegion.isEmpty()) {
136                         languageRegion.add(ltp.toString());
137                     }
138                 }
139             }
140             languages.add(sourceLanguage);
141             scripts.add(sourceScript);
142             if (StandardCodes.isCountry(sourceRegion) || sourceRegion.isEmpty()) {
143                 regions.add(sourceRegion);
144             }
145         }
146     }
147 
148     static final Tags TAGS = new Tags();
149 
150     final LanguageTagParser maxLtp = new LanguageTagParser();
151     final LanguageTagParser sourceLtp = new LanguageTagParser();
152 
153     /**
154      * Return false if we should skip the language
155      *
156      * @param source
157      * @return
158      */
checkAdding(String source)159     public boolean checkAdding(String source) {
160         // if X maps to Y, then adding a field from Y to X will still map to Y
161         // Example:
162         // und_AF => fa_Arab_AF
163         // therefore, the following should also be true:
164         // und_Arab_AF => fa_Arab_AF
165         // fa_AF => fa_Arab_AF
166         // fa_Arab_AF => fa_Arab_AF
167 
168         String max = LIKELY.maximize(source);
169         if (!assertNotEquals("Maximize " + source, null, max)) {
170             return source.contains("_");
171         }
172         sourceLtp.set(source);
173         if (!sourceLtp.getRegion().isEmpty()
174             && !StandardCodes.isCountry(sourceLtp.getRegion())) {
175             return true;
176         }
177         maxLtp.set(max);
178         for (int i = 1; i < 8; ++i) {
179             if ((i & 1) != 0) {
180                 if (!sourceLtp.getLanguage().equals("und"))
181                     continue;
182                 sourceLtp.setLanguage(maxLtp.getLanguage());
183             }
184             if ((i & 2) != 0) {
185                 if (!sourceLtp.getScript().isEmpty())
186                     continue;
187                 sourceLtp.setScript(maxLtp.getScript());
188             }
189             if ((i & 4) != 0) {
190                 if (!sourceLtp.getRegion().isEmpty())
191                     continue;
192                 sourceLtp.setRegion(maxLtp.getRegion());
193             }
194             String test = sourceLtp.toString();
195             final String maximize = LIKELY.maximize(test);
196             if (!max.equals(maximize)) {
197                 if (!assertEquals(source + " -> " + max + ", so testing "
198                     + test, max, maximize)) {
199                     LIKELY.maximize(test); // do again for debugging
200                 }
201             }
202             sourceLtp.set(source); // restore
203         }
204         return true;
205     }
206 
TestCompleteness()207     public void TestCompleteness() {
208         // if (logKnownIssue("Cldrbug:7121",
209         // "Problems with likely subtags test")) {
210         // return;
211         // }
212         // checkAdding("und_Bopo");
213         // checkAdding("und_Brai");
214         // checkAdding("und_Limb");
215         // checkAdding("und_Cakm");
216         // checkAdding("und_Shaw");
217 
218         final LanguageTagParser ltp = new LanguageTagParser();
219         if (DEBUG) {
220             System.out.println(TAGS.languages.size() + "\t" + TAGS.languages);
221             System.out.println(TAGS.scripts.size() + "\t" + TAGS.scripts);
222             System.out.println(TAGS.regions.size() + "\t" + TAGS.regions);
223         }
224         main: for (Entry<String, Map<String, Map<String, Boolean>>> languageScriptRegion : TAGS.languageToScriptToRegions) {
225             String language = languageScriptRegion.getKey();
226             ltp.set(language); // clears script, region
227             for (Entry<String, Map<String, Boolean>> scriptRegion : languageScriptRegion
228                 .getValue().entrySet()) {
229                 String script = scriptRegion.getKey();
230                 ltp.setScript(script);
231                 for (String region : scriptRegion.getValue().keySet()) {
232                     ltp.setRegion(region);
233                     String testTag = ltp.toString();
234                     // System.out.println(testTag);
235                     if (!checkAdding(testTag)) {
236                         continue main;
237                     }
238                 }
239             }
240         }
241     }
242 
243     static Set<String> exceptions = new HashSet<String>(Arrays.asList("Zyyy",
244         "Zinh", "Zzzz", "Brai"));
245 
TestStability()246     public void TestStability() {
247         // when maximized must never change
248         // first get all the subtags
249         // then test all the combinations
250         LanguageTagParser ltp = new LanguageTagParser();
251         for (Entry<String, String> entry : likely.entrySet()) {
252             ltp.set(entry.getKey());
253             String sourceLanguage = ltp.getLanguage();
254             if (sourceLanguage.equals("und")) {
255                 sourceLanguage = "";
256             }
257             String sourceScript = ltp.getScript();
258             String sourceRegion = ltp.getRegion();
259             ltp.set(entry.getValue());
260             String targetLanguage = ltp.getLanguage();
261             String targetScript = ltp.getScript();
262             String targetRegion = ltp.getRegion();
263             if (!sourceLanguage.isEmpty()) {
264                 assertEquals("language", sourceLanguage, targetLanguage);
265             }
266             if (!sourceScript.isEmpty()) {
267                 assertEquals("script", sourceScript, targetScript);
268             }
269             if (!sourceRegion.isEmpty()) {
270                 if (Containment.isLeaf(sourceRegion)) {
271                     assertEquals("region", sourceRegion, targetRegion);
272                 }
273             }
274         }
275 
276     }
277 
TestForMissingScriptMetadata()278     public void TestForMissingScriptMetadata() {
279         TreeSet<String> metadataScripts = new TreeSet<String>(
280             ScriptMetadata.getScripts());
281         UnicodeSet current = new UnicodeSet(0, 0x10FFFF);
282         UnicodeSet toRemove = new UnicodeSet();
283 
284         while (!current.isEmpty()) {
285             int ch = current.charAt(0);
286             int script = UScript.getScript(ch);
287             String shortName = UScript.getShortName(script);
288             Info i = ScriptMetadata.getInfo(shortName);
289             if (i == null) {
290                 errln("Script Metadata is missing: " + shortName);
291                 continue;
292             }
293             if (i.likelyLanguage.equals("und")
294                 && !exceptions.contains(shortName)) {
295                 errln("Script has no likely language: " + shortName);
296             }
297             toRemove.applyIntPropertyValue(UProperty.SCRIPT, script);
298             current.removeAll(toRemove);
299             metadataScripts.remove(shortName);
300         }
301         metadataScripts
302             .removeAll(Arrays.asList("Hans", "Hant", "Hanb", "Jamo", "Jpan", "Kore")); // remove
303         // "combo"
304         // scripts
305         if (!metadataScripts.isEmpty()) {
306             // Warning, not error, so that we can add scripts to the script metadata
307             // and later update to the Unicode version that has characters for those scripts.
308             warnln("Script Metadata for characters not in Unicode: "
309                 + metadataScripts);
310         }
311     }
312 
TestMissingInfoForLanguage()313     public void TestMissingInfoForLanguage() {
314         CLDRFile english = CLDRConfig.getInstance().getEnglish();
315 
316         for (String language : CLDRConfig.getInstance().getCldrFactory()
317             .getAvailableLanguages()) {
318             if (language.contains("_") || language.equals("root")) {
319                 continue;
320             }
321             String likelyExpansion = likely.get(language);
322             if (likelyExpansion == null) {
323                 errln("Missing likely subtags for: " + language);
324             } else {
325                 logln("Likely subtags for " + language + ":\t " + likely);
326             }
327             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language);
328             String englishName = english.getStringValue(path);
329             if (englishName == null) {
330                 errln("Missing English translation for: " + language);
331             }
332         }
333     }
334 
TestMissingInfoForRegion()335     public void TestMissingInfoForRegion() {
336         CLDRFile english = CLDRConfig.getInstance().getEnglish();
337 
338         for (String region : StandardCodes.make().getGoodAvailableCodes(
339             "territory")) {
340             String likelyExpansion = likely.get("und_" + region);
341             if (likelyExpansion == null) {
342                 if (region.equals("ZZ") || region.equals("001") || region.equals("UN")
343                     || SUPPLEMENTAL_DATA_INFO.getContained(region) == null) { // not
344                     // container
345                     String likelyTag = LikelySubtags.maximize("und_" + region,
346                         likely);
347                     if (likelyTag == null || !likelyTag.startsWith("en_Latn_")) {
348                         errln("Missing likely subtags for region: " + region
349                             + "\t" + english.getName("territory", region));
350                     }
351                 } else { // container
352                     errln("Missing likely subtags for macroregion (fix to exclude regions having 'en'): "
353                         + region
354                         + "\t"
355                         + english.getName("territory", region));
356                 }
357             } else {
358                 logln("Likely subtags for region: " + region + ":\t " + likely);
359             }
360             String path = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, region);
361             String englishName = english.getStringValue(path);
362             if (englishName == null) {
363                 errln("Missing English translation for: " + region);
364             }
365         }
366     }
367 
TestMissingInfoForScript()368     public void TestMissingInfoForScript() {
369         VersionInfo icuUnicodeVersion = UCharacter.getUnicodeVersion();
370         TreeSet<String> sorted = new TreeSet<String>(
371             ScriptMetadata.getScripts());
372         Set<String> exceptions2 = new HashSet<String>(
373             Arrays.asList("zh_Hans_CN"));
374         for (String script : sorted) {
375             if (exceptions.contains(script) || script.equals("Latn")
376                 || script.equals("Dsrt")) {
377                 // we minimize away und_X, when the code puts in en...US
378                 continue;
379             }
380             Info i = ScriptMetadata.getInfo(script);
381             // System.out.println(i);
382             String likelyLanguage = i.likelyLanguage;
383             String originCountry = i.originCountry;
384             String undScript = "und_" + script;
385             String langScript = likelyLanguage + "_" + script + "_";
386             String likelyExpansion = likely.get(undScript);
387             if (likelyExpansion == null) {
388                 String msg = "Missing likely language for script (und_" + script
389                     + ")  should be something like:\t "
390                     + showOverride(script, originCountry, langScript);
391                 if (i.age.compareTo(icuUnicodeVersion) <= 0) {
392                     // Error: Missing data for a script in ICU's Unicode version.
393                     errln(msg);
394                 } else {
395                     // Warning: Missing data for a script in a future Unicode version.
396                     warnln(msg);
397                 }
398             } else if (!exceptions2.contains(likelyExpansion)
399                 && !likelyExpansion.startsWith(langScript)) {
400                 // if
401                 // (logKnownIssue("Cldrbug:7181","Missing script metadata for "
402                 // + script)
403                 // && (script.equals("Tfng") || script.equals("Brah"))) {
404                 // logln("Wrong likely language for script (und_" + script +
405                 // "). Should not be " + likelyExpansion
406                 // + ", but something like:\t " + showOverride(script,
407                 // originCountry, langScript));
408                 // } else {
409                 errln("Wrong likely language for script (und_" + script
410                     + "). Should not be " + likelyExpansion
411                     + ", but something like:\t "
412                     + showOverride(script, originCountry, langScript));
413                 // }
414             } else {
415                 logln("OK: " + undScript + " => " + likelyExpansion);
416             }
417         }
418         /**
419          * und_Bopo => zh_Bopo_TW und_Copt => cop_Copt_EG // fix 002 und_Dsrt =>
420          * en_Dsrt_US // fix US
421          */
422     }
423 
showOverride(String script, String originCountry, String langScript)424     public String showOverride(String script, String originCountry,
425         String langScript) {
426         return "{\"und_" + script + "\", \"" + langScript + originCountry
427             + "\"},";
428     }
429 }
430