1 package org.unicode.cldr.tool;
2 
3 import java.util.Collections;
4 import java.util.HashMap;
5 import java.util.HashSet;
6 import java.util.LinkedHashMap;
7 import java.util.LinkedHashSet;
8 import java.util.List;
9 import java.util.Locale;
10 import java.util.Map;
11 import java.util.Map.Entry;
12 import java.util.Set;
13 import java.util.TreeMap;
14 import java.util.TreeSet;
15 
16 import org.unicode.cldr.util.Builder;
17 import org.unicode.cldr.util.CLDRConfig;
18 import org.unicode.cldr.util.CldrUtility;
19 import org.unicode.cldr.util.LanguageTagParser;
20 import org.unicode.cldr.util.StandardCodes;
21 import org.unicode.cldr.util.StringIterables;
22 
23 import com.ibm.icu.impl.Row.R2;
24 import com.ibm.icu.util.ULocale;
25 
26 public class LanguageCodeConverter {
27     private static Map<String, String> languageNameToCode = new TreeMap<>();
28     private static Set<String> exceptionCodes = new TreeSet<>();
29     private static Set<String> parseErrors = new LinkedHashSet<>();
30 
31     private static Map<String, R2<List<String>, String>> languageAliases = CLDRConfig.getInstance().getSupplementalDataInfo().getLocaleAliasInfo()
32         .get("language");
33 
34     /**
35      * Public only for testing.
36      *
37      * @internal
38      */
39     public static final Map<String, String> GOOGLE_CLDR = Builder.with(new LinkedHashMap<String, String>()) // preserve order
40         .put("iw", "he")
41         .put("jw", "jv")
42         .put("no", "nb")
43         .put("tl", "fil")
44         .put("pt-BR", "pt")
45         .put("xx-bork", "x_bork")
46         .put("xx-elmer", "x_elmer")
47         .put("xx-hacker", "x_hacker")
48         .put("xx-pirate", "x_pirate")
49         .put("xx-klingon", "tlh")
50         .put("zh-CN", "zh")
51         .put("zh-TW", "zh_Hant")
52         .put("zh-HK", "zh_Hant_HK")
53         .put("sit-NP", "lif")
54         .put("ut", "und")
55         .put("un", "und")
56         .put("xx", "und")
57 
58         // .put("sh", "fil")
59         .freeze();
60 
61     /**
62      * Public only for testing.
63      *
64      * @internal
65      */
66     public static final Map<String, String> CLDR_GOOGLE = Builder.with(new HashMap<String, String>())
67         .putAllTransposed(GOOGLE_CLDR)
68         .freeze();
69 
70     /**
71      * Public only for testing.
72      *
73      * @internal
74      */
75     public static final Map<String, String> EXTRA_SCRIPTS = Builder.with(new HashMap<String, String>())
76         .on("crs", "pcm", "tlh").put("Latn")
77         .freeze();
78 
79     static {
80         // Reads the CLDR copy of
81         // http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
82         Map<String, Map<String, Map<String, String>>> lstreg = StandardCodes.getLStreg();
83         Map<String, Map<String, String>> languages = lstreg.get("language");
84         Set<String> validCodes = new HashSet<>();
85 
86         for (Entry<String, Map<String, String>> codeInfo : languages.entrySet()) {
87             String code = codeInfo.getKey();
88             R2<List<String>, String> replacement = languageAliases.get(code);
89             // Returns "sh" -> <{"sr_Latn"}, reason>
90             if (replacement != null) {
91                 List<String> replacements = replacement.get0();
92                 if (replacements.size() != 1) {
93                     continue;
94                 }
95                 code = replacements.get(0);
96                 if (code.contains("_")) {
97                     continue;
98                 }
99             }
100             // if (languageAliases.containsKey(code)) {
101             // continue;
102             // }
103             final Map<String, String> info = codeInfo.getValue();
104             String deprecated = info.get("Deprecated");
105             if (deprecated != null) {
106                 continue;
107             }
108             String name = info.get("Description");
109             if (name.equals("Private use")) {
110                 continue;
111             }
112             validCodes.add(code);
113             if (name.contains(StandardCodes.DESCRIPTION_SEPARATOR)) {
114                 for (String namePart : name.split(StandardCodes.DESCRIPTION_SEPARATOR)) {
115                     addNameToCode("lstr", code, namePart);
116                 }
117             } else {
118                 addNameToCode("lstr", code, name);
119             }
120         }
121 
122         // CLDRFile english; // = testInfo.getEnglish();
123         for (String code : validCodes) {
124             String icuName = ULocale.getDisplayName(code, "en");
125             addNameToCode("cldr", code, icuName);
126             // if (languageAliases.containsKey(code)) {
127             // continue;
128             // }
129             // String cldrName = english.getName("language", code);
130             // if (cldrName != null && !cldrName.equals("private-use")) {
131             // addNameToCode("cldr", code, cldrName);
132             // }
133         }
134         // add exceptions
135         LanguageTagParser ltp = new LanguageTagParser();
136         for (String line : StringIterables.in(CldrUtility.getUTF8Data("external/alternate_language_names.txt"))) {
137             String[] parts = CldrUtility.cleanSemiFields(line);
138             if (parts == null || parts.length == 0) continue;
139             String code = parts[0];
140             if (!validCodes.contains(code)) {
141                 if (code.equals("*OMIT")) {
142                     parseErrors.add("Skipping " + line);
143                     continue;
144                 }
145                 String base = ltp.set(code).getLanguage();
146                 if (!validCodes.contains(base)) {
147                     R2<List<String>, String> alias = languageAliases.get(base);
148                     if (alias != null) {
149                         code = alias.get0().get(0);
150                     } else {
151                         parseErrors.add("Skipping " + line);
152                         continue;
153                     }
154                 }
155             }
toUnderbarLocale(code)156             exceptionCodes.add(toUnderbarLocale(code));
157             if (parts.length < 2) {
158                 continue;
159             }
160             String name = parts[1];
161             if (parts.length > 2) {
162                 name += ";" + parts[2]; // HACK
163             }
164             addNameToCode("exception", code, name);
165         }
166         for (String cldr : GOOGLE_CLDR.values()) {
167             String goodCode = toUnderbarLocale(cldr);
168             exceptionCodes.add(goodCode);
169         }
170         languageNameToCode = Collections.unmodifiableMap(languageNameToCode);
171         exceptionCodes = Collections.unmodifiableSet(exceptionCodes);
172         parseErrors = Collections.unmodifiableSet(parseErrors);
173     }
174 
addNameToCode(final String type, final String code, String name)175     private static void addNameToCode(final String type, final String code, String name) {
176         if (code.equals("mru") && name.equals("mru")) {
177             // mru=Mono (Cameroon)
178             // mro=Mru
179             // Ignore the CLDR mapping of the code to itself,
180             // to avoid clobbering the mapping of the real name Mru to the real code mro.
181             return;
182         }
183         name = name.toLowerCase(Locale.ENGLISH);
184         String oldCode = languageNameToCode.get(name);
185         if (oldCode != null) {
186             if (!oldCode.equals(code)) {
187                 parseErrors.add("Name Collision! " + type + ": " + name + " <" + oldCode + ", " + code + ">");
188             } else {
189                 return;
190             }
191         }
192         languageNameToCode.put(name, code);
193     }
194 
toGoogleLocaleId(String localeId)195     public static String toGoogleLocaleId(String localeId) {
196         // TODO fix to do languages, etc. field by field
197         localeId = localeId.replace("-", "_");
198         String result = CLDR_GOOGLE.get(localeId);
199         result = result == null ? localeId : result;
200         return result.replace("_", "-");
201     }
202 
fromGoogleLocaleId(String localeId)203     public static String fromGoogleLocaleId(String localeId) {
204         localeId = localeId.replace("_", "-");
205         // TODO fix to do languages, etc. field by field
206         String result = GOOGLE_CLDR.get(localeId);
207         result = result == null ? localeId : result;
208         return result.replace("-", "_");
209     }
210 
toUnderbarLocale(String localeId)211     public static String toUnderbarLocale(String localeId) {
212         return localeId.replace("-", "_");
213     }
214 
toHyphenLocale(String localeId)215     public static String toHyphenLocale(String localeId) {
216         return localeId.replace("_", "-");
217     }
218 
getCodeForName(String languageName)219     public static String getCodeForName(String languageName) {
220         return languageNameToCode.get(languageName.toLowerCase(Locale.ENGLISH));
221     }
222 
getExceptionCodes()223     public static Set<String> getExceptionCodes() {
224         return exceptionCodes;
225     }
226 
getParseErrors()227     public static Set<String> getParseErrors() {
228         return parseErrors;
229     }
230 
getLanguageNameToCode()231     public static Map<String, String> getLanguageNameToCode() {
232         return languageNameToCode;
233     }
234 
235 }
236