• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *     * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *     * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include "config.h"
32 #include "platform/text/LocaleToScriptMapping.h"
33 
34 #include "wtf/HashMap.h"
35 #include "wtf/HashSet.h"
36 #include "wtf/text/StringHash.h"
37 
38 namespace blink {
39 
scriptNameToCode(const String & scriptName)40 UScriptCode scriptNameToCode(const String& scriptName)
41 {
42     struct ScriptNameCode {
43         const char* name;
44         UScriptCode code;
45     };
46 
47     // This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are
48     // treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to
49     // USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered
50     // using the same font setting.
51     static const ScriptNameCode scriptNameCodeList[] = {
52         { "zyyy", USCRIPT_COMMON },
53         { "qaai", USCRIPT_INHERITED },
54         { "arab", USCRIPT_ARABIC },
55         { "armn", USCRIPT_ARMENIAN },
56         { "beng", USCRIPT_BENGALI },
57         { "bopo", USCRIPT_BOPOMOFO },
58         { "cher", USCRIPT_CHEROKEE },
59         { "copt", USCRIPT_COPTIC },
60         { "cyrl", USCRIPT_CYRILLIC },
61         { "dsrt", USCRIPT_DESERET },
62         { "deva", USCRIPT_DEVANAGARI },
63         { "ethi", USCRIPT_ETHIOPIC },
64         { "geor", USCRIPT_GEORGIAN },
65         { "goth", USCRIPT_GOTHIC },
66         { "grek", USCRIPT_GREEK },
67         { "gujr", USCRIPT_GUJARATI },
68         { "guru", USCRIPT_GURMUKHI },
69         { "hani", USCRIPT_HAN },
70         { "hang", USCRIPT_HANGUL },
71         { "hebr", USCRIPT_HEBREW },
72         { "hira", USCRIPT_KATAKANA_OR_HIRAGANA },
73         { "knda", USCRIPT_KANNADA },
74         { "kana", USCRIPT_KATAKANA_OR_HIRAGANA },
75         { "khmr", USCRIPT_KHMER },
76         { "laoo", USCRIPT_LAO },
77         { "latn", USCRIPT_LATIN },
78         { "mlym", USCRIPT_MALAYALAM },
79         { "mong", USCRIPT_MONGOLIAN },
80         { "mymr", USCRIPT_MYANMAR },
81         { "ogam", USCRIPT_OGHAM },
82         { "ital", USCRIPT_OLD_ITALIC },
83         { "orya", USCRIPT_ORIYA },
84         { "runr", USCRIPT_RUNIC },
85         { "sinh", USCRIPT_SINHALA },
86         { "syrc", USCRIPT_SYRIAC },
87         { "taml", USCRIPT_TAMIL },
88         { "telu", USCRIPT_TELUGU },
89         { "thaa", USCRIPT_THAANA },
90         { "thai", USCRIPT_THAI },
91         { "tibt", USCRIPT_TIBETAN },
92         { "cans", USCRIPT_CANADIAN_ABORIGINAL },
93         { "yiii", USCRIPT_YI },
94         { "tglg", USCRIPT_TAGALOG },
95         { "hano", USCRIPT_HANUNOO },
96         { "buhd", USCRIPT_BUHID },
97         { "tagb", USCRIPT_TAGBANWA },
98         { "brai", USCRIPT_BRAILLE },
99         { "cprt", USCRIPT_CYPRIOT },
100         { "limb", USCRIPT_LIMBU },
101         { "linb", USCRIPT_LINEAR_B },
102         { "osma", USCRIPT_OSMANYA },
103         { "shaw", USCRIPT_SHAVIAN },
104         { "tale", USCRIPT_TAI_LE },
105         { "ugar", USCRIPT_UGARITIC },
106         { "hrkt", USCRIPT_KATAKANA_OR_HIRAGANA },
107         { "bugi", USCRIPT_BUGINESE },
108         { "glag", USCRIPT_GLAGOLITIC },
109         { "khar", USCRIPT_KHAROSHTHI },
110         { "sylo", USCRIPT_SYLOTI_NAGRI },
111         { "talu", USCRIPT_NEW_TAI_LUE },
112         { "tfng", USCRIPT_TIFINAGH },
113         { "xpeo", USCRIPT_OLD_PERSIAN },
114         { "bali", USCRIPT_BALINESE },
115         { "batk", USCRIPT_BATAK },
116         { "blis", USCRIPT_BLISSYMBOLS },
117         { "brah", USCRIPT_BRAHMI },
118         { "cham", USCRIPT_CHAM },
119         { "cirt", USCRIPT_CIRTH },
120         { "cyrs", USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC },
121         { "egyd", USCRIPT_DEMOTIC_EGYPTIAN },
122         { "egyh", USCRIPT_HIERATIC_EGYPTIAN },
123         { "egyp", USCRIPT_EGYPTIAN_HIEROGLYPHS },
124         { "geok", USCRIPT_KHUTSURI },
125         { "hans", USCRIPT_SIMPLIFIED_HAN },
126         { "hant", USCRIPT_TRADITIONAL_HAN },
127         { "hmng", USCRIPT_PAHAWH_HMONG },
128         { "hung", USCRIPT_OLD_HUNGARIAN },
129         { "inds", USCRIPT_HARAPPAN_INDUS },
130         { "java", USCRIPT_JAVANESE },
131         { "kali", USCRIPT_KAYAH_LI },
132         { "latf", USCRIPT_LATIN_FRAKTUR },
133         { "latg", USCRIPT_LATIN_GAELIC },
134         { "lepc", USCRIPT_LEPCHA },
135         { "lina", USCRIPT_LINEAR_A },
136         { "mand", USCRIPT_MANDAEAN },
137         { "maya", USCRIPT_MAYAN_HIEROGLYPHS },
138         { "mero", USCRIPT_MEROITIC },
139         { "nkoo", USCRIPT_NKO },
140         { "orkh", USCRIPT_ORKHON },
141         { "perm", USCRIPT_OLD_PERMIC },
142         { "phag", USCRIPT_PHAGS_PA },
143         { "phnx", USCRIPT_PHOENICIAN },
144         { "plrd", USCRIPT_PHONETIC_POLLARD },
145         { "roro", USCRIPT_RONGORONGO },
146         { "sara", USCRIPT_SARATI },
147         { "syre", USCRIPT_ESTRANGELO_SYRIAC },
148         { "syrj", USCRIPT_WESTERN_SYRIAC },
149         { "syrn", USCRIPT_EASTERN_SYRIAC },
150         { "teng", USCRIPT_TENGWAR },
151         { "vaii", USCRIPT_VAI },
152         { "visp", USCRIPT_VISIBLE_SPEECH },
153         { "xsux", USCRIPT_CUNEIFORM },
154         { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA },
155         { "kore", USCRIPT_HANGUL },
156         { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES },
157         { "zzzz", USCRIPT_UNKNOWN }
158     };
159 
160     typedef HashMap<String, UScriptCode> ScriptNameCodeMap;
161     DEFINE_STATIC_LOCAL(ScriptNameCodeMap, scriptNameCodeMap, ());
162     if (scriptNameCodeMap.isEmpty()) {
163         for (size_t i = 0; i < sizeof(scriptNameCodeList) / sizeof(scriptNameCodeList[0]); ++i)
164             scriptNameCodeMap.set(scriptNameCodeList[i].name, scriptNameCodeList[i].code);
165     }
166 
167     HashMap<String, UScriptCode>::iterator it = scriptNameCodeMap.find(scriptName.lower());
168     if (it != scriptNameCodeMap.end())
169         return it->value;
170     return USCRIPT_INVALID_CODE;
171 }
172 
localeToScriptCodeForFontSelection(const String & locale)173 UScriptCode localeToScriptCodeForFontSelection(const String& locale)
174 {
175     struct LocaleScript {
176         const char* locale;
177         UScriptCode script;
178     };
179 
180     static const LocaleScript localeScriptList[] = {
181         { "aa", USCRIPT_LATIN },
182         { "ab", USCRIPT_CYRILLIC },
183         { "ady", USCRIPT_CYRILLIC },
184         { "af", USCRIPT_LATIN },
185         { "ak", USCRIPT_LATIN },
186         { "am", USCRIPT_ETHIOPIC },
187         { "ar", USCRIPT_ARABIC },
188         { "as", USCRIPT_BENGALI },
189         { "ast", USCRIPT_LATIN },
190         { "av", USCRIPT_CYRILLIC },
191         { "ay", USCRIPT_LATIN },
192         { "az", USCRIPT_LATIN },
193         { "ba", USCRIPT_CYRILLIC },
194         { "be", USCRIPT_CYRILLIC },
195         { "bg", USCRIPT_CYRILLIC },
196         { "bi", USCRIPT_LATIN },
197         { "bn", USCRIPT_BENGALI },
198         { "bo", USCRIPT_TIBETAN },
199         { "bs", USCRIPT_LATIN },
200         { "ca", USCRIPT_LATIN },
201         { "ce", USCRIPT_CYRILLIC },
202         { "ceb", USCRIPT_LATIN },
203         { "ch", USCRIPT_LATIN },
204         { "chk", USCRIPT_LATIN },
205         { "cs", USCRIPT_LATIN },
206         { "cy", USCRIPT_LATIN },
207         { "da", USCRIPT_LATIN },
208         { "de", USCRIPT_LATIN },
209         { "dv", USCRIPT_THAANA },
210         { "dz", USCRIPT_TIBETAN },
211         { "ee", USCRIPT_LATIN },
212         { "efi", USCRIPT_LATIN },
213         { "el", USCRIPT_GREEK },
214         { "en", USCRIPT_LATIN },
215         { "es", USCRIPT_LATIN },
216         { "et", USCRIPT_LATIN },
217         { "eu", USCRIPT_LATIN },
218         { "fa", USCRIPT_ARABIC },
219         { "fi", USCRIPT_LATIN },
220         { "fil", USCRIPT_LATIN },
221         { "fj", USCRIPT_LATIN },
222         { "fo", USCRIPT_LATIN },
223         { "fr", USCRIPT_LATIN },
224         { "fur", USCRIPT_LATIN },
225         { "fy", USCRIPT_LATIN },
226         { "ga", USCRIPT_LATIN },
227         { "gaa", USCRIPT_LATIN },
228         { "gd", USCRIPT_LATIN },
229         { "gil", USCRIPT_LATIN },
230         { "gl", USCRIPT_LATIN },
231         { "gn", USCRIPT_LATIN },
232         { "gsw", USCRIPT_LATIN },
233         { "gu", USCRIPT_GUJARATI },
234         { "ha", USCRIPT_LATIN },
235         { "haw", USCRIPT_LATIN },
236         { "he", USCRIPT_HEBREW },
237         { "hi", USCRIPT_DEVANAGARI },
238         { "hil", USCRIPT_LATIN },
239         { "ho", USCRIPT_LATIN },
240         { "hr", USCRIPT_LATIN },
241         { "ht", USCRIPT_LATIN },
242         { "hu", USCRIPT_LATIN },
243         { "hy", USCRIPT_ARMENIAN },
244         { "id", USCRIPT_LATIN },
245         { "ig", USCRIPT_LATIN },
246         { "ii", USCRIPT_YI },
247         { "ilo", USCRIPT_LATIN },
248         { "inh", USCRIPT_CYRILLIC },
249         { "is", USCRIPT_LATIN },
250         { "it", USCRIPT_LATIN },
251         { "iu", USCRIPT_CANADIAN_ABORIGINAL },
252         { "ja", USCRIPT_KATAKANA_OR_HIRAGANA },
253         { "jv", USCRIPT_LATIN },
254         { "ka", USCRIPT_GEORGIAN },
255         { "kaj", USCRIPT_LATIN },
256         { "kam", USCRIPT_LATIN },
257         { "kbd", USCRIPT_CYRILLIC },
258         { "kha", USCRIPT_LATIN },
259         { "kk", USCRIPT_CYRILLIC },
260         { "kl", USCRIPT_LATIN },
261         { "km", USCRIPT_KHMER },
262         { "kn", USCRIPT_KANNADA },
263         { "ko", USCRIPT_HANGUL },
264         { "kok", USCRIPT_DEVANAGARI },
265         { "kos", USCRIPT_LATIN },
266         { "kpe", USCRIPT_LATIN },
267         { "krc", USCRIPT_CYRILLIC },
268         { "ks", USCRIPT_ARABIC },
269         { "ku", USCRIPT_ARABIC },
270         { "kum", USCRIPT_CYRILLIC },
271         { "ky", USCRIPT_CYRILLIC },
272         { "la", USCRIPT_LATIN },
273         { "lah", USCRIPT_ARABIC },
274         { "lb", USCRIPT_LATIN },
275         { "lez", USCRIPT_CYRILLIC },
276         { "ln", USCRIPT_LATIN },
277         { "lo", USCRIPT_LAO },
278         { "lt", USCRIPT_LATIN },
279         { "lv", USCRIPT_LATIN },
280         { "mai", USCRIPT_DEVANAGARI },
281         { "mdf", USCRIPT_CYRILLIC },
282         { "mg", USCRIPT_LATIN },
283         { "mh", USCRIPT_LATIN },
284         { "mi", USCRIPT_LATIN },
285         { "mk", USCRIPT_CYRILLIC },
286         { "ml", USCRIPT_MALAYALAM },
287         { "mn", USCRIPT_CYRILLIC },
288         { "mr", USCRIPT_DEVANAGARI },
289         { "ms", USCRIPT_LATIN },
290         { "mt", USCRIPT_LATIN },
291         { "my", USCRIPT_MYANMAR },
292         { "myv", USCRIPT_CYRILLIC },
293         { "na", USCRIPT_LATIN },
294         { "nb", USCRIPT_LATIN },
295         { "ne", USCRIPT_DEVANAGARI },
296         { "niu", USCRIPT_LATIN },
297         { "nl", USCRIPT_LATIN },
298         { "nn", USCRIPT_LATIN },
299         { "nr", USCRIPT_LATIN },
300         { "nso", USCRIPT_LATIN },
301         { "ny", USCRIPT_LATIN },
302         { "oc", USCRIPT_LATIN },
303         { "om", USCRIPT_LATIN },
304         { "or", USCRIPT_ORIYA },
305         { "os", USCRIPT_CYRILLIC },
306         { "pa", USCRIPT_GURMUKHI },
307         { "pag", USCRIPT_LATIN },
308         { "pap", USCRIPT_LATIN },
309         { "pau", USCRIPT_LATIN },
310         { "pl", USCRIPT_LATIN },
311         { "pon", USCRIPT_LATIN },
312         { "ps", USCRIPT_ARABIC },
313         { "pt", USCRIPT_LATIN },
314         { "qu", USCRIPT_LATIN },
315         { "rm", USCRIPT_LATIN },
316         { "rn", USCRIPT_LATIN },
317         { "ro", USCRIPT_LATIN },
318         { "ru", USCRIPT_CYRILLIC },
319         { "rw", USCRIPT_LATIN },
320         { "sa", USCRIPT_DEVANAGARI },
321         { "sah", USCRIPT_CYRILLIC },
322         { "sat", USCRIPT_LATIN },
323         { "sd", USCRIPT_ARABIC },
324         { "se", USCRIPT_LATIN },
325         { "sg", USCRIPT_LATIN },
326         { "si", USCRIPT_SINHALA },
327         { "sid", USCRIPT_LATIN },
328         { "sk", USCRIPT_LATIN },
329         { "sl", USCRIPT_LATIN },
330         { "sm", USCRIPT_LATIN },
331         { "so", USCRIPT_LATIN },
332         { "sq", USCRIPT_LATIN },
333         { "sr", USCRIPT_CYRILLIC },
334         { "ss", USCRIPT_LATIN },
335         { "st", USCRIPT_LATIN },
336         { "su", USCRIPT_LATIN },
337         { "sv", USCRIPT_LATIN },
338         { "sw", USCRIPT_LATIN },
339         { "ta", USCRIPT_TAMIL },
340         { "te", USCRIPT_TELUGU },
341         { "tet", USCRIPT_LATIN },
342         { "tg", USCRIPT_CYRILLIC },
343         { "th", USCRIPT_THAI },
344         { "ti", USCRIPT_ETHIOPIC },
345         { "tig", USCRIPT_ETHIOPIC },
346         { "tk", USCRIPT_LATIN },
347         { "tkl", USCRIPT_LATIN },
348         { "tl", USCRIPT_LATIN },
349         { "tn", USCRIPT_LATIN },
350         { "to", USCRIPT_LATIN },
351         { "tpi", USCRIPT_LATIN },
352         { "tr", USCRIPT_LATIN },
353         { "trv", USCRIPT_LATIN },
354         { "ts", USCRIPT_LATIN },
355         { "tt", USCRIPT_CYRILLIC },
356         { "tvl", USCRIPT_LATIN },
357         { "tw", USCRIPT_LATIN },
358         { "ty", USCRIPT_LATIN },
359         { "tyv", USCRIPT_CYRILLIC },
360         { "udm", USCRIPT_CYRILLIC },
361         { "ug", USCRIPT_ARABIC },
362         { "uk", USCRIPT_CYRILLIC },
363         { "und", USCRIPT_LATIN },
364         { "ur", USCRIPT_ARABIC },
365         { "uz", USCRIPT_CYRILLIC },
366         { "ve", USCRIPT_LATIN },
367         { "vi", USCRIPT_LATIN },
368         { "wal", USCRIPT_ETHIOPIC },
369         { "war", USCRIPT_LATIN },
370         { "wo", USCRIPT_LATIN },
371         { "xh", USCRIPT_LATIN },
372         { "yap", USCRIPT_LATIN },
373         { "yo", USCRIPT_LATIN },
374         { "za", USCRIPT_LATIN },
375         { "zh", USCRIPT_SIMPLIFIED_HAN },
376         { "zh_hk", USCRIPT_TRADITIONAL_HAN },
377         { "zh_tw", USCRIPT_TRADITIONAL_HAN },
378         { "zu", USCRIPT_LATIN }
379     };
380 
381     typedef HashMap<String, UScriptCode> LocaleScriptMap;
382     DEFINE_STATIC_LOCAL(LocaleScriptMap, localeScriptMap, ());
383     if (localeScriptMap.isEmpty()) {
384         for (size_t i = 0; i < sizeof(localeScriptList) / sizeof(localeScriptList[0]); ++i)
385             localeScriptMap.set(localeScriptList[i].locale, localeScriptList[i].script);
386     }
387 
388     String canonicalLocale = locale.lower().replace('-', '_');
389     while (!canonicalLocale.isEmpty()) {
390         HashMap<String, UScriptCode>::iterator it = localeScriptMap.find(canonicalLocale);
391         if (it != localeScriptMap.end())
392             return it->value;
393         size_t pos = canonicalLocale.reverseFind('_');
394         if (pos == kNotFound)
395             break;
396         UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos + 1));
397         if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN)
398             return code;
399         canonicalLocale = canonicalLocale.substring(0, pos);
400     }
401     return USCRIPT_COMMON;
402 }
403 
404 } // namespace blink
405