1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "Locale.h"
18 
19 #include <algorithm>
20 
21 #include <hb.h>
22 
23 #include "minikin/LocaleList.h"
24 
25 #include "LocaleListCache.h"
26 #include "MinikinInternal.h"
27 #include "StringPiece.h"
28 
29 namespace minikin {
30 
31 constexpr uint32_t FIVE_BITS = 0x1f;
32 
registerLocaleList(const std::string & locales)33 uint32_t registerLocaleList(const std::string& locales) {
34     return LocaleListCache::getId(locales);
35 }
36 
37 // Check if a language code supports emoji according to its subtag
isEmojiSubtag(const char * buf,size_t bufLen,const char * subtag,size_t subtagLen)38 static bool isEmojiSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
39     if (bufLen < subtagLen) {
40         return false;
41     }
42     if (strncmp(buf, subtag, subtagLen) != 0) {
43         return false;  // no match between two strings
44     }
45     return (bufLen == subtagLen || buf[subtagLen] == '\0' || buf[subtagLen] == '-' ||
46             buf[subtagLen] == '_');
47 }
48 
49 // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
50 // For the region code, the letters must be all digits in three letter case, so the number of
51 // possible values are 10. For the language code, the letters must be all small alphabets, so the
52 // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
53 // three letter language code or region code to 15 bits.
54 //
55 // In case of two letter code, use fullbit(0x1f) for the first letter instead.
packLanguageOrRegion(const StringPiece & in,uint8_t twoLetterBase,uint8_t threeLetterBase)56 static uint16_t packLanguageOrRegion(const StringPiece& in, uint8_t twoLetterBase,
57                                      uint8_t threeLetterBase) {
58     if (in.length() == 2) {
59         return 0x7c00u |  // 0x1fu << 10
60                (uint16_t)(in[0] - twoLetterBase) << 5 | (uint16_t)(in[1] - twoLetterBase);
61     } else {
62         return ((uint16_t)(in[0] - threeLetterBase) << 10) |
63                (uint16_t)(in[1] - threeLetterBase) << 5 | (uint16_t)(in[2] - threeLetterBase);
64     }
65 }
66 
unpackLanguageOrRegion(uint16_t in,char * out,uint8_t twoLetterBase,uint8_t threeLetterBase)67 static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
68                                      uint8_t threeLetterBase) {
69     uint8_t first = (in >> 10) & FIVE_BITS;
70     uint8_t second = (in >> 5) & FIVE_BITS;
71     uint8_t third = in & FIVE_BITS;
72 
73     if (first == 0x1f) {
74         out[0] = second + twoLetterBase;
75         out[1] = third + twoLetterBase;
76         return 2;
77     } else {
78         out[0] = first + threeLetterBase;
79         out[1] = second + threeLetterBase;
80         out[2] = third + threeLetterBase;
81         return 3;
82     }
83 }
84 
packLanguage(const StringPiece & in)85 static uint16_t packLanguage(const StringPiece& in) {
86     return packLanguageOrRegion(in, 'a', 'a');
87 }
88 
unpackLanguage(uint16_t in,char * out)89 static size_t unpackLanguage(uint16_t in, char* out) {
90     return unpackLanguageOrRegion(in, out, 'a', 'a');
91 }
92 
packScript(char c1,char c2,char c3,char c4)93 constexpr uint32_t packScript(char c1, char c2, char c3, char c4) {
94     constexpr char FIRST_LETTER_BASE = 'A';
95     constexpr char REST_LETTER_BASE = 'a';
96     return ((uint32_t)(c1 - FIRST_LETTER_BASE) << 15) | (uint32_t)(c2 - REST_LETTER_BASE) << 10 |
97            ((uint32_t)(c3 - REST_LETTER_BASE) << 5) | (uint32_t)(c4 - REST_LETTER_BASE);
98 }
99 
packScript(uint32_t script)100 constexpr uint32_t packScript(uint32_t script) {
101     return packScript(script >> 24, (script >> 16) & 0xff, (script >> 8) & 0xff, script & 0xff);
102 }
103 
unpackScript(uint32_t packedScript)104 constexpr uint32_t unpackScript(uint32_t packedScript) {
105     constexpr char FIRST_LETTER_BASE = 'A';
106     constexpr char REST_LETTER_BASE = 'a';
107     const uint32_t first = (packedScript >> 15) + FIRST_LETTER_BASE;
108     const uint32_t second = ((packedScript >> 10) & FIVE_BITS) + REST_LETTER_BASE;
109     const uint32_t third = ((packedScript >> 5) & FIVE_BITS) + REST_LETTER_BASE;
110     const uint32_t fourth = (packedScript & FIVE_BITS) + REST_LETTER_BASE;
111 
112     return first << 24 | second << 16 | third << 8 | fourth;
113 }
114 
packRegion(const StringPiece & in)115 static uint16_t packRegion(const StringPiece& in) {
116     return packLanguageOrRegion(in, 'A', '0');
117 }
118 
unpackRegion(uint16_t in,char * out)119 static size_t unpackRegion(uint16_t in, char* out) {
120     return unpackLanguageOrRegion(in, out, 'A', '0');
121 }
122 
isLowercase(char c)123 static inline bool isLowercase(char c) {
124     return 'a' <= c && c <= 'z';
125 }
126 
isUppercase(char c)127 static inline bool isUppercase(char c) {
128     return 'A' <= c && c <= 'Z';
129 }
130 
isDigit(char c)131 static inline bool isDigit(char c) {
132     return '0' <= c && c <= '9';
133 }
134 
135 // Returns true if the buffer is valid for language code.
isValidLanguageCode(const StringPiece & buffer)136 static inline bool isValidLanguageCode(const StringPiece& buffer) {
137     if (buffer.length() != 2 && buffer.length() != 3) return false;
138     if (!isLowercase(buffer[0])) return false;
139     if (!isLowercase(buffer[1])) return false;
140     if (buffer.length() == 3 && !isLowercase(buffer[2])) return false;
141     return true;
142 }
143 
144 // Returns true if buffer is valid for script code. The length of buffer must be 4.
isValidScriptCode(const StringPiece & buffer)145 static inline bool isValidScriptCode(const StringPiece& buffer) {
146     return buffer.size() == 4 && isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
147            isLowercase(buffer[2]) && isLowercase(buffer[3]);
148 }
149 
150 // Returns true if the buffer is valid for region code.
isValidRegionCode(const StringPiece & buffer)151 static inline bool isValidRegionCode(const StringPiece& buffer) {
152     return (buffer.size() == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
153            (buffer.size() == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
154 }
155 
156 // Parse BCP 47 language identifier into internal structure
Locale(const StringPiece & input)157 Locale::Locale(const StringPiece& input) : Locale() {
158     SplitIterator it(input, '-');
159 
160     StringPiece language = it.next();
161     if (isValidLanguageCode(language)) {
162         mLanguage = packLanguage(language);
163     } else {
164         // We don't understand anything other than two-letter or three-letter
165         // language codes, so we skip parsing the rest of the string.
166         return;
167     }
168 
169     if (!it.hasNext()) {
170         return;  // Language code only.
171     }
172     StringPiece token = it.next();
173 
174     if (isValidScriptCode(token)) {
175         mScript = packScript(token[0], token[1], token[2], token[3]);
176         mSubScriptBits = scriptToSubScriptBits(mScript);
177 
178         if (!it.hasNext()) {
179             goto finalize;  // No variant, emoji subtag and region code.
180         }
181         token = it.next();
182     }
183 
184     if (isValidRegionCode(token)) {
185         mRegion = packRegion(token);
186 
187         if (!it.hasNext()) {
188             goto finalize;  // No variant or emoji subtag.
189         }
190         token = it.next();
191     }
192 
193     if (language == "de") {  // We are only interested in German variants.
194         if (token == "1901") {
195             mVariant = Variant::GERMAN_1901_ORTHOGRAPHY;
196         } else if (token == "1996") {
197             mVariant = Variant::GERMAN_1996_ORTHOGRAPHY;
198         }
199 
200         if (mVariant != Variant::NO_VARIANT) {
201             if (!it.hasNext()) {
202                 goto finalize;  // No emoji subtag.
203             }
204 
205             token = it.next();
206         }
207     }
208 
209     mEmojiStyle = resolveEmojiStyle(input.data(), input.length());
210 
211 finalize:
212     if (mEmojiStyle == EmojiStyle::EMPTY) {
213         mEmojiStyle = scriptToEmojiStyle(mScript);
214     }
215 }
216 
217 // static
resolveEmojiStyle(const char * buf,size_t length)218 EmojiStyle Locale::resolveEmojiStyle(const char* buf, size_t length) {
219     // First, lookup emoji subtag.
220     // 10 is the length of "-u-em-text", which is the shortest emoji subtag,
221     // unnecessary comparison can be avoided if total length is smaller than 10.
222     const size_t kMinSubtagLength = 10;
223     if (length >= kMinSubtagLength) {
224         static const char kPrefix[] = "-u-em-";
225         const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
226         if (pos != buf + length) {  // found
227             pos += strlen(kPrefix);
228             const size_t remainingLength = length - (pos - buf);
229             if (isEmojiSubtag(pos, remainingLength, "emoji", 5)) {
230                 return EmojiStyle::EMOJI;
231             } else if (isEmojiSubtag(pos, remainingLength, "text", 4)) {
232                 return EmojiStyle::TEXT;
233             } else if (isEmojiSubtag(pos, remainingLength, "default", 7)) {
234                 return EmojiStyle::DEFAULT;
235             }
236         }
237     }
238     return EmojiStyle::EMPTY;
239 }
240 
scriptToEmojiStyle(uint32_t script)241 EmojiStyle Locale::scriptToEmojiStyle(uint32_t script) {
242     // If no emoji subtag was provided, resolve the emoji style from script code.
243     if (script == packScript('Z', 's', 'y', 'e')) {
244         return EmojiStyle::EMOJI;
245     } else if (script == packScript('Z', 's', 'y', 'm')) {
246         return EmojiStyle::TEXT;
247     }
248     return EmojiStyle::EMPTY;
249 }
250 
251 // static
scriptToSubScriptBits(uint32_t script)252 uint8_t Locale::scriptToSubScriptBits(uint32_t script) {
253     uint8_t subScriptBits = 0u;
254     switch (script) {
255         case packScript('B', 'o', 'p', 'o'):
256             subScriptBits = kBopomofoFlag;
257             break;
258         case packScript('H', 'a', 'n', 'g'):
259             subScriptBits = kHangulFlag;
260             break;
261         case packScript('H', 'a', 'n', 'b'):
262             // Bopomofo is almost exclusively used in Taiwan.
263             subScriptBits = kHanFlag | kBopomofoFlag;
264             break;
265         case packScript('H', 'a', 'n', 'i'):
266             subScriptBits = kHanFlag;
267             break;
268         case packScript('H', 'a', 'n', 's'):
269             subScriptBits = kHanFlag | kSimplifiedChineseFlag;
270             break;
271         case packScript('H', 'a', 'n', 't'):
272             subScriptBits = kHanFlag | kTraditionalChineseFlag;
273             break;
274         case packScript('H', 'i', 'r', 'a'):
275             subScriptBits = kHiraganaFlag;
276             break;
277         case packScript('H', 'r', 'k', 't'):
278             subScriptBits = kKatakanaFlag | kHiraganaFlag;
279             break;
280         case packScript('J', 'p', 'a', 'n'):
281             subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
282             break;
283         case packScript('K', 'a', 'n', 'a'):
284             subScriptBits = kKatakanaFlag;
285             break;
286         case packScript('K', 'o', 'r', 'e'):
287             subScriptBits = kHanFlag | kHangulFlag;
288             break;
289     }
290     return subScriptBits;
291 }
292 
getString() const293 std::string Locale::getString() const {
294     char buf[24];
295     size_t i;
296     if (mLanguage == NO_LANGUAGE) {
297         buf[0] = 'u';
298         buf[1] = 'n';
299         buf[2] = 'd';
300         i = 3;
301     } else {
302         i = unpackLanguage(mLanguage, buf);
303     }
304     if (mScript != NO_SCRIPT) {
305         uint32_t rawScript = unpackScript(mScript);
306         buf[i++] = '-';
307         buf[i++] = (rawScript >> 24) & 0xFFu;
308         buf[i++] = (rawScript >> 16) & 0xFFu;
309         buf[i++] = (rawScript >> 8) & 0xFFu;
310         buf[i++] = rawScript & 0xFFu;
311     }
312     if (mRegion != NO_REGION) {
313         buf[i++] = '-';
314         i += unpackRegion(mRegion, buf + i);
315     }
316     if (mVariant != Variant::NO_VARIANT) {
317         buf[i++] = '-';
318         buf[i++] = '1';
319         buf[i++] = '9';
320         switch (mVariant) {
321             case Variant::GERMAN_1901_ORTHOGRAPHY:
322                 buf[i++] = '0';
323                 buf[i++] = '1';
324                 break;
325             case Variant::GERMAN_1996_ORTHOGRAPHY:
326                 buf[i++] = '9';
327                 buf[i++] = '6';
328                 break;
329             default:
330                 MINIKIN_ASSERT(false, "Must not reached.");
331         }
332     }
333     return std::string(buf, i);
334 }
335 
getPartialLocale(SubtagBits bits) const336 Locale Locale::getPartialLocale(SubtagBits bits) const {
337     Locale subLocale;
338     if ((bits & SubtagBits::LANGUAGE) != SubtagBits::EMPTY) {
339         subLocale.mLanguage = mLanguage;
340     } else {
341         subLocale.mLanguage = packLanguage("und");
342     }
343     if ((bits & SubtagBits::SCRIPT) != SubtagBits::EMPTY) {
344         subLocale.mScript = mScript;
345         subLocale.mSubScriptBits = mSubScriptBits;
346     }
347     if ((bits & SubtagBits::REGION) != SubtagBits::EMPTY) {
348         subLocale.mRegion = mRegion;
349     }
350     if ((bits & SubtagBits::VARIANT) != SubtagBits::EMPTY) {
351         subLocale.mVariant = mVariant;
352     }
353     if ((bits & SubtagBits::EMOJI) != SubtagBits::EMPTY) {
354         subLocale.mEmojiStyle = mEmojiStyle;
355     }
356     return subLocale;
357 }
358 
isEqualScript(const Locale & other) const359 bool Locale::isEqualScript(const Locale& other) const {
360     return other.mScript == mScript;
361 }
362 
363 // static
supportsScript(uint8_t providedBits,uint8_t requestedBits)364 bool Locale::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
365     return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
366 }
367 
supportsHbScript(hb_script_t script) const368 bool Locale::supportsHbScript(hb_script_t script) const {
369     static_assert(unpackScript(packScript('J', 'p', 'a', 'n')) == HB_TAG('J', 'p', 'a', 'n'),
370                   "The Minikin script and HarfBuzz hb_script_t have different encodings.");
371     uint32_t packedScript = packScript(script);
372     if (packedScript == mScript) return true;
373     return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
374 }
375 
calcScoreFor(const LocaleList & supported) const376 int Locale::calcScoreFor(const LocaleList& supported) const {
377     bool languageScriptMatch = false;
378     bool subtagMatch = false;
379     bool scriptMatch = false;
380 
381     for (size_t i = 0; i < supported.size(); ++i) {
382         if (mEmojiStyle != EmojiStyle::EMPTY && mEmojiStyle == supported[i].mEmojiStyle) {
383             subtagMatch = true;
384             if (mLanguage == supported[i].mLanguage) {
385                 return 4;
386             }
387         }
388         if (isEqualScript(supported[i]) ||
389             supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
390             scriptMatch = true;
391             if (mLanguage == supported[i].mLanguage) {
392                 languageScriptMatch = true;
393             }
394         }
395     }
396 
397     if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
398         scriptMatch = true;
399         if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLocale()) {
400             return 3;
401         }
402     }
403 
404     if (languageScriptMatch) {
405         return 3;
406     } else if (subtagMatch) {
407         return 2;
408     } else if (scriptMatch) {
409         return 1;
410     }
411     return 0;
412 }
413 
buildHbLanguage(const Locale & locale)414 static hb_language_t buildHbLanguage(const Locale& locale) {
415     return locale.isSupported() ? hb_language_from_string(locale.getString().c_str(), -1)
416                                 : HB_LANGUAGE_INVALID;
417 }
418 
LocaleList(std::vector<Locale> && locales)419 LocaleList::LocaleList(std::vector<Locale>&& locales) : mLocales(std::move(locales)) {
420     mIsAllTheSameLocale = true;
421     mUnionOfSubScriptBits = 0u;
422     mHbLangs.reserve(mLocales.size());
423     mEmojiStyle = EmojiStyle::EMPTY;
424     const auto firstLanguage = mLocales.empty() ? NO_LANGUAGE : mLocales[0].mLanguage;
425     for (const Locale& locale : mLocales) {
426         mUnionOfSubScriptBits |= locale.mSubScriptBits;
427         if (mIsAllTheSameLocale && firstLanguage != locale.mLanguage) {
428             mIsAllTheSameLocale = false;
429         }
430         mHbLangs.push_back(buildHbLanguage(locale));
431         if (mEmojiStyle == EmojiStyle::EMPTY) {
432             mEmojiStyle = locale.getEmojiStyle();
433         }
434     }
435 }
436 
437 }  // namespace minikin
438