1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "Locale.h"
18 
19 #include <algorithm>
20 
21 #include <hb.h>
22 
23 #include "minikin/LocaleList.h"
24 
25 #include "LocaleListCache.h"
26 #include "MinikinInternal.h"
27 #include "StringPiece.h"
28 
29 namespace minikin {
30 
31 constexpr uint32_t FIVE_BITS = 0x1f;
32 
registerLocaleList(const std::string & locales)33 uint32_t registerLocaleList(const std::string& locales) {
34     return LocaleListCache::getId(locales);
35 }
36 
getLocaleString(uint32_t localeId)37 std::string getLocaleString(uint32_t localeId) {
38     const LocaleList& localeList = LocaleListCache::getById(localeId);
39     std::string out;
40     for (size_t i = 0; i < localeList.size(); ++i) {
41         if (i != 0) {
42             out += ",";
43         }
44         out += localeList[i].getString();
45     }
46     return out;
47 }
48 
49 // Check if a language code supports extension such as emoji and line break etc. according to its
50 // subtag
isSubtag(const char * buf,size_t bufLen,const char * subtag,size_t subtagLen)51 static bool isSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
52     if (bufLen < subtagLen) {
53         return false;
54     }
55     if (strncmp(buf, subtag, subtagLen) != 0) {
56         return false;  // no match between two strings
57     }
58     return (bufLen == subtagLen || buf[subtagLen] == '\0' || buf[subtagLen] == '-' ||
59             buf[subtagLen] == '_');
60 }
61 
62 // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
63 // For the region code, the letters must be all digits in three letter case, so the number of
64 // possible values are 10. For the language code, the letters must be all small alphabets, so the
65 // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
66 // three letter language code or region code to 15 bits.
67 //
68 // In case of two letter code, use fullbit(0x1f) for the first letter instead.
packLanguageOrRegion(const StringPiece & in,uint8_t twoLetterBase,uint8_t threeLetterBase)69 static uint16_t packLanguageOrRegion(const StringPiece& in, uint8_t twoLetterBase,
70                                      uint8_t threeLetterBase) {
71     if (in.length() == 2) {
72         return 0x7c00u |  // 0x1fu << 10
73                (uint16_t)(in[0] - twoLetterBase) << 5 | (uint16_t)(in[1] - twoLetterBase);
74     } else {
75         return ((uint16_t)(in[0] - threeLetterBase) << 10) |
76                (uint16_t)(in[1] - threeLetterBase) << 5 | (uint16_t)(in[2] - threeLetterBase);
77     }
78 }
79 
unpackLanguageOrRegion(uint16_t in,char * out,uint8_t twoLetterBase,uint8_t threeLetterBase)80 static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
81                                      uint8_t threeLetterBase) {
82     uint8_t first = (in >> 10) & FIVE_BITS;
83     uint8_t second = (in >> 5) & FIVE_BITS;
84     uint8_t third = in & FIVE_BITS;
85 
86     if (first == 0x1f) {
87         out[0] = second + twoLetterBase;
88         out[1] = third + twoLetterBase;
89         return 2;
90     } else {
91         out[0] = first + threeLetterBase;
92         out[1] = second + threeLetterBase;
93         out[2] = third + threeLetterBase;
94         return 3;
95     }
96 }
97 
packLanguage(const StringPiece & in)98 static uint16_t packLanguage(const StringPiece& in) {
99     return packLanguageOrRegion(in, 'a', 'a');
100 }
101 
unpackLanguage(uint16_t in,char * out)102 static size_t unpackLanguage(uint16_t in, char* out) {
103     return unpackLanguageOrRegion(in, out, 'a', 'a');
104 }
105 
packScript(char c1,char c2,char c3,char c4)106 constexpr uint32_t packScript(char c1, char c2, char c3, char c4) {
107     constexpr char FIRST_LETTER_BASE = 'A';
108     constexpr char REST_LETTER_BASE = 'a';
109     return ((uint32_t)(c1 - FIRST_LETTER_BASE) << 15) | (uint32_t)(c2 - REST_LETTER_BASE) << 10 |
110            ((uint32_t)(c3 - REST_LETTER_BASE) << 5) | (uint32_t)(c4 - REST_LETTER_BASE);
111 }
112 
packScript(uint32_t script)113 constexpr uint32_t packScript(uint32_t script) {
114     return packScript(script >> 24, (script >> 16) & 0xff, (script >> 8) & 0xff, script & 0xff);
115 }
116 
unpackScript(uint32_t packedScript)117 constexpr uint32_t unpackScript(uint32_t packedScript) {
118     constexpr char FIRST_LETTER_BASE = 'A';
119     constexpr char REST_LETTER_BASE = 'a';
120     const uint32_t first = (packedScript >> 15) + FIRST_LETTER_BASE;
121     const uint32_t second = ((packedScript >> 10) & FIVE_BITS) + REST_LETTER_BASE;
122     const uint32_t third = ((packedScript >> 5) & FIVE_BITS) + REST_LETTER_BASE;
123     const uint32_t fourth = (packedScript & FIVE_BITS) + REST_LETTER_BASE;
124 
125     return first << 24 | second << 16 | third << 8 | fourth;
126 }
127 
packRegion(const StringPiece & in)128 static uint16_t packRegion(const StringPiece& in) {
129     return packLanguageOrRegion(in, 'A', '0');
130 }
131 
unpackRegion(uint16_t in,char * out)132 static size_t unpackRegion(uint16_t in, char* out) {
133     return unpackLanguageOrRegion(in, out, 'A', '0');
134 }
135 
isLowercase(char c)136 static inline bool isLowercase(char c) {
137     return 'a' <= c && c <= 'z';
138 }
139 
isUppercase(char c)140 static inline bool isUppercase(char c) {
141     return 'A' <= c && c <= 'Z';
142 }
143 
isDigit(char c)144 static inline bool isDigit(char c) {
145     return '0' <= c && c <= '9';
146 }
147 
148 // Returns true if the buffer is valid for language code.
isValidLanguageCode(const StringPiece & buffer)149 static inline bool isValidLanguageCode(const StringPiece& buffer) {
150     if (buffer.length() != 2 && buffer.length() != 3) return false;
151     if (!isLowercase(buffer[0])) return false;
152     if (!isLowercase(buffer[1])) return false;
153     if (buffer.length() == 3 && !isLowercase(buffer[2])) return false;
154     return true;
155 }
156 
157 // Returns true if buffer is valid for script code. The length of buffer must be 4.
isValidScriptCode(const StringPiece & buffer)158 static inline bool isValidScriptCode(const StringPiece& buffer) {
159     return buffer.size() == 4 && isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
160            isLowercase(buffer[2]) && isLowercase(buffer[3]);
161 }
162 
163 // Returns true if the buffer is valid for region code.
isValidRegionCode(const StringPiece & buffer)164 static inline bool isValidRegionCode(const StringPiece& buffer) {
165     return (buffer.size() == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
166            (buffer.size() == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
167 }
168 
169 // Parse BCP 47 language identifier into internal structure
Locale(const StringPiece & input)170 Locale::Locale(const StringPiece& input) : Locale() {
171     SplitIterator it(input, '-');
172 
173     StringPiece language = it.next();
174     if (isValidLanguageCode(language)) {
175         mLanguage = packLanguage(language);
176     } else {
177         // We don't understand anything other than two-letter or three-letter
178         // language codes, so we skip parsing the rest of the string.
179         return;
180     }
181 
182     if (!it.hasNext()) {
183         return;  // Language code only.
184     }
185     StringPiece token = it.next();
186 
187     if (isValidScriptCode(token)) {
188         mScript = packScript(token[0], token[1], token[2], token[3]);
189         mSubScriptBits = scriptToSubScriptBits(mScript);
190 
191         if (!it.hasNext()) {
192             goto finalize;  // No variant, emoji subtag and region code.
193         }
194         token = it.next();
195     }
196 
197     if (isValidRegionCode(token)) {
198         mRegion = packRegion(token);
199 
200         if (!it.hasNext()) {
201             goto finalize;  // No variant or emoji subtag.
202         }
203         token = it.next();
204     }
205 
206     if (language == "de") {  // We are only interested in German variants.
207         if (token == "1901") {
208             mVariant = Variant::GERMAN_1901_ORTHOGRAPHY;
209         } else if (token == "1996") {
210             mVariant = Variant::GERMAN_1996_ORTHOGRAPHY;
211         }
212 
213         if (mVariant != Variant::NO_VARIANT) {
214             if (!it.hasNext()) {
215                 goto finalize;  // No emoji subtag.
216             }
217 
218             token = it.next();
219         }
220     }
221 
222     resolveUnicodeExtension(input.data(), input.length());
223 
224 finalize:
225     if (mEmojiStyle == EmojiStyle::EMPTY) {
226         mEmojiStyle = scriptToEmojiStyle(mScript);
227     }
228 }
229 
resolveUnicodeExtension(const char * buf,size_t length)230 void Locale::resolveUnicodeExtension(const char* buf, size_t length) {
231     static const char kPrefix[] = "-u-";
232     const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
233     if (pos != buf + length) {
234         pos += strlen(kPrefix);
235         const size_t remainingLength = length - (pos - buf);
236         mEmojiStyle = resolveEmojiStyle(pos, remainingLength);
237     }
238 }
239 
240 // static
241 // Lookup emoji subtag and determine the emoji style.
resolveEmojiStyle(const char * buf,size_t length)242 EmojiStyle Locale::resolveEmojiStyle(const char* buf, size_t length) {
243     // 7 is the length of "-u-em-text", which is the shortest emoji subtag,
244     // unnecessary comparison can be avoided if total length is smaller than 10.
245     const size_t kMinSubtagLength = 7;
246     if (length >= kMinSubtagLength) {
247         static const char kPrefix[] = "em-";
248         const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
249         if (pos != buf + length) {  // found
250             pos += strlen(kPrefix);
251             const size_t remainingLength = length - (pos - buf);
252             if (isSubtag(pos, remainingLength, "emoji", 5)) {
253                 return EmojiStyle::EMOJI;
254             } else if (isSubtag(pos, remainingLength, "text", 4)) {
255                 return EmojiStyle::TEXT;
256             } else if (isSubtag(pos, remainingLength, "default", 7)) {
257                 return EmojiStyle::DEFAULT;
258             }
259         }
260     }
261     return EmojiStyle::EMPTY;
262 }
263 
scriptToEmojiStyle(uint32_t script)264 EmojiStyle Locale::scriptToEmojiStyle(uint32_t script) {
265     // If no emoji subtag was provided, resolve the emoji style from script code.
266     if (script == packScript('Z', 's', 'y', 'e')) {
267         return EmojiStyle::EMOJI;
268     } else if (script == packScript('Z', 's', 'y', 'm')) {
269         return EmojiStyle::TEXT;
270     }
271     return EmojiStyle::EMPTY;
272 }
273 
274 // static
scriptToSubScriptBits(uint32_t script)275 uint8_t Locale::scriptToSubScriptBits(uint32_t script) {
276     uint8_t subScriptBits = 0u;
277     switch (script) {
278         case packScript('B', 'o', 'p', 'o'):
279             subScriptBits = kBopomofoFlag;
280             break;
281         case packScript('H', 'a', 'n', 'g'):
282             subScriptBits = kHangulFlag;
283             break;
284         case packScript('H', 'a', 'n', 'b'):
285             // Bopomofo is almost exclusively used in Taiwan.
286             subScriptBits = kHanFlag | kBopomofoFlag;
287             break;
288         case packScript('H', 'a', 'n', 'i'):
289             subScriptBits = kHanFlag;
290             break;
291         case packScript('H', 'a', 'n', 's'):
292             subScriptBits = kHanFlag | kSimplifiedChineseFlag;
293             break;
294         case packScript('H', 'a', 'n', 't'):
295             subScriptBits = kHanFlag | kTraditionalChineseFlag;
296             break;
297         case packScript('H', 'i', 'r', 'a'):
298             subScriptBits = kHiraganaFlag;
299             break;
300         case packScript('H', 'r', 'k', 't'):
301             subScriptBits = kKatakanaFlag | kHiraganaFlag;
302             break;
303         case packScript('J', 'p', 'a', 'n'):
304             subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
305             break;
306         case packScript('K', 'a', 'n', 'a'):
307             subScriptBits = kKatakanaFlag;
308             break;
309         case packScript('K', 'o', 'r', 'e'):
310             subScriptBits = kHanFlag | kHangulFlag;
311             break;
312     }
313     return subScriptBits;
314 }
315 
getString() const316 std::string Locale::getString() const {
317     char buf[32];
318     int i = buildLocaleString(buf);
319     return std::string(buf, i);
320 }
321 
getStringWithLineBreakOption(LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle) const322 std::string Locale::getStringWithLineBreakOption(LineBreakStyle lbStyle,
323                                                  LineBreakWordStyle lbWordStyle) const {
324     char buf[48];
325     int i = buildLocaleString(buf);
326 
327     // Add line break unicode extension.
328     if (lbStyle != LineBreakStyle::None || lbWordStyle != LineBreakWordStyle::None) {
329         buf[i++] = '-';
330         buf[i++] = 'u';
331     }
332 
333     if (lbStyle != LineBreakStyle::None) {
334         buf[i++] = '-';
335         buf[i++] = 'l';
336         buf[i++] = 'b';
337         buf[i++] = '-';
338         switch (lbStyle) {
339             case LineBreakStyle::Loose:
340                 buf[i++] = 'l';
341                 buf[i++] = 'o';
342                 buf[i++] = 'o';
343                 buf[i++] = 's';
344                 buf[i++] = 'e';
345                 break;
346             case LineBreakStyle::Normal:
347                 buf[i++] = 'n';
348                 buf[i++] = 'o';
349                 buf[i++] = 'r';
350                 buf[i++] = 'm';
351                 buf[i++] = 'a';
352                 buf[i++] = 'l';
353                 break;
354             case LineBreakStyle::Strict:
355                 buf[i++] = 's';
356                 buf[i++] = 't';
357                 buf[i++] = 'r';
358                 buf[i++] = 'i';
359                 buf[i++] = 'c';
360                 buf[i++] = 't';
361                 break;
362             default:
363                 MINIKIN_ASSERT(false, "Must not reached.");
364         }
365     }
366 
367     if (lbWordStyle != LineBreakWordStyle::None) {
368         buf[i++] = '-';
369         buf[i++] = 'l';
370         buf[i++] = 'w';
371         buf[i++] = '-';
372         switch (lbWordStyle) {
373             case LineBreakWordStyle::Phrase:
374                 buf[i++] = 'p';
375                 buf[i++] = 'h';
376                 buf[i++] = 'r';
377                 buf[i++] = 'a';
378                 buf[i++] = 's';
379                 buf[i++] = 'e';
380                 break;
381             default:
382                 MINIKIN_ASSERT(false, "Must not reached.");
383         }
384     }
385     return std::string(buf, i);
386 }
387 
buildLocaleString(char * buf) const388 int Locale::buildLocaleString(char* buf) const {
389     size_t i;
390     if (mLanguage == NO_LANGUAGE) {
391         buf[0] = 'u';
392         buf[1] = 'n';
393         buf[2] = 'd';
394         i = 3;
395     } else {
396         i = unpackLanguage(mLanguage, buf);
397     }
398     if (mScript != NO_SCRIPT) {
399         uint32_t rawScript = unpackScript(mScript);
400         buf[i++] = '-';
401         buf[i++] = (rawScript >> 24) & 0xFFu;
402         buf[i++] = (rawScript >> 16) & 0xFFu;
403         buf[i++] = (rawScript >> 8) & 0xFFu;
404         buf[i++] = rawScript & 0xFFu;
405     }
406     if (mRegion != NO_REGION) {
407         buf[i++] = '-';
408         i += unpackRegion(mRegion, buf + i);
409     }
410     if (mVariant != Variant::NO_VARIANT) {
411         buf[i++] = '-';
412         buf[i++] = '1';
413         buf[i++] = '9';
414         switch (mVariant) {
415             case Variant::GERMAN_1901_ORTHOGRAPHY:
416                 buf[i++] = '0';
417                 buf[i++] = '1';
418                 break;
419             case Variant::GERMAN_1996_ORTHOGRAPHY:
420                 buf[i++] = '9';
421                 buf[i++] = '6';
422                 break;
423             default:
424                 MINIKIN_ASSERT(false, "Must not reached.");
425         }
426     }
427     return i;
428 }
429 
getPartialLocale(SubtagBits bits) const430 Locale Locale::getPartialLocale(SubtagBits bits) const {
431     Locale subLocale;
432     if ((bits & SubtagBits::LANGUAGE) != SubtagBits::EMPTY) {
433         subLocale.mLanguage = mLanguage;
434     } else {
435         subLocale.mLanguage = packLanguage("und");
436     }
437     if ((bits & SubtagBits::SCRIPT) != SubtagBits::EMPTY) {
438         subLocale.mScript = mScript;
439         subLocale.mSubScriptBits = mSubScriptBits;
440     }
441     if ((bits & SubtagBits::REGION) != SubtagBits::EMPTY) {
442         subLocale.mRegion = mRegion;
443     }
444     if ((bits & SubtagBits::VARIANT) != SubtagBits::EMPTY) {
445         subLocale.mVariant = mVariant;
446     }
447     if ((bits & SubtagBits::EMOJI) != SubtagBits::EMPTY) {
448         subLocale.mEmojiStyle = mEmojiStyle;
449     }
450     return subLocale;
451 }
452 
isEqualScript(const Locale & other) const453 bool Locale::isEqualScript(const Locale& other) const {
454     return other.mScript == mScript;
455 }
456 
457 // static
supportsScript(uint8_t providedBits,uint8_t requestedBits)458 bool Locale::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
459     return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
460 }
461 
supportsScript(uint32_t script) const462 bool Locale::supportsScript(uint32_t script) const {
463     static_assert(unpackScript(packScript('J', 'p', 'a', 'n')) == HB_TAG('J', 'p', 'a', 'n'),
464                   "The Minikin script and HarfBuzz hb_script_t have different encodings.");
465     uint32_t packedScript = packScript(script);
466     if (packedScript == mScript) return true;
467     return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
468 }
469 
supportsScript(char c1,char c2,char c3,char c4) const470 bool Locale::supportsScript(char c1, char c2, char c3, char c4) const {
471     uint32_t packedScript = packScript(c1, c2, c3, c4);
472     if (packedScript == mScript) return true;
473     return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
474 }
475 
calcScoreFor(const LocaleList & supported) const476 int Locale::calcScoreFor(const LocaleList& supported) const {
477     bool languageScriptMatch = false;
478     bool subtagMatch = false;
479     bool scriptMatch = false;
480 
481     for (size_t i = 0; i < supported.size(); ++i) {
482         if (mEmojiStyle != EmojiStyle::EMPTY && mEmojiStyle == supported[i].mEmojiStyle) {
483             subtagMatch = true;
484             if (mLanguage == supported[i].mLanguage) {
485                 return 4;
486             }
487         }
488         if (isEqualScript(supported[i]) ||
489             supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
490             scriptMatch = true;
491             if (mLanguage == supported[i].mLanguage) {
492                 languageScriptMatch = true;
493             }
494         }
495     }
496 
497     if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
498         scriptMatch = true;
499         if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLocale()) {
500             return 3;
501         }
502     }
503 
504     if (languageScriptMatch) {
505         return 3;
506     } else if (subtagMatch) {
507         return 2;
508     } else if (scriptMatch) {
509         return 1;
510     }
511     return 0;
512 }
513 
buildHbLanguage(const Locale & locale)514 static hb_language_t buildHbLanguage(const Locale& locale) {
515     return locale.isSupported() ? hb_language_from_string(locale.getString().c_str(), -1)
516                                 : HB_LANGUAGE_INVALID;
517 }
518 
LocaleList(std::vector<Locale> && locales)519 LocaleList::LocaleList(std::vector<Locale>&& locales) : mLocales(std::move(locales)) {
520     mIsAllTheSameLocale = true;
521     mUnionOfSubScriptBits = 0u;
522     mHbLangs.reserve(mLocales.size());
523     mEmojiStyle = EmojiStyle::EMPTY;
524     const auto firstLanguage = mLocales.empty() ? NO_LANGUAGE : mLocales[0].mLanguage;
525     for (const Locale& locale : mLocales) {
526         mUnionOfSubScriptBits |= locale.mSubScriptBits;
527         if (mIsAllTheSameLocale && firstLanguage != locale.mLanguage) {
528             mIsAllTheSameLocale = false;
529         }
530         mHbLangs.push_back(buildHbLanguage(locale));
531         if (mEmojiStyle == EmojiStyle::EMPTY) {
532             mEmojiStyle = locale.getEmojiStyle();
533         }
534     }
535 }
536 
atLeastOneScriptMatch(const LocaleList & list) const537 bool LocaleList::atLeastOneScriptMatch(const LocaleList& list) const {
538     if ((mUnionOfSubScriptBits & list.mUnionOfSubScriptBits) != 0) {
539         return true;
540     }
541 
542     for (const Locale& myLocale : mLocales) {
543         for (const Locale& otherLocale : list.mLocales) {
544             if (myLocale.isEqualScript(otherLocale)) {
545                 return true;
546             }
547         }
548     }
549 
550     return false;
551 }
552 
hasScript(char c1,char c2,char c3,char c4) const553 bool LocaleList::hasScript(char c1, char c2, char c3, char c4) const {
554     for (const Locale& locale : mLocales) {
555         if (locale.supportsScript(c1, c2, c3, c4)) {
556             return true;
557         }
558     }
559     return false;
560 }
561 
562 }  // namespace minikin
563