1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINIKIN_LOCALE_LIST_H
18 #define MINIKIN_LOCALE_LIST_H
19 
20 #include <hb.h>
21 
22 #include <string>
23 #include <vector>
24 
25 #include "StringPiece.h"
26 #include "minikin/LineBreakStyle.h"
27 
28 namespace minikin {
29 
30 // Due to the limits in font fallback score calculation, we can't use anything more than 12 locales.
31 const size_t FONT_LOCALE_LIMIT = 12;
32 
33 // The language or region code is encoded to 15 bits.
34 constexpr uint16_t NO_LANGUAGE = 0x7fff;
35 constexpr uint16_t NO_REGION = 0x7fff;
36 // The script code is encoded to 20 bits.
37 constexpr uint32_t NO_SCRIPT = 0xfffff;
38 
39 class LocaleList;
40 
41 // Enum for making sub-locale from FontLangauge.
42 enum class SubtagBits : uint8_t {
43     EMPTY = 0b00000000,
44     LANGUAGE = 0b00000001,
45     SCRIPT = 0b00000010,
46     REGION = 0b00000100,
47     VARIANT = 0b00001000,
48     EMOJI = 0b00010000,
49     ALL = 0b00011111,
50 };
51 
52 inline constexpr SubtagBits operator&(SubtagBits l, SubtagBits r) {
53     return static_cast<SubtagBits>(static_cast<uint8_t>(l) & static_cast<uint8_t>(r));
54 }
55 inline constexpr SubtagBits operator|(SubtagBits l, SubtagBits r) {
56     return static_cast<SubtagBits>(static_cast<uint8_t>(l) | static_cast<uint8_t>(r));
57 }
58 
59 // Enum for emoji style.
60 enum class EmojiStyle : uint8_t {
61     EMPTY = 0,    // No emoji style is specified.
62     DEFAULT = 1,  // Default emoji style is specified.
63     EMOJI = 2,    // Emoji (color) emoji style is specified.
64     TEXT = 3,     // Text (black/white) emoji style is specified.
65 };
66 
67 // Locale is a compact representation of a BCP 47 language tag.
68 // It does not capture all possible information, only what directly affects text layout:
69 // font rendering, hyphenation, word breaking, etc.
70 struct Locale {
71 public:
72     enum class Variant : uint16_t {
73         NO_VARIANT = 0x0000,
74         GERMAN_1901_ORTHOGRAPHY = 0x0001,
75         GERMAN_1996_ORTHOGRAPHY = 0x0002,
76     };
77 
78     // Default constructor creates the unsupported locale.
LocaleLocale79     Locale()
80             : mScript(NO_SCRIPT),
81               mLanguage(NO_LANGUAGE),
82               mRegion(NO_REGION),
83               mSubScriptBits(0ul),
84               mVariant(Variant::NO_VARIANT),
85               mEmojiStyle(EmojiStyle::EMPTY) {}
86 
87     // Parse from string
88     Locale(const StringPiece& buf);
89 
90     // Parse from identifier. See getIdentifier() for the identifier format.
LocaleLocale91     explicit Locale(uint64_t identifier)
92             : mScript(extractBits(identifier, 29, 20)),
93               mLanguage(extractBits(identifier, 49, 15)),
94               mRegion(extractBits(identifier, 14, 15)),
95               mSubScriptBits(scriptToSubScriptBits(mScript)),
96               mVariant(static_cast<Variant>(extractBits(identifier, 0, 2))),
97               mEmojiStyle(static_cast<EmojiStyle>(extractBits(identifier, 12, 2))) {}
98 
99     bool operator==(const Locale& other) const {
100         return !isUnsupported() && isEqualScript(other) && mLanguage == other.mLanguage &&
101                mRegion == other.mRegion && mVariant == other.mVariant &&
102                mEmojiStyle == other.mEmojiStyle;
103     }
104 
105     bool operator!=(const Locale other) const { return !(*this == other); }
106 
hasLanguageLocale107     inline bool hasLanguage() const { return mLanguage != NO_LANGUAGE; }
hasScriptLocale108     inline bool hasScript() const { return mScript != NO_SCRIPT; }
hasRegionLocale109     inline bool hasRegion() const { return mRegion != NO_REGION; }
hasVariantLocale110     inline bool hasVariant() const { return mVariant != Variant::NO_VARIANT; }
hasEmojiStyleLocale111     inline bool hasEmojiStyle() const { return mEmojiStyle != EmojiStyle::EMPTY; }
112 
isSupportedLocale113     inline bool isSupported() const {
114         return hasLanguage() || hasScript() || hasRegion() || hasVariant() || hasEmojiStyle();
115     }
116 
isUnsupportedLocale117     inline bool isUnsupported() const { return !isSupported(); }
118 
getEmojiStyleLocale119     EmojiStyle getEmojiStyle() const { return mEmojiStyle; }
120 
121     bool isEqualScript(const Locale& other) const;
122 
123     // Returns true if this script supports the given script. For example, ja-Jpan supports Hira,
124     // ja-Hira doesn't support Jpan.
125     bool supportsScript(uint32_t script) const;
126     bool supportsScript(char c1, char c2, char c3, char c4) const;
127 
128     std::string getString() const;
129 
130     std::string getStringWithLineBreakOption(LineBreakStyle lbStyle,
131                                              LineBreakWordStyle lbWordStyle) const;
132 
133     // Calculates a matching score. This score represents how well the input locales cover this
134     // locale. The maximum score in the locale list is returned.
135     // 0 = no match, 1 = script match, 2 = script and primary language match.
136     int calcScoreFor(const LocaleList& supported) const;
137 
138     // Identifier pattern:
139     // |-------|-------|-------|-------|-------|-------|-------|-------|
140     // lllllllllllllll                                                   Language Code (15 bits)
141     //                ssssssssssssssssssss                               Script Code (20 bits)
142     //                                    rrrrrrrrrrrrrrr                Region Code (15 bits)
143     //                                                   ee              Emoji Style (2 bits)
144     //                                                     XXXXXXXXXX    Free (10 bits)
145     //                                                               vv  German Variant (2 bits)
getIdentifierLocale146     uint64_t getIdentifier() const {
147         return ((uint64_t)mLanguage << 49) | ((uint64_t)mScript << 29) | ((uint64_t)mRegion << 14) |
148                ((uint64_t)mEmojiStyle << 12) | (uint64_t)mVariant;
149     }
150 
151     Locale getPartialLocale(SubtagBits bits) const;
152 
153 private:
154     friend class LocaleList;  // for LocaleList constructor
155 
156     // ISO 15924 compliant script code. The 4 chars script code are packed into a 20 bit integer.
157     // If not specified, this is kInvalidScript.
158     uint32_t mScript;
159 
160     // ISO 639-1 or ISO 639-2 compliant language code.
161     // The two- or three-letter language code is packed into a 15 bit integer.
162     // mLanguage = 0 means the Locale is unsupported.
163     uint16_t mLanguage;
164 
165     // ISO 3166-1 or UN M.49 compliant region code. The two-letter or three-digit region code is
166     // packed into a 15 bit integer.
167     uint16_t mRegion;
168 
169     // For faster comparing, use 7 bits for specific scripts.
170     static const uint8_t kBopomofoFlag = 1u;
171     static const uint8_t kHanFlag = 1u << 1;
172     static const uint8_t kHangulFlag = 1u << 2;
173     static const uint8_t kHiraganaFlag = 1u << 3;
174     static const uint8_t kKatakanaFlag = 1u << 4;
175     static const uint8_t kSimplifiedChineseFlag = 1u << 5;
176     static const uint8_t kTraditionalChineseFlag = 1u << 6;
177     uint8_t mSubScriptBits;
178 
179     Variant mVariant;
180 
181     EmojiStyle mEmojiStyle;
182 
183     void resolveUnicodeExtension(const char* buf, size_t length);
184 
extractBitsLocale185     inline static uint64_t extractBits(uint64_t value, uint8_t shift, uint8_t nBits) {
186         return (value >> shift) & ((1 << nBits) - 1);
187     }
188 
189     int buildLocaleString(char* buf) const;
190 
191     static uint8_t scriptToSubScriptBits(uint32_t rawScript);
192 
193     static EmojiStyle resolveEmojiStyle(const char* buf, size_t length);
194     static EmojiStyle scriptToEmojiStyle(uint32_t script);
195 
196     // Returns true if the provide subscript bits has the requested subscript bits.
197     // Note that this function returns false if the requested subscript bits are empty.
198     static bool supportsScript(uint8_t providedBits, uint8_t requestedBits);
199 };
200 
201 // An immutable list of locale.
202 class LocaleList {
203 public:
204     explicit LocaleList(std::vector<Locale>&& locales);
LocaleList()205     LocaleList()
206             : mUnionOfSubScriptBits(0),
207               mIsAllTheSameLocale(false),
208               mEmojiStyle(EmojiStyle::EMPTY) {}
209     LocaleList(LocaleList&&) = default;
210 
size()211     size_t size() const { return mLocales.size(); }
empty()212     bool empty() const { return mLocales.empty(); }
213     const Locale& operator[](size_t n) const { return mLocales[n]; }
214 
getHbLanguage(size_t n)215     hb_language_t getHbLanguage(size_t n) const { return mHbLangs[n]; }
216 
217     // Returns an effective emoji style of this locale list.
218     // The effective means the first non empty emoji style in the list.
getEmojiStyle()219     EmojiStyle getEmojiStyle() const { return mEmojiStyle; }
220 
221     bool atLeastOneScriptMatch(const LocaleList& list) const;
222 
hasJapanese()223     bool hasJapanese() const { return hasScript('J', 'p', 'a', 'n'); }
hasKorean()224     bool hasKorean() const { return hasScript('K', 'o', 'r', 'e'); }
225 
226 private:
227     friend struct Locale;  // for calcScoreFor
228 
229     std::vector<Locale> mLocales;
230 
231     // The languages to be passed to HarfBuzz shaper.
232     std::vector<hb_language_t> mHbLangs;
233     uint8_t mUnionOfSubScriptBits;
234     bool mIsAllTheSameLocale;
235     EmojiStyle mEmojiStyle;
236 
getUnionOfSubScriptBits()237     uint8_t getUnionOfSubScriptBits() const { return mUnionOfSubScriptBits; }
isAllTheSameLocale()238     bool isAllTheSameLocale() const { return mIsAllTheSameLocale; }
239 
240     bool hasScript(char c1, char c2, char c3, char c4) const;
241 
242     // Do not copy and assign.
243     LocaleList(const LocaleList&) = delete;
244     void operator=(const LocaleList&) = delete;
245 };
246 
247 }  // namespace minikin
248 
249 #endif  // MINIKIN_LOCALE_LIST_H
250