1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "minikin/Hyphenator.h"
18 
19 #include <unicode/uchar.h>
20 #include <unicode/uscript.h>
21 
22 #include <algorithm>
23 #include <memory>
24 #include <string>
25 #include <vector>
26 
27 #include "FeatureFlags.h"
28 #include "MinikinInternal.h"
29 #include "minikin/Characters.h"
30 
31 #ifndef _WIN32
32 #include "minikin_cxx_bridge.rs.h"
33 #endif  // _WIN32
34 
35 namespace minikin {
36 
37 // The following are structs that correspond to tables inside the hyb file format
38 
39 struct AlphabetTable0 {
40     uint32_t version;
41     uint32_t min_codepoint;
42     uint32_t max_codepoint;
43     uint8_t data[1];  // actually flexible array, size is known at runtime
44 };
45 
46 struct AlphabetTable1 {
47     uint32_t version;
48     uint32_t n_entries;
49     uint32_t data[1];  // actually flexible array, size is known at runtime
50 
codepointminikin::AlphabetTable151     static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
valueminikin::AlphabetTable152     static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
53 };
54 
55 struct Trie {
56     uint32_t version;
57     uint32_t char_mask;
58     uint32_t link_shift;
59     uint32_t link_mask;
60     uint32_t pattern_shift;
61     uint32_t n_entries;
62     uint32_t data[1];  // actually flexible array, size is known at runtime
63 };
64 
65 struct Pattern {
66     uint32_t version;
67     uint32_t n_entries;
68     uint32_t pattern_offset;
69     uint32_t pattern_size;
70     uint32_t data[1];  // actually flexible array, size is known at runtime
71 
72     // accessors
lenminikin::Pattern73     static uint32_t len(uint32_t entry) { return entry >> 26; }
shiftminikin::Pattern74     static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
bufminikin::Pattern75     const uint8_t* buf(uint32_t entry) const {
76         return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
77     }
78 };
79 
80 struct Header {
81     uint32_t magic;
82     uint32_t version;
83     uint32_t alphabet_offset;
84     uint32_t trie_offset;
85     uint32_t pattern_offset;
86     uint32_t file_size;
87 
88     // accessors
bytesminikin::Header89     const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
alphabetVersionminikin::Header90     uint32_t alphabetVersion() const {
91         return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
92     }
alphabetTable0minikin::Header93     const AlphabetTable0* alphabetTable0() const {
94         return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
95     }
alphabetTable1minikin::Header96     const AlphabetTable1* alphabetTable1() const {
97         return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
98     }
trieTableminikin::Header99     const Trie* trieTable() const { return reinterpret_cast<const Trie*>(bytes() + trie_offset); }
patternTableminikin::Header100     const Pattern* patternTable() const {
101         return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
102     }
103 };
104 
105 #ifndef _WIN32
106 class HyphenatorRust : public Hyphenator {
107 public:
HyphenatorRust(const uint8_t * patternData,size_t dataSize,size_t minPrefix,size_t minSuffix,const std::string & locale)108     HyphenatorRust(const uint8_t* patternData, size_t dataSize, size_t minPrefix, size_t minSuffix,
109                    const std::string& locale)
110             : mHyphenator(rust::load_hyphenator(::rust::cxxbridge1::Slice(patternData, dataSize),
111                                                 minPrefix, minSuffix, locale)) {}
112 
hyphenate(const U16StringPiece & word,HyphenationType * out) const113     virtual void hyphenate(const U16StringPiece& word, HyphenationType* out) const override {
114         static_assert(sizeof(HyphenationType) == sizeof(uint8_t),
115                       "HyphnationType must be uint8_t.");
116         rust::hyphenate(*mHyphenator, ::rust::cxxbridge1::Slice(word.data(), word.size()),
117                         ::rust::cxxbridge1::Slice(reinterpret_cast<uint8_t*>(out), word.size()));
118     }
119 
120 private:
121     ::rust::Box<rust::Hyphenator> mHyphenator;
122 };
123 #endif  // _WIN32
124 
125 // static
loadBinary(const uint8_t * patternData,size_t dataSize,size_t minPrefix,size_t minSuffix,const std::string & locale)126 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t dataSize, size_t minPrefix,
127                                    size_t minSuffix, const std::string& locale) {
128 #ifdef _WIN32
129     return HyphenatorCXX::loadBinary(patternData, dataSize, minPrefix, minSuffix, locale);
130 #else   // _WIN32
131     if (features::rust_hyphenator()) {
132         return new HyphenatorRust(patternData, dataSize, minPrefix, minSuffix, locale);
133     } else {
134         return HyphenatorCXX::loadBinary(patternData, dataSize, minPrefix, minSuffix, locale);
135     }
136 #endif  // _WIN32
137 }
138 
139 #ifdef _WIN32
loadBinaryForRust(const uint8_t *,size_t,size_t,size_t,const std::string &)140 Hyphenator* Hyphenator::loadBinaryForRust(const uint8_t* /*patternData*/, size_t /*dataSize*/,
141                                           size_t /*minPrefix*/, size_t /*minSuffix*/,
142                                           const std::string& /*locale*/) {
143     MINIKIN_NOT_REACHED("Rust implementation is not available on Win32");
144 }
145 #else   // _WIN32
loadBinaryForRust(const uint8_t * patternData,size_t dataSize,size_t minPrefix,size_t minSuffix,const std::string & locale)146 Hyphenator* Hyphenator::loadBinaryForRust(const uint8_t* patternData, size_t dataSize,
147                                           size_t minPrefix, size_t minSuffix,
148                                           const std::string& locale) {
149     return new HyphenatorRust(patternData, dataSize, minPrefix, minSuffix, locale);
150 }
151 #endif  // _WIN32
152 // static
loadBinary(const uint8_t * patternData,size_t,size_t minPrefix,size_t minSuffix,const std::string & locale)153 Hyphenator* HyphenatorCXX::loadBinary(const uint8_t* patternData, size_t, size_t minPrefix,
154                                       size_t minSuffix, const std::string& locale) {
155     HyphenationLocale hyphenLocale = HyphenationLocale::OTHER;
156     if (locale == "pl") {
157         hyphenLocale = HyphenationLocale::POLISH;
158     } else if (locale == "ca") {
159         hyphenLocale = HyphenationLocale::CATALAN;
160     } else if (locale == "sl") {
161         hyphenLocale = HyphenationLocale::SLOVENIAN;
162     }
163     return new HyphenatorCXX(patternData, minPrefix, minSuffix, hyphenLocale);
164 }
165 
HyphenatorCXX(const uint8_t * patternData,size_t minPrefix,size_t minSuffix,HyphenationLocale hyphenLocale)166 HyphenatorCXX::HyphenatorCXX(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
167                              HyphenationLocale hyphenLocale)
168         : mPatternData(patternData),
169           mMinPrefix(minPrefix),
170           mMinSuffix(minSuffix),
171           mHyphenationLocale(hyphenLocale) {}
172 
hyphenate(const U16StringPiece & word,HyphenationType * out) const173 void HyphenatorCXX::hyphenate(const U16StringPiece& word, HyphenationType* out) const {
174     const size_t len = word.size();
175     const size_t paddedLen = len + 2;  // start and stop code each count for 1
176     if (mPatternData != nullptr && len >= mMinPrefix + mMinSuffix &&
177         paddedLen <= MAX_HYPHENATED_SIZE) {
178         uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
179         const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word);
180 
181         if (hyphenValue != HyphenationType::DONT_BREAK) {
182             hyphenateFromCodes(alpha_codes, paddedLen, hyphenValue, out);
183             return;
184         }
185         // TODO: try NFC normalization
186         // TODO: handle non-BMP Unicode (requires remapping of offsets)
187     }
188     // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the
189     // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup
190     // would return DONT_BREAK.
191     hyphenateWithNoPatterns(word, out);
192 }
193 
194 // This function determines whether a character is like U+2010 HYPHEN in
195 // line breaking and usage: a character immediately after which line breaks
196 // are allowed, but words containing it should not be automatically
197 // hyphenated using patterns. This is a curated set, created by manually
198 // inspecting all the characters that have the Unicode line breaking
199 // property of BA or HY and seeing which ones are hyphens.
isLineBreakingHyphen(uint32_t c)200 bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
201     return (c == 0x002D ||  // HYPHEN-MINUS
202             c == 0x058A ||  // ARMENIAN HYPHEN
203             c == 0x05BE ||  // HEBREW PUNCTUATION MAQAF
204             c == 0x1400 ||  // CANADIAN SYLLABICS HYPHEN
205             c == 0x2010 ||  // HYPHEN
206             c == 0x2013 ||  // EN DASH
207             c == 0x2027 ||  // HYPHENATION POINT
208             c == 0x2E17 ||  // DOUBLE OBLIQUE HYPHEN
209             c == 0x2E40);   // DOUBLE HYPHEN
210 }
211 
editForThisLine(HyphenationType type)212 EndHyphenEdit editForThisLine(HyphenationType type) {
213     switch (type) {
214         case HyphenationType::BREAK_AND_INSERT_HYPHEN:
215             return EndHyphenEdit::INSERT_HYPHEN;
216         case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
217             return EndHyphenEdit::INSERT_ARMENIAN_HYPHEN;
218         case HyphenationType::BREAK_AND_INSERT_MAQAF:
219             return EndHyphenEdit::INSERT_MAQAF;
220         case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
221             return EndHyphenEdit::INSERT_UCAS_HYPHEN;
222         case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
223             return EndHyphenEdit::REPLACE_WITH_HYPHEN;
224         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
225             return EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN;
226         case HyphenationType::DONT_BREAK:  // Hyphen edit for non breaking case doesn't make sense.
227         default:
228             return EndHyphenEdit::NO_EDIT;
229     }
230 }
231 
editForNextLine(HyphenationType type)232 StartHyphenEdit editForNextLine(HyphenationType type) {
233     switch (type) {
234         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
235             return StartHyphenEdit::INSERT_HYPHEN;
236         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
237             return StartHyphenEdit::INSERT_ZWJ;
238         case HyphenationType::DONT_BREAK:  // Hyphen edit for non breaking case doesn't make sense.
239         default:
240             return StartHyphenEdit::NO_EDIT;
241     }
242 }
243 
getScript(uint32_t codePoint)244 static UScriptCode getScript(uint32_t codePoint) {
245     UErrorCode errorCode = U_ZERO_ERROR;
246     const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
247     if (U_SUCCESS(errorCode)) {
248         return script;
249     } else {
250         return USCRIPT_INVALID_CODE;
251     }
252 }
253 
hyphenationTypeBasedOnScript(uint32_t codePoint)254 static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
255     // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
256     // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
257     // for now to be safe.  BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
258     // to maqaf for Hebrew, we can simply add a condition here.
259     const UScriptCode script = getScript(codePoint);
260     if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
261         script == USCRIPT_TELUGU) {
262         // Grantha is not included, since we don't support non-BMP hyphenation yet.
263         return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
264     } else if (script == USCRIPT_ARMENIAN) {
265         return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
266     } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
267         return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
268     } else {
269         return HyphenationType::BREAK_AND_INSERT_HYPHEN;
270     }
271 }
272 
getJoiningType(UChar32 codepoint)273 static inline int32_t getJoiningType(UChar32 codepoint) {
274     return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
275 }
276 
277 // Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN.
278 // This function decides if the letters before and after the hyphen should appear as joining.
getHyphTypeForArabic(const U16StringPiece & word,size_t location)279 static inline HyphenationType getHyphTypeForArabic(const U16StringPiece& word, size_t location) {
280     ssize_t i = location;
281     int32_t type = U_JT_NON_JOINING;
282     while (static_cast<size_t>(i) < word.size() &&
283            (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
284         i++;
285     }
286     if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) {
287         // The next character is of the type that may join the last character. See if the last
288         // character is also of the right type.
289         i = location - 2;  // Skip the soft hyphen
290         type = U_JT_NON_JOINING;
291         while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
292             i--;
293         }
294         if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) {
295             return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
296         }
297     }
298     return HyphenationType::BREAK_AND_INSERT_HYPHEN;
299 }
300 
301 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
302 // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
303 // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
hyphenateWithNoPatterns(const U16StringPiece & word,HyphenationType * out) const304 void HyphenatorCXX::hyphenateWithNoPatterns(const U16StringPiece& word,
305                                             HyphenationType* out) const {
306     out[0] = HyphenationType::DONT_BREAK;
307     for (size_t i = 1; i < word.size(); i++) {
308         const uint16_t prevChar = word[i - 1];
309         if (i > 1 && isLineBreakingHyphen(prevChar)) {
310             // Break after hyphens, but only if they don't start the word.
311 
312             if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) &&
313                 (mHyphenationLocale == HyphenationLocale::POLISH ||
314                  mHyphenationLocale == HyphenationLocale::SLOVENIAN) &&
315                 getScript(word[i]) == USCRIPT_LATIN) {
316                 // In Polish and Slovenian, hyphens get repeated at the next line. To be safe,
317                 // we will do this only if the next character is Latin.
318                 out[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
319             } else {
320                 out[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
321             }
322         } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) {
323             // Break after soft hyphens, but only if they don't start the word (a soft hyphen
324             // starting the word doesn't give any useful break opportunities). The type of the break
325             // is based on the script of the character we break on.
326             if (getScript(word[i]) == USCRIPT_ARABIC) {
327                 // For Arabic, we need to look and see if the characters around the soft hyphen
328                 // actually join. If they don't, we'll just insert a normal hyphen.
329                 out[i] = getHyphTypeForArabic(word, i);
330             } else {
331                 out[i] = hyphenationTypeBasedOnScript(word[i]);
332             }
333         } else if (prevChar == CHAR_MIDDLE_DOT && mMinPrefix < i && i <= word.size() - mMinSuffix &&
334                    ((word[i - 2] == 'l' && word[i] == 'l') ||
335                     (word[i - 2] == 'L' && word[i] == 'L')) &&
336                    mHyphenationLocale == HyphenationLocale::CATALAN) {
337             // In Catalan, "l·l" should break as "l-" on the first line
338             // and "l" on the next line.
339             out[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
340         } else {
341             out[i] = HyphenationType::DONT_BREAK;
342         }
343     }
344 }
345 
alphabetLookup(uint16_t * alpha_codes,const U16StringPiece & word) const346 HyphenationType HyphenatorCXX::alphabetLookup(uint16_t* alpha_codes,
347                                               const U16StringPiece& word) const {
348     const Header* header = getHeader();
349     HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
350     // TODO: check header magic
351     uint32_t alphabetVersion = header->alphabetVersion();
352     if (alphabetVersion == 0) {
353         const AlphabetTable0* alphabet = header->alphabetTable0();
354         uint32_t min_codepoint = alphabet->min_codepoint;
355         uint32_t max_codepoint = alphabet->max_codepoint;
356         alpha_codes[0] = 0;  // word start
357         for (size_t i = 0; i < word.size(); i++) {
358             uint16_t c = word[i];
359             if (c < min_codepoint || c >= max_codepoint) {
360                 return HyphenationType::DONT_BREAK;
361             }
362             uint8_t code = alphabet->data[c - min_codepoint];
363             if (code == 0) {
364                 return HyphenationType::DONT_BREAK;
365             }
366             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
367                 result = hyphenationTypeBasedOnScript(c);
368             }
369             alpha_codes[i + 1] = code;
370         }
371         alpha_codes[word.size() + 1] = 0;  // word termination
372         return result;
373     } else if (alphabetVersion == 1) {
374         const AlphabetTable1* alphabet = header->alphabetTable1();
375         size_t n_entries = alphabet->n_entries;
376         const uint32_t* begin = alphabet->data;
377         const uint32_t* end = begin + n_entries;
378         alpha_codes[0] = 0;
379         for (size_t i = 0; i < word.size(); i++) {
380             uint16_t c = word[i];
381             auto p = std::lower_bound(begin, end, c << 11);
382             if (p == end) {
383                 return HyphenationType::DONT_BREAK;
384             }
385             uint32_t entry = *p;
386             if (AlphabetTable1::codepoint(entry) != c) {
387                 return HyphenationType::DONT_BREAK;
388             }
389             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
390                 result = hyphenationTypeBasedOnScript(c);
391             }
392             alpha_codes[i + 1] = AlphabetTable1::value(entry);
393         }
394         alpha_codes[word.size() + 1] = 0;
395         return result;
396     }
397     return HyphenationType::DONT_BREAK;
398 }
399 
400 /**
401  * Internal implementation, after conversion to codes. All case folding and normalization
402  * has been done by now, and all characters have been found in the alphabet.
403  * Note: len here is the padded length including 0 codes at start and end.
404  **/
hyphenateFromCodes(const uint16_t * codes,size_t len,HyphenationType hyphenValue,HyphenationType * out) const405 void HyphenatorCXX::hyphenateFromCodes(const uint16_t* codes, size_t len,
406                                        HyphenationType hyphenValue, HyphenationType* out) const {
407     static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t.");
408     // Reuse the result array as a buffer for calculating intermediate hyphenation numbers.
409     uint8_t* buffer = reinterpret_cast<uint8_t*>(out);
410 
411     const Header* header = getHeader();
412     const Trie* trie = header->trieTable();
413     const Pattern* pattern = header->patternTable();
414     uint32_t char_mask = trie->char_mask;
415     uint32_t link_shift = trie->link_shift;
416     uint32_t link_mask = trie->link_mask;
417     uint32_t pattern_shift = trie->pattern_shift;
418     size_t maxOffset = len - mMinSuffix - 1;
419     for (size_t i = 0; i < len - 1; i++) {
420         uint32_t node = 0;  // index into Trie table
421         for (size_t j = i; j < len; j++) {
422             uint16_t c = codes[j];
423             uint32_t entry = trie->data[node + c];
424             if ((entry & char_mask) == c) {
425                 node = (entry & link_mask) >> link_shift;
426             } else {
427                 break;
428             }
429             uint32_t pat_ix = trie->data[node] >> pattern_shift;
430             // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
431             // into the buf pool. This is the pattern for the substring (i..j) we just matched,
432             // which we combine (via point-wise max) into the buffer vector.
433             if (pat_ix != 0) {
434                 uint32_t pat_entry = pattern->data[pat_ix];
435                 int pat_len = Pattern::len(pat_entry);
436                 int pat_shift = Pattern::shift(pat_entry);
437                 const uint8_t* pat_buf = pattern->buf(pat_entry);
438                 int offset = j + 1 - (pat_len + pat_shift);
439                 // offset is the index within buffer that lines up with the start of pat_buf
440                 int start = std::max((int)mMinPrefix - offset, 0);
441                 int end = std::min(pat_len, (int)maxOffset - offset);
442                 for (int k = start; k < end; k++) {
443                     buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
444                 }
445             }
446         }
447     }
448     // Since the above calculation does not modify values outside
449     // [mMinPrefix, len - mMinSuffix], they are left as 0 = DONT_BREAK.
450     for (size_t i = mMinPrefix; i < maxOffset; i++) {
451         // Hyphenation opportunities happen when the hyphenation numbers are odd.
452         out[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK;
453     }
454 }
455 
456 }  // namespace minikin
457