1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <vector>
18 #include <memory>
19 #include <algorithm>
20 #include <string>
21 #include <unicode/uchar.h>
22 
23 // HACK: for reading pattern file
24 #include <fcntl.h>
25 
26 #define LOG_TAG "Minikin"
27 #include "utils/Log.h"
28 
29 #include "minikin/Hyphenator.h"
30 
31 using std::vector;
32 
33 namespace android {
34 
35 static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
36 
37 // The following are structs that correspond to tables inside the hyb file format
38 
39 struct AlphabetTable0 {
40     uint32_t version;
41     uint32_t min_codepoint;
42     uint32_t max_codepoint;
43     uint8_t data[1];  // actually flexible array, size is known at runtime
44 };
45 
46 struct AlphabetTable1 {
47     uint32_t version;
48     uint32_t n_entries;
49     uint32_t data[1]; // actually flexible array, size is known at runtime
50 
codepointandroid::AlphabetTable151     static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
valueandroid::AlphabetTable152     static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
53 };
54 
55 struct Trie {
56     uint32_t version;
57     uint32_t char_mask;
58     uint32_t link_shift;
59     uint32_t link_mask;
60     uint32_t pattern_shift;
61     uint32_t n_entries;
62     uint32_t data[1];  // actually flexible array, size is known at runtime
63 };
64 
65 struct Pattern {
66     uint32_t version;
67     uint32_t n_entries;
68     uint32_t pattern_offset;
69     uint32_t pattern_size;
70     uint32_t data[1];  // actually flexible array, size is known at runtime
71 
72     // accessors
lenandroid::Pattern73     static uint32_t len(uint32_t entry) { return entry >> 26; }
shiftandroid::Pattern74     static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
bufandroid::Pattern75     const uint8_t* buf(uint32_t entry) const {
76         return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
77     }
78 };
79 
80 struct Header {
81     uint32_t magic;
82     uint32_t version;
83     uint32_t alphabet_offset;
84     uint32_t trie_offset;
85     uint32_t pattern_offset;
86     uint32_t file_size;
87 
88     // accessors
bytesandroid::Header89     const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
alphabetVersionandroid::Header90     uint32_t alphabetVersion() const {
91         return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
92     }
alphabetTable0android::Header93     const AlphabetTable0* alphabetTable0() const {
94         return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
95     }
alphabetTable1android::Header96     const AlphabetTable1* alphabetTable1() const {
97         return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
98     }
trieTableandroid::Header99     const Trie* trieTable() const {
100         return reinterpret_cast<const Trie*>(bytes() + trie_offset);
101     }
patternTableandroid::Header102     const Pattern* patternTable() const {
103         return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
104     }
105 };
106 
loadBinary(const uint8_t * patternData)107 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData) {
108     Hyphenator* result = new Hyphenator;
109     result->patternData = patternData;
110     return result;
111 }
112 
hyphenate(vector<uint8_t> * result,const uint16_t * word,size_t len)113 void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) {
114     result->clear();
115     result->resize(len);
116     const size_t paddedLen = len + 2;  // start and stop code each count for 1
117     if (patternData != nullptr &&
118             (int)len >= MIN_PREFIX + MIN_SUFFIX && paddedLen <= MAX_HYPHENATED_SIZE) {
119         uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
120         if (alphabetLookup(alpha_codes, word, len)) {
121             hyphenateFromCodes(result->data(), alpha_codes, paddedLen);
122             return;
123         }
124         // TODO: try NFC normalization
125         // TODO: handle non-BMP Unicode (requires remapping of offsets)
126     }
127     hyphenateSoft(result->data(), word, len);
128 }
129 
130 // If any soft hyphen is present in the word, use soft hyphens to decide hyphenation,
131 // as recommended in UAX #14 (Use of Soft Hyphen)
hyphenateSoft(uint8_t * result,const uint16_t * word,size_t len)132 void Hyphenator::hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len) {
133     result[0] = 0;
134     for (size_t i = 1; i < len; i++) {
135         result[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
136      }
137 }
138 
alphabetLookup(uint16_t * alpha_codes,const uint16_t * word,size_t len)139 bool Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len) {
140     const Header* header = getHeader();
141     // TODO: check header magic
142     uint32_t alphabetVersion = header->alphabetVersion();
143     if (alphabetVersion == 0) {
144         const AlphabetTable0* alphabet = header->alphabetTable0();
145         uint32_t min_codepoint = alphabet->min_codepoint;
146         uint32_t max_codepoint = alphabet->max_codepoint;
147         alpha_codes[0] = 0;  // word start
148         for (size_t i = 0; i < len; i++) {
149             uint16_t c = word[i];
150             if (c < min_codepoint || c >= max_codepoint) {
151                 return false;
152             }
153             uint8_t code = alphabet->data[c - min_codepoint];
154             if (code == 0) {
155                 return false;
156             }
157             alpha_codes[i + 1] = code;
158         }
159         alpha_codes[len + 1] = 0;  // word termination
160         return true;
161     } else if (alphabetVersion == 1) {
162         const AlphabetTable1* alphabet = header->alphabetTable1();
163         size_t n_entries = alphabet->n_entries;
164         const uint32_t* begin = alphabet->data;
165         const uint32_t* end = begin + n_entries;
166         alpha_codes[0] = 0;
167         for (size_t i = 0; i < len; i++) {
168             uint16_t c = word[i];
169             auto p = std::lower_bound(begin, end, c << 11);
170             if (p == end) {
171                 return false;
172             }
173             uint32_t entry = *p;
174             if (AlphabetTable1::codepoint(entry) != c) {
175                 return false;
176             }
177             alpha_codes[i + 1] = AlphabetTable1::value(entry);
178         }
179         alpha_codes[len + 1] = 0;
180         return true;
181     }
182     return false;
183 }
184 
185 /**
186  * Internal implementation, after conversion to codes. All case folding and normalization
187  * has been done by now, and all characters have been found in the alphabet.
188  * Note: len here is the padded length including 0 codes at start and end.
189  **/
hyphenateFromCodes(uint8_t * result,const uint16_t * codes,size_t len)190 void Hyphenator::hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len) {
191     const Header* header = getHeader();
192     const Trie* trie = header->trieTable();
193     const Pattern* pattern = header->patternTable();
194     uint32_t char_mask = trie->char_mask;
195     uint32_t link_shift = trie->link_shift;
196     uint32_t link_mask = trie->link_mask;
197     uint32_t pattern_shift = trie->pattern_shift;
198     size_t maxOffset = len - MIN_SUFFIX - 1;
199     for (size_t i = 0; i < len - 1; i++) {
200         uint32_t node = 0;  // index into Trie table
201         for (size_t j = i; j < len; j++) {
202             uint16_t c = codes[j];
203             uint32_t entry = trie->data[node + c];
204             if ((entry & char_mask) == c) {
205                 node = (entry & link_mask) >> link_shift;
206             } else {
207                 break;
208             }
209             uint32_t pat_ix = trie->data[node] >> pattern_shift;
210             // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
211             // into the buf pool. This is the pattern for the substring (i..j) we just matched,
212             // which we combine (via point-wise max) into the result vector.
213             if (pat_ix != 0) {
214                 uint32_t pat_entry = pattern->data[pat_ix];
215                 int pat_len = Pattern::len(pat_entry);
216                 int pat_shift = Pattern::shift(pat_entry);
217                 const uint8_t* pat_buf = pattern->buf(pat_entry);
218                 int offset = j + 1 - (pat_len + pat_shift);
219                 // offset is the index within result that lines up with the start of pat_buf
220                 int start = std::max(MIN_PREFIX - offset, 0);
221                 int end = std::min(pat_len, (int)maxOffset - offset);
222                 for (int k = start; k < end; k++) {
223                     result[offset + k] = std::max(result[offset + k], pat_buf[k]);
224                 }
225             }
226         }
227     }
228     // Since the above calculation does not modify values outside
229     // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0.
230     for (size_t i = MIN_PREFIX; i < maxOffset; i++) {
231         result[i] &= 1;
232     }
233 }
234 
235 }  // namespace android
236