1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <vector>
18 #include <memory>
19 #include <algorithm>
20 #include <string>
21 #include <unicode/uchar.h>
22 
23 // HACK: for reading pattern file
24 #include <fcntl.h>
25 
26 #define LOG_TAG "Minikin"
27 #include "utils/Log.h"
28 
29 #include "minikin/Hyphenator.h"
30 
31 using std::vector;
32 
33 namespace android {
34 
35 static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
36 
addPattern(const uint16_t * pattern,size_t size)37 void Hyphenator::addPattern(const uint16_t* pattern, size_t size) {
38     vector<uint16_t> word;
39     vector<uint8_t> result;
40 
41     // start by parsing the Liang-format pattern into a word and a result vector, the
42     // vector right-aligned but without leading zeros. Examples:
43     // a1bc2d -> abcd [1, 0, 2, 0]
44     // abc1 -> abc [1]
45     // 1a2b3c4d5 -> abcd [1, 2, 3, 4, 5]
46     bool lastWasLetter = false;
47     bool haveSeenNumber = false;
48     for (size_t i = 0; i < size; i++) {
49         uint16_t c = pattern[i];
50         if (isdigit(c)) {
51             result.push_back(c - '0');
52             lastWasLetter = false;
53             haveSeenNumber = true;
54         } else {
55             word.push_back(c);
56             if (lastWasLetter && haveSeenNumber) {
57                 result.push_back(0);
58             }
59             lastWasLetter = true;
60         }
61     }
62     if (lastWasLetter) {
63         result.push_back(0);
64     }
65     Trie* t = &root;
66     for (size_t i = 0; i < word.size(); i++) {
67         t = &t->succ[word[i]];
68     }
69     t->result = result;
70 }
71 
72 // If any soft hyphen is present in the word, use soft hyphens to decide hyphenation,
73 // as recommended in UAX #14 (Use of Soft Hyphen)
hyphenateSoft(vector<uint8_t> * result,const uint16_t * word,size_t len)74 void Hyphenator::hyphenateSoft(vector<uint8_t>* result, const uint16_t* word, size_t len) {
75     (*result)[0] = 0;
76     for (size_t i = 1; i < len; i++) {
77         (*result)[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
78     }
79 }
80 
hyphenate(vector<uint8_t> * result,const uint16_t * word,size_t len)81 void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) {
82     result->clear();
83     result->resize(len);
84     if (len < MIN_PREFIX + MIN_SUFFIX) return;
85     size_t maxOffset = len - MIN_SUFFIX + 1;
86     for (size_t i = 0; i < len + 1; i++) {
87         const Trie* node = &root;
88         for (size_t j = i; j < len + 2; j++) {
89             uint16_t c;
90             if (j == 0 || j == len + 1) {
91                 c = '.';  // word boundary character in pattern data files
92             } else {
93                 c = word[j - 1];
94                 if (c == CHAR_SOFT_HYPHEN) {
95                     hyphenateSoft(result, word, len);
96                     return;
97                 }
98                 // TODO: This uses ICU's simple character to character lowercasing, which ignores
99                 // the locale, and ignores cases when lowercasing a character results in more than
100                 // one character. It should be fixed to consider the locale (in order for it to work
101                 // correctly for Turkish and Azerbaijani), as well as support one-to-many, and
102                 // many-to-many case conversions (including non-BMP cases).
103                 if (c < 0x00C0) { // U+00C0 is the lowest uppercase non-ASCII character
104                     // Convert uppercase ASCII to lowercase ASCII, but keep other characters as-is
105                     if (0x0041 <= c && c <= 0x005A) {
106                         c += 0x0020;
107                     }
108                 } else {
109                     c = u_tolower(c);
110                 }
111             }
112             auto search = node->succ.find(c);
113             if (search != node->succ.end()) {
114                 node = &search->second;
115             } else {
116                 break;
117             }
118             if (!node->result.empty()) {
119                 int resultLen = node->result.size();
120                 int offset = j + 1 - resultLen;
121                 int start = std::max(MIN_PREFIX - offset, 0);
122                 int end = std::min(resultLen, (int)maxOffset - offset);
123                 // TODO performance: this inner loop can profitably be optimized
124                 for (int k = start; k < end; k++) {
125                     (*result)[offset + k] = std::max((*result)[offset + k], node->result[k]);
126                 }
127 #if 0
128                 // debug printing of matched patterns
129                 std::string dbg;
130                 for (size_t k = i; k <= j + 1; k++) {
131                     int off = k - j - 2 + resultLen;
132                     if (off >= 0 && node->result[off] != 0) {
133                         dbg.push_back((char)('0' + node->result[off]));
134                     }
135                     if (k < j + 1) {
136                         uint16_t c = (k == 0 || k == len + 1) ? '.' : word[k - 1];
137                         dbg.push_back((char)c);
138                     }
139                 }
140                 ALOGD("%d:%d %s", i, j, dbg.c_str());
141 #endif
142             }
143         }
144     }
145     // Since the above calculation does not modify values outside
146     // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0.
147     for (size_t i = MIN_PREFIX; i < maxOffset; i++) {
148         (*result)[i] &= 1;
149     }
150 }
151 
load(const uint16_t * patternData,size_t size)152 Hyphenator* Hyphenator::load(const uint16_t *patternData, size_t size) {
153     Hyphenator* result = new Hyphenator;
154     for (size_t i = 0; i < size; i++) {
155         size_t end = i;
156         while (patternData[end] != '\n') end++;
157         result->addPattern(patternData + i, end - i);
158         i = end;
159     }
160     return result;
161 }
162 
163 }  // namespace android
164