1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <vector>
18 #include <memory>
19 #include <algorithm>
20 #include <string>
21 #include <unicode/uchar.h>
22
23 // HACK: for reading pattern file
24 #include <fcntl.h>
25
26 #define LOG_TAG "Minikin"
27 #include "utils/Log.h"
28
29 #include "minikin/Hyphenator.h"
30
31 using std::vector;
32
33 namespace android {
34
35 static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
36
addPattern(const uint16_t * pattern,size_t size)37 void Hyphenator::addPattern(const uint16_t* pattern, size_t size) {
38 vector<uint16_t> word;
39 vector<uint8_t> result;
40
41 // start by parsing the Liang-format pattern into a word and a result vector, the
42 // vector right-aligned but without leading zeros. Examples:
43 // a1bc2d -> abcd [1, 0, 2, 0]
44 // abc1 -> abc [1]
45 // 1a2b3c4d5 -> abcd [1, 2, 3, 4, 5]
46 bool lastWasLetter = false;
47 bool haveSeenNumber = false;
48 for (size_t i = 0; i < size; i++) {
49 uint16_t c = pattern[i];
50 if (isdigit(c)) {
51 result.push_back(c - '0');
52 lastWasLetter = false;
53 haveSeenNumber = true;
54 } else {
55 word.push_back(c);
56 if (lastWasLetter && haveSeenNumber) {
57 result.push_back(0);
58 }
59 lastWasLetter = true;
60 }
61 }
62 if (lastWasLetter) {
63 result.push_back(0);
64 }
65 Trie* t = &root;
66 for (size_t i = 0; i < word.size(); i++) {
67 t = &t->succ[word[i]];
68 }
69 t->result = result;
70 }
71
72 // If any soft hyphen is present in the word, use soft hyphens to decide hyphenation,
73 // as recommended in UAX #14 (Use of Soft Hyphen)
hyphenateSoft(vector<uint8_t> * result,const uint16_t * word,size_t len)74 void Hyphenator::hyphenateSoft(vector<uint8_t>* result, const uint16_t* word, size_t len) {
75 (*result)[0] = 0;
76 for (size_t i = 1; i < len; i++) {
77 (*result)[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
78 }
79 }
80
hyphenate(vector<uint8_t> * result,const uint16_t * word,size_t len)81 void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) {
82 result->clear();
83 result->resize(len);
84 if (len < MIN_PREFIX + MIN_SUFFIX) return;
85 size_t maxOffset = len - MIN_SUFFIX + 1;
86 for (size_t i = 0; i < len + 1; i++) {
87 const Trie* node = &root;
88 for (size_t j = i; j < len + 2; j++) {
89 uint16_t c;
90 if (j == 0 || j == len + 1) {
91 c = '.'; // word boundary character in pattern data files
92 } else {
93 c = word[j - 1];
94 if (c == CHAR_SOFT_HYPHEN) {
95 hyphenateSoft(result, word, len);
96 return;
97 }
98 // TODO: This uses ICU's simple character to character lowercasing, which ignores
99 // the locale, and ignores cases when lowercasing a character results in more than
100 // one character. It should be fixed to consider the locale (in order for it to work
101 // correctly for Turkish and Azerbaijani), as well as support one-to-many, and
102 // many-to-many case conversions (including non-BMP cases).
103 if (c < 0x00C0) { // U+00C0 is the lowest uppercase non-ASCII character
104 // Convert uppercase ASCII to lowercase ASCII, but keep other characters as-is
105 if (0x0041 <= c && c <= 0x005A) {
106 c += 0x0020;
107 }
108 } else {
109 c = u_tolower(c);
110 }
111 }
112 auto search = node->succ.find(c);
113 if (search != node->succ.end()) {
114 node = &search->second;
115 } else {
116 break;
117 }
118 if (!node->result.empty()) {
119 int resultLen = node->result.size();
120 int offset = j + 1 - resultLen;
121 int start = std::max(MIN_PREFIX - offset, 0);
122 int end = std::min(resultLen, (int)maxOffset - offset);
123 // TODO performance: this inner loop can profitably be optimized
124 for (int k = start; k < end; k++) {
125 (*result)[offset + k] = std::max((*result)[offset + k], node->result[k]);
126 }
127 #if 0
128 // debug printing of matched patterns
129 std::string dbg;
130 for (size_t k = i; k <= j + 1; k++) {
131 int off = k - j - 2 + resultLen;
132 if (off >= 0 && node->result[off] != 0) {
133 dbg.push_back((char)('0' + node->result[off]));
134 }
135 if (k < j + 1) {
136 uint16_t c = (k == 0 || k == len + 1) ? '.' : word[k - 1];
137 dbg.push_back((char)c);
138 }
139 }
140 ALOGD("%d:%d %s", i, j, dbg.c_str());
141 #endif
142 }
143 }
144 }
145 // Since the above calculation does not modify values outside
146 // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0.
147 for (size_t i = MIN_PREFIX; i < maxOffset; i++) {
148 (*result)[i] &= 1;
149 }
150 }
151
load(const uint16_t * patternData,size_t size)152 Hyphenator* Hyphenator::load(const uint16_t *patternData, size_t size) {
153 Hyphenator* result = new Hyphenator;
154 for (size_t i = 0; i < size; i++) {
155 size_t end = i;
156 while (patternData[end] != '\n') end++;
157 result->addPattern(patternData + i, end - i);
158 i = end;
159 }
160 return result;
161 }
162
163 } // namespace android
164