1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "Minikin"
18
19 #include <android/log.h>
20
21 #include <minikin/Emoji.h>
22 #include <minikin/Hyphenator.h>
23 #include <minikin/WordBreaker.h>
24 #include "MinikinInternal.h"
25
26 #include <unicode/uchar.h>
27 #include <unicode/utf16.h>
28
29 namespace minikin {
30
31 const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
32 const uint32_t CHAR_ZWJ = 0x200D;
33
setLocale(const icu::Locale & locale)34 void WordBreaker::setLocale(const icu::Locale& locale) {
35 UErrorCode status = U_ZERO_ERROR;
36 mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
37 // TODO: handle failure status
38 if (mText != nullptr) {
39 mBreakIterator->setText(&mUText, status);
40 }
41 mIteratorWasReset = true;
42 }
43
setText(const uint16_t * data,size_t size)44 void WordBreaker::setText(const uint16_t* data, size_t size) {
45 mText = data;
46 mTextSize = size;
47 mIteratorWasReset = false;
48 mLast = 0;
49 mCurrent = 0;
50 mScanOffset = 0;
51 mInEmailOrUrl = false;
52 UErrorCode status = U_ZERO_ERROR;
53 utext_openUChars(&mUText, data, size, &status);
54 mBreakIterator->setText(&mUText, status);
55 mBreakIterator->first();
56 }
57
current() const58 ssize_t WordBreaker::current() const {
59 return mCurrent;
60 }
61
62 /**
63 * Determine whether a line break at position i within the buffer buf is valid. This
64 * represents customization beyond the ICU behavior, because plain ICU provides some
65 * line break opportunities that we don't want.
66 **/
isBreakValid(const uint16_t * buf,size_t bufEnd,size_t i)67 static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
68 uint32_t codePoint;
69 size_t prev_offset = i;
70 U16_PREV(buf, 0, prev_offset, codePoint);
71 // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
72 if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
73 return false;
74 }
75 // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
76 // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
77 // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
78 // where no line break could be imagined, since the Myanmar virama is a pure stacker.
79 if (codePoint == 0x1039) { // MYANMAR SIGN VIRAMA
80 return false;
81 }
82
83 uint32_t next_codepoint;
84 size_t next_offset = i;
85 U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
86
87 // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
88 // emoji data than ICU does.
89 if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
90 return false;
91 }
92
93 // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
94 if (isEmojiModifier(next_codepoint)) {
95 if (codePoint == 0xFE0F && prev_offset > 0) {
96 // skip over emoji variation selector
97 U16_PREV(buf, 0, prev_offset, codePoint);
98 }
99 if (isEmojiBase(codePoint)) {
100 return false;
101 }
102 }
103 return true;
104 }
105
106 // Customized iteratorNext that takes care of both resets and our modifications
107 // to ICU's behavior.
iteratorNext()108 int32_t WordBreaker::iteratorNext() {
109 int32_t result;
110 do {
111 if (mIteratorWasReset) {
112 result = mBreakIterator->following(mCurrent);
113 mIteratorWasReset = false;
114 } else {
115 result = mBreakIterator->next();
116 }
117 } while (!(result == icu::BreakIterator::DONE || (size_t)result == mTextSize
118 || isBreakValid(mText, mTextSize, result)));
119 return result;
120 }
121
122 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
breakAfter(uint16_t c)123 static bool breakAfter(uint16_t c) {
124 return c == ':' || c == '=' || c == '&';
125 }
126
127 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
breakBefore(uint16_t c)128 static bool breakBefore(uint16_t c) {
129 return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#'
130 || c == '%' || c == '=' || c == '&';
131 }
132
133 enum ScanState {
134 START,
135 SAW_AT,
136 SAW_COLON,
137 SAW_COLON_SLASH,
138 SAW_COLON_SLASH_SLASH,
139 };
140
detectEmailOrUrl()141 void WordBreaker::detectEmailOrUrl() {
142 // scan forward from current ICU position for email address or URL
143 if (mLast >= mScanOffset) {
144 ScanState state = START;
145 size_t i;
146 for (i = mLast; i < mTextSize; i++) {
147 uint16_t c = mText[i];
148 // scan only ASCII characters, stop at space
149 if (!(' ' < c && c <= 0x007E)) {
150 break;
151 }
152 if (state == START && c == '@') {
153 state = SAW_AT;
154 } else if (state == START && c == ':') {
155 state = SAW_COLON;
156 } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
157 if (c == '/') {
158 state = static_cast<ScanState>((int)state + 1); // next state adds a slash
159 } else {
160 state = START;
161 }
162 }
163 }
164 if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
165 if (!mBreakIterator->isBoundary(i)) {
166 // If there are combining marks or such at the end of the URL or the email address,
167 // consider them a part of the URL or the email, and skip to the next actual
168 // boundary.
169 i = mBreakIterator->following(i);
170 }
171 mInEmailOrUrl = true;
172 mIteratorWasReset = true;
173 } else {
174 mInEmailOrUrl = false;
175 }
176 mScanOffset = i;
177 }
178 }
179
findNextBreakInEmailOrUrl()180 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
181 // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
182 uint16_t lastChar = mText[mLast];
183 ssize_t i;
184 for (i = mLast + 1; i < mScanOffset; i++) {
185 if (breakAfter(lastChar)) {
186 break;
187 }
188 // break after double slash
189 if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
190 break;
191 }
192 const uint16_t thisChar = mText[i];
193 // never break after hyphen
194 if (lastChar != '-') {
195 if (breakBefore(thisChar)) {
196 break;
197 }
198 // break before single slash
199 if (thisChar == '/' && lastChar != '/' &&
200 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
201 break;
202 }
203 }
204 lastChar = thisChar;
205 }
206 return i;
207 }
208
next()209 ssize_t WordBreaker::next() {
210 mLast = mCurrent;
211
212 detectEmailOrUrl();
213 if (mInEmailOrUrl) {
214 mCurrent = findNextBreakInEmailOrUrl();
215 } else { // Business as usual
216 mCurrent = (ssize_t) iteratorNext();
217 }
218 return mCurrent;
219 }
220
wordStart() const221 ssize_t WordBreaker::wordStart() const {
222 if (mInEmailOrUrl) {
223 return mLast;
224 }
225 ssize_t result = mLast;
226 while (result < mCurrent) {
227 UChar32 c;
228 ssize_t ix = result;
229 U16_NEXT(mText, ix, mCurrent, c);
230 const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
231 // strip leading punctuation, defined as OP and QU line breaking classes,
232 // see UAX #14
233 if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
234 break;
235 }
236 result = ix;
237 }
238 return result;
239 }
240
wordEnd() const241 ssize_t WordBreaker::wordEnd() const {
242 if (mInEmailOrUrl) {
243 return mLast;
244 }
245 ssize_t result = mCurrent;
246 while (result > mLast) {
247 UChar32 c;
248 ssize_t ix = result;
249 U16_PREV(mText, mLast, ix, c);
250 const int32_t gc_mask = U_GET_GC_MASK(c);
251 // strip trailing space and punctuation
252 if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
253 break;
254 }
255 result = ix;
256 }
257 return result;
258 }
259
breakBadness() const260 int WordBreaker::breakBadness() const {
261 return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
262 }
263
finish()264 void WordBreaker::finish() {
265 mText = nullptr;
266 // Note: calling utext_close multiply is safe
267 utext_close(&mUText);
268 }
269
270 } // namespace minikin
271