1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Minikin"
18 
19 #include <android/log.h>
20 
21 #include <minikin/Emoji.h>
22 #include <minikin/Hyphenator.h>
23 #include <minikin/WordBreaker.h>
24 #include "MinikinInternal.h"
25 
26 #include <unicode/uchar.h>
27 #include <unicode/utf16.h>
28 
29 namespace minikin {
30 
31 const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
32 const uint32_t CHAR_ZWJ = 0x200D;
33 
setLocale(const icu::Locale & locale)34 void WordBreaker::setLocale(const icu::Locale& locale) {
35     UErrorCode status = U_ZERO_ERROR;
36     mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
37     // TODO: handle failure status
38     if (mText != nullptr) {
39         mBreakIterator->setText(&mUText, status);
40     }
41     mIteratorWasReset = true;
42 }
43 
setText(const uint16_t * data,size_t size)44 void WordBreaker::setText(const uint16_t* data, size_t size) {
45     mText = data;
46     mTextSize = size;
47     mIteratorWasReset = false;
48     mLast = 0;
49     mCurrent = 0;
50     mScanOffset = 0;
51     mInEmailOrUrl = false;
52     UErrorCode status = U_ZERO_ERROR;
53     utext_openUChars(&mUText, data, size, &status);
54     mBreakIterator->setText(&mUText, status);
55     mBreakIterator->first();
56 }
57 
current() const58 ssize_t WordBreaker::current() const {
59     return mCurrent;
60 }
61 
62 /**
63  * Determine whether a line break at position i within the buffer buf is valid. This
64  * represents customization beyond the ICU behavior, because plain ICU provides some
65  * line break opportunities that we don't want.
66  **/
isBreakValid(const uint16_t * buf,size_t bufEnd,size_t i)67 static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
68     uint32_t codePoint;
69     size_t prev_offset = i;
70     U16_PREV(buf, 0, prev_offset, codePoint);
71     // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
72     if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
73         return false;
74     }
75     // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
76     // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
77     // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
78     // where no line break could be imagined, since the Myanmar virama is a pure stacker.
79     if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
80         return false;
81     }
82 
83     uint32_t next_codepoint;
84     size_t next_offset = i;
85     U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
86 
87     // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
88     // emoji data than ICU does.
89     if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
90         return false;
91     }
92 
93     // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
94     if (isEmojiModifier(next_codepoint)) {
95         if (codePoint == 0xFE0F && prev_offset > 0) {
96             // skip over emoji variation selector
97             U16_PREV(buf, 0, prev_offset, codePoint);
98         }
99         if (isEmojiBase(codePoint)) {
100             return false;
101         }
102     }
103     return true;
104 }
105 
106 // Customized iteratorNext that takes care of both resets and our modifications
107 // to ICU's behavior.
iteratorNext()108 int32_t WordBreaker::iteratorNext() {
109     int32_t result;
110     do {
111         if (mIteratorWasReset) {
112             result = mBreakIterator->following(mCurrent);
113             mIteratorWasReset = false;
114         } else {
115             result = mBreakIterator->next();
116         }
117     } while (!(result == icu::BreakIterator::DONE || (size_t)result == mTextSize
118             || isBreakValid(mText, mTextSize, result)));
119     return result;
120 }
121 
122 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
breakAfter(uint16_t c)123 static bool breakAfter(uint16_t c) {
124     return c == ':' || c == '=' || c == '&';
125 }
126 
127 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
breakBefore(uint16_t c)128 static bool breakBefore(uint16_t c) {
129     return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#'
130             || c == '%' || c == '=' || c == '&';
131 }
132 
133 enum ScanState {
134     START,
135     SAW_AT,
136     SAW_COLON,
137     SAW_COLON_SLASH,
138     SAW_COLON_SLASH_SLASH,
139 };
140 
detectEmailOrUrl()141 void WordBreaker::detectEmailOrUrl() {
142     // scan forward from current ICU position for email address or URL
143     if (mLast >= mScanOffset) {
144         ScanState state = START;
145         size_t i;
146         for (i = mLast; i < mTextSize; i++) {
147             uint16_t c = mText[i];
148             // scan only ASCII characters, stop at space
149             if (!(' ' < c && c <= 0x007E)) {
150                 break;
151             }
152             if (state == START && c == '@') {
153                 state = SAW_AT;
154             } else if (state == START && c == ':') {
155                 state = SAW_COLON;
156             } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
157                 if (c == '/') {
158                     state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
159                 } else {
160                     state = START;
161                 }
162             }
163         }
164         if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
165             if (!mBreakIterator->isBoundary(i)) {
166                 // If there are combining marks or such at the end of the URL or the email address,
167                 // consider them a part of the URL or the email, and skip to the next actual
168                 // boundary.
169                 i = mBreakIterator->following(i);
170             }
171             mInEmailOrUrl = true;
172             mIteratorWasReset = true;
173         } else {
174             mInEmailOrUrl = false;
175         }
176         mScanOffset = i;
177     }
178 }
179 
findNextBreakInEmailOrUrl()180 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
181     // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
182     uint16_t lastChar = mText[mLast];
183     ssize_t i;
184     for (i = mLast + 1; i < mScanOffset; i++) {
185         if (breakAfter(lastChar)) {
186             break;
187         }
188         // break after double slash
189         if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
190             break;
191         }
192         const uint16_t thisChar = mText[i];
193         // never break after hyphen
194         if (lastChar != '-') {
195             if (breakBefore(thisChar)) {
196                 break;
197             }
198             // break before single slash
199             if (thisChar == '/' && lastChar != '/' &&
200                         !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
201                 break;
202             }
203         }
204         lastChar = thisChar;
205     }
206     return i;
207 }
208 
next()209 ssize_t WordBreaker::next() {
210     mLast = mCurrent;
211 
212     detectEmailOrUrl();
213     if (mInEmailOrUrl) {
214         mCurrent = findNextBreakInEmailOrUrl();
215     } else {  // Business as usual
216         mCurrent = (ssize_t) iteratorNext();
217     }
218     return mCurrent;
219 }
220 
wordStart() const221 ssize_t WordBreaker::wordStart() const {
222     if (mInEmailOrUrl) {
223         return mLast;
224     }
225     ssize_t result = mLast;
226     while (result < mCurrent) {
227         UChar32 c;
228         ssize_t ix = result;
229         U16_NEXT(mText, ix, mCurrent, c);
230         const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
231         // strip leading punctuation, defined as OP and QU line breaking classes,
232         // see UAX #14
233         if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
234             break;
235         }
236         result = ix;
237     }
238     return result;
239 }
240 
wordEnd() const241 ssize_t WordBreaker::wordEnd() const {
242     if (mInEmailOrUrl) {
243         return mLast;
244     }
245     ssize_t result = mCurrent;
246     while (result > mLast) {
247         UChar32 c;
248         ssize_t ix = result;
249         U16_PREV(mText, mLast, ix, c);
250         const int32_t gc_mask = U_GET_GC_MASK(c);
251         // strip trailing space and punctuation
252         if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
253             break;
254         }
255         result = ix;
256     }
257     return result;
258 }
259 
breakBadness() const260 int WordBreaker::breakBadness() const {
261     return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
262 }
263 
finish()264 void WordBreaker::finish() {
265     mText = nullptr;
266     // Note: calling utext_close multiply is safe
267     utext_close(&mUText);
268 }
269 
270 }  // namespace minikin
271