1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "WordBreaker.h"
18 
19 #include <list>
20 #include <map>
21 
22 #include <unicode/uchar.h>
23 #include <unicode/utf16.h>
24 
25 #include "minikin/Emoji.h"
26 #include "minikin/Hyphenator.h"
27 
28 #include "Locale.h"
29 #include "MinikinInternal.h"
30 
31 namespace minikin {
32 
33 namespace {
createNewIterator(const Locale & locale)34 static icu::BreakIterator* createNewIterator(const Locale& locale) {
35     // TODO: handle failure status
36     UErrorCode status = U_ZERO_ERROR;
37     return icu::BreakIterator::createLineInstance(
38             locale.isUnsupported() ? icu::Locale::getRoot()
39                                    : icu::Locale::createFromName(locale.getString().c_str()),
40             status);
41 }
42 }  // namespace
43 
acquire(const Locale & locale)44 ICULineBreakerPool::Slot ICULineBreakerPoolImpl::acquire(const Locale& locale) {
45     const uint64_t id = locale.getIdentifier();
46     std::lock_guard<std::mutex> lock(mMutex);
47     for (auto i = mPool.begin(); i != mPool.end(); i++) {
48         if (i->localeId == id) {
49             Slot slot = std::move(*i);
50             mPool.erase(i);
51             return slot;
52         }
53     }
54 
55     // Not found in pool. Create new one.
56     return {id, std::unique_ptr<icu::BreakIterator>(createNewIterator(locale))};
57 }
58 
release(ICULineBreakerPool::Slot && slot)59 void ICULineBreakerPoolImpl::release(ICULineBreakerPool::Slot&& slot) {
60     if (slot.breaker.get() == nullptr) {
61         return;  // Already released slot. Do nothing.
62     }
63     std::lock_guard<std::mutex> lock(mMutex);
64     if (mPool.size() >= MAX_POOL_SIZE) {
65         // Pool is full. Move to local variable, so that the given slot will be released when the
66         // variable leaves the scope.
67         Slot localSlot = std::move(slot);
68         return;
69     }
70     mPool.push_front(std::move(slot));
71 }
72 
WordBreaker()73 WordBreaker::WordBreaker() : mPool(&ICULineBreakerPoolImpl::getInstance()) {}
74 
WordBreaker(ICULineBreakerPool * pool)75 WordBreaker::WordBreaker(ICULineBreakerPool* pool) : mPool(pool) {}
76 
followingWithLocale(const Locale & locale,size_t from)77 ssize_t WordBreaker::followingWithLocale(const Locale& locale, size_t from) {
78     mIcuBreaker = mPool->acquire(locale);
79     UErrorCode status = U_ZERO_ERROR;
80     MINIKIN_ASSERT(mText != nullptr, "setText must be called first");
81     // TODO: handle failure status
82     mIcuBreaker.breaker->setText(&mUText, status);
83     if (mInEmailOrUrl) {
84         // Note:
85         // Don't reset mCurrent, mLast, or mScanOffset for keeping email/URL context.
86         // The email/URL detection doesn't support following() functionality, so that we can't
87         // restart from the specific position. This means following() can not be supported in
88         // general, but keeping old email/URL context works for LineBreaker since it just wants to
89         // re-calculate the next break point with the new locale.
90     } else {
91         mCurrent = mLast = mScanOffset = from;
92         next();
93     }
94     return mCurrent;
95 }
96 
setText(const uint16_t * data,size_t size)97 void WordBreaker::setText(const uint16_t* data, size_t size) {
98     mText = data;
99     mTextSize = size;
100     mLast = 0;
101     mCurrent = 0;
102     mScanOffset = 0;
103     mInEmailOrUrl = false;
104     UErrorCode status = U_ZERO_ERROR;
105     utext_openUChars(&mUText, reinterpret_cast<const UChar*>(data), size, &status);
106 }
107 
current() const108 ssize_t WordBreaker::current() const {
109     return mCurrent;
110 }
111 
112 /**
113  * Determine whether a line break at position i within the buffer buf is valid. This
114  * represents customization beyond the ICU behavior, because plain ICU provides some
115  * line break opportunities that we don't want.
116  **/
isValidBreak(const uint16_t * buf,size_t bufEnd,int32_t i)117 static bool isValidBreak(const uint16_t* buf, size_t bufEnd, int32_t i) {
118     const size_t position = static_cast<size_t>(i);
119     if (i == icu::BreakIterator::DONE || position == bufEnd) {
120         // If the iterator reaches the end, treat as break.
121         return true;
122     }
123     uint32_t codePoint;
124     size_t prev_offset = position;
125     U16_PREV(buf, 0, prev_offset, codePoint);
126     // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
127     if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
128         return false;
129     }
130     // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
131     // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
132     // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
133     // where no line break could be imagined, since the Myanmar virama is a pure stacker.
134     if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
135         return false;
136     }
137 
138     uint32_t next_codepoint;
139     size_t next_offset = position;
140     U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
141 
142     // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
143     // emoji data than ICU does.
144     if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
145         return false;
146     }
147 
148     // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
149     if (isEmojiModifier(next_codepoint)) {
150         if (codePoint == 0xFE0F && prev_offset > 0) {
151             // skip over emoji variation selector
152             U16_PREV(buf, 0, prev_offset, codePoint);
153         }
154         if (isEmojiBase(codePoint)) {
155             return false;
156         }
157     }
158     return true;
159 }
160 
161 // Customized iteratorNext that takes care of both resets and our modifications
162 // to ICU's behavior.
iteratorNext()163 int32_t WordBreaker::iteratorNext() {
164     int32_t result = mIcuBreaker.breaker->following(mCurrent);
165     while (!isValidBreak(mText, mTextSize, result)) {
166         result = mIcuBreaker.breaker->next();
167     }
168     return result;
169 }
170 
171 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
breakAfter(uint16_t c)172 static bool breakAfter(uint16_t c) {
173     return c == ':' || c == '=' || c == '&';
174 }
175 
176 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
breakBefore(uint16_t c)177 static bool breakBefore(uint16_t c) {
178     return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' ||
179            c == '%' || c == '=' || c == '&';
180 }
181 
182 enum ScanState {
183     START,
184     SAW_AT,
185     SAW_COLON,
186     SAW_COLON_SLASH,
187     SAW_COLON_SLASH_SLASH,
188 };
189 
detectEmailOrUrl()190 void WordBreaker::detectEmailOrUrl() {
191     // scan forward from current ICU position for email address or URL
192     if (mLast >= mScanOffset) {
193         ScanState state = START;
194         size_t i;
195         for (i = mLast; i < mTextSize; i++) {
196             uint16_t c = mText[i];
197             // scan only ASCII characters, stop at space
198             if (!(' ' < c && c <= 0x007E)) {
199                 break;
200             }
201             if (state == START && c == '@') {
202                 state = SAW_AT;
203             } else if (state == START && c == ':') {
204                 state = SAW_COLON;
205             } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
206                 if (c == '/') {
207                     state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
208                 } else {
209                     state = START;
210                 }
211             }
212         }
213         if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
214             if (!mIcuBreaker.breaker->isBoundary(i)) {
215                 // If there are combining marks or such at the end of the URL or the email address,
216                 // consider them a part of the URL or the email, and skip to the next actual
217                 // boundary.
218                 i = mIcuBreaker.breaker->following(i);
219             }
220             mInEmailOrUrl = true;
221         } else {
222             mInEmailOrUrl = false;
223         }
224         mScanOffset = i;
225     }
226 }
227 
findNextBreakInEmailOrUrl()228 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
229     // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
230     uint16_t lastChar = mText[mLast];
231     ssize_t i;
232     for (i = mLast + 1; i < mScanOffset; i++) {
233         if (breakAfter(lastChar)) {
234             break;
235         }
236         // break after double slash
237         if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
238             break;
239         }
240         const uint16_t thisChar = mText[i];
241         // never break after hyphen
242         if (lastChar != '-') {
243             if (breakBefore(thisChar)) {
244                 break;
245             }
246             // break before single slash
247             if (thisChar == '/' && lastChar != '/' &&
248                 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
249                 break;
250             }
251         }
252         lastChar = thisChar;
253     }
254     return i;
255 }
256 
next()257 ssize_t WordBreaker::next() {
258     mLast = mCurrent;
259 
260     detectEmailOrUrl();
261     if (mInEmailOrUrl) {
262         mCurrent = findNextBreakInEmailOrUrl();
263     } else {  // Business as usual
264         mCurrent = (ssize_t)iteratorNext();
265     }
266     return mCurrent;
267 }
268 
wordStart() const269 ssize_t WordBreaker::wordStart() const {
270     if (mInEmailOrUrl) {
271         return mLast;
272     }
273     ssize_t result = mLast;
274     while (result < mCurrent) {
275         UChar32 c;
276         ssize_t ix = result;
277         U16_NEXT(mText, ix, mCurrent, c);
278         const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
279         // strip leading punctuation, defined as OP and QU line breaking classes,
280         // see UAX #14
281         if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
282             break;
283         }
284         result = ix;
285     }
286     return result;
287 }
288 
wordEnd() const289 ssize_t WordBreaker::wordEnd() const {
290     if (mInEmailOrUrl) {
291         return mLast;
292     }
293     ssize_t result = mCurrent;
294     while (result > mLast) {
295         UChar32 c;
296         ssize_t ix = result;
297         U16_PREV(mText, mLast, ix, c);
298         const int32_t gc_mask = U_GET_GC_MASK(c);
299         // strip trailing spaces, punctuation and control characters
300         if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK)) == 0) {
301             break;
302         }
303         result = ix;
304     }
305     return result;
306 }
307 
breakBadness() const308 int WordBreaker::breakBadness() const {
309     return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
310 }
311 
finish()312 void WordBreaker::finish() {
313     mText = nullptr;
314     // Note: calling utext_close multiply is safe
315     utext_close(&mUText);
316     mPool->release(std::move(mIcuBreaker));
317 }
318 
319 }  // namespace minikin
320