1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define VERBOSE_DEBUG 0
18 
19 #include <limits>
20 
21 #define LOG_TAG "Minikin"
22 #include <cutils/log.h>
23 
24 #include <minikin/Layout.h>
25 #include <minikin/LineBreaker.h>
26 
27 using std::vector;
28 
29 namespace android {
30 
31 const int CHAR_TAB = 0x0009;
32 const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
33 
34 // Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
35 // constants are larger than any reasonable actual width score.
36 const float SCORE_INFTY = std::numeric_limits<float>::max();
37 const float SCORE_OVERFULL = 1e12f;
38 const float SCORE_DESPERATE = 1e10f;
39 
40 // Multiplier for hyphen penalty on last line.
41 const float LAST_LINE_PENALTY_MULTIPLIER = 4.0f;
42 // Penalty assigned to each line break (to try to minimize number of lines)
43 // TODO: when we implement full justification (so spaces can shrink and stretch), this is
44 // probably not the most appropriate method.
45 const float LINE_PENALTY_MULTIPLIER = 2.0f;
46 
47 // Very long words trigger O(n^2) behavior in hyphenation, so we disable hyphenation for
48 // unreasonably long words. This is somewhat of a heuristic because extremely long words
49 // are possible in some languages. This does mean that very long real words can get
50 // broken by desperate breaks, with no hyphens.
51 const size_t LONGEST_HYPHENATED_WORD = 45;
52 
53 // When the text buffer is within this limit, capacity of vectors is retained at finish(),
54 // to avoid allocation.
55 const size_t MAX_TEXT_BUF_RETAIN = 32678;
56 
setLocale(const icu::Locale & locale,Hyphenator * hyphenator)57 void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) {
58     delete mBreakIterator;
59     UErrorCode status = U_ZERO_ERROR;
60     mBreakIterator = icu::BreakIterator::createLineInstance(locale, status);
61     // TODO: check status
62 
63     // TODO: load actual resource dependent on locale; letting Minikin do it is a hack
64     mHyphenator = hyphenator;
65 }
66 
setText()67 void LineBreaker::setText() {
68     UErrorCode status = U_ZERO_ERROR;
69     utext_openUChars(&mUText, mTextBuf.data(), mTextBuf.size(), &status);
70     mBreakIterator->setText(&mUText, status);
71     mBreakIterator->first();
72 
73     // handle initial break here because addStyleRun may never be called
74     mBreakIterator->next();
75     mCandidates.clear();
76     Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0};
77     mCandidates.push_back(cand);
78 
79     // reset greedy breaker state
80     mBreaks.clear();
81     mWidths.clear();
82     mFlags.clear();
83     mLastBreak = 0;
84     mBestBreak = 0;
85     mBestScore = SCORE_INFTY;
86     mPreBreak = 0;
87     mFirstTabIndex = INT_MAX;
88 }
89 
setLineWidths(float firstWidth,int firstWidthLineCount,float restWidth)90 void LineBreaker::setLineWidths(float firstWidth, int firstWidthLineCount, float restWidth) {
91     mLineWidths.setWidths(firstWidth, firstWidthLineCount, restWidth);
92 }
93 
94 
setIndents(const std::vector<float> & indents)95 void LineBreaker::setIndents(const std::vector<float>& indents) {
96     mLineWidths.setIndents(indents);
97 }
98 
99 // This function determines whether a character is a space that disappears at end of line.
100 // It is the Unicode set: [[:General_Category=Space_Separator:]-[:Line_Break=Glue:]],
101 // plus '\n'.
102 // Note: all such characters are in the BMP, so it's ok to use code units for this.
isLineEndSpace(uint16_t c)103 static bool isLineEndSpace(uint16_t c) {
104     return c == '\n' || c == ' ' || c == 0x1680 || (0x2000 <= c && c <= 0x200A && c != 0x2007) ||
105             c == 0x205F || c == 0x3000;
106 }
107 
108 // This function determines whether a character is like U+2010 HYPHEN in
109 // line breaking and usage: a character immediately after which line breaks
110 // are allowed, but words containing it should not be automatically
111 // hyphenated. This is a curated set, created by manually inspecting all
112 // the characters that have the Unicode line breaking property of BA or HY
113 // and seeing which ones are hyphens.
isLineBreakingHyphen(uint16_t c)114 static bool isLineBreakingHyphen(uint16_t c) {
115     return (c == 0x002D || // HYPHEN-MINUS
116             c == 0x058A || // ARMENIAN HYPHEN
117             c == 0x05BE || // HEBREW PUNCTUATION MAQAF
118             c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
119             c == 0x2010 || // HYPHEN
120             c == 0x2013 || // EN DASH
121             c == 0x2027 || // HYPHENATION POINT
122             c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
123             c == 0x2E40);  // DOUBLE HYPHEN
124 }
125 
126 // Ordinarily, this method measures the text in the range given. However, when paint
127 // is nullptr, it assumes the widths have already been calculated and stored in the
128 // width buffer.
129 // This method finds the candidate word breaks (using the ICU break iterator) and sends them
130 // to addCandidate.
addStyleRun(MinikinPaint * paint,const FontCollection * typeface,FontStyle style,size_t start,size_t end,bool isRtl)131 float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typeface,
132         FontStyle style, size_t start, size_t end, bool isRtl) {
133     Layout layout;  // performance TODO: move layout to self object to reduce allocation cost?
134     float width = 0.0f;
135     int bidiFlags = isRtl ? kBidi_Force_RTL : kBidi_Force_LTR;
136 
137     float hyphenPenalty = 0.0;
138     if (paint != nullptr) {
139         layout.setFontCollection(typeface);
140         layout.doLayout(mTextBuf.data(), start, end - start, mTextBuf.size(), bidiFlags, style,
141                 *paint);
142         layout.getAdvances(mCharWidths.data() + start);
143         width = layout.getAdvance();
144 
145         // a heuristic that seems to perform well
146         hyphenPenalty = 0.5 * paint->size * paint->scaleX * mLineWidths.getLineWidth(0);
147         if (mHyphenationFrequency == kHyphenationFrequency_Normal) {
148             hyphenPenalty *= 4.0; // TODO: Replace with a better value after some testing
149         }
150 
151         mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER);
152     }
153 
154     size_t current = (size_t)mBreakIterator->current();
155     size_t wordEnd = start;
156     size_t lastBreak = start;
157     ParaWidth lastBreakWidth = mWidth;
158     ParaWidth postBreak = mWidth;
159     bool temporarilySkipHyphenation = false;
160     for (size_t i = start; i < end; i++) {
161         uint16_t c = mTextBuf[i];
162         if (c == CHAR_TAB) {
163             mWidth = mPreBreak + mTabStops.nextTab(mWidth - mPreBreak);
164             if (mFirstTabIndex == INT_MAX) {
165                 mFirstTabIndex = (int)i;
166             }
167             // fall back to greedy; other modes don't know how to deal with tabs
168             mStrategy = kBreakStrategy_Greedy;
169         } else {
170             mWidth += mCharWidths[i];
171             if (!isLineEndSpace(c)) {
172                 postBreak = mWidth;
173                 wordEnd = i + 1;
174             }
175         }
176         if (i + 1 == current) {
177             // Override ICU's treatment of soft hyphen as a break opportunity, because we want it
178             // to be a hyphen break, with penalty and drawing behavior.
179             if (c != CHAR_SOFT_HYPHEN) {
180                 // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
181                 // we can pass the whole word down to Hyphenator like the soft hyphen case.
182                 bool wordEndsInHyphen = isLineBreakingHyphen(c);
183                 if (paint != nullptr && mHyphenator != nullptr &&
184                         mHyphenationFrequency != kHyphenationFrequency_None &&
185                         !wordEndsInHyphen && !temporarilySkipHyphenation &&
186                         wordEnd > lastBreak && wordEnd - lastBreak <= LONGEST_HYPHENATED_WORD) {
187                     mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[lastBreak], wordEnd - lastBreak);
188     #if VERBOSE_DEBUG
189                     std::string hyphenatedString;
190                     for (size_t j = lastBreak; j < wordEnd; j++) {
191                         if (mHyphBuf[j - lastBreak]) hyphenatedString.push_back('-');
192                         // Note: only works with ASCII, should do UTF-8 conversion here
193                         hyphenatedString.push_back(buffer()[j]);
194                     }
195                     ALOGD("hyphenated string: %s", hyphenatedString.c_str());
196     #endif
197 
198                     // measure hyphenated substrings
199                     for (size_t j = lastBreak; j < wordEnd; j++) {
200                         uint8_t hyph = mHyphBuf[j - lastBreak];
201                         if (hyph) {
202                             paint->hyphenEdit = hyph;
203                             layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
204                                     mTextBuf.size(), bidiFlags, style, *paint);
205                             ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
206                             paint->hyphenEdit = 0;
207                             layout.doLayout(mTextBuf.data(), j, wordEnd - j,
208                                     mTextBuf.size(), bidiFlags, style, *paint);
209                             ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
210                             addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
211                         }
212                     }
213                 }
214                 // Skip hyphenating the next word if and only if the present word ends in a hyphen
215                 temporarilySkipHyphenation = wordEndsInHyphen;
216 
217                 // Skip break for zero-width characters inside replacement span
218                 if (paint != nullptr || current == end || mCharWidths[current] > 0) {
219                     addWordBreak(current, mWidth, postBreak, 0.0, 0);
220                 }
221                 lastBreak = current;
222                 lastBreakWidth = mWidth;
223             }
224             current = (size_t)mBreakIterator->next();
225         }
226     }
227 
228     return width;
229 }
230 
231 // add a word break (possibly for a hyphenated fragment), and add desperate breaks if
232 // needed (ie when word exceeds current line width)
addWordBreak(size_t offset,ParaWidth preBreak,ParaWidth postBreak,float penalty,uint8_t hyph)233 void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth postBreak,
234         float penalty, uint8_t hyph) {
235     Candidate cand;
236     ParaWidth width = mCandidates.back().preBreak;
237     if (postBreak - width > currentLineWidth()) {
238         // Add desperate breaks.
239         // Note: these breaks are based on the shaping of the (non-broken) original text; they
240         // are imprecise especially in the presence of kerning, ligatures, and Arabic shaping.
241         size_t i = mCandidates.back().offset;
242         width += mCharWidths[i++];
243         for (; i < offset; i++) {
244             float w = mCharWidths[i];
245             if (w > 0) {
246                 cand.offset = i;
247                 cand.preBreak = width;
248                 cand.postBreak = width;
249                 cand.penalty = SCORE_DESPERATE;
250                 cand.hyphenEdit = 0;
251 #if VERBOSE_DEBUG
252                 ALOGD("desperate cand: %d %g:%g",
253                         mCandidates.size(), cand.postBreak, cand.preBreak);
254 #endif
255                 addCandidate(cand);
256                 width += w;
257             }
258         }
259     }
260 
261     cand.offset = offset;
262     cand.preBreak = preBreak;
263     cand.postBreak = postBreak;
264     cand.penalty = penalty;
265     cand.hyphenEdit = hyph;
266 #if VERBOSE_DEBUG
267     ALOGD("cand: %d %g:%g", mCandidates.size(), cand.postBreak, cand.preBreak);
268 #endif
269     addCandidate(cand);
270 }
271 
272 // TODO performance: could avoid populating mCandidates if greedy only
addCandidate(Candidate cand)273 void LineBreaker::addCandidate(Candidate cand) {
274     size_t candIndex = mCandidates.size();
275     mCandidates.push_back(cand);
276     if (cand.postBreak - mPreBreak > currentLineWidth()) {
277         // This break would create an overfull line, pick the best break and break there (greedy)
278         if (mBestBreak == mLastBreak) {
279             mBestBreak = candIndex;
280         }
281         pushBreak(mCandidates[mBestBreak].offset, mCandidates[mBestBreak].postBreak - mPreBreak,
282                 mCandidates[mBestBreak].hyphenEdit);
283         mBestScore = SCORE_INFTY;
284 #if VERBOSE_DEBUG
285         ALOGD("break: %d %g", mBreaks.back(), mWidths.back());
286 #endif
287         mLastBreak = mBestBreak;
288         mPreBreak = mCandidates[mBestBreak].preBreak;
289     }
290     if (cand.penalty <= mBestScore) {
291         mBestBreak = candIndex;
292         mBestScore = cand.penalty;
293     }
294 }
295 
pushBreak(int offset,float width,uint8_t hyph)296 void LineBreaker::pushBreak(int offset, float width, uint8_t hyph) {
297     mBreaks.push_back(offset);
298     mWidths.push_back(width);
299     int flags = (mFirstTabIndex < mBreaks.back()) << kTab_Shift;
300     flags |= hyph;
301     mFlags.push_back(flags);
302     mFirstTabIndex = INT_MAX;
303 }
304 
addReplacement(size_t start,size_t end,float width)305 void LineBreaker::addReplacement(size_t start, size_t end, float width) {
306     mCharWidths[start] = width;
307     std::fill(&mCharWidths[start + 1], &mCharWidths[end], 0.0f);
308     addStyleRun(nullptr, nullptr, FontStyle(), start, end, false);
309 }
310 
currentLineWidth() const311 float LineBreaker::currentLineWidth() const {
312     return mLineWidths.getLineWidth(mBreaks.size());
313 }
314 
computeBreaksGreedy()315 void LineBreaker::computeBreaksGreedy() {
316     // All breaks but the last have been added in addCandidate already.
317     size_t nCand = mCandidates.size();
318     if (nCand == 1 || mLastBreak != nCand - 1) {
319         pushBreak(mCandidates[nCand - 1].offset, mCandidates[nCand - 1].postBreak - mPreBreak, 0);
320         // don't need to update mBestScore, because we're done
321 #if VERBOSE_DEBUG
322         ALOGD("final break: %d %g", mBreaks.back(), mWidths.back());
323 #endif
324     }
325 }
326 
327 // Follow "prev" links in mCandidates array, and copy to result arrays.
finishBreaksOptimal()328 void LineBreaker::finishBreaksOptimal() {
329     // clear existing greedy break result
330     mBreaks.clear();
331     mWidths.clear();
332     mFlags.clear();
333     size_t nCand = mCandidates.size();
334     size_t prev;
335     for (size_t i = nCand - 1; i > 0; i = prev) {
336         prev = mCandidates[i].prev;
337         mBreaks.push_back(mCandidates[i].offset);
338         mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak);
339         mFlags.push_back(mCandidates[i].hyphenEdit);
340     }
341     std::reverse(mBreaks.begin(), mBreaks.end());
342     std::reverse(mWidths.begin(), mWidths.end());
343     std::reverse(mFlags.begin(), mFlags.end());
344 }
345 
computeBreaksOptimal(bool isRectangle)346 void LineBreaker::computeBreaksOptimal(bool isRectangle) {
347     size_t active = 0;
348     size_t nCand = mCandidates.size();
349     float width = mLineWidths.getLineWidth(0);
350     for (size_t i = 1; i < nCand; i++) {
351         bool atEnd = i == nCand - 1;
352         float best = SCORE_INFTY;
353         size_t bestPrev = 0;
354         size_t lineNumberLast = 0;
355 
356         if (!isRectangle) {
357             size_t lineNumberLast = mCandidates[active].lineNumber;
358             width = mLineWidths.getLineWidth(lineNumberLast);
359         }
360         ParaWidth leftEdge = mCandidates[i].postBreak - width;
361         float bestHope = 0;
362 
363         for (size_t j = active; j < i; j++) {
364             if (!isRectangle) {
365                 size_t lineNumber = mCandidates[j].lineNumber;
366                 if (lineNumber != lineNumberLast) {
367                     float widthNew = mLineWidths.getLineWidth(lineNumber);
368                     if (widthNew != width) {
369                         leftEdge = mCandidates[i].postBreak - width;
370                         bestHope = 0;
371                         width = widthNew;
372                     }
373                     lineNumberLast = lineNumber;
374                 }
375             }
376             float jScore = mCandidates[j].score;
377             if (jScore + bestHope >= best) continue;
378             float delta = mCandidates[j].preBreak - leftEdge;
379 
380             // compute width score for line
381 
382             // Note: the "bestHope" optimization makes the assumption that, when delta is
383             // non-negative, widthScore will increase monotonically as successive candidate
384             // breaks are considered.
385             float widthScore = 0.0f;
386             float additionalPenalty = 0.0f;
387             if (delta < 0) {
388                 widthScore = SCORE_OVERFULL;
389             } else if (atEnd && mStrategy != kBreakStrategy_Balanced) {
390                 // increase penalty for hyphen on last line
391                 additionalPenalty = LAST_LINE_PENALTY_MULTIPLIER * mCandidates[j].penalty;
392             } else {
393                 widthScore = delta * delta;
394             }
395 
396             if (delta < 0) {
397                 active = j + 1;
398             } else {
399                 bestHope = widthScore;
400             }
401 
402             float score = jScore + widthScore + additionalPenalty;
403             if (score <= best) {
404                 best = score;
405                 bestPrev = j;
406             }
407         }
408         mCandidates[i].score = best + mCandidates[i].penalty + mLinePenalty;
409         mCandidates[i].prev = bestPrev;
410         mCandidates[i].lineNumber = mCandidates[bestPrev].lineNumber + 1;
411 #if VERBOSE_DEBUG
412         ALOGD("break %d: score=%g, prev=%d", i, mCandidates[i].score, mCandidates[i].prev);
413 #endif
414     }
415     finishBreaksOptimal();
416 }
417 
computeBreaks()418 size_t LineBreaker::computeBreaks() {
419     if (mStrategy == kBreakStrategy_Greedy) {
420         computeBreaksGreedy();
421     } else {
422         computeBreaksOptimal(mLineWidths.isConstant());
423     }
424     return mBreaks.size();
425 }
426 
finish()427 void LineBreaker::finish() {
428     mWidth = 0;
429     mCandidates.clear();
430     mBreaks.clear();
431     mWidths.clear();
432     mFlags.clear();
433     if (mTextBuf.size() > MAX_TEXT_BUF_RETAIN) {
434         mTextBuf.clear();
435         mTextBuf.shrink_to_fit();
436         mCharWidths.clear();
437         mCharWidths.shrink_to_fit();
438         mHyphBuf.clear();
439         mHyphBuf.shrink_to_fit();
440         mCandidates.shrink_to_fit();
441         mBreaks.shrink_to_fit();
442         mWidths.shrink_to_fit();
443         mFlags.shrink_to_fit();
444     }
445     mStrategy = kBreakStrategy_Greedy;
446     mHyphenationFrequency = kHyphenationFrequency_Normal;
447     mLinePenalty = 0.0f;
448 }
449 
450 }  // namespace android
451