1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2016, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_BREAK_ITERATION
13 
14 #include "brkeng.h"
15 #include "dictbe.h"
16 #include "unicode/uniset.h"
17 #include "unicode/chariter.h"
18 #include "unicode/ubrk.h"
19 #include "uvectr32.h"
20 #include "uvector.h"
21 #include "uassert.h"
22 #include "unicode/normlzr.h"
23 #include "cmemory.h"
24 #include "dictionarydata.h"
25 
26 U_NAMESPACE_BEGIN
27 
28 /*
29  ******************************************************************
30  */
31 
DictionaryBreakEngine()32 DictionaryBreakEngine::DictionaryBreakEngine() {
33 }
34 
~DictionaryBreakEngine()35 DictionaryBreakEngine::~DictionaryBreakEngine() {
36 }
37 
38 UBool
handles(UChar32 c) const39 DictionaryBreakEngine::handles(UChar32 c) const {
40     return fSet.contains(c);
41 }
42 
43 int32_t
findBreaks(UText * text,int32_t startPos,int32_t endPos,UVector32 & foundBreaks) const44 DictionaryBreakEngine::findBreaks( UText *text,
45                                  int32_t startPos,
46                                  int32_t endPos,
47                                  UVector32 &foundBreaks ) const {
48     (void)startPos;            // TODO: remove this param?
49     int32_t result = 0;
50 
51     // Find the span of characters included in the set.
52     //   The span to break begins at the current position in the text, and
53     //   extends towards the start or end of the text, depending on 'reverse'.
54 
55     int32_t start = (int32_t)utext_getNativeIndex(text);
56     int32_t current;
57     int32_t rangeStart;
58     int32_t rangeEnd;
59     UChar32 c = utext_current32(text);
60     while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
61         utext_next32(text);         // TODO:  recast loop for postincrement
62         c = utext_current32(text);
63     }
64     rangeStart = start;
65     rangeEnd = current;
66     result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
67     utext_setNativeIndex(text, current);
68 
69     return result;
70 }
71 
72 void
setCharacters(const UnicodeSet & set)73 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
74     fSet = set;
75     // Compact for caching
76     fSet.compact();
77 }
78 
79 /*
80  ******************************************************************
81  * PossibleWord
82  */
83 
84 // Helper class for improving readability of the Thai/Lao/Khmer word break
85 // algorithm. The implementation is completely inline.
86 
87 // List size, limited by the maximum number of words in the dictionary
88 // that form a nested sequence.
89 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
90 
91 class PossibleWord {
92 private:
93     // list of word candidate lengths, in increasing length order
94     // TODO: bytes would be sufficient for word lengths.
95     int32_t   count;      // Count of candidates
96     int32_t   prefix;     // The longest match with a dictionary word
97     int32_t   offset;     // Offset in the text of these candidates
98     int32_t   mark;       // The preferred candidate's offset
99     int32_t   current;    // The candidate we're currently looking at
100     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
101     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
102 
103 public:
PossibleWord()104     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
~PossibleWord()105     ~PossibleWord() {};
106 
107     // Fill the list of candidates if needed, select the longest, and return the number found
108     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
109 
110     // Select the currently marked candidate, point after it in the text, and invalidate self
111     int32_t   acceptMarked( UText *text );
112 
113     // Back up from the current candidate to the next shorter one; return TRUE if that exists
114     // and point the text after it
115     UBool     backUp( UText *text );
116 
117     // Return the longest prefix this candidate location shares with a dictionary word
118     // Return value is in code points.
longestPrefix()119     int32_t   longestPrefix() { return prefix; };
120 
121     // Mark the current candidate as the one we like
markCurrent()122     void      markCurrent() { mark = current; };
123 
124     // Get length in code points of the marked word.
markedCPLength()125     int32_t   markedCPLength() { return cpLengths[mark]; };
126 };
127 
128 
candidates(UText * text,DictionaryMatcher * dict,int32_t rangeEnd)129 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
130     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
131     int32_t start = (int32_t)utext_getNativeIndex(text);
132     if (start != offset) {
133         offset = start;
134         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
135         // Dictionary leaves text after longest prefix, not longest word. Back up.
136         if (count <= 0) {
137             utext_setNativeIndex(text, start);
138         }
139     }
140     if (count > 0) {
141         utext_setNativeIndex(text, start+cuLengths[count-1]);
142     }
143     current = count-1;
144     mark = current;
145     return count;
146 }
147 
148 int32_t
acceptMarked(UText * text)149 PossibleWord::acceptMarked( UText *text ) {
150     utext_setNativeIndex(text, offset + cuLengths[mark]);
151     return cuLengths[mark];
152 }
153 
154 
155 UBool
backUp(UText * text)156 PossibleWord::backUp( UText *text ) {
157     if (current > 0) {
158         utext_setNativeIndex(text, offset + cuLengths[--current]);
159         return TRUE;
160     }
161     return FALSE;
162 }
163 
164 /*
165  ******************************************************************
166  * ThaiBreakEngine
167  */
168 
169 // How many words in a row are "good enough"?
170 static const int32_t THAI_LOOKAHEAD = 3;
171 
172 // Will not combine a non-word with a preceding dictionary word longer than this
173 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
174 
175 // Will not combine a non-word that shares at least this much prefix with a
176 // dictionary word, with a preceding word
177 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
178 
179 // Ellision character
180 static const int32_t THAI_PAIYANNOI = 0x0E2F;
181 
182 // Repeat character
183 static const int32_t THAI_MAIYAMOK = 0x0E46;
184 
185 // Minimum word size
186 static const int32_t THAI_MIN_WORD = 2;
187 
188 // Minimum number of characters for two words
189 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
190 
ThaiBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)191 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
192     : DictionaryBreakEngine(),
193       fDictionary(adoptDictionary)
194 {
195     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
196     if (U_SUCCESS(status)) {
197         setCharacters(fThaiWordSet);
198     }
199     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
200     fMarkSet.add(0x0020);
201     fEndWordSet = fThaiWordSet;
202     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
203     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
204     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
205     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
206     fSuffixSet.add(THAI_PAIYANNOI);
207     fSuffixSet.add(THAI_MAIYAMOK);
208 
209     // Compact for caching.
210     fMarkSet.compact();
211     fEndWordSet.compact();
212     fBeginWordSet.compact();
213     fSuffixSet.compact();
214 }
215 
~ThaiBreakEngine()216 ThaiBreakEngine::~ThaiBreakEngine() {
217     delete fDictionary;
218 }
219 
220 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const221 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
222                                                 int32_t rangeStart,
223                                                 int32_t rangeEnd,
224                                                 UVector32 &foundBreaks ) const {
225     utext_setNativeIndex(text, rangeStart);
226     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
227     if (utext_getNativeIndex(text) >= rangeEnd) {
228         return 0;       // Not enough characters for two words
229     }
230     utext_setNativeIndex(text, rangeStart);
231 
232 
233     uint32_t wordsFound = 0;
234     int32_t cpWordLength = 0;    // Word Length in Code Points.
235     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
236     int32_t current;
237     UErrorCode status = U_ZERO_ERROR;
238     PossibleWord words[THAI_LOOKAHEAD];
239 
240     utext_setNativeIndex(text, rangeStart);
241 
242     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
243         cpWordLength = 0;
244         cuWordLength = 0;
245 
246         // Look for candidate words at the current position
247         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
248 
249         // If we found exactly one, use that
250         if (candidates == 1) {
251             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
252             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
253             wordsFound += 1;
254         }
255         // If there was more than one, see which one can take us forward the most words
256         else if (candidates > 1) {
257             // If we're already at the end of the range, we're done
258             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
259                 goto foundBest;
260             }
261             do {
262                 int32_t wordsMatched = 1;
263                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
264                     if (wordsMatched < 2) {
265                         // Followed by another dictionary word; mark first word as a good candidate
266                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
267                         wordsMatched = 2;
268                     }
269 
270                     // If we're already at the end of the range, we're done
271                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
272                         goto foundBest;
273                     }
274 
275                     // See if any of the possible second words is followed by a third word
276                     do {
277                         // If we find a third word, stop right away
278                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
279                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
280                             goto foundBest;
281                         }
282                     }
283                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
284                 }
285             }
286             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
287 foundBest:
288             // Set UText position to after the accepted word.
289             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
290             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
291             wordsFound += 1;
292         }
293 
294         // We come here after having either found a word or not. We look ahead to the
295         // next word. If it's not a dictionary word, we will combine it with the word we
296         // just found (if there is one), but only if the preceding word does not exceed
297         // the threshold.
298         // The text iterator should now be positioned at the end of the word we found.
299 
300         UChar32 uc = 0;
301         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
302             // if it is a dictionary word, do nothing. If it isn't, then if there is
303             // no preceding word, or the non-word shares less than the minimum threshold
304             // of characters with a dictionary word, then scan to resynchronize
305             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
306                   && (cuWordLength == 0
307                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
308                 // Look for a plausible word boundary
309                 int32_t remaining = rangeEnd - (current+cuWordLength);
310                 UChar32 pc;
311                 int32_t chars = 0;
312                 for (;;) {
313                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
314                     pc = utext_next32(text);
315                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
316                     chars += pcSize;
317                     remaining -= pcSize;
318                     if (remaining <= 0) {
319                         break;
320                     }
321                     uc = utext_current32(text);
322                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
323                         // Maybe. See if it's in the dictionary.
324                         // NOTE: In the original Apple code, checked that the next
325                         // two characters after uc were not 0x0E4C THANTHAKHAT before
326                         // checking the dictionary. That is just a performance filter,
327                         // but it's not clear it's faster than checking the trie.
328                         int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
329                         utext_setNativeIndex(text, current + cuWordLength + chars);
330                         if (num_candidates > 0) {
331                             break;
332                         }
333                     }
334                 }
335 
336                 // Bump the word count if there wasn't already one
337                 if (cuWordLength <= 0) {
338                     wordsFound += 1;
339                 }
340 
341                 // Update the length with the passed-over characters
342                 cuWordLength += chars;
343             }
344             else {
345                 // Back up to where we were for next iteration
346                 utext_setNativeIndex(text, current+cuWordLength);
347             }
348         }
349 
350         // Never stop before a combining mark.
351         int32_t currPos;
352         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
353             utext_next32(text);
354             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
355         }
356 
357         // Look ahead for possible suffixes if a dictionary word does not follow.
358         // We do this in code rather than using a rule so that the heuristic
359         // resynch continues to function. For example, one of the suffix characters
360         // could be a typo in the middle of a word.
361         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
362             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
363                 && fSuffixSet.contains(uc = utext_current32(text))) {
364                 if (uc == THAI_PAIYANNOI) {
365                     if (!fSuffixSet.contains(utext_previous32(text))) {
366                         // Skip over previous end and PAIYANNOI
367                         utext_next32(text);
368                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
369                         utext_next32(text);
370                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
371                         uc = utext_current32(text);     // Fetch next character
372                     }
373                     else {
374                         // Restore prior position
375                         utext_next32(text);
376                     }
377                 }
378                 if (uc == THAI_MAIYAMOK) {
379                     if (utext_previous32(text) != THAI_MAIYAMOK) {
380                         // Skip over previous end and MAIYAMOK
381                         utext_next32(text);
382                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
383                         utext_next32(text);
384                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
385                     }
386                     else {
387                         // Restore prior position
388                         utext_next32(text);
389                     }
390                 }
391             }
392             else {
393                 utext_setNativeIndex(text, current+cuWordLength);
394             }
395         }
396 
397         // Did we find a word on this iteration? If so, push it on the break stack
398         if (cuWordLength > 0) {
399             foundBreaks.push((current+cuWordLength), status);
400         }
401     }
402 
403     // Don't return a break for the end of the dictionary range if there is one there.
404     if (foundBreaks.peeki() >= rangeEnd) {
405         (void) foundBreaks.popi();
406         wordsFound -= 1;
407     }
408 
409     return wordsFound;
410 }
411 
412 /*
413  ******************************************************************
414  * LaoBreakEngine
415  */
416 
417 // How many words in a row are "good enough"?
418 static const int32_t LAO_LOOKAHEAD = 3;
419 
420 // Will not combine a non-word with a preceding dictionary word longer than this
421 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
422 
423 // Will not combine a non-word that shares at least this much prefix with a
424 // dictionary word, with a preceding word
425 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
426 
427 // Minimum word size
428 static const int32_t LAO_MIN_WORD = 2;
429 
430 // Minimum number of characters for two words
431 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
432 
LaoBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)433 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
434     : DictionaryBreakEngine(),
435       fDictionary(adoptDictionary)
436 {
437     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
438     if (U_SUCCESS(status)) {
439         setCharacters(fLaoWordSet);
440     }
441     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
442     fMarkSet.add(0x0020);
443     fEndWordSet = fLaoWordSet;
444     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
445     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
446     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
447     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
448 
449     // Compact for caching.
450     fMarkSet.compact();
451     fEndWordSet.compact();
452     fBeginWordSet.compact();
453 }
454 
~LaoBreakEngine()455 LaoBreakEngine::~LaoBreakEngine() {
456     delete fDictionary;
457 }
458 
459 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const460 LaoBreakEngine::divideUpDictionaryRange( UText *text,
461                                                 int32_t rangeStart,
462                                                 int32_t rangeEnd,
463                                                 UVector32 &foundBreaks ) const {
464     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
465         return 0;       // Not enough characters for two words
466     }
467 
468     uint32_t wordsFound = 0;
469     int32_t cpWordLength = 0;
470     int32_t cuWordLength = 0;
471     int32_t current;
472     UErrorCode status = U_ZERO_ERROR;
473     PossibleWord words[LAO_LOOKAHEAD];
474 
475     utext_setNativeIndex(text, rangeStart);
476 
477     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
478         cuWordLength = 0;
479         cpWordLength = 0;
480 
481         // Look for candidate words at the current position
482         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
483 
484         // If we found exactly one, use that
485         if (candidates == 1) {
486             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
487             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
488             wordsFound += 1;
489         }
490         // If there was more than one, see which one can take us forward the most words
491         else if (candidates > 1) {
492             // If we're already at the end of the range, we're done
493             if (utext_getNativeIndex(text) >= rangeEnd) {
494                 goto foundBest;
495             }
496             do {
497                 int32_t wordsMatched = 1;
498                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
499                     if (wordsMatched < 2) {
500                         // Followed by another dictionary word; mark first word as a good candidate
501                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
502                         wordsMatched = 2;
503                     }
504 
505                     // If we're already at the end of the range, we're done
506                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
507                         goto foundBest;
508                     }
509 
510                     // See if any of the possible second words is followed by a third word
511                     do {
512                         // If we find a third word, stop right away
513                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
514                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
515                             goto foundBest;
516                         }
517                     }
518                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
519                 }
520             }
521             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
522 foundBest:
523             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
524             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
525             wordsFound += 1;
526         }
527 
528         // We come here after having either found a word or not. We look ahead to the
529         // next word. If it's not a dictionary word, we will combine it withe the word we
530         // just found (if there is one), but only if the preceding word does not exceed
531         // the threshold.
532         // The text iterator should now be positioned at the end of the word we found.
533         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
534             // if it is a dictionary word, do nothing. If it isn't, then if there is
535             // no preceding word, or the non-word shares less than the minimum threshold
536             // of characters with a dictionary word, then scan to resynchronize
537             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
538                   && (cuWordLength == 0
539                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
540                 // Look for a plausible word boundary
541                 int32_t remaining = rangeEnd - (current + cuWordLength);
542                 UChar32 pc;
543                 UChar32 uc;
544                 int32_t chars = 0;
545                 for (;;) {
546                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
547                     pc = utext_next32(text);
548                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
549                     chars += pcSize;
550                     remaining -= pcSize;
551                     if (remaining <= 0) {
552                         break;
553                     }
554                     uc = utext_current32(text);
555                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
556                         // Maybe. See if it's in the dictionary.
557                         // TODO: this looks iffy; compare with old code.
558                         int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
559                         utext_setNativeIndex(text, current + cuWordLength + chars);
560                         if (num_candidates > 0) {
561                             break;
562                         }
563                     }
564                 }
565 
566                 // Bump the word count if there wasn't already one
567                 if (cuWordLength <= 0) {
568                     wordsFound += 1;
569                 }
570 
571                 // Update the length with the passed-over characters
572                 cuWordLength += chars;
573             }
574             else {
575                 // Back up to where we were for next iteration
576                 utext_setNativeIndex(text, current + cuWordLength);
577             }
578         }
579 
580         // Never stop before a combining mark.
581         int32_t currPos;
582         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
583             utext_next32(text);
584             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
585         }
586 
587         // Look ahead for possible suffixes if a dictionary word does not follow.
588         // We do this in code rather than using a rule so that the heuristic
589         // resynch continues to function. For example, one of the suffix characters
590         // could be a typo in the middle of a word.
591         // NOT CURRENTLY APPLICABLE TO LAO
592 
593         // Did we find a word on this iteration? If so, push it on the break stack
594         if (cuWordLength > 0) {
595             foundBreaks.push((current+cuWordLength), status);
596         }
597     }
598 
599     // Don't return a break for the end of the dictionary range if there is one there.
600     if (foundBreaks.peeki() >= rangeEnd) {
601         (void) foundBreaks.popi();
602         wordsFound -= 1;
603     }
604 
605     return wordsFound;
606 }
607 
608 /*
609  ******************************************************************
610  * BurmeseBreakEngine
611  */
612 
613 // How many words in a row are "good enough"?
614 static const int32_t BURMESE_LOOKAHEAD = 3;
615 
616 // Will not combine a non-word with a preceding dictionary word longer than this
617 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
618 
619 // Will not combine a non-word that shares at least this much prefix with a
620 // dictionary word, with a preceding word
621 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
622 
623 // Minimum word size
624 static const int32_t BURMESE_MIN_WORD = 2;
625 
626 // Minimum number of characters for two words
627 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
628 
BurmeseBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)629 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
630     : DictionaryBreakEngine(),
631       fDictionary(adoptDictionary)
632 {
633     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
634     if (U_SUCCESS(status)) {
635         setCharacters(fBurmeseWordSet);
636     }
637     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
638     fMarkSet.add(0x0020);
639     fEndWordSet = fBurmeseWordSet;
640     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
641 
642     // Compact for caching.
643     fMarkSet.compact();
644     fEndWordSet.compact();
645     fBeginWordSet.compact();
646 }
647 
~BurmeseBreakEngine()648 BurmeseBreakEngine::~BurmeseBreakEngine() {
649     delete fDictionary;
650 }
651 
652 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const653 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
654                                                 int32_t rangeStart,
655                                                 int32_t rangeEnd,
656                                                 UVector32 &foundBreaks ) const {
657     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
658         return 0;       // Not enough characters for two words
659     }
660 
661     uint32_t wordsFound = 0;
662     int32_t cpWordLength = 0;
663     int32_t cuWordLength = 0;
664     int32_t current;
665     UErrorCode status = U_ZERO_ERROR;
666     PossibleWord words[BURMESE_LOOKAHEAD];
667 
668     utext_setNativeIndex(text, rangeStart);
669 
670     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
671         cuWordLength = 0;
672         cpWordLength = 0;
673 
674         // Look for candidate words at the current position
675         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
676 
677         // If we found exactly one, use that
678         if (candidates == 1) {
679             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
680             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
681             wordsFound += 1;
682         }
683         // If there was more than one, see which one can take us forward the most words
684         else if (candidates > 1) {
685             // If we're already at the end of the range, we're done
686             if (utext_getNativeIndex(text) >= rangeEnd) {
687                 goto foundBest;
688             }
689             do {
690                 int32_t wordsMatched = 1;
691                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
692                     if (wordsMatched < 2) {
693                         // Followed by another dictionary word; mark first word as a good candidate
694                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
695                         wordsMatched = 2;
696                     }
697 
698                     // If we're already at the end of the range, we're done
699                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
700                         goto foundBest;
701                     }
702 
703                     // See if any of the possible second words is followed by a third word
704                     do {
705                         // If we find a third word, stop right away
706                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
707                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
708                             goto foundBest;
709                         }
710                     }
711                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
712                 }
713             }
714             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
715 foundBest:
716             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
717             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
718             wordsFound += 1;
719         }
720 
721         // We come here after having either found a word or not. We look ahead to the
722         // next word. If it's not a dictionary word, we will combine it withe the word we
723         // just found (if there is one), but only if the preceding word does not exceed
724         // the threshold.
725         // The text iterator should now be positioned at the end of the word we found.
726         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
727             // if it is a dictionary word, do nothing. If it isn't, then if there is
728             // no preceding word, or the non-word shares less than the minimum threshold
729             // of characters with a dictionary word, then scan to resynchronize
730             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
731                   && (cuWordLength == 0
732                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
733                 // Look for a plausible word boundary
734                 int32_t remaining = rangeEnd - (current + cuWordLength);
735                 UChar32 pc;
736                 UChar32 uc;
737                 int32_t chars = 0;
738                 for (;;) {
739                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
740                     pc = utext_next32(text);
741                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
742                     chars += pcSize;
743                     remaining -= pcSize;
744                     if (remaining <= 0) {
745                         break;
746                     }
747                     uc = utext_current32(text);
748                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
749                         // Maybe. See if it's in the dictionary.
750                         // TODO: this looks iffy; compare with old code.
751                         int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
752                         utext_setNativeIndex(text, current + cuWordLength + chars);
753                         if (num_candidates > 0) {
754                             break;
755                         }
756                     }
757                 }
758 
759                 // Bump the word count if there wasn't already one
760                 if (cuWordLength <= 0) {
761                     wordsFound += 1;
762                 }
763 
764                 // Update the length with the passed-over characters
765                 cuWordLength += chars;
766             }
767             else {
768                 // Back up to where we were for next iteration
769                 utext_setNativeIndex(text, current + cuWordLength);
770             }
771         }
772 
773         // Never stop before a combining mark.
774         int32_t currPos;
775         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
776             utext_next32(text);
777             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
778         }
779 
780         // Look ahead for possible suffixes if a dictionary word does not follow.
781         // We do this in code rather than using a rule so that the heuristic
782         // resynch continues to function. For example, one of the suffix characters
783         // could be a typo in the middle of a word.
784         // NOT CURRENTLY APPLICABLE TO BURMESE
785 
786         // Did we find a word on this iteration? If so, push it on the break stack
787         if (cuWordLength > 0) {
788             foundBreaks.push((current+cuWordLength), status);
789         }
790     }
791 
792     // Don't return a break for the end of the dictionary range if there is one there.
793     if (foundBreaks.peeki() >= rangeEnd) {
794         (void) foundBreaks.popi();
795         wordsFound -= 1;
796     }
797 
798     return wordsFound;
799 }
800 
801 /*
802  ******************************************************************
803  * KhmerBreakEngine
804  */
805 
806 // How many words in a row are "good enough"?
807 static const int32_t KHMER_LOOKAHEAD = 3;
808 
809 // Will not combine a non-word with a preceding dictionary word longer than this
810 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
811 
812 // Will not combine a non-word that shares at least this much prefix with a
813 // dictionary word, with a preceding word
814 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
815 
816 // Minimum word size
817 static const int32_t KHMER_MIN_WORD = 2;
818 
819 // Minimum number of characters for two words
820 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
821 
KhmerBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)822 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
823     : DictionaryBreakEngine(),
824       fDictionary(adoptDictionary)
825 {
826     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
827     if (U_SUCCESS(status)) {
828         setCharacters(fKhmerWordSet);
829     }
830     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
831     fMarkSet.add(0x0020);
832     fEndWordSet = fKhmerWordSet;
833     fBeginWordSet.add(0x1780, 0x17B3);
834     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
835     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
836     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
837     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
838     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
839 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
840 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
841 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
842 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
843 //    fSuffixSet.add(THAI_PAIYANNOI);
844 //    fSuffixSet.add(THAI_MAIYAMOK);
845 
846     // Compact for caching.
847     fMarkSet.compact();
848     fEndWordSet.compact();
849     fBeginWordSet.compact();
850 //    fSuffixSet.compact();
851 }
852 
~KhmerBreakEngine()853 KhmerBreakEngine::~KhmerBreakEngine() {
854     delete fDictionary;
855 }
856 
857 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const858 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
859                                                 int32_t rangeStart,
860                                                 int32_t rangeEnd,
861                                                 UVector32 &foundBreaks ) const {
862     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
863         return 0;       // Not enough characters for two words
864     }
865 
866     uint32_t wordsFound = 0;
867     int32_t cpWordLength = 0;
868     int32_t cuWordLength = 0;
869     int32_t current;
870     UErrorCode status = U_ZERO_ERROR;
871     PossibleWord words[KHMER_LOOKAHEAD];
872 
873     utext_setNativeIndex(text, rangeStart);
874 
875     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
876         cuWordLength = 0;
877         cpWordLength = 0;
878 
879         // Look for candidate words at the current position
880         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
881 
882         // If we found exactly one, use that
883         if (candidates == 1) {
884             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
885             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
886             wordsFound += 1;
887         }
888 
889         // If there was more than one, see which one can take us forward the most words
890         else if (candidates > 1) {
891             // If we're already at the end of the range, we're done
892             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
893                 goto foundBest;
894             }
895             do {
896                 int32_t wordsMatched = 1;
897                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
898                     if (wordsMatched < 2) {
899                         // Followed by another dictionary word; mark first word as a good candidate
900                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
901                         wordsMatched = 2;
902                     }
903 
904                     // If we're already at the end of the range, we're done
905                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
906                         goto foundBest;
907                     }
908 
909                     // See if any of the possible second words is followed by a third word
910                     do {
911                         // If we find a third word, stop right away
912                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
913                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
914                             goto foundBest;
915                         }
916                     }
917                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
918                 }
919             }
920             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
921 foundBest:
922             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
923             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
924             wordsFound += 1;
925         }
926 
927         // We come here after having either found a word or not. We look ahead to the
928         // next word. If it's not a dictionary word, we will combine it with the word we
929         // just found (if there is one), but only if the preceding word does not exceed
930         // the threshold.
931         // The text iterator should now be positioned at the end of the word we found.
932         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
933             // if it is a dictionary word, do nothing. If it isn't, then if there is
934             // no preceding word, or the non-word shares less than the minimum threshold
935             // of characters with a dictionary word, then scan to resynchronize
936             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
937                   && (cuWordLength == 0
938                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
939                 // Look for a plausible word boundary
940                 int32_t remaining = rangeEnd - (current+cuWordLength);
941                 UChar32 pc;
942                 UChar32 uc;
943                 int32_t chars = 0;
944                 for (;;) {
945                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
946                     pc = utext_next32(text);
947                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
948                     chars += pcSize;
949                     remaining -= pcSize;
950                     if (remaining <= 0) {
951                         break;
952                     }
953                     uc = utext_current32(text);
954                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
955                         // Maybe. See if it's in the dictionary.
956                         int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
957                         utext_setNativeIndex(text, current+cuWordLength+chars);
958                         if (num_candidates > 0) {
959                             break;
960                         }
961                     }
962                 }
963 
964                 // Bump the word count if there wasn't already one
965                 if (cuWordLength <= 0) {
966                     wordsFound += 1;
967                 }
968 
969                 // Update the length with the passed-over characters
970                 cuWordLength += chars;
971             }
972             else {
973                 // Back up to where we were for next iteration
974                 utext_setNativeIndex(text, current+cuWordLength);
975             }
976         }
977 
978         // Never stop before a combining mark.
979         int32_t currPos;
980         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
981             utext_next32(text);
982             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
983         }
984 
985         // Look ahead for possible suffixes if a dictionary word does not follow.
986         // We do this in code rather than using a rule so that the heuristic
987         // resynch continues to function. For example, one of the suffix characters
988         // could be a typo in the middle of a word.
989 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
990 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
991 //                && fSuffixSet.contains(uc = utext_current32(text))) {
992 //                if (uc == KHMER_PAIYANNOI) {
993 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
994 //                        // Skip over previous end and PAIYANNOI
995 //                        utext_next32(text);
996 //                        utext_next32(text);
997 //                        wordLength += 1;            // Add PAIYANNOI to word
998 //                        uc = utext_current32(text);     // Fetch next character
999 //                    }
1000 //                    else {
1001 //                        // Restore prior position
1002 //                        utext_next32(text);
1003 //                    }
1004 //                }
1005 //                if (uc == KHMER_MAIYAMOK) {
1006 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1007 //                        // Skip over previous end and MAIYAMOK
1008 //                        utext_next32(text);
1009 //                        utext_next32(text);
1010 //                        wordLength += 1;            // Add MAIYAMOK to word
1011 //                    }
1012 //                    else {
1013 //                        // Restore prior position
1014 //                        utext_next32(text);
1015 //                    }
1016 //                }
1017 //            }
1018 //            else {
1019 //                utext_setNativeIndex(text, current+wordLength);
1020 //            }
1021 //        }
1022 
1023         // Did we find a word on this iteration? If so, push it on the break stack
1024         if (cuWordLength > 0) {
1025             foundBreaks.push((current+cuWordLength), status);
1026         }
1027     }
1028 
1029     // Don't return a break for the end of the dictionary range if there is one there.
1030     if (foundBreaks.peeki() >= rangeEnd) {
1031         (void) foundBreaks.popi();
1032         wordsFound -= 1;
1033     }
1034 
1035     return wordsFound;
1036 }
1037 
1038 #if !UCONFIG_NO_NORMALIZATION
1039 /*
1040  ******************************************************************
1041  * CjkBreakEngine
1042  */
1043 static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine(DictionaryMatcher * adoptDictionary,LanguageType type,UErrorCode & status)1044 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1045 : DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1046     // Korean dictionary only includes Hangul syllables
1047     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1048     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1049     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1050     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1051     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1052 
1053     if (U_SUCCESS(status)) {
1054         // handle Korean and Japanese/Chinese using different dictionaries
1055         if (type == kKorean) {
1056             setCharacters(fHangulWordSet);
1057         } else { //Chinese and Japanese
1058             UnicodeSet cjSet;
1059             cjSet.addAll(fHanWordSet);
1060             cjSet.addAll(fKatakanaWordSet);
1061             cjSet.addAll(fHiraganaWordSet);
1062             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1063             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1064             setCharacters(cjSet);
1065         }
1066     }
1067 }
1068 
~CjkBreakEngine()1069 CjkBreakEngine::~CjkBreakEngine(){
1070     delete fDictionary;
1071 }
1072 
1073 // The katakanaCost values below are based on the length frequencies of all
1074 // katakana phrases in the dictionary
1075 static const int32_t kMaxKatakanaLength = 8;
1076 static const int32_t kMaxKatakanaGroupLength = 20;
1077 static const uint32_t maxSnlp = 255;
1078 
getKatakanaCost(int32_t wordLength)1079 static inline uint32_t getKatakanaCost(int32_t wordLength){
1080     //TODO: fill array with actual values from dictionary!
1081     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1082                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1083     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1084 }
1085 
isKatakana(UChar32 value)1086 static inline bool isKatakana(UChar32 value) {
1087     return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1088             (value >= 0xFF66 && value <= 0xFF9f);
1089 }
1090 
1091 
1092 // Function for accessing internal utext flags.
1093 //   Replicates an internal UText function.
1094 
utext_i32_flag(int32_t bitIndex)1095 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1096     return (int32_t)1 << bitIndex;
1097 }
1098 
1099 
1100 /*
1101  * @param text A UText representing the text
1102  * @param rangeStart The start of the range of dictionary characters
1103  * @param rangeEnd The end of the range of dictionary characters
1104  * @param foundBreaks vector<int32> to receive the break positions
1105  * @return The number of breaks found
1106  */
1107 int32_t
divideUpDictionaryRange(UText * inText,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const1108 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1109         int32_t rangeStart,
1110         int32_t rangeEnd,
1111         UVector32 &foundBreaks ) const {
1112     if (rangeStart >= rangeEnd) {
1113         return 0;
1114     }
1115 
1116     // UnicodeString version of input UText, NFKC normalized if necessary.
1117     UnicodeString inString;
1118 
1119     // inputMap[inStringIndex] = corresponding native index from UText inText.
1120     // If NULL then mapping is 1:1
1121     LocalPointer<UVector32>     inputMap;
1122 
1123     UErrorCode     status      = U_ZERO_ERROR;
1124 
1125 
1126     // if UText has the input string as one contiguous UTF-16 chunk
1127     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1128          inText->chunkNativeStart <= rangeStart &&
1129          inText->chunkNativeLimit >= rangeEnd   &&
1130          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1131 
1132         // Input UText is in one contiguous UTF-16 chunk.
1133         // Use Read-only aliasing UnicodeString.
1134         inString.setTo(FALSE,
1135                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
1136                        rangeEnd - rangeStart);
1137     } else {
1138         // Copy the text from the original inText (UText) to inString (UnicodeString).
1139         // Create a map from UnicodeString indices -> UText offsets.
1140         utext_setNativeIndex(inText, rangeStart);
1141         int32_t limit = rangeEnd;
1142         U_ASSERT(limit <= utext_nativeLength(inText));
1143         if (limit > utext_nativeLength(inText)) {
1144             limit = (int32_t)utext_nativeLength(inText);
1145         }
1146         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1147         if (U_FAILURE(status)) {
1148             return 0;
1149         }
1150         while (utext_getNativeIndex(inText) < limit) {
1151             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1152             UChar32 c = utext_next32(inText);
1153             U_ASSERT(c != U_SENTINEL);
1154             inString.append(c);
1155             while (inputMap->size() < inString.length()) {
1156                 inputMap->addElement(nativePosition, status);
1157             }
1158         }
1159         inputMap->addElement(limit, status);
1160     }
1161 
1162 
1163     if (!nfkcNorm2->isNormalized(inString, status)) {
1164         UnicodeString normalizedInput;
1165         //  normalizedMap[normalizedInput position] ==  original UText position.
1166         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1167         if (U_FAILURE(status)) {
1168             return 0;
1169         }
1170 
1171         UnicodeString fragment;
1172         UnicodeString normalizedFragment;
1173         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1174             fragment.remove();
1175             int32_t fragmentStartI = srcI;
1176             UChar32 c = inString.char32At(srcI);
1177             for (;;) {
1178                 fragment.append(c);
1179                 srcI = inString.moveIndex32(srcI, 1);
1180                 if (srcI == inString.length()) {
1181                     break;
1182                 }
1183                 c = inString.char32At(srcI);
1184                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1185                     break;
1186                 }
1187             }
1188             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1189             normalizedInput.append(normalizedFragment);
1190 
1191             // Map every position in the normalized chunk to the start of the chunk
1192             //   in the original input.
1193             int32_t fragmentOriginalStart = inputMap.isValid() ?
1194                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1195             while (normalizedMap->size() < normalizedInput.length()) {
1196                 normalizedMap->addElement(fragmentOriginalStart, status);
1197                 if (U_FAILURE(status)) {
1198                     break;
1199                 }
1200             }
1201         }
1202         U_ASSERT(normalizedMap->size() == normalizedInput.length());
1203         int32_t nativeEnd = inputMap.isValid() ?
1204                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1205         normalizedMap->addElement(nativeEnd, status);
1206 
1207         inputMap.moveFrom(normalizedMap);
1208         inString.moveFrom(normalizedInput);
1209     }
1210 
1211     int32_t numCodePts = inString.countChar32();
1212     if (numCodePts != inString.length()) {
1213         // There are supplementary characters in the input.
1214         // The dictionary will produce boundary positions in terms of code point indexes,
1215         //   not in terms of code unit string indexes.
1216         // Use the inputMap mechanism to take care of this in addition to indexing differences
1217         //    from normalization and/or UTF-8 input.
1218         UBool hadExistingMap = inputMap.isValid();
1219         if (!hadExistingMap) {
1220             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1221             if (U_FAILURE(status)) {
1222                 return 0;
1223             }
1224         }
1225         int32_t cpIdx = 0;
1226         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1227             U_ASSERT(cuIdx >= cpIdx);
1228             if (hadExistingMap) {
1229                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1230             } else {
1231                 inputMap->addElement(cuIdx+rangeStart, status);
1232             }
1233             cpIdx++;
1234             if (cuIdx == inString.length()) {
1235                break;
1236             }
1237         }
1238     }
1239 
1240     // bestSnlp[i] is the snlp of the best segmentation of the first i
1241     // code points in the range to be matched.
1242     UVector32 bestSnlp(numCodePts + 1, status);
1243     bestSnlp.addElement(0, status);
1244     for(int32_t i = 1; i <= numCodePts; i++) {
1245         bestSnlp.addElement(kuint32max, status);
1246     }
1247 
1248 
1249     // prev[i] is the index of the last CJK code point in the previous word in
1250     // the best segmentation of the first i characters.
1251     UVector32 prev(numCodePts + 1, status);
1252     for(int32_t i = 0; i <= numCodePts; i++){
1253         prev.addElement(-1, status);
1254     }
1255 
1256     const int32_t maxWordSize = 20;
1257     UVector32 values(numCodePts, status);
1258     values.setSize(numCodePts);
1259     UVector32 lengths(numCodePts, status);
1260     lengths.setSize(numCodePts);
1261 
1262     UText fu = UTEXT_INITIALIZER;
1263     utext_openUnicodeString(&fu, &inString, &status);
1264 
1265     // Dynamic programming to find the best segmentation.
1266 
1267     // In outer loop, i  is the code point index,
1268     //                ix is the corresponding string (code unit) index.
1269     //    They differ when the string contains supplementary characters.
1270     int32_t ix = 0;
1271     bool is_prev_katakana = false;
1272     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1273         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1274             continue;
1275         }
1276 
1277         int32_t count;
1278         utext_setNativeIndex(&fu, ix);
1279         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1280                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1281                              // Note: lengths is filled with code point lengths
1282                              //       The NULL parameter is the ignored code unit lengths.
1283 
1284         // if there are no single character matches found in the dictionary
1285         // starting with this character, treat character as a 1-character word
1286         // with the highest value possible, i.e. the least likely to occur.
1287         // Exclude Korean characters from this treatment, as they should be left
1288         // together by default.
1289         if ((count == 0 || lengths.elementAti(0) != 1) &&
1290                 !fHangulWordSet.contains(inString.char32At(ix))) {
1291             values.setElementAt(maxSnlp, count);   // 255
1292             lengths.setElementAt(1, count++);
1293         }
1294 
1295         for (int32_t j = 0; j < count; j++) {
1296             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1297             int32_t ln_j_i = lengths.elementAti(j) + i;
1298             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1299                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1300                 prev.setElementAt(i, ln_j_i);
1301             }
1302         }
1303 
1304         // In Japanese,
1305         // Katakana word in single character is pretty rare. So we apply
1306         // the following heuristic to Katakana: any continuous run of Katakana
1307         // characters is considered a candidate word with a default cost
1308         // specified in the katakanaCost table according to its length.
1309 
1310         bool is_katakana = isKatakana(inString.char32At(ix));
1311         int32_t katakanaRunLength = 1;
1312         if (!is_prev_katakana && is_katakana) {
1313             int32_t j = inString.moveIndex32(ix, 1);
1314             // Find the end of the continuous run of Katakana characters
1315             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1316                     isKatakana(inString.char32At(j))) {
1317                 j = inString.moveIndex32(j, 1);
1318                 katakanaRunLength++;
1319             }
1320             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1321                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1322                 if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1323                     bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1324                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1325                 }
1326             }
1327         }
1328         is_prev_katakana = is_katakana;
1329     }
1330     utext_close(&fu);
1331 
1332     // Start pushing the optimal offset index into t_boundary (t for tentative).
1333     // prev[numCodePts] is guaranteed to be meaningful.
1334     // We'll first push in the reverse order, i.e.,
1335     // t_boundary[0] = numCodePts, and afterwards do a swap.
1336     UVector32 t_boundary(numCodePts+1, status);
1337 
1338     int32_t numBreaks = 0;
1339     // No segmentation found, set boundary to end of range
1340     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1341         t_boundary.addElement(numCodePts, status);
1342         numBreaks++;
1343     } else {
1344         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1345             t_boundary.addElement(i, status);
1346             numBreaks++;
1347         }
1348         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1349     }
1350 
1351     // Add a break for the start of the dictionary range if there is not one
1352     // there already.
1353     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1354         t_boundary.addElement(0, status);
1355         numBreaks++;
1356     }
1357 
1358     // Now that we're done, convert positions in t_boundary[] (indices in
1359     // the normalized input string) back to indices in the original input UText
1360     // while reversing t_boundary and pushing values to foundBreaks.
1361     int32_t prevCPPos = -1;
1362     int32_t prevUTextPos = -1;
1363     for (int32_t i = numBreaks-1; i >= 0; i--) {
1364         int32_t cpPos = t_boundary.elementAti(i);
1365         U_ASSERT(cpPos > prevCPPos);
1366         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1367         U_ASSERT(utextPos >= prevUTextPos);
1368         if (utextPos > prevUTextPos) {
1369             // Boundaries are added to foundBreaks output in ascending order.
1370             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1371             foundBreaks.push(utextPos, status);
1372         } else {
1373             // Normalization expanded the input text, the dictionary found a boundary
1374             // within the expansion, giving two boundaries with the same index in the
1375             // original text. Ignore the second. See ticket #12918.
1376             --numBreaks;
1377         }
1378         prevCPPos = cpPos;
1379         prevUTextPos = utextPos;
1380     }
1381     (void)prevCPPos; // suppress compiler warnings about unused variable
1382 
1383     // inString goes out of scope
1384     // inputMap goes out of scope
1385     return numBreaks;
1386 }
1387 #endif
1388 
1389 U_NAMESPACE_END
1390 
1391 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1392 
1393