1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2014, International Business Machines Corporation and         *
6  * others. All Rights Reserved.                                                *
7  *******************************************************************************
8  */
9 package com.ibm.icu.text;
10 
11 import java.io.IOException;
12 import java.text.CharacterIterator;
13 
14 import com.ibm.icu.lang.UCharacter;
15 import com.ibm.icu.lang.UProperty;
16 import com.ibm.icu.lang.UScript;
17 
18 class ThaiBreakEngine extends DictionaryBreakEngine {
19 
20     // Constants for ThaiBreakIterator
21     // How many words in a row are "good enough"?
22     private static final byte THAI_LOOKAHEAD = 3;
23     // Will not combine a non-word with a preceding dictionary word longer than this
24     private static final byte THAI_ROOT_COMBINE_THRESHOLD = 3;
25     // Will not combine a non-word that shares at least this much prefix with a
26     // dictionary word with a preceding word
27     private static final byte THAI_PREFIX_COMBINE_THRESHOLD = 3;
28     // Ellision character
29     private static final char THAI_PAIYANNOI = 0x0E2F;
30     // Repeat character
31     private static final char THAI_MAIYAMOK = 0x0E46;
32     // Minimum word size
33     private static final byte THAI_MIN_WORD = 2;
34     // Minimum number of characters for two words
35     private static final byte THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
36 
37     private DictionaryMatcher fDictionary;
38     private static UnicodeSet fThaiWordSet;
39     private static UnicodeSet fEndWordSet;
40     private static UnicodeSet fBeginWordSet;
41     private static UnicodeSet fSuffixSet;
42     private static UnicodeSet fMarkSet;
43 
44     static {
45         // Initialize UnicodeSets
46         fThaiWordSet = new UnicodeSet();
47         fMarkSet = new UnicodeSet();
48         fBeginWordSet = new UnicodeSet();
49         fSuffixSet = new UnicodeSet();
50 
51         fThaiWordSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]]");
fThaiWordSet.compact()52         fThaiWordSet.compact();
53 
54         fMarkSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
55         fMarkSet.add(0x0020);
56         fEndWordSet = new UnicodeSet(fThaiWordSet);
57         fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
58         fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
59         fBeginWordSet.add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK
60         fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
61         fSuffixSet.add(THAI_PAIYANNOI);
62         fSuffixSet.add(THAI_MAIYAMOK);
63 
64         // Compact for caching
fMarkSet.compact()65         fMarkSet.compact();
fEndWordSet.compact()66         fEndWordSet.compact();
fBeginWordSet.compact()67         fBeginWordSet.compact();
fSuffixSet.compact()68         fSuffixSet.compact();
69 
70         // Freeze the static UnicodeSet
fThaiWordSet.freeze()71         fThaiWordSet.freeze();
fMarkSet.freeze()72         fMarkSet.freeze();
fEndWordSet.freeze()73         fEndWordSet.freeze();
fBeginWordSet.freeze()74         fBeginWordSet.freeze();
fSuffixSet.freeze()75         fSuffixSet.freeze();
76     }
77 
ThaiBreakEngine()78     public ThaiBreakEngine() throws IOException {
79         setCharacters(fThaiWordSet);
80         // Initialize dictionary
81         fDictionary = DictionaryData.loadDictionaryFor("Thai");
82     }
83 
84     @Override
equals(Object obj)85     public boolean equals(Object obj) {
86         // Normally is a singleton, but it's possible to have duplicates
87         //   during initialization. All are equivalent.
88         return obj instanceof ThaiBreakEngine;
89     }
90 
91     @Override
hashCode()92     public int hashCode() {
93         return getClass().hashCode();
94     }
95 
96     @Override
handles(int c)97     public boolean handles(int c) {
98         int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
99         return (script == UScript.THAI);
100     }
101 
102     @Override
divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, DequeI foundBreaks)103     public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
104             DequeI foundBreaks) {
105 
106         if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
107             return 0;  // Not enough characters for word
108         }
109         int wordsFound = 0;
110         int wordLength;
111         PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD];
112         for (int i = 0; i < THAI_LOOKAHEAD; i++) {
113             words[i] = new PossibleWord();
114         }
115 
116         int uc;
117         fIter.setIndex(rangeStart);
118         int current;
119         while ((current = fIter.getIndex()) < rangeEnd) {
120             wordLength = 0;
121 
122             //Look for candidate words at the current position
123             int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
124 
125             // If we found exactly one, use that
126             if (candidates == 1) {
127                 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
128                 wordsFound += 1;
129             }
130 
131             // If there was more than one, see which one can take us forward the most words
132             else if (candidates > 1) {
133                 // If we're already at the end of the range, we're done
134                 if (fIter.getIndex() < rangeEnd) {
135                   foundBest:
136                     do {
137                         int wordsMatched = 1;
138                         if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
139                             if (wordsMatched < 2) {
140                                 // Followed by another dictionary word; mark first word as a good candidate
141                                 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
142                                 wordsMatched = 2;
143                             }
144 
145                             // If we're already at the end of the range, we're done
146                             if (fIter.getIndex() >= rangeEnd) {
147                                 break foundBest;
148                             }
149 
150                             // See if any of the possible second words is followed by a third word
151                             do {
152                                 // If we find a third word, stop right away
153                                 if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
154                                     words[wordsFound%THAI_LOOKAHEAD].markCurrent();
155                                     break foundBest;
156                                 }
157                             } while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(fIter));
158                         }
159                     }
160                     while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter));
161                     // foundBest: end of loop
162                 }
163                 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
164                 wordsFound += 1;
165             }
166 
167             // We come here after having either found a word or not. We look ahead to the
168             // next word. If it's not a dictionary word, we will combine it with the word we
169             // just found (if there is one), but only if the preceding word does not exceed
170             // the threshold.
171             // The text iterator should now be positioned at the end of the word we found.
172             if (fIter.getIndex() < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
173                 // If it is a dictionary word, do nothing. If it isn't, then if there is
174                 // no preceding word, or the non-word shares less than the minimum threshold
175                 // of characters with a dictionary word, then scan to resynchronize
176                 if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
177                         (wordLength == 0 ||
178                                 words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
179                     // Look for a plausible word boundary
180                     int remaining = rangeEnd - (current + wordLength);
181                     int pc = fIter.current();
182                     int chars = 0;
183                     for (;;) {
184                         fIter.next();
185                         uc = fIter.current();
186                         chars += 1;
187                         if (--remaining <= 0) {
188                             break;
189                         }
190                         if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
191                             // Maybe. See if it's in the dictionary.
192                             // Note: In the original Apple code, checked that the next
193                             // two characters after uc were not 0x0E4C THANTHAKHAT before
194                             // checking the dictionary. That is just a performance filter,
195                             // but it's not clear it's faster than checking the trie
196                             int candidate = words[(wordsFound + 1) %THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
197                             fIter.setIndex(current + wordLength + chars);
198                             if (candidate > 0) {
199                                 break;
200                             }
201                         }
202                         pc = uc;
203                     }
204 
205                     // Bump the word count if there wasn't already one
206                     if (wordLength <= 0) {
207                         wordsFound += 1;
208                     }
209 
210                     // Update the length with the passed-over characters
211                     wordLength += chars;
212                 } else {
213                     // Backup to where we were for next iteration
214                     fIter.setIndex(current+wordLength);
215                 }
216             }
217 
218             // Never stop before a combining mark.
219             int currPos;
220             while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
221                 fIter.next();
222                 wordLength += fIter.getIndex() - currPos;
223             }
224 
225             // Look ahead for possible suffixes if a dictionary word does not follow.
226             // We do this in code rather than using a rule so that the heuristic
227             // resynch continues to function. For example, one of the suffix characters
228             // could be a typo in the middle of a word.
229             if (fIter.getIndex() < rangeEnd && wordLength > 0) {
230                 if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
231                         fSuffixSet.contains(uc = fIter.current())) {
232                     if (uc == THAI_PAIYANNOI) {
233                         if (!fSuffixSet.contains(fIter.previous())) {
234                             // Skip over previous end and PAIYANNOI
235                             fIter.next();
236                             fIter.next();
237                             wordLength += 1;
238                             uc = fIter.current();
239                         } else {
240                             // Restore prior position
241                             fIter.next();
242                         }
243                     }
244                     if (uc == THAI_MAIYAMOK) {
245                         if (fIter.previous() != THAI_MAIYAMOK) {
246                             // Skip over previous end and MAIYAMOK
247                             fIter.next();
248                             fIter.next();
249                             wordLength += 1;
250                         } else {
251                             // restore prior position
252                             fIter.next();
253                         }
254                     }
255                 } else {
256                     fIter.setIndex(current + wordLength);
257                 }
258             }
259 
260             // Did we find a word on this iteration? If so, push it on the break stack
261             if (wordLength > 0) {
262                 foundBreaks.push(Integer.valueOf(current + wordLength));
263             }
264         }
265 
266         // Don't return a break for the end of the dictionary range if there is one there
267         if (foundBreaks.peek() >= rangeEnd) {
268             foundBreaks.pop();
269             wordsFound -= 1;
270         }
271 
272         return wordsFound;
273     }
274 
275 }
276