1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2014, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.io.IOException; 12 import java.text.CharacterIterator; 13 14 import com.ibm.icu.lang.UCharacter; 15 import com.ibm.icu.lang.UProperty; 16 import com.ibm.icu.lang.UScript; 17 18 class ThaiBreakEngine extends DictionaryBreakEngine { 19 20 // Constants for ThaiBreakIterator 21 // How many words in a row are "good enough"? 22 private static final byte THAI_LOOKAHEAD = 3; 23 // Will not combine a non-word with a preceding dictionary word longer than this 24 private static final byte THAI_ROOT_COMBINE_THRESHOLD = 3; 25 // Will not combine a non-word that shares at least this much prefix with a 26 // dictionary word with a preceding word 27 private static final byte THAI_PREFIX_COMBINE_THRESHOLD = 3; 28 // Ellision character 29 private static final char THAI_PAIYANNOI = 0x0E2F; 30 // Repeat character 31 private static final char THAI_MAIYAMOK = 0x0E46; 32 // Minimum word size 33 private static final byte THAI_MIN_WORD = 2; 34 // Minimum number of characters for two words 35 private static final byte THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2; 36 37 private DictionaryMatcher fDictionary; 38 private static UnicodeSet fThaiWordSet; 39 private static UnicodeSet fEndWordSet; 40 private static UnicodeSet fBeginWordSet; 41 private static UnicodeSet fSuffixSet; 42 private static UnicodeSet fMarkSet; 43 44 static { 45 // Initialize UnicodeSets 46 fThaiWordSet = new UnicodeSet(); 47 fMarkSet = new UnicodeSet(); 48 fBeginWordSet = new UnicodeSet(); 49 fSuffixSet = new UnicodeSet(); 50 51 fThaiWordSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]]"); fThaiWordSet.compact()52 fThaiWordSet.compact(); 53 54 fMarkSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"); 55 fMarkSet.add(0x0020); 56 fEndWordSet = new UnicodeSet(fThaiWordSet); 57 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 58 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 59 fBeginWordSet.add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK 60 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 61 fSuffixSet.add(THAI_PAIYANNOI); 62 fSuffixSet.add(THAI_MAIYAMOK); 63 64 // Compact for caching fMarkSet.compact()65 fMarkSet.compact(); fEndWordSet.compact()66 fEndWordSet.compact(); fBeginWordSet.compact()67 fBeginWordSet.compact(); fSuffixSet.compact()68 fSuffixSet.compact(); 69 70 // Freeze the static UnicodeSet fThaiWordSet.freeze()71 fThaiWordSet.freeze(); fMarkSet.freeze()72 fMarkSet.freeze(); fEndWordSet.freeze()73 fEndWordSet.freeze(); fBeginWordSet.freeze()74 fBeginWordSet.freeze(); fSuffixSet.freeze()75 fSuffixSet.freeze(); 76 } 77 ThaiBreakEngine()78 public ThaiBreakEngine() throws IOException { 79 setCharacters(fThaiWordSet); 80 // Initialize dictionary 81 fDictionary = DictionaryData.loadDictionaryFor("Thai"); 82 } 83 84 @Override equals(Object obj)85 public boolean equals(Object obj) { 86 // Normally is a singleton, but it's possible to have duplicates 87 // during initialization. All are equivalent. 88 return obj instanceof ThaiBreakEngine; 89 } 90 91 @Override hashCode()92 public int hashCode() { 93 return getClass().hashCode(); 94 } 95 96 @Override handles(int c)97 public boolean handles(int c) { 98 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 99 return (script == UScript.THAI); 100 } 101 102 @Override divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, DequeI foundBreaks)103 public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, 104 DequeI foundBreaks) { 105 106 if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { 107 return 0; // Not enough characters for word 108 } 109 int wordsFound = 0; 110 int wordLength; 111 PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD]; 112 for (int i = 0; i < THAI_LOOKAHEAD; i++) { 113 words[i] = new PossibleWord(); 114 } 115 116 int uc; 117 fIter.setIndex(rangeStart); 118 int current; 119 while ((current = fIter.getIndex()) < rangeEnd) { 120 wordLength = 0; 121 122 //Look for candidate words at the current position 123 int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd); 124 125 // If we found exactly one, use that 126 if (candidates == 1) { 127 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter); 128 wordsFound += 1; 129 } 130 131 // If there was more than one, see which one can take us forward the most words 132 else if (candidates > 1) { 133 // If we're already at the end of the range, we're done 134 if (fIter.getIndex() < rangeEnd) { 135 foundBest: 136 do { 137 int wordsMatched = 1; 138 if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) { 139 if (wordsMatched < 2) { 140 // Followed by another dictionary word; mark first word as a good candidate 141 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 142 wordsMatched = 2; 143 } 144 145 // If we're already at the end of the range, we're done 146 if (fIter.getIndex() >= rangeEnd) { 147 break foundBest; 148 } 149 150 // See if any of the possible second words is followed by a third word 151 do { 152 // If we find a third word, stop right away 153 if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) { 154 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 155 break foundBest; 156 } 157 } while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(fIter)); 158 } 159 } 160 while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter)); 161 // foundBest: end of loop 162 } 163 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter); 164 wordsFound += 1; 165 } 166 167 // We come here after having either found a word or not. We look ahead to the 168 // next word. If it's not a dictionary word, we will combine it with the word we 169 // just found (if there is one), but only if the preceding word does not exceed 170 // the threshold. 171 // The text iterator should now be positioned at the end of the word we found. 172 if (fIter.getIndex() < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { 173 // If it is a dictionary word, do nothing. If it isn't, then if there is 174 // no preceding word, or the non-word shares less than the minimum threshold 175 // of characters with a dictionary word, then scan to resynchronize 176 if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 && 177 (wordLength == 0 || 178 words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 179 // Look for a plausible word boundary 180 int remaining = rangeEnd - (current + wordLength); 181 int pc = fIter.current(); 182 int chars = 0; 183 for (;;) { 184 fIter.next(); 185 uc = fIter.current(); 186 chars += 1; 187 if (--remaining <= 0) { 188 break; 189 } 190 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 191 // Maybe. See if it's in the dictionary. 192 // Note: In the original Apple code, checked that the next 193 // two characters after uc were not 0x0E4C THANTHAKHAT before 194 // checking the dictionary. That is just a performance filter, 195 // but it's not clear it's faster than checking the trie 196 int candidate = words[(wordsFound + 1) %THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd); 197 fIter.setIndex(current + wordLength + chars); 198 if (candidate > 0) { 199 break; 200 } 201 } 202 pc = uc; 203 } 204 205 // Bump the word count if there wasn't already one 206 if (wordLength <= 0) { 207 wordsFound += 1; 208 } 209 210 // Update the length with the passed-over characters 211 wordLength += chars; 212 } else { 213 // Backup to where we were for next iteration 214 fIter.setIndex(current+wordLength); 215 } 216 } 217 218 // Never stop before a combining mark. 219 int currPos; 220 while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) { 221 fIter.next(); 222 wordLength += fIter.getIndex() - currPos; 223 } 224 225 // Look ahead for possible suffixes if a dictionary word does not follow. 226 // We do this in code rather than using a rule so that the heuristic 227 // resynch continues to function. For example, one of the suffix characters 228 // could be a typo in the middle of a word. 229 if (fIter.getIndex() < rangeEnd && wordLength > 0) { 230 if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 && 231 fSuffixSet.contains(uc = fIter.current())) { 232 if (uc == THAI_PAIYANNOI) { 233 if (!fSuffixSet.contains(fIter.previous())) { 234 // Skip over previous end and PAIYANNOI 235 fIter.next(); 236 fIter.next(); 237 wordLength += 1; 238 uc = fIter.current(); 239 } else { 240 // Restore prior position 241 fIter.next(); 242 } 243 } 244 if (uc == THAI_MAIYAMOK) { 245 if (fIter.previous() != THAI_MAIYAMOK) { 246 // Skip over previous end and MAIYAMOK 247 fIter.next(); 248 fIter.next(); 249 wordLength += 1; 250 } else { 251 // restore prior position 252 fIter.next(); 253 } 254 } 255 } else { 256 fIter.setIndex(current + wordLength); 257 } 258 } 259 260 // Did we find a word on this iteration? If so, push it on the break stack 261 if (wordLength > 0) { 262 foundBreaks.push(Integer.valueOf(current + wordLength)); 263 } 264 } 265 266 // Don't return a break for the end of the dictionary range if there is one there 267 if (foundBreaks.peek() >= rangeEnd) { 268 foundBreaks.pop(); 269 wordsFound -= 1; 270 } 271 272 return wordsFound; 273 } 274 275 } 276