1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2012-2016, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import static com.ibm.icu.impl.CharacterIteration.DONE32; 12 import static com.ibm.icu.impl.CharacterIteration.current32; 13 import static com.ibm.icu.impl.CharacterIteration.next32; 14 15 import java.io.IOException; 16 import java.text.CharacterIterator; 17 18 import com.ibm.icu.impl.Assert; 19 20 class CjkBreakEngine extends DictionaryBreakEngine { 21 private static final UnicodeSet fHangulWordSet = new UnicodeSet(); 22 private static final UnicodeSet fHanWordSet = new UnicodeSet(); 23 private static final UnicodeSet fKatakanaWordSet = new UnicodeSet(); 24 private static final UnicodeSet fHiraganaWordSet = new UnicodeSet(); 25 static { 26 fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]"); 27 fHanWordSet.applyPattern("[:Han:]"); 28 fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]"); 29 fHiraganaWordSet.applyPattern("[:Hiragana:]"); 30 31 // freeze them all fHangulWordSet.freeze()32 fHangulWordSet.freeze(); fHanWordSet.freeze()33 fHanWordSet.freeze(); fKatakanaWordSet.freeze()34 fKatakanaWordSet.freeze(); fHiraganaWordSet.freeze()35 fHiraganaWordSet.freeze(); 36 } 37 38 private DictionaryMatcher fDictionary = null; 39 CjkBreakEngine(boolean korean)40 public CjkBreakEngine(boolean korean) throws IOException { 41 fDictionary = DictionaryData.loadDictionaryFor("Hira"); 42 if (korean) { 43 setCharacters(fHangulWordSet); 44 } else { //Chinese and Japanese 45 UnicodeSet cjSet = new UnicodeSet(); 46 cjSet.addAll(fHanWordSet); 47 cjSet.addAll(fKatakanaWordSet); 48 cjSet.addAll(fHiraganaWordSet); 49 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 50 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 51 setCharacters(cjSet); 52 } 53 } 54 55 @Override equals(Object obj)56 public boolean equals(Object obj) { 57 if (obj instanceof CjkBreakEngine) { 58 CjkBreakEngine other = (CjkBreakEngine)obj; 59 return this.fSet.equals(other.fSet); 60 } 61 return false; 62 } 63 64 @Override hashCode()65 public int hashCode() { 66 return getClass().hashCode(); 67 } 68 69 private static final int kMaxKatakanaLength = 8; 70 private static final int kMaxKatakanaGroupLength = 20; 71 private static final int maxSnlp = 255; 72 private static final int kint32max = Integer.MAX_VALUE; getKatakanaCost(int wordlength)73 private static int getKatakanaCost(int wordlength) { 74 int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 }; 75 return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength]; 76 } 77 isKatakana(int value)78 private static boolean isKatakana(int value) { 79 return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) || 80 (value >= 0xFF66 && value <= 0xFF9F); 81 } 82 83 @Override divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, DequeI foundBreaks)84 public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, 85 DequeI foundBreaks) { 86 if (startPos >= endPos) { 87 return 0; 88 } 89 90 inText.setIndex(startPos); 91 92 int inputLength = endPos - startPos; 93 int[] charPositions = new int[inputLength + 1]; 94 StringBuffer s = new StringBuffer(""); 95 inText.setIndex(startPos); 96 while (inText.getIndex() < endPos) { 97 s.append(inText.current()); 98 inText.next(); 99 } 100 String prenormstr = s.toString(); 101 boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES || 102 Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0); 103 CharacterIterator text; 104 int numCodePts = 0; 105 if (isNormalized) { 106 text = new java.text.StringCharacterIterator(prenormstr); 107 int index = 0; 108 charPositions[0] = 0; 109 while (index < prenormstr.length()) { 110 int codepoint = prenormstr.codePointAt(index); 111 index += Character.charCount(codepoint); 112 numCodePts++; 113 charPositions[numCodePts] = index; 114 } 115 } else { 116 String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC); 117 text = new java.text.StringCharacterIterator(normStr); 118 charPositions = new int[normStr.length() + 1]; 119 Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0); 120 int index = 0; 121 charPositions[0] = 0; 122 while (index < normalizer.endIndex()) { 123 normalizer.next(); 124 numCodePts++; 125 index = normalizer.getIndex(); 126 charPositions[numCodePts] = index; 127 } 128 } 129 130 // From here on out, do the algorithm. Note that our indices 131 // refer to indices within the normalized string. 132 int[] bestSnlp = new int[numCodePts + 1]; 133 bestSnlp[0] = 0; 134 for (int i = 1; i <= numCodePts; i++) { 135 bestSnlp[i] = kint32max; 136 } 137 138 int[] prev = new int[numCodePts + 1]; 139 for (int i = 0; i <= numCodePts; i++) { 140 prev[i] = -1; 141 } 142 143 final int maxWordSize = 20; 144 int values[] = new int[numCodePts]; 145 int lengths[] = new int[numCodePts]; 146 // dynamic programming to find the best segmentation 147 148 // In outer loop, i is the code point index, 149 // ix is the corresponding code unit index. 150 // They differ when the string contains supplementary characters. 151 int ix = 0; 152 text.setIndex(ix); 153 boolean is_prev_katakana = false; 154 for (int i = 0; i < numCodePts; i++, text.setIndex(ix), next32(text)) { 155 ix = text.getIndex(); 156 if (bestSnlp[i] == kint32max) { 157 continue; 158 } 159 160 int maxSearchLength = (i + maxWordSize < numCodePts) ? maxWordSize : (numCodePts - i); 161 int[] count_ = new int[1]; 162 fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); 163 int count = count_[0]; 164 165 // if there are no single character matches found in the dictionary 166 // starting with this character, treat character as a 1-character word 167 // with the highest value possible (i.e. the least likely to occur). 168 // Exclude Korean characters from this treatment, as they should be 169 // left together by default. 170 text.setIndex(ix); // fDictionary.matches() advances the text position; undo that. 171 if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) { 172 values[count] = maxSnlp; 173 lengths[count] = 1; 174 count++; 175 } 176 177 for (int j = 0; j < count; j++) { 178 int newSnlp = bestSnlp[i] + values[j]; 179 if (newSnlp < bestSnlp[lengths[j] + i]) { 180 bestSnlp[lengths[j] + i] = newSnlp; 181 prev[lengths[j] + i] = i; 182 } 183 } 184 185 // In Japanese, single-character Katakana words are pretty rare. 186 // So we apply the following heuristic to Katakana: any continuous 187 // run of Katakana characters is considered a candidate word with 188 // a default cost specified in the katakanaCost table according 189 // to its length. 190 boolean is_katakana = isKatakana(current32(text)); 191 if (!is_prev_katakana && is_katakana) { 192 int j = i + 1; 193 next32(text); 194 while (j < numCodePts && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) { 195 next32(text); 196 ++j; 197 } 198 199 if ((j - i) < kMaxKatakanaGroupLength) { 200 int newSnlp = bestSnlp[i] + getKatakanaCost(j - i); 201 if (newSnlp < bestSnlp[j]) { 202 bestSnlp[j] = newSnlp; 203 prev[j] = i; 204 } 205 } 206 } 207 is_prev_katakana = is_katakana; 208 } 209 210 int t_boundary[] = new int[numCodePts + 1]; 211 int numBreaks = 0; 212 if (bestSnlp[numCodePts] == kint32max) { 213 t_boundary[numBreaks] = numCodePts; 214 numBreaks++; 215 } else { 216 for (int i = numCodePts; i > 0; i = prev[i]) { 217 t_boundary[numBreaks] = i; 218 numBreaks++; 219 } 220 Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0); 221 } 222 223 if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) { 224 t_boundary[numBreaks++] = 0; 225 } 226 227 int correctedNumBreaks = 0; 228 for (int i = numBreaks - 1; i >= 0; i--) { 229 int pos = charPositions[t_boundary[i]] + startPos; 230 if (!(foundBreaks.contains(pos) || pos == startPos)) { 231 foundBreaks.push(charPositions[t_boundary[i]] + startPos); 232 correctedNumBreaks++; 233 } 234 } 235 236 if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) { 237 foundBreaks.pop(); 238 correctedNumBreaks--; 239 } 240 if (!foundBreaks.isEmpty()) 241 inText.setIndex(foundBreaks.peek()); 242 return correctedNumBreaks; 243 } 244 } 245