1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2012-2016, International Business Machines Corporation and         *
6  * others. All Rights Reserved.                                                *
7  *******************************************************************************
8  */
9 package com.ibm.icu.text;
10 
11 import static com.ibm.icu.impl.CharacterIteration.DONE32;
12 import static com.ibm.icu.impl.CharacterIteration.current32;
13 import static com.ibm.icu.impl.CharacterIteration.next32;
14 
15 import java.io.IOException;
16 import java.text.CharacterIterator;
17 
18 import com.ibm.icu.impl.Assert;
19 
20 class CjkBreakEngine extends DictionaryBreakEngine {
21     private static final UnicodeSet fHangulWordSet = new UnicodeSet();
22     private static final UnicodeSet fHanWordSet = new UnicodeSet();
23     private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
24     private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
25     static {
26         fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
27         fHanWordSet.applyPattern("[:Han:]");
28         fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
29         fHiraganaWordSet.applyPattern("[:Hiragana:]");
30 
31         // freeze them all
fHangulWordSet.freeze()32         fHangulWordSet.freeze();
fHanWordSet.freeze()33         fHanWordSet.freeze();
fKatakanaWordSet.freeze()34         fKatakanaWordSet.freeze();
fHiraganaWordSet.freeze()35         fHiraganaWordSet.freeze();
36     }
37 
38     private DictionaryMatcher fDictionary = null;
39 
CjkBreakEngine(boolean korean)40     public CjkBreakEngine(boolean korean) throws IOException {
41         fDictionary = DictionaryData.loadDictionaryFor("Hira");
42         if (korean) {
43             setCharacters(fHangulWordSet);
44         } else { //Chinese and Japanese
45             UnicodeSet cjSet = new UnicodeSet();
46             cjSet.addAll(fHanWordSet);
47             cjSet.addAll(fKatakanaWordSet);
48             cjSet.addAll(fHiraganaWordSet);
49             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
50             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
51             setCharacters(cjSet);
52         }
53     }
54 
55     @Override
equals(Object obj)56     public boolean equals(Object obj) {
57         if (obj instanceof CjkBreakEngine) {
58             CjkBreakEngine other = (CjkBreakEngine)obj;
59             return this.fSet.equals(other.fSet);
60         }
61         return false;
62     }
63 
64     @Override
hashCode()65     public int hashCode() {
66         return getClass().hashCode();
67     }
68 
69     private static final int kMaxKatakanaLength = 8;
70     private static final int kMaxKatakanaGroupLength = 20;
71     private static final int maxSnlp = 255;
72     private static final int kint32max = Integer.MAX_VALUE;
getKatakanaCost(int wordlength)73     private static int getKatakanaCost(int wordlength) {
74         int katakanaCost[] =  new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
75         return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
76     }
77 
isKatakana(int value)78     private static boolean isKatakana(int value) {
79         return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
80                 (value >= 0xFF66 && value <= 0xFF9F);
81     }
82 
83     @Override
divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, DequeI foundBreaks)84     public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
85             DequeI foundBreaks) {
86         if (startPos >= endPos) {
87             return 0;
88         }
89 
90         inText.setIndex(startPos);
91 
92         int inputLength = endPos - startPos;
93         int[] charPositions = new int[inputLength + 1];
94         StringBuffer s = new StringBuffer("");
95         inText.setIndex(startPos);
96         while (inText.getIndex() < endPos) {
97             s.append(inText.current());
98             inText.next();
99         }
100         String prenormstr = s.toString();
101         boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
102                                Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
103         CharacterIterator text;
104         int numCodePts = 0;
105         if (isNormalized) {
106             text = new java.text.StringCharacterIterator(prenormstr);
107             int index = 0;
108             charPositions[0] = 0;
109             while (index < prenormstr.length()) {
110                 int codepoint = prenormstr.codePointAt(index);
111                 index += Character.charCount(codepoint);
112                 numCodePts++;
113                 charPositions[numCodePts] = index;
114             }
115         } else {
116             String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
117             text = new java.text.StringCharacterIterator(normStr);
118             charPositions = new int[normStr.length() + 1];
119             Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
120             int index = 0;
121             charPositions[0] = 0;
122             while (index < normalizer.endIndex()) {
123                 normalizer.next();
124                 numCodePts++;
125                 index = normalizer.getIndex();
126                 charPositions[numCodePts] = index;
127             }
128         }
129 
130         // From here on out, do the algorithm. Note that our indices
131         // refer to indices within the normalized string.
132         int[] bestSnlp = new int[numCodePts + 1];
133         bestSnlp[0] = 0;
134         for (int i = 1; i <= numCodePts; i++) {
135             bestSnlp[i] = kint32max;
136         }
137 
138         int[] prev = new int[numCodePts + 1];
139         for (int i = 0; i <= numCodePts; i++) {
140             prev[i] = -1;
141         }
142 
143         final int maxWordSize = 20;
144         int values[] = new int[numCodePts];
145         int lengths[] = new int[numCodePts];
146         // dynamic programming to find the best segmentation
147 
148         // In outer loop, i  is the code point index,
149         //                ix is the corresponding code unit index.
150         //    They differ when the string contains supplementary characters.
151         int ix = 0;
152         text.setIndex(ix);
153         boolean is_prev_katakana = false;
154         for (int i = 0; i < numCodePts; i++, text.setIndex(ix), next32(text)) {
155             ix = text.getIndex();
156             if (bestSnlp[i] == kint32max) {
157                 continue;
158             }
159 
160             int maxSearchLength = (i + maxWordSize < numCodePts) ? maxWordSize : (numCodePts - i);
161             int[] count_ = new int[1];
162             fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
163             int count = count_[0];
164 
165             // if there are no single character matches found in the dictionary
166             // starting with this character, treat character as a 1-character word
167             // with the highest value possible (i.e. the least likely to occur).
168             // Exclude Korean characters from this treatment, as they should be
169             // left together by default.
170             text.setIndex(ix);  // fDictionary.matches() advances the text position; undo that.
171             if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
172                 values[count] = maxSnlp;
173                 lengths[count] = 1;
174                 count++;
175             }
176 
177             for (int j = 0; j < count; j++) {
178                 int newSnlp = bestSnlp[i] + values[j];
179                 if (newSnlp < bestSnlp[lengths[j] + i]) {
180                     bestSnlp[lengths[j] + i] = newSnlp;
181                     prev[lengths[j] + i] = i;
182                 }
183             }
184 
185             // In Japanese, single-character Katakana words are pretty rare.
186             // So we apply the following heuristic to Katakana: any continuous
187             // run of Katakana characters is considered a candidate word with
188             // a default cost specified in the katakanaCost table according
189             // to its length.
190             boolean is_katakana = isKatakana(current32(text));
191             if (!is_prev_katakana && is_katakana) {
192                 int j = i + 1;
193                 next32(text);
194                 while (j < numCodePts && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
195                     next32(text);
196                     ++j;
197                 }
198 
199                 if ((j - i) < kMaxKatakanaGroupLength) {
200                     int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
201                     if (newSnlp < bestSnlp[j]) {
202                         bestSnlp[j] = newSnlp;
203                         prev[j] = i;
204                     }
205                 }
206             }
207             is_prev_katakana = is_katakana;
208         }
209 
210         int t_boundary[] = new int[numCodePts + 1];
211         int numBreaks = 0;
212         if (bestSnlp[numCodePts] == kint32max) {
213             t_boundary[numBreaks] = numCodePts;
214             numBreaks++;
215         } else {
216             for (int i = numCodePts; i > 0; i = prev[i]) {
217                 t_boundary[numBreaks] = i;
218                 numBreaks++;
219             }
220             Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
221         }
222 
223         if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
224             t_boundary[numBreaks++] = 0;
225         }
226 
227         int correctedNumBreaks = 0;
228         for (int i = numBreaks - 1; i >= 0; i--) {
229             int pos = charPositions[t_boundary[i]] + startPos;
230             if (!(foundBreaks.contains(pos) || pos == startPos)) {
231                 foundBreaks.push(charPositions[t_boundary[i]] + startPos);
232                 correctedNumBreaks++;
233             }
234         }
235 
236         if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
237             foundBreaks.pop();
238             correctedNumBreaks--;
239         }
240         if (!foundBreaks.isEmpty())
241             inText.setIndex(foundBreaks.peek());
242         return correctedNumBreaks;
243     }
244 }
245