1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2014, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp
9 *
10 * C++ version created on: 2012sep23 (from utf16collationiterator.h)
11 * created by: Markus W. Scherer
12 */
13 
14 package com.ibm.icu.impl.coll;
15 
16 import com.ibm.icu.impl.Normalizer2Impl;
17 import com.ibm.icu.text.UCharacterIterator;
18 
19 /**
20  * Incrementally checks the input text for FCD and normalizes where necessary.
21  */
22 public final class FCDIterCollationIterator extends IterCollationIterator {
FCDIterCollationIterator(CollationData data, boolean numeric, UCharacterIterator ui, int startIndex)23     public FCDIterCollationIterator(CollationData data, boolean numeric,
24             UCharacterIterator ui, int startIndex) {
25         super(data, numeric, ui);
26         state = State.ITER_CHECK_FWD;
27         start = startIndex;
28         nfcImpl = data.nfcImpl;
29     }
30 
31     @Override
resetToOffset(int newOffset)32     public void resetToOffset(int newOffset) {
33         super.resetToOffset(newOffset);
34         start = newOffset;
35         state = State.ITER_CHECK_FWD;
36     }
37 
38     @Override
getOffset()39     public int getOffset() {
40         if(state.compareTo(State.ITER_CHECK_BWD) <= 0) {
41             return iter.getIndex();
42         } else if(state == State.ITER_IN_FCD_SEGMENT) {
43             return pos;
44         } else if(pos == 0) {
45             return start;
46         } else {
47             return limit;
48         }
49     }
50 
51     @Override
nextCodePoint()52     public int nextCodePoint() {
53         int c;
54         for(;;) {
55             if(state == State.ITER_CHECK_FWD) {
56                 c = iter.next();
57                 if(c < 0) {
58                     return c;
59                 }
60                 if(CollationFCD.hasTccc(c)) {
61                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
62                             CollationFCD.hasLccc(iter.current())) {
63                         iter.previous();
64                         if(!nextSegment()) {
65                             return Collation.SENTINEL_CP;
66                         }
67                         continue;
68                     }
69                 }
70                 if(isLeadSurrogate(c)) {
71                     int trail = iter.next();
72                     if(isTrailSurrogate(trail)) {
73                         return Character.toCodePoint((char)c, (char)trail);
74                     } else if(trail >= 0) {
75                         iter.previous();
76                     }
77                 }
78                 return c;
79             } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
80                 c = iter.nextCodePoint();
81                 pos += Character.charCount(c);
82                 assert(c >= 0);
83                 return c;
84             } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
85                     pos != normalized.length()) {
86                 c = normalized.codePointAt(pos);
87                 pos += Character.charCount(c);
88                 return c;
89             } else {
90                 switchToForward();
91             }
92         }
93     }
94 
95     @Override
previousCodePoint()96     public int previousCodePoint() {
97         int c;
98         for(;;) {
99             if(state == State.ITER_CHECK_BWD) {
100                 c = iter.previous();
101                 if(c < 0) {
102                     start = pos = 0;
103                     state = State.ITER_IN_FCD_SEGMENT;
104                     return Collation.SENTINEL_CP;
105                 }
106                 if(CollationFCD.hasLccc(c)) {
107                     int prev = Collation.SENTINEL_CP;
108                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
109                             CollationFCD.hasTccc(prev = iter.previous())) {
110                         iter.next();
111                         if(prev >= 0) {
112                             iter.next();
113                         }
114                         if(!previousSegment()) {
115                             return Collation.SENTINEL_CP;
116                         }
117                         continue;
118                     }
119                     // hasLccc(trail)=true for all trail surrogates
120                     if(isTrailSurrogate(c)) {
121                         if(prev < 0) {
122                             prev = iter.previous();
123                         }
124                         if(isLeadSurrogate(prev)) {
125                             return Character.toCodePoint((char)prev, (char)c);
126                         }
127                     }
128                     if(prev >= 0) {
129                         iter.next();
130                     }
131                 }
132                 return c;
133             } else if(state == State.ITER_IN_FCD_SEGMENT && pos != start) {
134                 c = iter.previousCodePoint();
135                 pos -= Character.charCount(c);
136                 assert(c >= 0);
137                 return c;
138             } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) {
139                 c = normalized.codePointBefore(pos);
140                 pos -= Character.charCount(c);
141                 return c;
142             } else {
143                 switchToBackward();
144             }
145         }
146     }
147 
148     @Override
handleNextCE32()149     protected long handleNextCE32() {
150         int c;
151         for(;;) {
152             if(state == State.ITER_CHECK_FWD) {
153                 c = iter.next();
154                 if(c < 0) {
155                     return NO_CP_AND_CE32;
156                 }
157                 if(CollationFCD.hasTccc(c)) {
158                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
159                             CollationFCD.hasLccc(iter.current())) {
160                         iter.previous();
161                         if(!nextSegment()) {
162                             c = Collation.SENTINEL_CP;
163                             return Collation.FALLBACK_CE32;
164                         }
165                         continue;
166                     }
167                 }
168                 break;
169             } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
170                 c = iter.next();
171                 ++pos;
172                 assert(c >= 0);
173                 break;
174             } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
175                     pos != normalized.length()) {
176                 c = normalized.charAt(pos++);
177                 break;
178             } else {
179                 switchToForward();
180             }
181         }
182         return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c));
183     }
184 
185     @Override
handleGetTrailSurrogate()186     protected char handleGetTrailSurrogate() {
187         if(state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) {
188             int trail = iter.next();
189             if(isTrailSurrogate(trail)) {
190                 if(state == State.ITER_IN_FCD_SEGMENT) { ++pos; }
191             } else if(trail >= 0) {
192                 iter.previous();
193             }
194             return (char)trail;
195         } else {
196             assert(pos < normalized.length());
197             char trail;
198             if(Character.isLowSurrogate(trail = normalized.charAt(pos))) { ++pos; }
199             return trail;
200         }
201     }
202 
203     @Override
204     protected void forwardNumCodePoints(int num) {
205         // Specify the class to avoid a virtual-function indirection.
206         // In Java, we would declare this class final.
207         while(num > 0 && nextCodePoint() >= 0) {
208             --num;
209         }
210     }
211 
212     @Override
backwardNumCodePoints(int num)213     protected void backwardNumCodePoints(int num) {
214         // Specify the class to avoid a virtual-function indirection.
215         // In Java, we would declare this class final.
216         while(num > 0 && previousCodePoint() >= 0) {
217             --num;
218         }
219     }
220 
221     /**
222      * Switches to forward checking if possible.
223      */
switchToForward()224     private void switchToForward() {
225         assert(state == State.ITER_CHECK_BWD ||
226                 (state == State.ITER_IN_FCD_SEGMENT && pos == limit) ||
227                 (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == normalized.length()));
228         if(state == State.ITER_CHECK_BWD) {
229             // Turn around from backward checking.
230             start = pos = iter.getIndex();
231             if(pos == limit) {
232                 state = State.ITER_CHECK_FWD;  // Check forward.
233             } else {  // pos < limit
234                 state = State.ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
235             }
236         } else {
237             // Reached the end of the FCD segment.
238             if(state == State.ITER_IN_FCD_SEGMENT) {
239                 // The input text segment is FCD, extend it forward.
240             } else {
241                 // The input text segment needed to be normalized.
242                 // Switch to checking forward from it.
243                 if(state == State.IN_NORM_ITER_AT_START) {
244                     iter.moveIndex(limit - start);
245                 }
246                 start = limit;
247             }
248             state = State.ITER_CHECK_FWD;
249         }
250     }
251 
252     /**
253      * Extends the FCD text segment forward or normalizes around pos.
254      * @return true if success
255      */
nextSegment()256     private boolean nextSegment() {
257         assert(state == State.ITER_CHECK_FWD);
258         // The input text [start..(iter index)[ passes the FCD check.
259         pos = iter.getIndex();
260         // Collect the characters being checked, in case they need to be normalized.
261         if(s == null) {
262             s = new StringBuilder();
263         } else {
264             s.setLength(0);
265         }
266         int prevCC = 0;
267         for(;;) {
268             // Fetch the next character and its fcd16 value.
269             int c = iter.nextCodePoint();
270             if(c < 0) { break; }
271             int fcd16 = nfcImpl.getFCD16(c);
272             int leadCC = fcd16 >> 8;
273             if(leadCC == 0 && s.length() != 0) {
274                 // FCD boundary before this character.
275                 iter.previousCodePoint();
276                 break;
277             }
278             s.appendCodePoint(c);
279             if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
280                 // Fails FCD check. Find the next FCD boundary and normalize.
281                 for(;;) {
282                     c = iter.nextCodePoint();
283                     if(c < 0) { break; }
284                     if(nfcImpl.getFCD16(c) <= 0xff) {
285                         iter.previousCodePoint();
286                         break;
287                     }
288                     s.appendCodePoint(c);
289                 }
290                 normalize(s);
291                 start = pos;
292                 limit = pos + s.length();
293                 state = State.IN_NORM_ITER_AT_LIMIT;
294                 pos = 0;
295                 return true;
296             }
297             prevCC = fcd16 & 0xff;
298             if(prevCC == 0) {
299                 // FCD boundary after the last character.
300                 break;
301             }
302         }
303         limit = pos + s.length();
304         assert(pos != limit);
305         iter.moveIndex(-s.length());
306         state = State.ITER_IN_FCD_SEGMENT;
307         return true;
308     }
309 
310     /**
311      * Switches to backward checking.
312      */
switchToBackward()313     private void switchToBackward() {
314         assert(state == State.ITER_CHECK_FWD ||
315                 (state == State.ITER_IN_FCD_SEGMENT && pos == start) ||
316                 (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0));
317         if(state == State.ITER_CHECK_FWD) {
318             // Turn around from forward checking.
319             limit = pos = iter.getIndex();
320             if(pos == start) {
321                 state = State.ITER_CHECK_BWD;  // Check backward.
322             } else {  // pos > start
323                 state = State.ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
324             }
325         } else {
326             // Reached the start of the FCD segment.
327             if(state == State.ITER_IN_FCD_SEGMENT) {
328                 // The input text segment is FCD, extend it backward.
329             } else {
330                 // The input text segment needed to be normalized.
331                 // Switch to checking backward from it.
332                 if(state == State.IN_NORM_ITER_AT_LIMIT) {
333                     iter.moveIndex(start - limit);
334                 }
335                 limit = start;
336             }
337             state = State.ITER_CHECK_BWD;
338         }
339     }
340 
341     /**
342      * Extends the FCD text segment backward or normalizes around pos.
343      * @return true if success
344      */
previousSegment()345     private boolean previousSegment() {
346         assert(state == State.ITER_CHECK_BWD);
347         // The input text [(iter index)..limit[ passes the FCD check.
348         pos = iter.getIndex();
349         // Collect the characters being checked, in case they need to be normalized.
350         if(s == null) {
351             s = new StringBuilder();
352         } else {
353             s.setLength(0);
354         }
355         int nextCC = 0;
356         for(;;) {
357             // Fetch the previous character and its fcd16 value.
358             int c = iter.previousCodePoint();
359             if(c < 0) { break; }
360             int fcd16 = nfcImpl.getFCD16(c);
361             int trailCC = fcd16 & 0xff;
362             if(trailCC == 0 && s.length() != 0) {
363                 // FCD boundary after this character.
364                 iter.nextCodePoint();
365                 break;
366             }
367             s.appendCodePoint(c);
368             if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
369                                 CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
370                 // Fails FCD check. Find the previous FCD boundary and normalize.
371                 while(fcd16 > 0xff) {
372                     c = iter.previousCodePoint();
373                     if(c < 0) { break; }
374                     fcd16 = nfcImpl.getFCD16(c);
375                     if(fcd16 == 0) {
376                         iter.nextCodePoint();
377                         break;
378                     }
379                     s.appendCodePoint(c);
380                 }
381                 s.reverse();
382                 normalize(s);
383                 limit = pos;
384                 start = pos - s.length();
385                 state = State.IN_NORM_ITER_AT_START;
386                 pos = normalized.length();
387                 return true;
388             }
389             nextCC = fcd16 >> 8;
390             if(nextCC == 0) {
391                 // FCD boundary before the following character.
392                 break;
393             }
394         }
395         start = pos - s.length();
396         assert(pos != start);
397         iter.moveIndex(s.length());
398         state = State.ITER_IN_FCD_SEGMENT;
399         return true;
400     }
401 
normalize(CharSequence s)402     private void normalize(CharSequence s) {
403         if(normalized == null) {
404             normalized = new StringBuilder();
405         }
406         // NFD without argument checking.
407         nfcImpl.decompose(s, normalized);
408     }
409 
410     private enum State {
411         /**
412          * The input text [start..(iter index)[ passes the FCD check.
413          * Moving forward checks incrementally.
414          * pos & limit are undefined.
415          */
416         ITER_CHECK_FWD,
417         /**
418          * The input text [(iter index)..limit[ passes the FCD check.
419          * Moving backward checks incrementally.
420          * start & pos are undefined.
421          */
422         ITER_CHECK_BWD,
423         /**
424          * The input text [start..limit[ passes the FCD check.
425          * pos tracks the current text index.
426          */
427         ITER_IN_FCD_SEGMENT,
428         /**
429          * The input text [start..limit[ failed the FCD check and was normalized.
430          * pos tracks the current index in the normalized string.
431          * The text iterator is at the limit index.
432          */
433         IN_NORM_ITER_AT_LIMIT,
434         /**
435          * The input text [start..limit[ failed the FCD check and was normalized.
436          * pos tracks the current index in the normalized string.
437          * The text iterator is at the start index.
438          */
439         IN_NORM_ITER_AT_START
440     }
441 
442     private State state;
443 
444     private int start;
445     private int pos;
446     private int limit;
447 
448     private final Normalizer2Impl nfcImpl;
449     private StringBuilder s;
450     private StringBuilder normalized;
451 }
452