1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2014, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * FCDUTF16CollationIterator.java, ported from utf16collationiterator.h/.cpp
9 *
10 * C++ version created on: 2010oct27
11 * created by: Markus W. Scherer
12 */
13 
14 package com.ibm.icu.impl.coll;
15 
16 import com.ibm.icu.impl.Normalizer2Impl;
17 
18 /**
19  * Incrementally checks the input text for FCD and normalizes where necessary.
20  */
21 public final class FCDUTF16CollationIterator extends UTF16CollationIterator {
22     /**
23      * Partial constructor, see {@link CollationIterator#CollationIterator(CollationData)}.
24      */
FCDUTF16CollationIterator(CollationData d)25     public FCDUTF16CollationIterator(CollationData d) {
26         super(d);
27         nfcImpl = d.nfcImpl;
28     }
29 
FCDUTF16CollationIterator(CollationData data, boolean numeric, CharSequence s, int p)30     public FCDUTF16CollationIterator(CollationData data, boolean numeric, CharSequence s, int p) {
31         super(data, numeric, s, p);
32         rawSeq = s;
33         segmentStart = p;
34         rawLimit = s.length();
35         nfcImpl = data.nfcImpl;
36         checkDir = 1;
37     }
38 
39     @Override
equals(Object other)40     public boolean equals(Object other) {
41         // Skip the UTF16CollationIterator and call its parent.
42         if (!(other instanceof CollationIterator)
43             || !((CollationIterator)this).equals(other)
44             || !(other instanceof FCDUTF16CollationIterator))
45         {
46             return false;
47         }
48         FCDUTF16CollationIterator o = (FCDUTF16CollationIterator)other;
49         // Compare the iterator state but not the text: Assume that the caller does that.
50         if (checkDir != o.checkDir) {
51             return false;
52         }
53         if (checkDir == 0 && (seq == rawSeq) != (o.seq == o.rawSeq)) {
54             return false;
55         }
56         if (checkDir != 0 || seq == rawSeq) {
57             return (pos - rawStart) == (o.pos - /*o.*/ rawStart);
58         }
59         else {
60             return (segmentStart - rawStart) == (o.segmentStart - /*o.*/ rawStart) &&
61                     (pos - start) == (o.pos - o.start);
62         }
63     }
64 
65     @Override
hashCode()66     public int hashCode() {
67         assert false : "hashCode not designed";
68         return 42; // any arbitrary constant will do
69     }
70 
71     @Override
resetToOffset(int newOffset)72     public void resetToOffset(int newOffset) {
73         reset();
74         seq = rawSeq;
75         start = segmentStart = pos = rawStart + newOffset;
76         limit = rawLimit;
77         checkDir = 1;
78     }
79 
80     @Override
getOffset()81     public int getOffset() {
82         if(checkDir != 0 || seq == rawSeq) {
83             return pos - rawStart;
84         } else if(pos == start) {
85             return segmentStart - rawStart;
86         } else {
87             return segmentLimit - rawStart;
88         }
89     }
90 
91     @Override
setText(boolean numeric, CharSequence s, int p)92     public void setText(boolean numeric, CharSequence s, int p) {
93         super.setText(numeric, s, p);
94         rawSeq = s;
95         segmentStart = p;
96         rawLimit = limit = s.length();
97         checkDir = 1;
98     }
99 
100     @Override
nextCodePoint()101     public int nextCodePoint() {
102         char c;
103         for(;;) {
104             if(checkDir > 0) {
105                 if(pos == limit) {
106                     return Collation.SENTINEL_CP;
107                 }
108                 c = seq.charAt(pos++);
109                 if(CollationFCD.hasTccc(c)) {
110                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
111                             (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
112                         --pos;
113                         nextSegment();
114                         c = seq.charAt(pos++);
115                     }
116                 }
117                 break;
118             } else if(checkDir == 0 && pos != limit) {
119                 c = seq.charAt(pos++);
120                 break;
121             } else {
122                 switchToForward();
123             }
124         }
125         char trail;
126         if(Character.isHighSurrogate(c) && pos != limit &&
127                 Character.isLowSurrogate(trail = seq.charAt(pos))) {
128             ++pos;
129             return Character.toCodePoint(c, trail);
130         } else {
131             return c;
132         }
133     }
134 
135     @Override
previousCodePoint()136     public int previousCodePoint() {
137         char c;
138         for(;;) {
139             if(checkDir < 0) {
140                 if(pos == start) {
141                     return Collation.SENTINEL_CP;
142                 }
143                 c = seq.charAt(--pos);
144                 if(CollationFCD.hasLccc(c)) {
145                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
146                             (pos != start && CollationFCD.hasTccc(seq.charAt(pos - 1)))) {
147                         ++pos;
148                         previousSegment();
149                         c = seq.charAt(--pos);
150                     }
151                 }
152                 break;
153             } else if(checkDir == 0 && pos != start) {
154                 c = seq.charAt(--pos);
155                 break;
156             } else {
157                 switchToBackward();
158             }
159         }
160         char lead;
161         if(Character.isLowSurrogate(c) && pos != start &&
162                 Character.isHighSurrogate(lead = seq.charAt(pos - 1))) {
163             --pos;
164             return Character.toCodePoint(lead, c);
165         } else {
166             return c;
167         }
168     }
169 
170     @Override
handleNextCE32()171     protected long handleNextCE32() {
172         char c;
173         for(;;) {
174             if(checkDir > 0) {
175                 if(pos == limit) {
176                     return NO_CP_AND_CE32;
177                 }
178                 c = seq.charAt(pos++);
179                 if(CollationFCD.hasTccc(c)) {
180                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
181                             (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
182                         --pos;
183                         nextSegment();
184                         c = seq.charAt(pos++);
185                     }
186                 }
187                 break;
188             } else if(checkDir == 0 && pos != limit) {
189                 c = seq.charAt(pos++);
190                 break;
191             } else {
192                 switchToForward();
193             }
194         }
195         return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead(c));
196     }
197 
198     /* boolean foundNULTerminator(); */
199 
200     @Override
forwardNumCodePoints(int num)201     protected void forwardNumCodePoints(int num) {
202         // Specify the class to avoid a virtual-function indirection.
203         // In Java, we would declare this class final.
204         while(num > 0 && nextCodePoint() >= 0) {
205             --num;
206         }
207     }
208 
209     @Override
backwardNumCodePoints(int num)210     protected void backwardNumCodePoints(int num) {
211         // Specify the class to avoid a virtual-function indirection.
212         // In Java, we would declare this class final.
213         while(num > 0 && previousCodePoint() >= 0) {
214             --num;
215         }
216     }
217 
218     /**
219      * Switches to forward checking if possible.
220      * To be called when checkDir < 0 || (checkDir == 0 && pos == limit).
221      * Returns with checkDir > 0 || (checkDir == 0 && pos != limit).
222      */
switchToForward()223     private void switchToForward() {
224         assert((checkDir < 0 && seq == rawSeq) || (checkDir == 0 && pos == limit));
225         if(checkDir < 0) {
226             // Turn around from backward checking.
227             start = segmentStart = pos;
228             if(pos == segmentLimit) {
229                 limit = rawLimit;
230                 checkDir = 1;  // Check forward.
231             } else {  // pos < segmentLimit
232                 checkDir = 0;  // Stay in FCD segment.
233             }
234         } else {
235             // Reached the end of the FCD segment.
236             if(seq == rawSeq) {
237                 // The input text segment is FCD, extend it forward.
238             } else {
239                 // The input text segment needed to be normalized.
240                 // Switch to checking forward from it.
241                 seq = rawSeq;
242                 pos = start = segmentStart = segmentLimit;
243                 // Note: If this segment is at the end of the input text,
244                 // then it might help to return false to indicate that, so that
245                 // we do not have to re-check and normalize when we turn around and go backwards.
246                 // However, that would complicate the call sites for an optimization of an unusual case.
247             }
248             limit = rawLimit;
249             checkDir = 1;
250         }
251     }
252 
253     /**
254      * Extend the FCD text segment forward or normalize around pos.
255      * To be called when checkDir > 0 && pos != limit.
256      * Returns with checkDir == 0 and pos != limit.
257      */
258     private void nextSegment() {
259         assert(checkDir > 0 && seq == rawSeq && pos != limit);
260         // The input text [segmentStart..pos[ passes the FCD check.
261         int p = pos;
262         int prevCC = 0;
263         for(;;) {
264             // Fetch the next character's fcd16 value.
265             int q = p;
266             int c = Character.codePointAt(seq, p);
267             p += Character.charCount(c);
268             int fcd16 = nfcImpl.getFCD16(c);
269             int leadCC = fcd16 >> 8;
270             if(leadCC == 0 && q != pos) {
271                 // FCD boundary before the [q, p[ character.
272                 limit = segmentLimit = q;
273                 break;
274             }
275             if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
276                 // Fails FCD check. Find the next FCD boundary and normalize.
277                 do {
278                     q = p;
279                     if(p == rawLimit) { break; }
280                     c = Character.codePointAt(seq, p);
281                     p += Character.charCount(c);
282                 } while(nfcImpl.getFCD16(c) > 0xff);
283                 normalize(pos, q);
284                 pos = start;
285                 break;
286             }
287             prevCC = fcd16 & 0xff;
288             if(p == rawLimit || prevCC == 0) {
289                 // FCD boundary after the last character.
290                 limit = segmentLimit = p;
291                 break;
292             }
293         }
294         assert(pos != limit);
295         checkDir = 0;
296     }
297 
298     /**
299      * Switches to backward checking.
300      * To be called when checkDir > 0 || (checkDir == 0 && pos == start).
301      * Returns with checkDir < 0 || (checkDir == 0 && pos != start).
302      */
303     private void switchToBackward() {
304         assert((checkDir > 0 && seq == rawSeq) || (checkDir == 0 && pos == start));
305         if(checkDir > 0) {
306             // Turn around from forward checking.
307             limit = segmentLimit = pos;
308             if(pos == segmentStart) {
309                 start = rawStart;
310                 checkDir = -1;  // Check backward.
311             } else {  // pos > segmentStart
312                 checkDir = 0;  // Stay in FCD segment.
313             }
314         } else {
315             // Reached the start of the FCD segment.
316             if(seq == rawSeq) {
317                 // The input text segment is FCD, extend it backward.
318             } else {
319                 // The input text segment needed to be normalized.
320                 // Switch to checking backward from it.
321                 seq = rawSeq;
322                 pos = limit = segmentLimit = segmentStart;
323             }
324             start = rawStart;
325             checkDir = -1;
326         }
327     }
328 
329     /**
330      * Extend the FCD text segment backward or normalize around pos.
331      * To be called when checkDir < 0 && pos != start.
332      * Returns with checkDir == 0 and pos != start.
333      */
334     private void previousSegment() {
335         assert(checkDir < 0 && seq == rawSeq && pos != start);
336         // The input text [pos..segmentLimit[ passes the FCD check.
337         int p = pos;
338         int nextCC = 0;
339         for(;;) {
340             // Fetch the previous character's fcd16 value.
341             int q = p;
342             int c = Character.codePointBefore(seq, p);
343             p -= Character.charCount(c);
344             int fcd16 = nfcImpl.getFCD16(c);
345             int trailCC = fcd16 & 0xff;
346             if(trailCC == 0 && q != pos) {
347                 // FCD boundary after the [p, q[ character.
348                 start = segmentStart = q;
349                 break;
350             }
351             if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
352                                 CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
353                 // Fails FCD check. Find the previous FCD boundary and normalize.
354                 do {
355                     q = p;
356                     if(fcd16 <= 0xff || p == rawStart) { break; }
357                     c = Character.codePointBefore(seq, p);
358                     p -= Character.charCount(c);
359                 } while((fcd16 = nfcImpl.getFCD16(c)) != 0);
360                 normalize(q, pos);
361                 pos = limit;
362                 break;
363             }
364             nextCC = fcd16 >> 8;
365             if(p == rawStart || nextCC == 0) {
366                 // FCD boundary before the following character.
367                 start = segmentStart = p;
368                 break;
369             }
370         }
371         assert(pos != start);
372         checkDir = 0;
373     }
374 
375     private void normalize(int from, int to) {
376         if(normalized == null) {
377             normalized = new StringBuilder();
378         }
379         // NFD without argument checking.
380         nfcImpl.decompose(rawSeq, from, to, normalized, to - from);
381         // Switch collation processing into the FCD buffer
382         // with the result of normalizing [segmentStart, segmentLimit[.
383         segmentStart = from;
384         segmentLimit = to;
385         seq = normalized;
386         start = 0;
387         limit = start + normalized.length();
388     }
389 
390     // Text pointers: The input text is rawSeq[rawStart, rawLimit[.
391     // (In C++, these are const UChar * pointers.
392     // In Java, we use CharSequence rawSeq and the parent class' seq
393     // together with int indexes.)
394     //
395     // checkDir > 0:
396     //
397     // The input text rawSeq[segmentStart..pos[ passes the FCD check.
398     // Moving forward checks incrementally.
399     // segmentLimit is undefined. seq == rawSeq. limit == rawLimit.
400     //
401     // checkDir < 0:
402     // The input text rawSeq[pos..segmentLimit[ passes the FCD check.
403     // Moving backward checks incrementally.
404     // segmentStart is undefined. seq == rawSeq. start == rawStart.
405     //
406     // checkDir == 0:
407     //
408     // The input text rawSeq[segmentStart..segmentLimit[ is being processed.
409     // These pointers are at FCD boundaries.
410     // Either this text segment already passes the FCD check
411     // and seq==rawSeq && segmentStart==start<=pos<=limit==segmentLimit,
412     // or the current segment had to be normalized so that
413     // rawSeq[segmentStart..segmentLimit[ turned into the normalized string,
414     // corresponding to seq==normalized && 0==start<=pos<=limit==start+normalized.length().
415     private CharSequence rawSeq;
416     private static final int rawStart = 0;
417     private int segmentStart;
418     private int segmentLimit;
419     private int rawLimit;
420 
421     private final Normalizer2Impl nfcImpl;
422     private StringBuilder normalized;
423     // Direction of incremental FCD check. See comments before rawStart.
424     private int checkDir;
425 }
426