1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * uitercollationiterator.cpp
7 *
8 * created on: 2012sep23 (from utf16collationiterator.cpp)
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/uiter.h"
17 #include "charstr.h"
18 #include "cmemory.h"
19 #include "collation.h"
20 #include "collationdata.h"
21 #include "collationfcd.h"
22 #include "collationiterator.h"
23 #include "normalizer2impl.h"
24 #include "uassert.h"
25 #include "uitercollationiterator.h"
26 
27 U_NAMESPACE_BEGIN
28 
~UIterCollationIterator()29 UIterCollationIterator::~UIterCollationIterator() {}
30 
31 void
resetToOffset(int32_t newOffset)32 UIterCollationIterator::resetToOffset(int32_t newOffset) {
33     reset();
34     iter.move(&iter, newOffset, UITER_START);
35 }
36 
37 int32_t
getOffset() const38 UIterCollationIterator::getOffset() const {
39     return iter.getIndex(&iter, UITER_CURRENT);
40 }
41 
42 uint32_t
handleNextCE32(UChar32 & c,UErrorCode &)43 UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
44     c = iter.next(&iter);
45     if(c < 0) {
46         return Collation::FALLBACK_CE32;
47     }
48     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
49 }
50 
51 UChar
handleGetTrailSurrogate()52 UIterCollationIterator::handleGetTrailSurrogate() {
53     UChar32 trail = iter.next(&iter);
54     if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); }
55     return (UChar)trail;
56 }
57 
58 UChar32
nextCodePoint(UErrorCode &)59 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
60     return uiter_next32(&iter);
61 }
62 
63 UChar32
previousCodePoint(UErrorCode &)64 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
65     return uiter_previous32(&iter);
66 }
67 
68 void
forwardNumCodePoints(int32_t num,UErrorCode &)69 UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
70     while(num > 0 && (uiter_next32(&iter)) >= 0) {
71         --num;
72     }
73 }
74 
75 void
backwardNumCodePoints(int32_t num,UErrorCode &)76 UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
77     while(num > 0 && (uiter_previous32(&iter)) >= 0) {
78         --num;
79     }
80 }
81 
82 // FCDUIterCollationIterator ----------------------------------------------- ***
83 
~FCDUIterCollationIterator()84 FCDUIterCollationIterator::~FCDUIterCollationIterator() {}
85 
86 void
resetToOffset(int32_t newOffset)87 FCDUIterCollationIterator::resetToOffset(int32_t newOffset) {
88     UIterCollationIterator::resetToOffset(newOffset);
89     start = newOffset;
90     state = ITER_CHECK_FWD;
91 }
92 
93 int32_t
getOffset() const94 FCDUIterCollationIterator::getOffset() const {
95     if(state <= ITER_CHECK_BWD) {
96         return iter.getIndex(&iter, UITER_CURRENT);
97     } else if(state == ITER_IN_FCD_SEGMENT) {
98         return pos;
99     } else if(pos == 0) {
100         return start;
101     } else {
102         return limit;
103     }
104 }
105 
106 uint32_t
handleNextCE32(UChar32 & c,UErrorCode & errorCode)107 FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
108     for(;;) {
109         if(state == ITER_CHECK_FWD) {
110             c = iter.next(&iter);
111             if(c < 0) {
112                 return Collation::FALLBACK_CE32;
113             }
114             if(CollationFCD::hasTccc(c)) {
115                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
116                         CollationFCD::hasLccc(iter.current(&iter))) {
117                     iter.previous(&iter);
118                     if(!nextSegment(errorCode)) {
119                         c = U_SENTINEL;
120                         return Collation::FALLBACK_CE32;
121                     }
122                     continue;
123                 }
124             }
125             break;
126         } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
127             c = iter.next(&iter);
128             ++pos;
129             U_ASSERT(c >= 0);
130             break;
131         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
132             c = normalized[pos++];
133             break;
134         } else {
135             switchToForward();
136         }
137     }
138     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
139 }
140 
141 UChar
handleGetTrailSurrogate()142 FCDUIterCollationIterator::handleGetTrailSurrogate() {
143     if(state <= ITER_IN_FCD_SEGMENT) {
144         UChar32 trail = iter.next(&iter);
145         if(U16_IS_TRAIL(trail)) {
146             if(state == ITER_IN_FCD_SEGMENT) { ++pos; }
147         } else if(trail >= 0) {
148             iter.previous(&iter);
149         }
150         return (UChar)trail;
151     } else {
152         U_ASSERT(pos < normalized.length());
153         UChar trail;
154         if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
155         return trail;
156     }
157 }
158 
159 UChar32
nextCodePoint(UErrorCode & errorCode)160 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) {
161     UChar32 c;
162     for(;;) {
163         if(state == ITER_CHECK_FWD) {
164             c = iter.next(&iter);
165             if(c < 0) {
166                 return c;
167             }
168             if(CollationFCD::hasTccc(c)) {
169                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
170                         CollationFCD::hasLccc(iter.current(&iter))) {
171                     iter.previous(&iter);
172                     if(!nextSegment(errorCode)) {
173                         return U_SENTINEL;
174                     }
175                     continue;
176                 }
177             }
178             if(U16_IS_LEAD(c)) {
179                 UChar32 trail = iter.next(&iter);
180                 if(U16_IS_TRAIL(trail)) {
181                     return U16_GET_SUPPLEMENTARY(c, trail);
182                 } else if(trail >= 0) {
183                     iter.previous(&iter);
184                 }
185             }
186             return c;
187         } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
188             c = uiter_next32(&iter);
189             pos += U16_LENGTH(c);
190             U_ASSERT(c >= 0);
191             return c;
192         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
193             c = normalized.char32At(pos);
194             pos += U16_LENGTH(c);
195             return c;
196         } else {
197             switchToForward();
198         }
199     }
200 }
201 
202 UChar32
previousCodePoint(UErrorCode & errorCode)203 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) {
204     UChar32 c;
205     for(;;) {
206         if(state == ITER_CHECK_BWD) {
207             c = iter.previous(&iter);
208             if(c < 0) {
209                 start = pos = 0;
210                 state = ITER_IN_FCD_SEGMENT;
211                 return U_SENTINEL;
212             }
213             if(CollationFCD::hasLccc(c)) {
214                 UChar32 prev = U_SENTINEL;
215                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
216                         CollationFCD::hasTccc(prev = iter.previous(&iter))) {
217                     iter.next(&iter);
218                     if(prev >= 0) {
219                         iter.next(&iter);
220                     }
221                     if(!previousSegment(errorCode)) {
222                         return U_SENTINEL;
223                     }
224                     continue;
225                 }
226                 // hasLccc(trail)=true for all trail surrogates
227                 if(U16_IS_TRAIL(c)) {
228                     if(prev < 0) {
229                         prev = iter.previous(&iter);
230                     }
231                     if(U16_IS_LEAD(prev)) {
232                         return U16_GET_SUPPLEMENTARY(prev, c);
233                     }
234                 }
235                 if(prev >= 0) {
236                     iter.next(&iter);
237                 }
238             }
239             return c;
240         } else if(state == ITER_IN_FCD_SEGMENT && pos != start) {
241             c = uiter_previous32(&iter);
242             pos -= U16_LENGTH(c);
243             U_ASSERT(c >= 0);
244             return c;
245         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) {
246             c = normalized.char32At(pos - 1);
247             pos -= U16_LENGTH(c);
248             return c;
249         } else {
250             switchToBackward();
251         }
252     }
253 }
254 
255 void
forwardNumCodePoints(int32_t num,UErrorCode & errorCode)256 FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
257     // Specify the class to avoid a virtual-function indirection.
258     // In Java, we would declare this class final.
259     while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) {
260         --num;
261     }
262 }
263 
264 void
backwardNumCodePoints(int32_t num,UErrorCode & errorCode)265 FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
266     // Specify the class to avoid a virtual-function indirection.
267     // In Java, we would declare this class final.
268     while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) {
269         --num;
270     }
271 }
272 
273 void
switchToForward()274 FCDUIterCollationIterator::switchToForward() {
275     U_ASSERT(state == ITER_CHECK_BWD ||
276              (state == ITER_IN_FCD_SEGMENT && pos == limit) ||
277              (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length()));
278     if(state == ITER_CHECK_BWD) {
279         // Turn around from backward checking.
280         start = pos = iter.getIndex(&iter, UITER_CURRENT);
281         if(pos == limit) {
282             state = ITER_CHECK_FWD;  // Check forward.
283         } else {  // pos < limit
284             state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
285         }
286     } else {
287         // Reached the end of the FCD segment.
288         if(state == ITER_IN_FCD_SEGMENT) {
289             // The input text segment is FCD, extend it forward.
290         } else {
291             // The input text segment needed to be normalized.
292             // Switch to checking forward from it.
293             if(state == IN_NORM_ITER_AT_START) {
294                 iter.move(&iter, limit - start, UITER_CURRENT);
295             }
296             start = limit;
297         }
298         state = ITER_CHECK_FWD;
299     }
300 }
301 
302 UBool
nextSegment(UErrorCode & errorCode)303 FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) {
304     if(U_FAILURE(errorCode)) { return FALSE; }
305     U_ASSERT(state == ITER_CHECK_FWD);
306     // The input text [start..(iter index)[ passes the FCD check.
307     pos = iter.getIndex(&iter, UITER_CURRENT);
308     // Collect the characters being checked, in case they need to be normalized.
309     UnicodeString s;
310     uint8_t prevCC = 0;
311     for(;;) {
312         // Fetch the next character and its fcd16 value.
313         UChar32 c = uiter_next32(&iter);
314         if(c < 0) { break; }
315         uint16_t fcd16 = nfcImpl.getFCD16(c);
316         uint8_t leadCC = (uint8_t)(fcd16 >> 8);
317         if(leadCC == 0 && !s.isEmpty()) {
318             // FCD boundary before this character.
319             uiter_previous32(&iter);
320             break;
321         }
322         s.append(c);
323         if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
324             // Fails FCD check. Find the next FCD boundary and normalize.
325             for(;;) {
326                 c = uiter_next32(&iter);
327                 if(c < 0) { break; }
328                 if(nfcImpl.getFCD16(c) <= 0xff) {
329                     uiter_previous32(&iter);
330                     break;
331                 }
332                 s.append(c);
333             }
334             if(!normalize(s, errorCode)) { return FALSE; }
335             start = pos;
336             limit = pos + s.length();
337             state = IN_NORM_ITER_AT_LIMIT;
338             pos = 0;
339             return TRUE;
340         }
341         prevCC = (uint8_t)fcd16;
342         if(prevCC == 0) {
343             // FCD boundary after the last character.
344             break;
345         }
346     }
347     limit = pos + s.length();
348     U_ASSERT(pos != limit);
349     iter.move(&iter, -s.length(), UITER_CURRENT);
350     state = ITER_IN_FCD_SEGMENT;
351     return TRUE;
352 }
353 
354 void
switchToBackward()355 FCDUIterCollationIterator::switchToBackward() {
356     U_ASSERT(state == ITER_CHECK_FWD ||
357              (state == ITER_IN_FCD_SEGMENT && pos == start) ||
358              (state >= IN_NORM_ITER_AT_LIMIT && pos == 0));
359     if(state == ITER_CHECK_FWD) {
360         // Turn around from forward checking.
361         limit = pos = iter.getIndex(&iter, UITER_CURRENT);
362         if(pos == start) {
363             state = ITER_CHECK_BWD;  // Check backward.
364         } else {  // pos > start
365             state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
366         }
367     } else {
368         // Reached the start of the FCD segment.
369         if(state == ITER_IN_FCD_SEGMENT) {
370             // The input text segment is FCD, extend it backward.
371         } else {
372             // The input text segment needed to be normalized.
373             // Switch to checking backward from it.
374             if(state == IN_NORM_ITER_AT_LIMIT) {
375                 iter.move(&iter, start - limit, UITER_CURRENT);
376             }
377             limit = start;
378         }
379         state = ITER_CHECK_BWD;
380     }
381 }
382 
383 UBool
previousSegment(UErrorCode & errorCode)384 FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) {
385     if(U_FAILURE(errorCode)) { return FALSE; }
386     U_ASSERT(state == ITER_CHECK_BWD);
387     // The input text [(iter index)..limit[ passes the FCD check.
388     pos = iter.getIndex(&iter, UITER_CURRENT);
389     // Collect the characters being checked, in case they need to be normalized.
390     UnicodeString s;
391     uint8_t nextCC = 0;
392     for(;;) {
393         // Fetch the previous character and its fcd16 value.
394         UChar32 c = uiter_previous32(&iter);
395         if(c < 0) { break; }
396         uint16_t fcd16 = nfcImpl.getFCD16(c);
397         uint8_t trailCC = (uint8_t)fcd16;
398         if(trailCC == 0 && !s.isEmpty()) {
399             // FCD boundary after this character.
400             uiter_next32(&iter);
401             break;
402         }
403         s.append(c);
404         if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
405                             CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
406             // Fails FCD check. Find the previous FCD boundary and normalize.
407             while(fcd16 > 0xff) {
408                 c = uiter_previous32(&iter);
409                 if(c < 0) { break; }
410                 fcd16 = nfcImpl.getFCD16(c);
411                 if(fcd16 == 0) {
412                     (void)uiter_next32(&iter);
413                     break;
414                 }
415                 s.append(c);
416             }
417             s.reverse();
418             if(!normalize(s, errorCode)) { return FALSE; }
419             limit = pos;
420             start = pos - s.length();
421             state = IN_NORM_ITER_AT_START;
422             pos = normalized.length();
423             return TRUE;
424         }
425         nextCC = (uint8_t)(fcd16 >> 8);
426         if(nextCC == 0) {
427             // FCD boundary before the following character.
428             break;
429         }
430     }
431     start = pos - s.length();
432     U_ASSERT(pos != start);
433     iter.move(&iter, s.length(), UITER_CURRENT);
434     state = ITER_IN_FCD_SEGMENT;
435     return TRUE;
436 }
437 
438 UBool
normalize(const UnicodeString & s,UErrorCode & errorCode)439 FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
440     // NFD without argument checking.
441     U_ASSERT(U_SUCCESS(errorCode));
442     nfcImpl.decompose(s, normalized, errorCode);
443     return U_SUCCESS(errorCode);
444 }
445 
446 U_NAMESPACE_END
447 
448 #endif  // !UCONFIG_NO_COLLATION
449