1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * utf16collationiterator.cpp
7 *
8 * created on: 2010oct27
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "charstr.h"
17 #include "cmemory.h"
18 #include "collation.h"
19 #include "collationdata.h"
20 #include "collationfcd.h"
21 #include "collationiterator.h"
22 #include "normalizer2impl.h"
23 #include "uassert.h"
24 #include "utf16collationiterator.h"
25 
26 U_NAMESPACE_BEGIN
27 
UTF16CollationIterator(const UTF16CollationIterator & other,const UChar * newText)28 UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other,
29                                                const UChar *newText)
30         : CollationIterator(other),
31           start(newText),
32           pos(newText + (other.pos - other.start)),
33           limit(other.limit == NULL ? NULL : newText + (other.limit - other.start)) {
34 }
35 
~UTF16CollationIterator()36 UTF16CollationIterator::~UTF16CollationIterator() {}
37 
38 UBool
operator ==(const CollationIterator & other) const39 UTF16CollationIterator::operator==(const CollationIterator &other) const {
40     if(!CollationIterator::operator==(other)) { return FALSE; }
41     const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other);
42     // Compare the iterator state but not the text: Assume that the caller does that.
43     return (pos - start) == (o.pos - o.start);
44 }
45 
46 void
resetToOffset(int32_t newOffset)47 UTF16CollationIterator::resetToOffset(int32_t newOffset) {
48     reset();
49     pos = start + newOffset;
50 }
51 
52 int32_t
getOffset() const53 UTF16CollationIterator::getOffset() const {
54     return (int32_t)(pos - start);
55 }
56 
57 uint32_t
handleNextCE32(UChar32 & c,UErrorCode &)58 UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
59     if(pos == limit) {
60         c = U_SENTINEL;
61         return Collation::FALLBACK_CE32;
62     }
63     c = *pos++;
64     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
65 }
66 
67 UChar
handleGetTrailSurrogate()68 UTF16CollationIterator::handleGetTrailSurrogate() {
69     if(pos == limit) { return 0; }
70     UChar trail;
71     if(U16_IS_TRAIL(trail = *pos)) { ++pos; }
72     return trail;
73 }
74 
75 UBool
foundNULTerminator()76 UTF16CollationIterator::foundNULTerminator() {
77     if(limit == NULL) {
78         limit = --pos;
79         return TRUE;
80     } else {
81         return FALSE;
82     }
83 }
84 
85 UChar32
nextCodePoint(UErrorCode &)86 UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
87     if(pos == limit) {
88         return U_SENTINEL;
89     }
90     UChar32 c = *pos;
91     if(c == 0 && limit == NULL) {
92         limit = pos;
93         return U_SENTINEL;
94     }
95     ++pos;
96     UChar trail;
97     if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
98         ++pos;
99         return U16_GET_SUPPLEMENTARY(c, trail);
100     } else {
101         return c;
102     }
103 }
104 
105 UChar32
previousCodePoint(UErrorCode &)106 UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
107     if(pos == start) {
108         return U_SENTINEL;
109     }
110     UChar32 c = *--pos;
111     UChar lead;
112     if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
113         --pos;
114         return U16_GET_SUPPLEMENTARY(lead, c);
115     } else {
116         return c;
117     }
118 }
119 
120 void
forwardNumCodePoints(int32_t num,UErrorCode &)121 UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
122     while(num > 0 && pos != limit) {
123         UChar32 c = *pos;
124         if(c == 0 && limit == NULL) {
125             limit = pos;
126             break;
127         }
128         ++pos;
129         --num;
130         if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) {
131             ++pos;
132         }
133     }
134 }
135 
136 void
backwardNumCodePoints(int32_t num,UErrorCode &)137 UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
138     while(num > 0 && pos != start) {
139         UChar32 c = *--pos;
140         --num;
141         if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) {
142             --pos;
143         }
144     }
145 }
146 
147 // FCDUTF16CollationIterator ----------------------------------------------- ***
148 
FCDUTF16CollationIterator(const FCDUTF16CollationIterator & other,const UChar * newText)149 FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other,
150                                                      const UChar *newText)
151         : UTF16CollationIterator(other),
152           rawStart(newText),
153           segmentStart(newText + (other.segmentStart - other.rawStart)),
154           segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segmentLimit - other.rawStart)),
155           rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - other.rawStart)),
156           nfcImpl(other.nfcImpl),
157           normalized(other.normalized),
158           checkDir(other.checkDir) {
159     if(checkDir != 0 || other.start == other.segmentStart) {
160         start = newText + (other.start - other.rawStart);
161         pos = newText + (other.pos - other.rawStart);
162         limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawStart);
163     } else {
164         start = normalized.getBuffer();
165         pos = start + (other.pos - other.start);
166         limit = start + normalized.length();
167     }
168 }
169 
~FCDUTF16CollationIterator()170 FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {}
171 
172 UBool
operator ==(const CollationIterator & other) const173 FCDUTF16CollationIterator::operator==(const CollationIterator &other) const {
174     // Skip the UTF16CollationIterator and call its parent.
175     if(!CollationIterator::operator==(other)) { return FALSE; }
176     const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other);
177     // Compare the iterator state but not the text: Assume that the caller does that.
178     if(checkDir != o.checkDir) { return FALSE; }
179     if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return FALSE; }
180     if(checkDir != 0 || start == segmentStart) {
181         return (pos - rawStart) == (o.pos - o.rawStart);
182     } else {
183         return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) &&
184                 (pos - start) == (o.pos - o.start);
185     }
186 }
187 
188 void
resetToOffset(int32_t newOffset)189 FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) {
190     reset();
191     start = segmentStart = pos = rawStart + newOffset;
192     limit = rawLimit;
193     checkDir = 1;
194 }
195 
196 int32_t
getOffset() const197 FCDUTF16CollationIterator::getOffset() const {
198     if(checkDir != 0 || start == segmentStart) {
199         return (int32_t)(pos - rawStart);
200     } else if(pos == start) {
201         return (int32_t)(segmentStart - rawStart);
202     } else {
203         return (int32_t)(segmentLimit - rawStart);
204     }
205 }
206 
207 uint32_t
handleNextCE32(UChar32 & c,UErrorCode & errorCode)208 FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
209     for(;;) {
210         if(checkDir > 0) {
211             if(pos == limit) {
212                 c = U_SENTINEL;
213                 return Collation::FALLBACK_CE32;
214             }
215             c = *pos++;
216             if(CollationFCD::hasTccc(c)) {
217                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
218                         (pos != limit && CollationFCD::hasLccc(*pos))) {
219                     --pos;
220                     if(!nextSegment(errorCode)) {
221                         c = U_SENTINEL;
222                         return Collation::FALLBACK_CE32;
223                     }
224                     c = *pos++;
225                 }
226             }
227             break;
228         } else if(checkDir == 0 && pos != limit) {
229             c = *pos++;
230             break;
231         } else {
232             switchToForward();
233         }
234     }
235     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
236 }
237 
238 UBool
foundNULTerminator()239 FCDUTF16CollationIterator::foundNULTerminator() {
240     if(limit == NULL) {
241         limit = rawLimit = --pos;
242         return TRUE;
243     } else {
244         return FALSE;
245     }
246 }
247 
248 UChar32
nextCodePoint(UErrorCode & errorCode)249 FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) {
250     UChar32 c;
251     for(;;) {
252         if(checkDir > 0) {
253             if(pos == limit) {
254                 return U_SENTINEL;
255             }
256             c = *pos++;
257             if(CollationFCD::hasTccc(c)) {
258                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
259                         (pos != limit && CollationFCD::hasLccc(*pos))) {
260                     --pos;
261                     if(!nextSegment(errorCode)) {
262                         return U_SENTINEL;
263                     }
264                     c = *pos++;
265                 }
266             } else if(c == 0 && limit == NULL) {
267                 limit = rawLimit = --pos;
268                 return U_SENTINEL;
269             }
270             break;
271         } else if(checkDir == 0 && pos != limit) {
272             c = *pos++;
273             break;
274         } else {
275             switchToForward();
276         }
277     }
278     UChar trail;
279     if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
280         ++pos;
281         return U16_GET_SUPPLEMENTARY(c, trail);
282     } else {
283         return c;
284     }
285 }
286 
287 UChar32
previousCodePoint(UErrorCode & errorCode)288 FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) {
289     UChar32 c;
290     for(;;) {
291         if(checkDir < 0) {
292             if(pos == start) {
293                 return U_SENTINEL;
294             }
295             c = *--pos;
296             if(CollationFCD::hasLccc(c)) {
297                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
298                         (pos != start && CollationFCD::hasTccc(*(pos - 1)))) {
299                     ++pos;
300                     if(!previousSegment(errorCode)) {
301                         return U_SENTINEL;
302                     }
303                     c = *--pos;
304                 }
305             }
306             break;
307         } else if(checkDir == 0 && pos != start) {
308             c = *--pos;
309             break;
310         } else {
311             switchToBackward();
312         }
313     }
314     UChar lead;
315     if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
316         --pos;
317         return U16_GET_SUPPLEMENTARY(lead, c);
318     } else {
319         return c;
320     }
321 }
322 
323 void
forwardNumCodePoints(int32_t num,UErrorCode & errorCode)324 FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
325     // Specify the class to avoid a virtual-function indirection.
326     // In Java, we would declare this class final.
327     while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) {
328         --num;
329     }
330 }
331 
332 void
backwardNumCodePoints(int32_t num,UErrorCode & errorCode)333 FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
334     // Specify the class to avoid a virtual-function indirection.
335     // In Java, we would declare this class final.
336     while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) {
337         --num;
338     }
339 }
340 
341 void
switchToForward()342 FCDUTF16CollationIterator::switchToForward() {
343     U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit));
344     if(checkDir < 0) {
345         // Turn around from backward checking.
346         start = segmentStart = pos;
347         if(pos == segmentLimit) {
348             limit = rawLimit;
349             checkDir = 1;  // Check forward.
350         } else {  // pos < segmentLimit
351             checkDir = 0;  // Stay in FCD segment.
352         }
353     } else {
354         // Reached the end of the FCD segment.
355         if(start == segmentStart) {
356             // The input text segment is FCD, extend it forward.
357         } else {
358             // The input text segment needed to be normalized.
359             // Switch to checking forward from it.
360             pos = start = segmentStart = segmentLimit;
361             // Note: If this segment is at the end of the input text,
362             // then it might help to return FALSE to indicate that, so that
363             // we do not have to re-check and normalize when we turn around and go backwards.
364             // However, that would complicate the call sites for an optimization of an unusual case.
365         }
366         limit = rawLimit;
367         checkDir = 1;
368     }
369 }
370 
371 UBool
nextSegment(UErrorCode & errorCode)372 FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) {
373     if(U_FAILURE(errorCode)) { return FALSE; }
374     U_ASSERT(checkDir > 0 && pos != limit);
375     // The input text [segmentStart..pos[ passes the FCD check.
376     const UChar *p = pos;
377     uint8_t prevCC = 0;
378     for(;;) {
379         // Fetch the next character's fcd16 value.
380         const UChar *q = p;
381         uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit);
382         uint8_t leadCC = (uint8_t)(fcd16 >> 8);
383         if(leadCC == 0 && q != pos) {
384             // FCD boundary before the [q, p[ character.
385             limit = segmentLimit = q;
386             break;
387         }
388         if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
389             // Fails FCD check. Find the next FCD boundary and normalize.
390             do {
391                 q = p;
392             } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff);
393             if(!normalize(pos, q, errorCode)) { return FALSE; }
394             pos = start;
395             break;
396         }
397         prevCC = (uint8_t)fcd16;
398         if(p == rawLimit || prevCC == 0) {
399             // FCD boundary after the last character.
400             limit = segmentLimit = p;
401             break;
402         }
403     }
404     U_ASSERT(pos != limit);
405     checkDir = 0;
406     return TRUE;
407 }
408 
409 void
switchToBackward()410 FCDUTF16CollationIterator::switchToBackward() {
411     U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start));
412     if(checkDir > 0) {
413         // Turn around from forward checking.
414         limit = segmentLimit = pos;
415         if(pos == segmentStart) {
416             start = rawStart;
417             checkDir = -1;  // Check backward.
418         } else {  // pos > segmentStart
419             checkDir = 0;  // Stay in FCD segment.
420         }
421     } else {
422         // Reached the start of the FCD segment.
423         if(start == segmentStart) {
424             // The input text segment is FCD, extend it backward.
425         } else {
426             // The input text segment needed to be normalized.
427             // Switch to checking backward from it.
428             pos = limit = segmentLimit = segmentStart;
429         }
430         start = rawStart;
431         checkDir = -1;
432     }
433 }
434 
435 UBool
previousSegment(UErrorCode & errorCode)436 FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) {
437     if(U_FAILURE(errorCode)) { return FALSE; }
438     U_ASSERT(checkDir < 0 && pos != start);
439     // The input text [pos..segmentLimit[ passes the FCD check.
440     const UChar *p = pos;
441     uint8_t nextCC = 0;
442     for(;;) {
443         // Fetch the previous character's fcd16 value.
444         const UChar *q = p;
445         uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p);
446         uint8_t trailCC = (uint8_t)fcd16;
447         if(trailCC == 0 && q != pos) {
448             // FCD boundary after the [p, q[ character.
449             start = segmentStart = q;
450             break;
451         }
452         if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
453                             CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
454             // Fails FCD check. Find the previous FCD boundary and normalize.
455             do {
456                 q = p;
457             } while(fcd16 > 0xff && p != rawStart &&
458                     (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0);
459             if(!normalize(q, pos, errorCode)) { return FALSE; }
460             pos = limit;
461             break;
462         }
463         nextCC = (uint8_t)(fcd16 >> 8);
464         if(p == rawStart || nextCC == 0) {
465             // FCD boundary before the following character.
466             start = segmentStart = p;
467             break;
468         }
469     }
470     U_ASSERT(pos != start);
471     checkDir = 0;
472     return TRUE;
473 }
474 
475 UBool
normalize(const UChar * from,const UChar * to,UErrorCode & errorCode)476 FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorCode &errorCode) {
477     // NFD without argument checking.
478     U_ASSERT(U_SUCCESS(errorCode));
479     nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode);
480     if(U_FAILURE(errorCode)) { return FALSE; }
481     // Switch collation processing into the FCD buffer
482     // with the result of normalizing [segmentStart, segmentLimit[.
483     segmentStart = from;
484     segmentLimit = to;
485     start = normalized.getBuffer();
486     limit = start + normalized.length();
487     return TRUE;
488 }
489 
490 U_NAMESPACE_END
491 
492 #endif  // !UCONFIG_NO_COLLATION
493