1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * uitercollationiterator.cpp
7 *
8 * created on: 2012sep23 (from utf16collationiterator.cpp)
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/uiter.h"
17 #include "charstr.h"
18 #include "cmemory.h"
19 #include "collation.h"
20 #include "collationdata.h"
21 #include "collationfcd.h"
22 #include "collationiterator.h"
23 #include "normalizer2impl.h"
24 #include "uassert.h"
25 #include "uitercollationiterator.h"
26
27 U_NAMESPACE_BEGIN
28
~UIterCollationIterator()29 UIterCollationIterator::~UIterCollationIterator() {}
30
31 void
resetToOffset(int32_t newOffset)32 UIterCollationIterator::resetToOffset(int32_t newOffset) {
33 reset();
34 iter.move(&iter, newOffset, UITER_START);
35 }
36
37 int32_t
getOffset() const38 UIterCollationIterator::getOffset() const {
39 return iter.getIndex(&iter, UITER_CURRENT);
40 }
41
42 uint32_t
handleNextCE32(UChar32 & c,UErrorCode &)43 UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
44 c = iter.next(&iter);
45 if(c < 0) {
46 return Collation::FALLBACK_CE32;
47 }
48 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
49 }
50
51 UChar
handleGetTrailSurrogate()52 UIterCollationIterator::handleGetTrailSurrogate() {
53 UChar32 trail = iter.next(&iter);
54 if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); }
55 return (UChar)trail;
56 }
57
58 UChar32
nextCodePoint(UErrorCode &)59 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
60 return uiter_next32(&iter);
61 }
62
63 UChar32
previousCodePoint(UErrorCode &)64 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
65 return uiter_previous32(&iter);
66 }
67
68 void
forwardNumCodePoints(int32_t num,UErrorCode &)69 UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
70 while(num > 0 && (uiter_next32(&iter)) >= 0) {
71 --num;
72 }
73 }
74
75 void
backwardNumCodePoints(int32_t num,UErrorCode &)76 UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
77 while(num > 0 && (uiter_previous32(&iter)) >= 0) {
78 --num;
79 }
80 }
81
82 // FCDUIterCollationIterator ----------------------------------------------- ***
83
~FCDUIterCollationIterator()84 FCDUIterCollationIterator::~FCDUIterCollationIterator() {}
85
86 void
resetToOffset(int32_t newOffset)87 FCDUIterCollationIterator::resetToOffset(int32_t newOffset) {
88 UIterCollationIterator::resetToOffset(newOffset);
89 start = newOffset;
90 state = ITER_CHECK_FWD;
91 }
92
93 int32_t
getOffset() const94 FCDUIterCollationIterator::getOffset() const {
95 if(state <= ITER_CHECK_BWD) {
96 return iter.getIndex(&iter, UITER_CURRENT);
97 } else if(state == ITER_IN_FCD_SEGMENT) {
98 return pos;
99 } else if(pos == 0) {
100 return start;
101 } else {
102 return limit;
103 }
104 }
105
106 uint32_t
handleNextCE32(UChar32 & c,UErrorCode & errorCode)107 FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
108 for(;;) {
109 if(state == ITER_CHECK_FWD) {
110 c = iter.next(&iter);
111 if(c < 0) {
112 return Collation::FALLBACK_CE32;
113 }
114 if(CollationFCD::hasTccc(c)) {
115 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
116 CollationFCD::hasLccc(iter.current(&iter))) {
117 iter.previous(&iter);
118 if(!nextSegment(errorCode)) {
119 c = U_SENTINEL;
120 return Collation::FALLBACK_CE32;
121 }
122 continue;
123 }
124 }
125 break;
126 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
127 c = iter.next(&iter);
128 ++pos;
129 U_ASSERT(c >= 0);
130 break;
131 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
132 c = normalized[pos++];
133 break;
134 } else {
135 switchToForward();
136 }
137 }
138 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
139 }
140
141 UChar
handleGetTrailSurrogate()142 FCDUIterCollationIterator::handleGetTrailSurrogate() {
143 if(state <= ITER_IN_FCD_SEGMENT) {
144 UChar32 trail = iter.next(&iter);
145 if(U16_IS_TRAIL(trail)) {
146 if(state == ITER_IN_FCD_SEGMENT) { ++pos; }
147 } else if(trail >= 0) {
148 iter.previous(&iter);
149 }
150 return (UChar)trail;
151 } else {
152 U_ASSERT(pos < normalized.length());
153 UChar trail;
154 if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
155 return trail;
156 }
157 }
158
159 UChar32
nextCodePoint(UErrorCode & errorCode)160 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) {
161 UChar32 c;
162 for(;;) {
163 if(state == ITER_CHECK_FWD) {
164 c = iter.next(&iter);
165 if(c < 0) {
166 return c;
167 }
168 if(CollationFCD::hasTccc(c)) {
169 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
170 CollationFCD::hasLccc(iter.current(&iter))) {
171 iter.previous(&iter);
172 if(!nextSegment(errorCode)) {
173 return U_SENTINEL;
174 }
175 continue;
176 }
177 }
178 if(U16_IS_LEAD(c)) {
179 UChar32 trail = iter.next(&iter);
180 if(U16_IS_TRAIL(trail)) {
181 return U16_GET_SUPPLEMENTARY(c, trail);
182 } else if(trail >= 0) {
183 iter.previous(&iter);
184 }
185 }
186 return c;
187 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
188 c = uiter_next32(&iter);
189 pos += U16_LENGTH(c);
190 U_ASSERT(c >= 0);
191 return c;
192 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
193 c = normalized.char32At(pos);
194 pos += U16_LENGTH(c);
195 return c;
196 } else {
197 switchToForward();
198 }
199 }
200 }
201
202 UChar32
previousCodePoint(UErrorCode & errorCode)203 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) {
204 UChar32 c;
205 for(;;) {
206 if(state == ITER_CHECK_BWD) {
207 c = iter.previous(&iter);
208 if(c < 0) {
209 start = pos = 0;
210 state = ITER_IN_FCD_SEGMENT;
211 return U_SENTINEL;
212 }
213 if(CollationFCD::hasLccc(c)) {
214 UChar32 prev = U_SENTINEL;
215 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
216 CollationFCD::hasTccc(prev = iter.previous(&iter))) {
217 iter.next(&iter);
218 if(prev >= 0) {
219 iter.next(&iter);
220 }
221 if(!previousSegment(errorCode)) {
222 return U_SENTINEL;
223 }
224 continue;
225 }
226 // hasLccc(trail)=true for all trail surrogates
227 if(U16_IS_TRAIL(c)) {
228 if(prev < 0) {
229 prev = iter.previous(&iter);
230 }
231 if(U16_IS_LEAD(prev)) {
232 return U16_GET_SUPPLEMENTARY(prev, c);
233 }
234 }
235 if(prev >= 0) {
236 iter.next(&iter);
237 }
238 }
239 return c;
240 } else if(state == ITER_IN_FCD_SEGMENT && pos != start) {
241 c = uiter_previous32(&iter);
242 pos -= U16_LENGTH(c);
243 U_ASSERT(c >= 0);
244 return c;
245 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) {
246 c = normalized.char32At(pos - 1);
247 pos -= U16_LENGTH(c);
248 return c;
249 } else {
250 switchToBackward();
251 }
252 }
253 }
254
255 void
forwardNumCodePoints(int32_t num,UErrorCode & errorCode)256 FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
257 // Specify the class to avoid a virtual-function indirection.
258 // In Java, we would declare this class final.
259 while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) {
260 --num;
261 }
262 }
263
264 void
backwardNumCodePoints(int32_t num,UErrorCode & errorCode)265 FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
266 // Specify the class to avoid a virtual-function indirection.
267 // In Java, we would declare this class final.
268 while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) {
269 --num;
270 }
271 }
272
273 void
switchToForward()274 FCDUIterCollationIterator::switchToForward() {
275 U_ASSERT(state == ITER_CHECK_BWD ||
276 (state == ITER_IN_FCD_SEGMENT && pos == limit) ||
277 (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length()));
278 if(state == ITER_CHECK_BWD) {
279 // Turn around from backward checking.
280 start = pos = iter.getIndex(&iter, UITER_CURRENT);
281 if(pos == limit) {
282 state = ITER_CHECK_FWD; // Check forward.
283 } else { // pos < limit
284 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
285 }
286 } else {
287 // Reached the end of the FCD segment.
288 if(state == ITER_IN_FCD_SEGMENT) {
289 // The input text segment is FCD, extend it forward.
290 } else {
291 // The input text segment needed to be normalized.
292 // Switch to checking forward from it.
293 if(state == IN_NORM_ITER_AT_START) {
294 iter.move(&iter, limit - start, UITER_CURRENT);
295 }
296 start = limit;
297 }
298 state = ITER_CHECK_FWD;
299 }
300 }
301
302 UBool
nextSegment(UErrorCode & errorCode)303 FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) {
304 if(U_FAILURE(errorCode)) { return FALSE; }
305 U_ASSERT(state == ITER_CHECK_FWD);
306 // The input text [start..(iter index)[ passes the FCD check.
307 pos = iter.getIndex(&iter, UITER_CURRENT);
308 // Collect the characters being checked, in case they need to be normalized.
309 UnicodeString s;
310 uint8_t prevCC = 0;
311 for(;;) {
312 // Fetch the next character and its fcd16 value.
313 UChar32 c = uiter_next32(&iter);
314 if(c < 0) { break; }
315 uint16_t fcd16 = nfcImpl.getFCD16(c);
316 uint8_t leadCC = (uint8_t)(fcd16 >> 8);
317 if(leadCC == 0 && !s.isEmpty()) {
318 // FCD boundary before this character.
319 uiter_previous32(&iter);
320 break;
321 }
322 s.append(c);
323 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
324 // Fails FCD check. Find the next FCD boundary and normalize.
325 for(;;) {
326 c = uiter_next32(&iter);
327 if(c < 0) { break; }
328 if(nfcImpl.getFCD16(c) <= 0xff) {
329 uiter_previous32(&iter);
330 break;
331 }
332 s.append(c);
333 }
334 if(!normalize(s, errorCode)) { return FALSE; }
335 start = pos;
336 limit = pos + s.length();
337 state = IN_NORM_ITER_AT_LIMIT;
338 pos = 0;
339 return TRUE;
340 }
341 prevCC = (uint8_t)fcd16;
342 if(prevCC == 0) {
343 // FCD boundary after the last character.
344 break;
345 }
346 }
347 limit = pos + s.length();
348 U_ASSERT(pos != limit);
349 iter.move(&iter, -s.length(), UITER_CURRENT);
350 state = ITER_IN_FCD_SEGMENT;
351 return TRUE;
352 }
353
354 void
switchToBackward()355 FCDUIterCollationIterator::switchToBackward() {
356 U_ASSERT(state == ITER_CHECK_FWD ||
357 (state == ITER_IN_FCD_SEGMENT && pos == start) ||
358 (state >= IN_NORM_ITER_AT_LIMIT && pos == 0));
359 if(state == ITER_CHECK_FWD) {
360 // Turn around from forward checking.
361 limit = pos = iter.getIndex(&iter, UITER_CURRENT);
362 if(pos == start) {
363 state = ITER_CHECK_BWD; // Check backward.
364 } else { // pos > start
365 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
366 }
367 } else {
368 // Reached the start of the FCD segment.
369 if(state == ITER_IN_FCD_SEGMENT) {
370 // The input text segment is FCD, extend it backward.
371 } else {
372 // The input text segment needed to be normalized.
373 // Switch to checking backward from it.
374 if(state == IN_NORM_ITER_AT_LIMIT) {
375 iter.move(&iter, start - limit, UITER_CURRENT);
376 }
377 limit = start;
378 }
379 state = ITER_CHECK_BWD;
380 }
381 }
382
383 UBool
previousSegment(UErrorCode & errorCode)384 FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) {
385 if(U_FAILURE(errorCode)) { return FALSE; }
386 U_ASSERT(state == ITER_CHECK_BWD);
387 // The input text [(iter index)..limit[ passes the FCD check.
388 pos = iter.getIndex(&iter, UITER_CURRENT);
389 // Collect the characters being checked, in case they need to be normalized.
390 UnicodeString s;
391 uint8_t nextCC = 0;
392 for(;;) {
393 // Fetch the previous character and its fcd16 value.
394 UChar32 c = uiter_previous32(&iter);
395 if(c < 0) { break; }
396 uint16_t fcd16 = nfcImpl.getFCD16(c);
397 uint8_t trailCC = (uint8_t)fcd16;
398 if(trailCC == 0 && !s.isEmpty()) {
399 // FCD boundary after this character.
400 uiter_next32(&iter);
401 break;
402 }
403 s.append(c);
404 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
405 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
406 // Fails FCD check. Find the previous FCD boundary and normalize.
407 while(fcd16 > 0xff) {
408 c = uiter_previous32(&iter);
409 if(c < 0) { break; }
410 fcd16 = nfcImpl.getFCD16(c);
411 if(fcd16 == 0) {
412 (void)uiter_next32(&iter);
413 break;
414 }
415 s.append(c);
416 }
417 s.reverse();
418 if(!normalize(s, errorCode)) { return FALSE; }
419 limit = pos;
420 start = pos - s.length();
421 state = IN_NORM_ITER_AT_START;
422 pos = normalized.length();
423 return TRUE;
424 }
425 nextCC = (uint8_t)(fcd16 >> 8);
426 if(nextCC == 0) {
427 // FCD boundary before the following character.
428 break;
429 }
430 }
431 start = pos - s.length();
432 U_ASSERT(pos != start);
433 iter.move(&iter, s.length(), UITER_CURRENT);
434 state = ITER_IN_FCD_SEGMENT;
435 return TRUE;
436 }
437
438 UBool
normalize(const UnicodeString & s,UErrorCode & errorCode)439 FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
440 // NFD without argument checking.
441 U_ASSERT(U_SUCCESS(errorCode));
442 nfcImpl.decompose(s, normalized, errorCode);
443 return U_SUCCESS(errorCode);
444 }
445
446 U_NAMESPACE_END
447
448 #endif // !UCONFIG_NO_COLLATION
449