1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2002-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  uiter.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002jan18
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 #include "unicode/ustring.h"
19 #include "unicode/chariter.h"
20 #include "unicode/rep.h"
21 #include "unicode/uiter.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf8.h"
24 #include "unicode/utf16.h"
25 #include "cstring.h"
26 
27 U_NAMESPACE_USE
28 
29 #define IS_EVEN(n) (((n)&1)==0)
30 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
31 
32 U_CDECL_BEGIN
33 
34 /* No-Op UCharIterator implementation for illegal input --------------------- */
35 
36 static int32_t U_CALLCONV
noopGetIndex(UCharIterator *,UCharIteratorOrigin)37 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
38     return 0;
39 }
40 
41 static int32_t U_CALLCONV
noopMove(UCharIterator *,int32_t,UCharIteratorOrigin)42 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
43     return 0;
44 }
45 
46 static UBool U_CALLCONV
noopHasNext(UCharIterator *)47 noopHasNext(UCharIterator * /*iter*/) {
48     return FALSE;
49 }
50 
51 static UChar32 U_CALLCONV
noopCurrent(UCharIterator *)52 noopCurrent(UCharIterator * /*iter*/) {
53     return U_SENTINEL;
54 }
55 
56 static uint32_t U_CALLCONV
noopGetState(const UCharIterator *)57 noopGetState(const UCharIterator * /*iter*/) {
58     return UITER_NO_STATE;
59 }
60 
61 static void U_CALLCONV
noopSetState(UCharIterator *,uint32_t,UErrorCode * pErrorCode)62 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
63     *pErrorCode=U_UNSUPPORTED_ERROR;
64 }
65 
66 static const UCharIterator noopIterator={
67     0, 0, 0, 0, 0, 0,
68     noopGetIndex,
69     noopMove,
70     noopHasNext,
71     noopHasNext,
72     noopCurrent,
73     noopCurrent,
74     noopCurrent,
75     NULL,
76     noopGetState,
77     noopSetState
78 };
79 
80 /* UCharIterator implementation for simple strings -------------------------- */
81 
82 /*
83  * This is an implementation of a code unit (UChar) iterator
84  * for UChar * strings.
85  *
86  * The UCharIterator.context field holds a pointer to the string.
87  */
88 
89 static int32_t U_CALLCONV
stringIteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)90 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
91     switch(origin) {
92     case UITER_ZERO:
93         return 0;
94     case UITER_START:
95         return iter->start;
96     case UITER_CURRENT:
97         return iter->index;
98     case UITER_LIMIT:
99         return iter->limit;
100     case UITER_LENGTH:
101         return iter->length;
102     default:
103         /* not a valid origin */
104         /* Should never get here! */
105         return -1;
106     }
107 }
108 
109 static int32_t U_CALLCONV
stringIteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)110 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
111     int32_t pos;
112 
113     switch(origin) {
114     case UITER_ZERO:
115         pos=delta;
116         break;
117     case UITER_START:
118         pos=iter->start+delta;
119         break;
120     case UITER_CURRENT:
121         pos=iter->index+delta;
122         break;
123     case UITER_LIMIT:
124         pos=iter->limit+delta;
125         break;
126     case UITER_LENGTH:
127         pos=iter->length+delta;
128         break;
129     default:
130         return -1;  /* Error */
131     }
132 
133     if(pos<iter->start) {
134         pos=iter->start;
135     } else if(pos>iter->limit) {
136         pos=iter->limit;
137     }
138 
139     return iter->index=pos;
140 }
141 
142 static UBool U_CALLCONV
stringIteratorHasNext(UCharIterator * iter)143 stringIteratorHasNext(UCharIterator *iter) {
144     return iter->index<iter->limit;
145 }
146 
147 static UBool U_CALLCONV
stringIteratorHasPrevious(UCharIterator * iter)148 stringIteratorHasPrevious(UCharIterator *iter) {
149     return iter->index>iter->start;
150 }
151 
152 static UChar32 U_CALLCONV
stringIteratorCurrent(UCharIterator * iter)153 stringIteratorCurrent(UCharIterator *iter) {
154     if(iter->index<iter->limit) {
155         return ((const UChar *)(iter->context))[iter->index];
156     } else {
157         return U_SENTINEL;
158     }
159 }
160 
161 static UChar32 U_CALLCONV
stringIteratorNext(UCharIterator * iter)162 stringIteratorNext(UCharIterator *iter) {
163     if(iter->index<iter->limit) {
164         return ((const UChar *)(iter->context))[iter->index++];
165     } else {
166         return U_SENTINEL;
167     }
168 }
169 
170 static UChar32 U_CALLCONV
stringIteratorPrevious(UCharIterator * iter)171 stringIteratorPrevious(UCharIterator *iter) {
172     if(iter->index>iter->start) {
173         return ((const UChar *)(iter->context))[--iter->index];
174     } else {
175         return U_SENTINEL;
176     }
177 }
178 
179 static uint32_t U_CALLCONV
stringIteratorGetState(const UCharIterator * iter)180 stringIteratorGetState(const UCharIterator *iter) {
181     return (uint32_t)iter->index;
182 }
183 
184 static void U_CALLCONV
stringIteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)185 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
186     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
187         /* do nothing */
188     } else if(iter==NULL) {
189         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
190     } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
191         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
192     } else {
193         iter->index=(int32_t)state;
194     }
195 }
196 
197 static const UCharIterator stringIterator={
198     0, 0, 0, 0, 0, 0,
199     stringIteratorGetIndex,
200     stringIteratorMove,
201     stringIteratorHasNext,
202     stringIteratorHasPrevious,
203     stringIteratorCurrent,
204     stringIteratorNext,
205     stringIteratorPrevious,
206     NULL,
207     stringIteratorGetState,
208     stringIteratorSetState
209 };
210 
211 U_CAPI void U_EXPORT2
uiter_setString(UCharIterator * iter,const UChar * s,int32_t length)212 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
213     if(iter!=0) {
214         if(s!=0 && length>=-1) {
215             *iter=stringIterator;
216             iter->context=s;
217             if(length>=0) {
218                 iter->length=length;
219             } else {
220                 iter->length=u_strlen(s);
221             }
222             iter->limit=iter->length;
223         } else {
224             *iter=noopIterator;
225         }
226     }
227 }
228 
229 /* UCharIterator implementation for UTF-16BE strings ------------------------ */
230 
231 /*
232  * This is an implementation of a code unit (UChar) iterator
233  * for UTF-16BE strings, i.e., strings in byte-vectors where
234  * each UChar is stored as a big-endian pair of bytes.
235  *
236  * The UCharIterator.context field holds a pointer to the string.
237  * Everything works just like with a normal UChar iterator (uiter_setString),
238  * except that UChars are assembled from byte pairs.
239  */
240 
241 /* internal helper function */
242 static inline UChar32
utf16BEIteratorGet(UCharIterator * iter,int32_t index)243 utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
244     const uint8_t *p=(const uint8_t *)iter->context;
245     return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
246 }
247 
248 static UChar32 U_CALLCONV
utf16BEIteratorCurrent(UCharIterator * iter)249 utf16BEIteratorCurrent(UCharIterator *iter) {
250     int32_t index;
251 
252     if((index=iter->index)<iter->limit) {
253         return utf16BEIteratorGet(iter, index);
254     } else {
255         return U_SENTINEL;
256     }
257 }
258 
259 static UChar32 U_CALLCONV
utf16BEIteratorNext(UCharIterator * iter)260 utf16BEIteratorNext(UCharIterator *iter) {
261     int32_t index;
262 
263     if((index=iter->index)<iter->limit) {
264         iter->index=index+1;
265         return utf16BEIteratorGet(iter, index);
266     } else {
267         return U_SENTINEL;
268     }
269 }
270 
271 static UChar32 U_CALLCONV
utf16BEIteratorPrevious(UCharIterator * iter)272 utf16BEIteratorPrevious(UCharIterator *iter) {
273     int32_t index;
274 
275     if((index=iter->index)>iter->start) {
276         iter->index=--index;
277         return utf16BEIteratorGet(iter, index);
278     } else {
279         return U_SENTINEL;
280     }
281 }
282 
283 static const UCharIterator utf16BEIterator={
284     0, 0, 0, 0, 0, 0,
285     stringIteratorGetIndex,
286     stringIteratorMove,
287     stringIteratorHasNext,
288     stringIteratorHasPrevious,
289     utf16BEIteratorCurrent,
290     utf16BEIteratorNext,
291     utf16BEIteratorPrevious,
292     NULL,
293     stringIteratorGetState,
294     stringIteratorSetState
295 };
296 
297 /*
298  * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
299  * i.e., before a pair of 0 bytes where the first 0 byte is at an even
300  * offset from s.
301  */
302 static int32_t
utf16BE_strlen(const char * s)303 utf16BE_strlen(const char *s) {
304     if(IS_POINTER_EVEN(s)) {
305         /*
306          * even-aligned, call u_strlen(s)
307          * we are probably on a little-endian machine, but searching for UChar NUL
308          * does not care about endianness
309          */
310         return u_strlen((const UChar *)s);
311     } else {
312         /* odd-aligned, search for pair of 0 bytes */
313         const char *p=s;
314 
315         while(!(*p==0 && p[1]==0)) {
316             p+=2;
317         }
318         return (int32_t)((p-s)/2);
319     }
320 }
321 
322 U_CAPI void U_EXPORT2
uiter_setUTF16BE(UCharIterator * iter,const char * s,int32_t length)323 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
324     if(iter!=NULL) {
325         /* allow only even-length strings (the input length counts bytes) */
326         if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) {
327             /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
328             length>>=1;
329 
330             if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
331                 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
332                 uiter_setString(iter, (const UChar *)s, length);
333                 return;
334             }
335 
336             *iter=utf16BEIterator;
337             iter->context=s;
338             if(length>=0) {
339                 iter->length=length;
340             } else {
341                 iter->length=utf16BE_strlen(s);
342             }
343             iter->limit=iter->length;
344         } else {
345             *iter=noopIterator;
346         }
347     }
348 }
349 
350 /* UCharIterator wrapper around CharacterIterator --------------------------- */
351 
352 /*
353  * This is wrapper code around a C++ CharacterIterator to
354  * look like a C UCharIterator.
355  *
356  * The UCharIterator.context field holds a pointer to the CharacterIterator.
357  */
358 
359 static int32_t U_CALLCONV
characterIteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)360 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
361     switch(origin) {
362     case UITER_ZERO:
363         return 0;
364     case UITER_START:
365         return ((CharacterIterator *)(iter->context))->startIndex();
366     case UITER_CURRENT:
367         return ((CharacterIterator *)(iter->context))->getIndex();
368     case UITER_LIMIT:
369         return ((CharacterIterator *)(iter->context))->endIndex();
370     case UITER_LENGTH:
371         return ((CharacterIterator *)(iter->context))->getLength();
372     default:
373         /* not a valid origin */
374         /* Should never get here! */
375         return -1;
376     }
377 }
378 
379 static int32_t U_CALLCONV
characterIteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)380 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
381     switch(origin) {
382     case UITER_ZERO:
383         ((CharacterIterator *)(iter->context))->setIndex(delta);
384         return ((CharacterIterator *)(iter->context))->getIndex();
385     case UITER_START:
386     case UITER_CURRENT:
387     case UITER_LIMIT:
388         return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
389     case UITER_LENGTH:
390         ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
391         return ((CharacterIterator *)(iter->context))->getIndex();
392     default:
393         /* not a valid origin */
394         /* Should never get here! */
395         return -1;
396     }
397 }
398 
399 static UBool U_CALLCONV
characterIteratorHasNext(UCharIterator * iter)400 characterIteratorHasNext(UCharIterator *iter) {
401     return ((CharacterIterator *)(iter->context))->hasNext();
402 }
403 
404 static UBool U_CALLCONV
characterIteratorHasPrevious(UCharIterator * iter)405 characterIteratorHasPrevious(UCharIterator *iter) {
406     return ((CharacterIterator *)(iter->context))->hasPrevious();
407 }
408 
409 static UChar32 U_CALLCONV
characterIteratorCurrent(UCharIterator * iter)410 characterIteratorCurrent(UCharIterator *iter) {
411     UChar32 c;
412 
413     c=((CharacterIterator *)(iter->context))->current();
414     if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
415         return c;
416     } else {
417         return U_SENTINEL;
418     }
419 }
420 
421 static UChar32 U_CALLCONV
characterIteratorNext(UCharIterator * iter)422 characterIteratorNext(UCharIterator *iter) {
423     if(((CharacterIterator *)(iter->context))->hasNext()) {
424         return ((CharacterIterator *)(iter->context))->nextPostInc();
425     } else {
426         return U_SENTINEL;
427     }
428 }
429 
430 static UChar32 U_CALLCONV
characterIteratorPrevious(UCharIterator * iter)431 characterIteratorPrevious(UCharIterator *iter) {
432     if(((CharacterIterator *)(iter->context))->hasPrevious()) {
433         return ((CharacterIterator *)(iter->context))->previous();
434     } else {
435         return U_SENTINEL;
436     }
437 }
438 
439 static uint32_t U_CALLCONV
characterIteratorGetState(const UCharIterator * iter)440 characterIteratorGetState(const UCharIterator *iter) {
441     return ((CharacterIterator *)(iter->context))->getIndex();
442 }
443 
444 static void U_CALLCONV
characterIteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)445 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
446     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
447         /* do nothing */
448     } else if(iter==NULL || iter->context==NULL) {
449         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
450     } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
451         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
452     } else {
453         ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
454     }
455 }
456 
457 static const UCharIterator characterIteratorWrapper={
458     0, 0, 0, 0, 0, 0,
459     characterIteratorGetIndex,
460     characterIteratorMove,
461     characterIteratorHasNext,
462     characterIteratorHasPrevious,
463     characterIteratorCurrent,
464     characterIteratorNext,
465     characterIteratorPrevious,
466     NULL,
467     characterIteratorGetState,
468     characterIteratorSetState
469 };
470 
471 U_CAPI void U_EXPORT2
uiter_setCharacterIterator(UCharIterator * iter,CharacterIterator * charIter)472 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
473     if(iter!=0) {
474         if(charIter!=0) {
475             *iter=characterIteratorWrapper;
476             iter->context=charIter;
477         } else {
478             *iter=noopIterator;
479         }
480     }
481 }
482 
483 /* UCharIterator wrapper around Replaceable --------------------------------- */
484 
485 /*
486  * This is an implementation of a code unit (UChar) iterator
487  * based on a Replaceable object.
488  *
489  * The UCharIterator.context field holds a pointer to the Replaceable.
490  * UCharIterator.length and UCharIterator.index hold Replaceable.length()
491  * and the iteration index.
492  */
493 
494 static UChar32 U_CALLCONV
replaceableIteratorCurrent(UCharIterator * iter)495 replaceableIteratorCurrent(UCharIterator *iter) {
496     if(iter->index<iter->limit) {
497         return ((Replaceable *)(iter->context))->charAt(iter->index);
498     } else {
499         return U_SENTINEL;
500     }
501 }
502 
503 static UChar32 U_CALLCONV
replaceableIteratorNext(UCharIterator * iter)504 replaceableIteratorNext(UCharIterator *iter) {
505     if(iter->index<iter->limit) {
506         return ((Replaceable *)(iter->context))->charAt(iter->index++);
507     } else {
508         return U_SENTINEL;
509     }
510 }
511 
512 static UChar32 U_CALLCONV
replaceableIteratorPrevious(UCharIterator * iter)513 replaceableIteratorPrevious(UCharIterator *iter) {
514     if(iter->index>iter->start) {
515         return ((Replaceable *)(iter->context))->charAt(--iter->index);
516     } else {
517         return U_SENTINEL;
518     }
519 }
520 
521 static const UCharIterator replaceableIterator={
522     0, 0, 0, 0, 0, 0,
523     stringIteratorGetIndex,
524     stringIteratorMove,
525     stringIteratorHasNext,
526     stringIteratorHasPrevious,
527     replaceableIteratorCurrent,
528     replaceableIteratorNext,
529     replaceableIteratorPrevious,
530     NULL,
531     stringIteratorGetState,
532     stringIteratorSetState
533 };
534 
535 U_CAPI void U_EXPORT2
uiter_setReplaceable(UCharIterator * iter,const Replaceable * rep)536 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
537     if(iter!=0) {
538         if(rep!=0) {
539             *iter=replaceableIterator;
540             iter->context=rep;
541             iter->limit=iter->length=rep->length();
542         } else {
543             *iter=noopIterator;
544         }
545     }
546 }
547 
548 /* UCharIterator implementation for UTF-8 strings --------------------------- */
549 
550 /*
551  * Possible, probably necessary only for an implementation for arbitrary
552  * converters:
553  * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
554  * This would require to turn reservedFn into a close function and
555  * to introduce a uiter_close(iter).
556  */
557 
558 #define UITER_CNV_CAPACITY 16
559 
560 /*
561  * Minimal implementation:
562  * Maintain a single-UChar buffer for an additional surrogate.
563  * The caller must not modify start and limit because they are used internally.
564  *
565  * Use UCharIterator fields as follows:
566  *   context        pointer to UTF-8 string
567  *   length         UTF-16 length of the string; -1 until lazy evaluation
568  *   start          current UTF-8 index
569  *   index          current UTF-16 index; may be -1="unknown" after setState()
570  *   limit          UTF-8 length of the string
571  *   reservedField  supplementary code point
572  *
573  * Since UCharIterator delivers 16-bit code units, the iteration can be
574  * currently in the middle of the byte sequence for a supplementary code point.
575  * In this case, reservedField will contain that code point and start will
576  * point to after the corresponding byte sequence. The UTF-16 index will be
577  * one less than what it would otherwise be corresponding to the UTF-8 index.
578  * Otherwise, reservedField will be 0.
579  */
580 
581 /*
582  * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
583  * Add implementations that do not call strlen() for iteration but check for NUL.
584  */
585 
586 static int32_t U_CALLCONV
utf8IteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)587 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
588     switch(origin) {
589     case UITER_ZERO:
590     case UITER_START:
591         return 0;
592     case UITER_CURRENT:
593         if(iter->index<0) {
594             /* the current UTF-16 index is unknown after setState(), count from the beginning */
595             const uint8_t *s;
596             UChar32 c;
597             int32_t i, limit, index;
598 
599             s=(const uint8_t *)iter->context;
600             i=index=0;
601             limit=iter->start; /* count up to the UTF-8 index */
602             while(i<limit) {
603                 U8_NEXT_OR_FFFD(s, i, limit, c);
604                 index+=U16_LENGTH(c);
605             }
606 
607             iter->start=i; /* just in case setState() did not get us to a code point boundary */
608             if(i==iter->limit) {
609                 iter->length=index; /* in case it was <0 or wrong */
610             }
611             if(iter->reservedField!=0) {
612                 --index; /* we are in the middle of a supplementary code point */
613             }
614             iter->index=index;
615         }
616         return iter->index;
617     case UITER_LIMIT:
618     case UITER_LENGTH:
619         if(iter->length<0) {
620             const uint8_t *s;
621             UChar32 c;
622             int32_t i, limit, length;
623 
624             s=(const uint8_t *)iter->context;
625             if(iter->index<0) {
626                 /*
627                  * the current UTF-16 index is unknown after setState(),
628                  * we must first count from the beginning to here
629                  */
630                 i=length=0;
631                 limit=iter->start;
632 
633                 /* count from the beginning to the current index */
634                 while(i<limit) {
635                     U8_NEXT_OR_FFFD(s, i, limit, c);
636                     length+=U16_LENGTH(c);
637                 }
638 
639                 /* assume i==limit==iter->start, set the UTF-16 index */
640                 iter->start=i; /* just in case setState() did not get us to a code point boundary */
641                 iter->index= iter->reservedField!=0 ? length-1 : length;
642             } else {
643                 i=iter->start;
644                 length=iter->index;
645                 if(iter->reservedField!=0) {
646                     ++length;
647                 }
648             }
649 
650             /* count from the current index to the end */
651             limit=iter->limit;
652             while(i<limit) {
653                 U8_NEXT_OR_FFFD(s, i, limit, c);
654                 length+=U16_LENGTH(c);
655             }
656             iter->length=length;
657         }
658         return iter->length;
659     default:
660         /* not a valid origin */
661         /* Should never get here! */
662         return -1;
663     }
664 }
665 
666 static int32_t U_CALLCONV
utf8IteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)667 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
668     const uint8_t *s;
669     UChar32 c;
670     int32_t pos; /* requested UTF-16 index */
671     int32_t i; /* UTF-8 index */
672     UBool havePos;
673 
674     /* calculate the requested UTF-16 index */
675     switch(origin) {
676     case UITER_ZERO:
677     case UITER_START:
678         pos=delta;
679         havePos=TRUE;
680         /* iter->index<0 (unknown) is possible */
681         break;
682     case UITER_CURRENT:
683         if(iter->index>=0) {
684             pos=iter->index+delta;
685             havePos=TRUE;
686         } else {
687             /* the current UTF-16 index is unknown after setState(), use only delta */
688             pos=0;
689             havePos=FALSE;
690         }
691         break;
692     case UITER_LIMIT:
693     case UITER_LENGTH:
694         if(iter->length>=0) {
695             pos=iter->length+delta;
696             havePos=TRUE;
697         } else {
698             /* pin to the end, avoid counting the length */
699             iter->index=-1;
700             iter->start=iter->limit;
701             iter->reservedField=0;
702             if(delta>=0) {
703                 return UITER_UNKNOWN_INDEX;
704             } else {
705                 /* the current UTF-16 index is unknown, use only delta */
706                 pos=0;
707                 havePos=FALSE;
708             }
709         }
710         break;
711     default:
712         return -1;  /* Error */
713     }
714 
715     if(havePos) {
716         /* shortcuts: pinning to the edges of the string */
717         if(pos<=0) {
718             iter->index=iter->start=iter->reservedField=0;
719             return 0;
720         } else if(iter->length>=0 && pos>=iter->length) {
721             iter->index=iter->length;
722             iter->start=iter->limit;
723             iter->reservedField=0;
724             return iter->index;
725         }
726 
727         /* minimize the number of U8_NEXT/PREV operations */
728         if(iter->index<0 || pos<iter->index/2) {
729             /* go forward from the start instead of backward from the current index */
730             iter->index=iter->start=iter->reservedField=0;
731         } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
732             /*
733              * if we have the UTF-16 index and length and the new position is
734              * closer to the end than the current index,
735              * then go backward from the end instead of forward from the current index
736              */
737             iter->index=iter->length;
738             iter->start=iter->limit;
739             iter->reservedField=0;
740         }
741 
742         delta=pos-iter->index;
743         if(delta==0) {
744             return iter->index; /* nothing to do */
745         }
746     } else {
747         /* move relative to unknown UTF-16 index */
748         if(delta==0) {
749             return UITER_UNKNOWN_INDEX; /* nothing to do */
750         } else if(-delta>=iter->start) {
751             /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
752             iter->index=iter->start=iter->reservedField=0;
753             return 0;
754         } else if(delta>=(iter->limit-iter->start)) {
755             /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
756             iter->index=iter->length; /* may or may not be <0 (unknown) */
757             iter->start=iter->limit;
758             iter->reservedField=0;
759             return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
760         }
761     }
762 
763     /* delta!=0 */
764 
765     /* move towards the requested position, pin to the edges of the string */
766     s=(const uint8_t *)iter->context;
767     pos=iter->index; /* could be <0 (unknown) */
768     i=iter->start;
769     if(delta>0) {
770         /* go forward */
771         int32_t limit=iter->limit;
772         if(iter->reservedField!=0) {
773             iter->reservedField=0;
774             ++pos;
775             --delta;
776         }
777         while(delta>0 && i<limit) {
778             U8_NEXT_OR_FFFD(s, i, limit, c);
779             if(c<=0xffff) {
780                 ++pos;
781                 --delta;
782             } else if(delta>=2) {
783                 pos+=2;
784                 delta-=2;
785             } else /* delta==1 */ {
786                 /* stop in the middle of a supplementary code point */
787                 iter->reservedField=c;
788                 ++pos;
789                 break; /* delta=0; */
790             }
791         }
792         if(i==limit) {
793             if(iter->length<0 && iter->index>=0) {
794                 iter->length= iter->reservedField==0 ? pos : pos+1;
795             } else if(iter->index<0 && iter->length>=0) {
796                 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
797             }
798         }
799     } else /* delta<0 */ {
800         /* go backward */
801         if(iter->reservedField!=0) {
802             iter->reservedField=0;
803             i-=4; /* we stayed behind the supplementary code point; go before it now */
804             --pos;
805             ++delta;
806         }
807         while(delta<0 && i>0) {
808             U8_PREV_OR_FFFD(s, 0, i, c);
809             if(c<=0xffff) {
810                 --pos;
811                 ++delta;
812             } else if(delta<=-2) {
813                 pos-=2;
814                 delta+=2;
815             } else /* delta==-1 */ {
816                 /* stop in the middle of a supplementary code point */
817                 i+=4; /* back to behind this supplementary code point for consistent state */
818                 iter->reservedField=c;
819                 --pos;
820                 break; /* delta=0; */
821             }
822         }
823     }
824 
825     iter->start=i;
826     if(iter->index>=0) {
827         return iter->index=pos;
828     } else {
829         /* we started with index<0 (unknown) so pos is bogus */
830         if(i<=1) {
831             return iter->index=i; /* reached the beginning */
832         } else {
833             /* we still don't know the UTF-16 index */
834             return UITER_UNKNOWN_INDEX;
835         }
836     }
837 }
838 
839 static UBool U_CALLCONV
utf8IteratorHasNext(UCharIterator * iter)840 utf8IteratorHasNext(UCharIterator *iter) {
841     return iter->start<iter->limit || iter->reservedField!=0;
842 }
843 
844 static UBool U_CALLCONV
utf8IteratorHasPrevious(UCharIterator * iter)845 utf8IteratorHasPrevious(UCharIterator *iter) {
846     return iter->start>0;
847 }
848 
849 static UChar32 U_CALLCONV
utf8IteratorCurrent(UCharIterator * iter)850 utf8IteratorCurrent(UCharIterator *iter) {
851     if(iter->reservedField!=0) {
852         return U16_TRAIL(iter->reservedField);
853     } else if(iter->start<iter->limit) {
854         const uint8_t *s=(const uint8_t *)iter->context;
855         UChar32 c;
856         int32_t i=iter->start;
857 
858         U8_NEXT_OR_FFFD(s, i, iter->limit, c);
859         if(c<=0xffff) {
860             return c;
861         } else {
862             return U16_LEAD(c);
863         }
864     } else {
865         return U_SENTINEL;
866     }
867 }
868 
869 static UChar32 U_CALLCONV
utf8IteratorNext(UCharIterator * iter)870 utf8IteratorNext(UCharIterator *iter) {
871     int32_t index;
872 
873     if(iter->reservedField!=0) {
874         UChar trail=U16_TRAIL(iter->reservedField);
875         iter->reservedField=0;
876         if((index=iter->index)>=0) {
877             iter->index=index+1;
878         }
879         return trail;
880     } else if(iter->start<iter->limit) {
881         const uint8_t *s=(const uint8_t *)iter->context;
882         UChar32 c;
883 
884         U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
885         if((index=iter->index)>=0) {
886             iter->index=++index;
887             if(iter->length<0 && iter->start==iter->limit) {
888                 iter->length= c<=0xffff ? index : index+1;
889             }
890         } else if(iter->start==iter->limit && iter->length>=0) {
891             iter->index= c<=0xffff ? iter->length : iter->length-1;
892         }
893         if(c<=0xffff) {
894             return c;
895         } else {
896             iter->reservedField=c;
897             return U16_LEAD(c);
898         }
899     } else {
900         return U_SENTINEL;
901     }
902 }
903 
904 static UChar32 U_CALLCONV
utf8IteratorPrevious(UCharIterator * iter)905 utf8IteratorPrevious(UCharIterator *iter) {
906     int32_t index;
907 
908     if(iter->reservedField!=0) {
909         UChar lead=U16_LEAD(iter->reservedField);
910         iter->reservedField=0;
911         iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
912         if((index=iter->index)>0) {
913             iter->index=index-1;
914         }
915         return lead;
916     } else if(iter->start>0) {
917         const uint8_t *s=(const uint8_t *)iter->context;
918         UChar32 c;
919 
920         U8_PREV_OR_FFFD(s, 0, iter->start, c);
921         if((index=iter->index)>0) {
922             iter->index=index-1;
923         } else if(iter->start<=1) {
924             iter->index= c<=0xffff ? iter->start : iter->start+1;
925         }
926         if(c<=0xffff) {
927             return c;
928         } else {
929             iter->start+=4; /* back to behind this supplementary code point for consistent state */
930             iter->reservedField=c;
931             return U16_TRAIL(c);
932         }
933     } else {
934         return U_SENTINEL;
935     }
936 }
937 
938 static uint32_t U_CALLCONV
utf8IteratorGetState(const UCharIterator * iter)939 utf8IteratorGetState(const UCharIterator *iter) {
940     uint32_t state=(uint32_t)(iter->start<<1);
941     if(iter->reservedField!=0) {
942         state|=1;
943     }
944     return state;
945 }
946 
947 static void U_CALLCONV
utf8IteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)948 utf8IteratorSetState(UCharIterator *iter,
949                      uint32_t state,
950                      UErrorCode *pErrorCode)
951 {
952     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
953         /* do nothing */
954     } else if(iter==NULL) {
955         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
956     } else if(state==utf8IteratorGetState(iter)) {
957         /* setting to the current state: no-op */
958     } else {
959         int32_t index=(int32_t)(state>>1); /* UTF-8 index */
960         state&=1; /* 1 if in surrogate pair, must be index>=4 */
961 
962         if((state==0 ? index<0 : index<4) || iter->limit<index) {
963             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
964         } else {
965             iter->start=index; /* restore UTF-8 byte index */
966             if(index<=1) {
967                 iter->index=index;
968             } else {
969                 iter->index=-1; /* unknown UTF-16 index */
970             }
971             if(state==0) {
972                 iter->reservedField=0;
973             } else {
974                 /* verified index>=4 above */
975                 UChar32 c;
976                 U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
977                 if(c<=0xffff) {
978                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
979                 } else {
980                     iter->reservedField=c;
981                 }
982             }
983         }
984     }
985 }
986 
987 static const UCharIterator utf8Iterator={
988     0, 0, 0, 0, 0, 0,
989     utf8IteratorGetIndex,
990     utf8IteratorMove,
991     utf8IteratorHasNext,
992     utf8IteratorHasPrevious,
993     utf8IteratorCurrent,
994     utf8IteratorNext,
995     utf8IteratorPrevious,
996     NULL,
997     utf8IteratorGetState,
998     utf8IteratorSetState
999 };
1000 
1001 U_CAPI void U_EXPORT2
uiter_setUTF8(UCharIterator * iter,const char * s,int32_t length)1002 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
1003     if(iter!=0) {
1004         if(s!=0 && length>=-1) {
1005             *iter=utf8Iterator;
1006             iter->context=s;
1007             if(length>=0) {
1008                 iter->limit=length;
1009             } else {
1010                 iter->limit=(int32_t)uprv_strlen(s);
1011             }
1012             iter->length= iter->limit<=1 ? iter->limit : -1;
1013         } else {
1014             *iter=noopIterator;
1015         }
1016     }
1017 }
1018 
1019 /* Helper functions --------------------------------------------------------- */
1020 
1021 U_CAPI UChar32 U_EXPORT2
uiter_current32(UCharIterator * iter)1022 uiter_current32(UCharIterator *iter) {
1023     UChar32 c, c2;
1024 
1025     c=iter->current(iter);
1026     if(U16_IS_SURROGATE(c)) {
1027         if(U16_IS_SURROGATE_LEAD(c)) {
1028             /*
1029              * go to the next code unit
1030              * we know that we are not at the limit because c!=U_SENTINEL
1031              */
1032             iter->move(iter, 1, UITER_CURRENT);
1033             if(U16_IS_TRAIL(c2=iter->current(iter))) {
1034                 c=U16_GET_SUPPLEMENTARY(c, c2);
1035             }
1036 
1037             /* undo index movement */
1038             iter->move(iter, -1, UITER_CURRENT);
1039         } else {
1040             if(U16_IS_LEAD(c2=iter->previous(iter))) {
1041                 c=U16_GET_SUPPLEMENTARY(c2, c);
1042             }
1043             if(c2>=0) {
1044                 /* undo index movement */
1045                 iter->move(iter, 1, UITER_CURRENT);
1046             }
1047         }
1048     }
1049     return c;
1050 }
1051 
1052 U_CAPI UChar32 U_EXPORT2
uiter_next32(UCharIterator * iter)1053 uiter_next32(UCharIterator *iter) {
1054     UChar32 c, c2;
1055 
1056     c=iter->next(iter);
1057     if(U16_IS_LEAD(c)) {
1058         if(U16_IS_TRAIL(c2=iter->next(iter))) {
1059             c=U16_GET_SUPPLEMENTARY(c, c2);
1060         } else if(c2>=0) {
1061             /* unmatched first surrogate, undo index movement */
1062             iter->move(iter, -1, UITER_CURRENT);
1063         }
1064     }
1065     return c;
1066 }
1067 
1068 U_CAPI UChar32 U_EXPORT2
uiter_previous32(UCharIterator * iter)1069 uiter_previous32(UCharIterator *iter) {
1070     UChar32 c, c2;
1071 
1072     c=iter->previous(iter);
1073     if(U16_IS_TRAIL(c)) {
1074         if(U16_IS_LEAD(c2=iter->previous(iter))) {
1075             c=U16_GET_SUPPLEMENTARY(c2, c);
1076         } else if(c2>=0) {
1077             /* unmatched second surrogate, undo index movement */
1078             iter->move(iter, 1, UITER_CURRENT);
1079         }
1080     }
1081     return c;
1082 }
1083 
1084 U_CAPI uint32_t U_EXPORT2
uiter_getState(const UCharIterator * iter)1085 uiter_getState(const UCharIterator *iter) {
1086     if(iter==NULL || iter->getState==NULL) {
1087         return UITER_NO_STATE;
1088     } else {
1089         return iter->getState(iter);
1090     }
1091 }
1092 
1093 U_CAPI void U_EXPORT2
uiter_setState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)1094 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
1095     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1096         /* do nothing */
1097     } else if(iter==NULL) {
1098         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1099     } else if(iter->setState==NULL) {
1100         *pErrorCode=U_UNSUPPORTED_ERROR;
1101     } else {
1102         iter->setState(iter, state, pErrorCode);
1103     }
1104 }
1105 
1106 U_CDECL_END
1107