1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2003-2006, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  uit_len8.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2003feb10
14 *   created by: Markus W. Scherer
15 *
16 *   This file contains the implementation of the "lenient UTF-8" UCharIterator
17 *   as used in the uciter8 sample code.
18 *   UTF-8-style macros are defined as well as the UCharIterator.
19 *   The macros are incomplete (do not assemble code points from pairs of
20 *   surrogates, see comment below)
21 *   but sufficient for the iterator.
22 */
23 
24 #include <string.h>
25 #include "unicode/utypes.h"
26 #include "unicode/uiter.h"
27 
28 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
29 
30 /*
31  * This code leniently reads 8-bit Unicode strings,
32  * which could contain a mix of UTF-8 and CESU-8.
33  * More precisely:
34  * - supplementary code points may be encoded with dedicated 4-byte sequences
35  *   (UTF-8 style)
36  * - supplementary code points may be encoded with
37  *   pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
38  *   (CESU-8 style)
39  * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
40  *
41  * Limitation:
42  * Right now, the macros do not attempt to assemble code points from pairs of
43  * separately encoded surrogates.
44  * This would not be sufficient for processing based on these macros,
45  * but it is sufficient for a UCharIterator that returns only UChars anyway.
46  *
47  * The code is copied and modified from utf_impl.c and utf8.h.
48  *
49  * Change 2006feb08: Much of the implementation code is replaced by calling
50  * the utf_impl.c functions which accept a new "strict" parameter value
51  * of -2 implementing exactly this leniency.
52  */
53 
54 #define L8_NEXT(s, i, length, c) { \
55     (c)=(uint8_t)(s)[(i)++]; \
56     if((c)>=0x80) { \
57         if(U8_IS_LEAD(c)) { \
58             (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
59         } else { \
60             (c)=U_SENTINEL; \
61         } \
62     } \
63 }
64 
65 #define L8_PREV(s, start, i, c) { \
66     (c)=(uint8_t)(s)[--(i)]; \
67     if((c)>=0x80) { \
68         if((c)<=0xbf) { \
69             (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
70         } else { \
71             (c)=U_SENTINEL; \
72         } \
73     } \
74 }
75 
76 /* lenient-8 UCharIterator -------------------------------------------------- */
77 
78 /*
79  * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
80  * except that it uses the lenient-8-bit-Unicode macros above.
81  */
82 
83 /*
84  * Minimal implementation:
85  * Maintain a single-UChar buffer for an additional surrogate.
86  * The caller must not modify start and limit because they are used internally.
87  *
88  * Use UCharIterator fields as follows:
89  *   context        pointer to UTF-8 string
90  *   length         UTF-16 length of the string; -1 until lazy evaluation
91  *   start          current UTF-8 index
92  *   index          current UTF-16 index; may be -1="unknown" after setState()
93  *   limit          UTF-8 length of the string
94  *   reservedField  supplementary code point
95  *
96  * Since UCharIterator delivers 16-bit code units, the iteration can be
97  * currently in the middle of the byte sequence for a supplementary code point.
98  * In this case, reservedField will contain that code point and start will
99  * point to after the corresponding byte sequence. The UTF-16 index will be
100  * one less than what it would otherwise be corresponding to the UTF-8 index.
101  * Otherwise, reservedField will be 0.
102  */
103 
104 /*
105  * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
106  * Add implementations that do not call strlen() for iteration but check for NUL.
107  */
108 
109 static int32_t U_CALLCONV
lenient8IteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)110 lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
111     switch(origin) {
112     case UITER_ZERO:
113     case UITER_START:
114         return 0;
115     case UITER_CURRENT:
116         if(iter->index<0) {
117             /* the current UTF-16 index is unknown after setState(), count from the beginning */
118             const uint8_t *s;
119             UChar32 c;
120             int32_t i, limit, index;
121 
122             s=(const uint8_t *)iter->context;
123             i=index=0;
124             limit=iter->start; /* count up to the UTF-8 index */
125             while(i<limit) {
126                 L8_NEXT(s, i, limit, c);
127                 if(c<=0xffff) {
128                     ++index;
129                 } else {
130                     index+=2;
131                 }
132             }
133 
134             iter->start=i; /* just in case setState() did not get us to a code point boundary */
135             if(i==iter->limit) {
136                 iter->length=index; /* in case it was <0 or wrong */
137             }
138             if(iter->reservedField!=0) {
139                 --index; /* we are in the middle of a supplementary code point */
140             }
141             iter->index=index;
142         }
143         return iter->index;
144     case UITER_LIMIT:
145     case UITER_LENGTH:
146         if(iter->length<0) {
147             const uint8_t *s;
148             UChar32 c;
149             int32_t i, limit, length;
150 
151             s=(const uint8_t *)iter->context;
152             if(iter->index<0) {
153                 /*
154                  * the current UTF-16 index is unknown after setState(),
155                  * we must first count from the beginning to here
156                  */
157                 i=length=0;
158                 limit=iter->start;
159 
160                 /* count from the beginning to the current index */
161                 while(i<limit) {
162                     L8_NEXT(s, i, limit, c);
163                     if(c<=0xffff) {
164                         ++length;
165                     } else {
166                         length+=2;
167                     }
168                 }
169 
170                 /* assume i==limit==iter->start, set the UTF-16 index */
171                 iter->start=i; /* just in case setState() did not get us to a code point boundary */
172                 iter->index= iter->reservedField!=0 ? length-1 : length;
173             } else {
174                 i=iter->start;
175                 length=iter->index;
176                 if(iter->reservedField!=0) {
177                     ++length;
178                 }
179             }
180 
181             /* count from the current index to the end */
182             limit=iter->limit;
183             while(i<limit) {
184                 L8_NEXT(s, i, limit, c);
185                 if(c<=0xffff) {
186                     ++length;
187                 } else {
188                     length+=2;
189                 }
190             }
191             iter->length=length;
192         }
193         return iter->length;
194     default:
195         /* not a valid origin */
196         /* Should never get here! */
197         return -1;
198     }
199 }
200 
201 static int32_t U_CALLCONV
lenient8IteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)202 lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
203     const uint8_t *s;
204     UChar32 c;
205     int32_t pos; /* requested UTF-16 index */
206     int32_t i; /* UTF-8 index */
207     UBool havePos;
208 
209     /* calculate the requested UTF-16 index */
210     switch(origin) {
211     case UITER_ZERO:
212     case UITER_START:
213         pos=delta;
214         havePos=TRUE;
215         /* iter->index<0 (unknown) is possible */
216         break;
217     case UITER_CURRENT:
218         if(iter->index>=0) {
219             pos=iter->index+delta;
220             havePos=TRUE;
221         } else {
222             /* the current UTF-16 index is unknown after setState(), use only delta */
223             pos=0;
224             havePos=FALSE;
225         }
226         break;
227     case UITER_LIMIT:
228     case UITER_LENGTH:
229         if(iter->length>=0) {
230             pos=iter->length+delta;
231             havePos=TRUE;
232         } else {
233             /* pin to the end, avoid counting the length */
234             iter->index=-1;
235             iter->start=iter->limit;
236             iter->reservedField=0;
237             if(delta>=0) {
238                 return UITER_UNKNOWN_INDEX;
239             } else {
240                 /* the current UTF-16 index is unknown, use only delta */
241                 pos=0;
242                 havePos=FALSE;
243             }
244         }
245         break;
246     default:
247         return -1;  /* Error */
248     }
249 
250     if(havePos) {
251         /* shortcuts: pinning to the edges of the string */
252         if(pos<=0) {
253             iter->index=iter->start=iter->reservedField=0;
254             return 0;
255         } else if(iter->length>=0 && pos>=iter->length) {
256             iter->index=iter->length;
257             iter->start=iter->limit;
258             iter->reservedField=0;
259             return iter->index;
260         }
261 
262         /* minimize the number of L8_NEXT/PREV operations */
263         if(iter->index<0 || pos<iter->index/2) {
264             /* go forward from the start instead of backward from the current index */
265             iter->index=iter->start=iter->reservedField=0;
266         } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
267             /*
268              * if we have the UTF-16 index and length and the new position is
269              * closer to the end than the current index,
270              * then go backward from the end instead of forward from the current index
271              */
272             iter->index=iter->length;
273             iter->start=iter->limit;
274             iter->reservedField=0;
275         }
276 
277         delta=pos-iter->index;
278         if(delta==0) {
279             return iter->index; /* nothing to do */
280         }
281     } else {
282         /* move relative to unknown UTF-16 index */
283         if(delta==0) {
284             return UITER_UNKNOWN_INDEX; /* nothing to do */
285         } else if(-delta>=iter->start) {
286             /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
287             iter->index=iter->start=iter->reservedField=0;
288             return 0;
289         } else if(delta>=(iter->limit-iter->start)) {
290             /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
291             iter->index=iter->length; /* may or may not be <0 (unknown) */
292             iter->start=iter->limit;
293             iter->reservedField=0;
294             return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
295         }
296     }
297 
298     /* delta!=0 */
299 
300     /* move towards the requested position, pin to the edges of the string */
301     s=(const uint8_t *)iter->context;
302     pos=iter->index; /* could be <0 (unknown) */
303     i=iter->start;
304     if(delta>0) {
305         /* go forward */
306         int32_t limit=iter->limit;
307         if(iter->reservedField!=0) {
308             iter->reservedField=0;
309             ++pos;
310             --delta;
311         }
312         while(delta>0 && i<limit) {
313             L8_NEXT(s, i, limit, c);
314             if(c<0xffff) {
315                 ++pos;
316                 --delta;
317             } else if(delta>=2) {
318                 pos+=2;
319                 delta-=2;
320             } else /* delta==1 */ {
321                 /* stop in the middle of a supplementary code point */
322                 iter->reservedField=c;
323                 ++pos;
324                 break; /* delta=0; */
325             }
326         }
327         if(i==limit) {
328             if(iter->length<0 && iter->index>=0) {
329                 iter->length= iter->reservedField==0 ? pos : pos+1;
330             } else if(iter->index<0 && iter->length>=0) {
331                 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
332             }
333         }
334     } else /* delta<0 */ {
335         /* go backward */
336         if(iter->reservedField!=0) {
337             iter->reservedField=0;
338             i-=4; /* we stayed behind the supplementary code point; go before it now */
339             --pos;
340             ++delta;
341         }
342         while(delta<0 && i>0) {
343             L8_PREV(s, 0, i, c);
344             if(c<0xffff) {
345                 --pos;
346                 ++delta;
347             } else if(delta<=-2) {
348                 pos-=2;
349                 delta+=2;
350             } else /* delta==-1 */ {
351                 /* stop in the middle of a supplementary code point */
352                 i+=4; /* back to behind this supplementary code point for consistent state */
353                 iter->reservedField=c;
354                 --pos;
355                 break; /* delta=0; */
356             }
357         }
358     }
359 
360     iter->start=i;
361     if(iter->index>=0) {
362         return iter->index=pos;
363     } else {
364         /* we started with index<0 (unknown) so pos is bogus */
365         if(i<=1) {
366             return iter->index=i; /* reached the beginning */
367         } else {
368             /* we still don't know the UTF-16 index */
369             return UITER_UNKNOWN_INDEX;
370         }
371     }
372 }
373 
374 static UBool U_CALLCONV
lenient8IteratorHasNext(UCharIterator * iter)375 lenient8IteratorHasNext(UCharIterator *iter) {
376     return iter->reservedField!=0 || iter->start<iter->limit;
377 }
378 
379 static UBool U_CALLCONV
lenient8IteratorHasPrevious(UCharIterator * iter)380 lenient8IteratorHasPrevious(UCharIterator *iter) {
381     return iter->start>0;
382 }
383 
384 static UChar32 U_CALLCONV
lenient8IteratorCurrent(UCharIterator * iter)385 lenient8IteratorCurrent(UCharIterator *iter) {
386     if(iter->reservedField!=0) {
387         return U16_TRAIL(iter->reservedField);
388     } else if(iter->start<iter->limit) {
389         const uint8_t *s=(const uint8_t *)iter->context;
390         UChar32 c;
391         int32_t i=iter->start;
392 
393         L8_NEXT(s, i, iter->limit, c);
394         if(c<0) {
395             return 0xfffd;
396         } else if(c<=0xffff) {
397             return c;
398         } else {
399             return U16_LEAD(c);
400         }
401     } else {
402         return U_SENTINEL;
403     }
404 }
405 
406 static UChar32 U_CALLCONV
lenient8IteratorNext(UCharIterator * iter)407 lenient8IteratorNext(UCharIterator *iter) {
408     int32_t index;
409 
410     if(iter->reservedField!=0) {
411         UChar trail=U16_TRAIL(iter->reservedField);
412         iter->reservedField=0;
413         if((index=iter->index)>=0) {
414             iter->index=index+1;
415         }
416         return trail;
417     } else if(iter->start<iter->limit) {
418         const uint8_t *s=(const uint8_t *)iter->context;
419         UChar32 c;
420 
421         L8_NEXT(s, iter->start, iter->limit, c);
422         if((index=iter->index)>=0) {
423             iter->index=++index;
424             if(iter->length<0 && iter->start==iter->limit) {
425                 iter->length= c<=0xffff ? index : index+1;
426             }
427         } else if(iter->start==iter->limit && iter->length>=0) {
428             iter->index= c<=0xffff ? iter->length : iter->length-1;
429         }
430         if(c<0) {
431             return 0xfffd;
432         } else if(c<=0xffff) {
433             return c;
434         } else {
435             iter->reservedField=c;
436             return U16_LEAD(c);
437         }
438     } else {
439         return U_SENTINEL;
440     }
441 }
442 
443 static UChar32 U_CALLCONV
lenient8IteratorPrevious(UCharIterator * iter)444 lenient8IteratorPrevious(UCharIterator *iter) {
445     int32_t index;
446 
447     if(iter->reservedField!=0) {
448         UChar lead=U16_LEAD(iter->reservedField);
449         iter->reservedField=0;
450         iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
451         if((index=iter->index)>0) {
452             iter->index=index-1;
453         }
454         return lead;
455     } else if(iter->start>0) {
456         const uint8_t *s=(const uint8_t *)iter->context;
457         UChar32 c;
458 
459         L8_PREV(s, 0, iter->start, c);
460         if((index=iter->index)>0) {
461             iter->index=index-1;
462         } else if(iter->start<=1) {
463             iter->index= c<=0xffff ? iter->start : iter->start+1;
464         }
465         if(c<0) {
466             return 0xfffd;
467         } else if(c<=0xffff) {
468             return c;
469         } else {
470             iter->start+=4; /* back to behind this supplementary code point for consistent state */
471             iter->reservedField=c;
472             return U16_TRAIL(c);
473         }
474     } else {
475         return U_SENTINEL;
476     }
477 }
478 
479 static uint32_t U_CALLCONV
lenient8IteratorGetState(const UCharIterator * iter)480 lenient8IteratorGetState(const UCharIterator *iter) {
481     uint32_t state=(uint32_t)(iter->start<<1);
482     if(iter->reservedField!=0) {
483         state|=1;
484     }
485     return state;
486 }
487 
488 static void U_CALLCONV
lenient8IteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)489 lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
490     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
491         /* do nothing */
492     } else if(iter==NULL) {
493         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
494     } else if(state==lenient8IteratorGetState(iter)) {
495         /* setting to the current state: no-op */
496     } else {
497         int32_t index=(int32_t)(state>>1); /* UTF-8 index */
498         state&=1; /* 1 if in surrogate pair, must be index>=4 */
499 
500         if((state==0 ? index<0 : index<4) || iter->limit<index) {
501             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
502         } else {
503             iter->start=index; /* restore UTF-8 byte index */
504             if(index<=1) {
505                 iter->index=index;
506             } else {
507                 iter->index=-1; /* unknown UTF-16 index */
508             }
509             if(state==0) {
510                 iter->reservedField=0;
511             } else {
512                 /* verified index>=4 above */
513                 UChar32 c;
514                 L8_PREV((const uint8_t *)iter->context, 0, index, c);
515                 if(c<=0xffff) {
516                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
517                 } else {
518                     iter->reservedField=c;
519                 }
520             }
521         }
522     }
523 }
524 
525 static const UCharIterator lenient8Iterator={
526     0, 0, 0, 0, 0, 0,
527     lenient8IteratorGetIndex,
528     lenient8IteratorMove,
529     lenient8IteratorHasNext,
530     lenient8IteratorHasPrevious,
531     lenient8IteratorCurrent,
532     lenient8IteratorNext,
533     lenient8IteratorPrevious,
534     NULL,
535     lenient8IteratorGetState,
536     lenient8IteratorSetState
537 };
538 
539 U_CAPI void U_EXPORT2
uiter_setLenient8(UCharIterator * iter,const char * s,int32_t length)540 uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
541     if(iter!=0) {
542         if(s!=0 && length>=-1) {
543             *iter=lenient8Iterator;
544             iter->context=s;
545             if(length>=0) {
546                 iter->limit=length;
547             } else {
548                 iter->limit=strlen(s);
549             }
550             iter->length= iter->limit<=1 ? iter->limit : -1;
551         } else {
552             /* set no-op iterator */
553             uiter_setString(iter, NULL, 0);
554         }
555     }
556 }
557