1 /*
2 *******************************************************************************
3 *
4 *   © 2016 and later: Unicode, Inc. and others.
5 *   License & terms of use: http://www.unicode.org/copyright.html#License
6 *
7 *******************************************************************************
8 *******************************************************************************
9 *
10 *   Copyright (C) 2003-2006, International Business Machines
11 *   Corporation and others.  All Rights Reserved.
12 *
13 *******************************************************************************
14 *   file name:  uit_len8.c
15 *   encoding:   UTF-8
16 *   tab size:   8 (not used)
17 *   indentation:4
18 *
19 *   created on: 2003feb10
20 *   created by: Markus W. Scherer
21 *
22 *   This file contains the implementation of the "lenient UTF-8" UCharIterator
23 *   as used in the uciter8 sample code.
24 *   UTF-8-style macros are defined as well as the UCharIterator.
25 *   The macros are incomplete (do not assemble code points from pairs of
26 *   surrogates, see comment below)
27 *   but sufficient for the iterator.
28 */
29 
30 #include <string.h>
31 #include "unicode/utypes.h"
32 #include "unicode/uiter.h"
33 
34 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
35 
36 /*
37  * This code leniently reads 8-bit Unicode strings,
38  * which could contain a mix of UTF-8 and CESU-8.
39  * More precisely:
40  * - supplementary code points may be encoded with dedicated 4-byte sequences
41  *   (UTF-8 style)
42  * - supplementary code points may be encoded with
43  *   pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
44  *   (CESU-8 style)
45  * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
46  *
47  * Limitation:
48  * Right now, the macros do not attempt to assemble code points from pairs of
49  * separately encoded surrogates.
50  * This would not be sufficient for processing based on these macros,
51  * but it is sufficient for a UCharIterator that returns only UChars anyway.
52  *
53  * The code is copied and modified from utf_impl.c and utf8.h.
54  *
55  * Change 2006feb08: Much of the implementation code is replaced by calling
56  * the utf_impl.c functions which accept a new "strict" parameter value
57  * of -2 implementing exactly this leniency.
58  */
59 
60 #define L8_NEXT(s, i, length, c) { \
61     (c)=(uint8_t)(s)[(i)++]; \
62     if((c)>=0x80) { \
63         if(U8_IS_LEAD(c)) { \
64             (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
65         } else { \
66             (c)=U_SENTINEL; \
67         } \
68     } \
69 }
70 
71 #define L8_PREV(s, start, i, c) { \
72     (c)=(uint8_t)(s)[--(i)]; \
73     if((c)>=0x80) { \
74         if((c)<=0xbf) { \
75             (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
76         } else { \
77             (c)=U_SENTINEL; \
78         } \
79     } \
80 }
81 
82 /* lenient-8 UCharIterator -------------------------------------------------- */
83 
84 /*
85  * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
86  * except that it uses the lenient-8-bit-Unicode macros above.
87  */
88 
89 /*
90  * Minimal implementation:
91  * Maintain a single-UChar buffer for an additional surrogate.
92  * The caller must not modify start and limit because they are used internally.
93  *
94  * Use UCharIterator fields as follows:
95  *   context        pointer to UTF-8 string
96  *   length         UTF-16 length of the string; -1 until lazy evaluation
97  *   start          current UTF-8 index
98  *   index          current UTF-16 index; may be -1="unknown" after setState()
99  *   limit          UTF-8 length of the string
100  *   reservedField  supplementary code point
101  *
102  * Since UCharIterator delivers 16-bit code units, the iteration can be
103  * currently in the middle of the byte sequence for a supplementary code point.
104  * In this case, reservedField will contain that code point and start will
105  * point to after the corresponding byte sequence. The UTF-16 index will be
106  * one less than what it would otherwise be corresponding to the UTF-8 index.
107  * Otherwise, reservedField will be 0.
108  */
109 
110 /*
111  * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
112  * Add implementations that do not call strlen() for iteration but check for NUL.
113  */
114 
115 static int32_t U_CALLCONV
116 lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
117     switch(origin) {
118     case UITER_ZERO:
119     case UITER_START:
120         return 0;
121     case UITER_CURRENT:
122         if(iter->index<0) {
123             /* the current UTF-16 index is unknown after setState(), count from the beginning */
124             const uint8_t *s;
125             UChar32 c;
126             int32_t i, limit, index;
127 
128             s=(const uint8_t *)iter->context;
129             i=index=0;
130             limit=iter->start; /* count up to the UTF-8 index */
131             while(i<limit) {
132                 L8_NEXT(s, i, limit, c);
133                 if(c<=0xffff) {
134                     ++index;
135                 } else {
136                     index+=2;
137                 }
138             }
139 
140             iter->start=i; /* just in case setState() did not get us to a code point boundary */
141             if(i==iter->limit) {
142                 iter->length=index; /* in case it was <0 or wrong */
143             }
144             if(iter->reservedField!=0) {
145                 --index; /* we are in the middle of a supplementary code point */
146             }
147             iter->index=index;
148         }
149         return iter->index;
150     case UITER_LIMIT:
151     case UITER_LENGTH:
152         if(iter->length<0) {
153             const uint8_t *s;
154             UChar32 c;
155             int32_t i, limit, length;
156 
157             s=(const uint8_t *)iter->context;
158             if(iter->index<0) {
159                 /*
160                  * the current UTF-16 index is unknown after setState(),
161                  * we must first count from the beginning to here
162                  */
163                 i=length=0;
164                 limit=iter->start;
165 
166                 /* count from the beginning to the current index */
167                 while(i<limit) {
168                     L8_NEXT(s, i, limit, c);
169                     if(c<=0xffff) {
170                         ++length;
171                     } else {
172                         length+=2;
173                     }
174                 }
175 
176                 /* assume i==limit==iter->start, set the UTF-16 index */
177                 iter->start=i; /* just in case setState() did not get us to a code point boundary */
178                 iter->index= iter->reservedField!=0 ? length-1 : length;
179             } else {
180                 i=iter->start;
181                 length=iter->index;
182                 if(iter->reservedField!=0) {
183                     ++length;
184                 }
185             }
186 
187             /* count from the current index to the end */
188             limit=iter->limit;
189             while(i<limit) {
190                 L8_NEXT(s, i, limit, c);
191                 if(c<=0xffff) {
192                     ++length;
193                 } else {
194                     length+=2;
195                 }
196             }
197             iter->length=length;
198         }
199         return iter->length;
200     default:
201         /* not a valid origin */
202         /* Should never get here! */
203         return -1;
204     }
205 }
206 
207 static int32_t U_CALLCONV
208 lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
209     const uint8_t *s;
210     UChar32 c;
211     int32_t pos; /* requested UTF-16 index */
212     int32_t i; /* UTF-8 index */
213     UBool havePos;
214 
215     /* calculate the requested UTF-16 index */
216     switch(origin) {
217     case UITER_ZERO:
218     case UITER_START:
219         pos=delta;
220         havePos=TRUE;
221         /* iter->index<0 (unknown) is possible */
222         break;
223     case UITER_CURRENT:
224         if(iter->index>=0) {
225             pos=iter->index+delta;
226             havePos=TRUE;
227         } else {
228             /* the current UTF-16 index is unknown after setState(), use only delta */
229             pos=0;
230             havePos=FALSE;
231         }
232         break;
233     case UITER_LIMIT:
234     case UITER_LENGTH:
235         if(iter->length>=0) {
236             pos=iter->length+delta;
237             havePos=TRUE;
238         } else {
239             /* pin to the end, avoid counting the length */
240             iter->index=-1;
241             iter->start=iter->limit;
242             iter->reservedField=0;
243             if(delta>=0) {
244                 return UITER_UNKNOWN_INDEX;
245             } else {
246                 /* the current UTF-16 index is unknown, use only delta */
247                 pos=0;
248                 havePos=FALSE;
249             }
250         }
251         break;
252     default:
253         return -1;  /* Error */
254     }
255 
256     if(havePos) {
257         /* shortcuts: pinning to the edges of the string */
258         if(pos<=0) {
259             iter->index=iter->start=iter->reservedField=0;
260             return 0;
261         } else if(iter->length>=0 && pos>=iter->length) {
262             iter->index=iter->length;
263             iter->start=iter->limit;
264             iter->reservedField=0;
265             return iter->index;
266         }
267 
268         /* minimize the number of L8_NEXT/PREV operations */
269         if(iter->index<0 || pos<iter->index/2) {
270             /* go forward from the start instead of backward from the current index */
271             iter->index=iter->start=iter->reservedField=0;
272         } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
273             /*
274              * if we have the UTF-16 index and length and the new position is
275              * closer to the end than the current index,
276              * then go backward from the end instead of forward from the current index
277              */
278             iter->index=iter->length;
279             iter->start=iter->limit;
280             iter->reservedField=0;
281         }
282 
283         delta=pos-iter->index;
284         if(delta==0) {
285             return iter->index; /* nothing to do */
286         }
287     } else {
288         /* move relative to unknown UTF-16 index */
289         if(delta==0) {
290             return UITER_UNKNOWN_INDEX; /* nothing to do */
291         } else if(-delta>=iter->start) {
292             /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
293             iter->index=iter->start=iter->reservedField=0;
294             return 0;
295         } else if(delta>=(iter->limit-iter->start)) {
296             /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
297             iter->index=iter->length; /* may or may not be <0 (unknown) */
298             iter->start=iter->limit;
299             iter->reservedField=0;
300             return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
301         }
302     }
303 
304     /* delta!=0 */
305 
306     /* move towards the requested position, pin to the edges of the string */
307     s=(const uint8_t *)iter->context;
308     pos=iter->index; /* could be <0 (unknown) */
309     i=iter->start;
310     if(delta>0) {
311         /* go forward */
312         int32_t limit=iter->limit;
313         if(iter->reservedField!=0) {
314             iter->reservedField=0;
315             ++pos;
316             --delta;
317         }
318         while(delta>0 && i<limit) {
319             L8_NEXT(s, i, limit, c);
320             if(c<0xffff) {
321                 ++pos;
322                 --delta;
323             } else if(delta>=2) {
324                 pos+=2;
325                 delta-=2;
326             } else /* delta==1 */ {
327                 /* stop in the middle of a supplementary code point */
328                 iter->reservedField=c;
329                 ++pos;
330                 break; /* delta=0; */
331             }
332         }
333         if(i==limit) {
334             if(iter->length<0 && iter->index>=0) {
335                 iter->length= iter->reservedField==0 ? pos : pos+1;
336             } else if(iter->index<0 && iter->length>=0) {
337                 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
338             }
339         }
340     } else /* delta<0 */ {
341         /* go backward */
342         if(iter->reservedField!=0) {
343             iter->reservedField=0;
344             i-=4; /* we stayed behind the supplementary code point; go before it now */
345             --pos;
346             ++delta;
347         }
348         while(delta<0 && i>0) {
349             L8_PREV(s, 0, i, c);
350             if(c<0xffff) {
351                 --pos;
352                 ++delta;
353             } else if(delta<=-2) {
354                 pos-=2;
355                 delta+=2;
356             } else /* delta==-1 */ {
357                 /* stop in the middle of a supplementary code point */
358                 i+=4; /* back to behind this supplementary code point for consistent state */
359                 iter->reservedField=c;
360                 --pos;
361                 break; /* delta=0; */
362             }
363         }
364     }
365 
366     iter->start=i;
367     if(iter->index>=0) {
368         return iter->index=pos;
369     } else {
370         /* we started with index<0 (unknown) so pos is bogus */
371         if(i<=1) {
372             return iter->index=i; /* reached the beginning */
373         } else {
374             /* we still don't know the UTF-16 index */
375             return UITER_UNKNOWN_INDEX;
376         }
377     }
378 }
379 
380 static UBool U_CALLCONV
381 lenient8IteratorHasNext(UCharIterator *iter) {
382     return iter->reservedField!=0 || iter->start<iter->limit;
383 }
384 
385 static UBool U_CALLCONV
386 lenient8IteratorHasPrevious(UCharIterator *iter) {
387     return iter->start>0;
388 }
389 
390 static UChar32 U_CALLCONV
391 lenient8IteratorCurrent(UCharIterator *iter) {
392     if(iter->reservedField!=0) {
393         return U16_TRAIL(iter->reservedField);
394     } else if(iter->start<iter->limit) {
395         const uint8_t *s=(const uint8_t *)iter->context;
396         UChar32 c;
397         int32_t i=iter->start;
398 
399         L8_NEXT(s, i, iter->limit, c);
400         if(c<0) {
401             return 0xfffd;
402         } else if(c<=0xffff) {
403             return c;
404         } else {
405             return U16_LEAD(c);
406         }
407     } else {
408         return U_SENTINEL;
409     }
410 }
411 
412 static UChar32 U_CALLCONV
413 lenient8IteratorNext(UCharIterator *iter) {
414     int32_t index;
415 
416     if(iter->reservedField!=0) {
417         UChar trail=U16_TRAIL(iter->reservedField);
418         iter->reservedField=0;
419         if((index=iter->index)>=0) {
420             iter->index=index+1;
421         }
422         return trail;
423     } else if(iter->start<iter->limit) {
424         const uint8_t *s=(const uint8_t *)iter->context;
425         UChar32 c;
426 
427         L8_NEXT(s, iter->start, iter->limit, c);
428         if((index=iter->index)>=0) {
429             iter->index=++index;
430             if(iter->length<0 && iter->start==iter->limit) {
431                 iter->length= c<=0xffff ? index : index+1;
432             }
433         } else if(iter->start==iter->limit && iter->length>=0) {
434             iter->index= c<=0xffff ? iter->length : iter->length-1;
435         }
436         if(c<0) {
437             return 0xfffd;
438         } else if(c<=0xffff) {
439             return c;
440         } else {
441             iter->reservedField=c;
442             return U16_LEAD(c);
443         }
444     } else {
445         return U_SENTINEL;
446     }
447 }
448 
449 static UChar32 U_CALLCONV
450 lenient8IteratorPrevious(UCharIterator *iter) {
451     int32_t index;
452 
453     if(iter->reservedField!=0) {
454         UChar lead=U16_LEAD(iter->reservedField);
455         iter->reservedField=0;
456         iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
457         if((index=iter->index)>0) {
458             iter->index=index-1;
459         }
460         return lead;
461     } else if(iter->start>0) {
462         const uint8_t *s=(const uint8_t *)iter->context;
463         UChar32 c;
464 
465         L8_PREV(s, 0, iter->start, c);
466         if((index=iter->index)>0) {
467             iter->index=index-1;
468         } else if(iter->start<=1) {
469             iter->index= c<=0xffff ? iter->start : iter->start+1;
470         }
471         if(c<0) {
472             return 0xfffd;
473         } else if(c<=0xffff) {
474             return c;
475         } else {
476             iter->start+=4; /* back to behind this supplementary code point for consistent state */
477             iter->reservedField=c;
478             return U16_TRAIL(c);
479         }
480     } else {
481         return U_SENTINEL;
482     }
483 }
484 
485 static uint32_t U_CALLCONV
486 lenient8IteratorGetState(const UCharIterator *iter) {
487     uint32_t state=(uint32_t)(iter->start<<1);
488     if(iter->reservedField!=0) {
489         state|=1;
490     }
491     return state;
492 }
493 
494 static void U_CALLCONV
495 lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
496     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
497         /* do nothing */
498     } else if(iter==NULL) {
499         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
500     } else if(state==lenient8IteratorGetState(iter)) {
501         /* setting to the current state: no-op */
502     } else {
503         int32_t index=(int32_t)(state>>1); /* UTF-8 index */
504         state&=1; /* 1 if in surrogate pair, must be index>=4 */
505 
506         if((state==0 ? index<0 : index<4) || iter->limit<index) {
507             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
508         } else {
509             iter->start=index; /* restore UTF-8 byte index */
510             if(index<=1) {
511                 iter->index=index;
512             } else {
513                 iter->index=-1; /* unknown UTF-16 index */
514             }
515             if(state==0) {
516                 iter->reservedField=0;
517             } else {
518                 /* verified index>=4 above */
519                 UChar32 c;
520                 L8_PREV((const uint8_t *)iter->context, 0, index, c);
521                 if(c<=0xffff) {
522                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
523                 } else {
524                     iter->reservedField=c;
525                 }
526             }
527         }
528     }
529 }
530 
531 static const UCharIterator lenient8Iterator={
532     0, 0, 0, 0, 0, 0,
533     lenient8IteratorGetIndex,
534     lenient8IteratorMove,
535     lenient8IteratorHasNext,
536     lenient8IteratorHasPrevious,
537     lenient8IteratorCurrent,
538     lenient8IteratorNext,
539     lenient8IteratorPrevious,
540     NULL,
541     lenient8IteratorGetState,
542     lenient8IteratorSetState
543 };
544 
545 U_CAPI void U_EXPORT2
546 uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
547     if(iter!=0) {
548         if(s!=0 && length>=-1) {
549             *iter=lenient8Iterator;
550             iter->context=s;
551             if(length>=0) {
552                 iter->limit=length;
553             } else {
554                 iter->limit=strlen(s);
555             }
556             iter->length= iter->limit<=1 ? iter->limit : -1;
557         } else {
558             /* set no-op iterator */
559             uiter_setString(iter, NULL, 0);
560         }
561     }
562 }
563