1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2016 and later: Unicode, Inc. and others.
5 * License & terms of use: http://www.unicode.org/copyright.html#License
6 *
7 *******************************************************************************
8 *******************************************************************************
9 *
10 * Copyright (C) 2003-2006, International Business Machines
11 * Corporation and others. All Rights Reserved.
12 *
13 *******************************************************************************
14 * file name: uit_len8.c
15 * encoding: US-ASCII
16 * tab size: 8 (not used)
17 * indentation:4
18 *
19 * created on: 2003feb10
20 * created by: Markus W. Scherer
21 *
22 * This file contains the implementation of the "lenient UTF-8" UCharIterator
23 * as used in the uciter8 sample code.
24 * UTF-8-style macros are defined as well as the UCharIterator.
25 * The macros are incomplete (do not assemble code points from pairs of
26 * surrogates, see comment below)
27 * but sufficient for the iterator.
28 */
29
30 #include <string.h>
31 #include "unicode/utypes.h"
32 #include "unicode/uiter.h"
33
34 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
35
36 /*
37 * This code leniently reads 8-bit Unicode strings,
38 * which could contain a mix of UTF-8 and CESU-8.
39 * More precisely:
40 * - supplementary code points may be encoded with dedicated 4-byte sequences
41 * (UTF-8 style)
42 * - supplementary code points may be encoded with
43 * pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
44 * (CESU-8 style)
45 * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
46 *
47 * Limitation:
48 * Right now, the macros do not attempt to assemble code points from pairs of
49 * separately encoded surrogates.
50 * This would not be sufficient for processing based on these macros,
51 * but it is sufficient for a UCharIterator that returns only UChars anyway.
52 *
53 * The code is copied and modified from utf_impl.c and utf8.h.
54 *
55 * Change 2006feb08: Much of the implementation code is replaced by calling
56 * the utf_impl.c functions which accept a new "strict" parameter value
57 * of -2 implementing exactly this leniency.
58 */
59
60 #define L8_NEXT(s, i, length, c) { \
61 (c)=(uint8_t)(s)[(i)++]; \
62 if((c)>=0x80) { \
63 if(U8_IS_LEAD(c)) { \
64 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
65 } else { \
66 (c)=U_SENTINEL; \
67 } \
68 } \
69 }
70
71 #define L8_PREV(s, start, i, c) { \
72 (c)=(uint8_t)(s)[--(i)]; \
73 if((c)>=0x80) { \
74 if((c)<=0xbf) { \
75 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
76 } else { \
77 (c)=U_SENTINEL; \
78 } \
79 } \
80 }
81
82 /* lenient-8 UCharIterator -------------------------------------------------- */
83
84 /*
85 * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
86 * except that it uses the lenient-8-bit-Unicode macros above.
87 */
88
89 /*
90 * Minimal implementation:
91 * Maintain a single-UChar buffer for an additional surrogate.
92 * The caller must not modify start and limit because they are used internally.
93 *
94 * Use UCharIterator fields as follows:
95 * context pointer to UTF-8 string
96 * length UTF-16 length of the string; -1 until lazy evaluation
97 * start current UTF-8 index
98 * index current UTF-16 index; may be -1="unknown" after setState()
99 * limit UTF-8 length of the string
100 * reservedField supplementary code point
101 *
102 * Since UCharIterator delivers 16-bit code units, the iteration can be
103 * currently in the middle of the byte sequence for a supplementary code point.
104 * In this case, reservedField will contain that code point and start will
105 * point to after the corresponding byte sequence. The UTF-16 index will be
106 * one less than what it would otherwise be corresponding to the UTF-8 index.
107 * Otherwise, reservedField will be 0.
108 */
109
110 /*
111 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
112 * Add implementations that do not call strlen() for iteration but check for NUL.
113 */
114
115 static int32_t U_CALLCONV
lenient8IteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)116 lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
117 switch(origin) {
118 case UITER_ZERO:
119 case UITER_START:
120 return 0;
121 case UITER_CURRENT:
122 if(iter->index<0) {
123 /* the current UTF-16 index is unknown after setState(), count from the beginning */
124 const uint8_t *s;
125 UChar32 c;
126 int32_t i, limit, index;
127
128 s=(const uint8_t *)iter->context;
129 i=index=0;
130 limit=iter->start; /* count up to the UTF-8 index */
131 while(i<limit) {
132 L8_NEXT(s, i, limit, c);
133 if(c<=0xffff) {
134 ++index;
135 } else {
136 index+=2;
137 }
138 }
139
140 iter->start=i; /* just in case setState() did not get us to a code point boundary */
141 if(i==iter->limit) {
142 iter->length=index; /* in case it was <0 or wrong */
143 }
144 if(iter->reservedField!=0) {
145 --index; /* we are in the middle of a supplementary code point */
146 }
147 iter->index=index;
148 }
149 return iter->index;
150 case UITER_LIMIT:
151 case UITER_LENGTH:
152 if(iter->length<0) {
153 const uint8_t *s;
154 UChar32 c;
155 int32_t i, limit, length;
156
157 s=(const uint8_t *)iter->context;
158 if(iter->index<0) {
159 /*
160 * the current UTF-16 index is unknown after setState(),
161 * we must first count from the beginning to here
162 */
163 i=length=0;
164 limit=iter->start;
165
166 /* count from the beginning to the current index */
167 while(i<limit) {
168 L8_NEXT(s, i, limit, c);
169 if(c<=0xffff) {
170 ++length;
171 } else {
172 length+=2;
173 }
174 }
175
176 /* assume i==limit==iter->start, set the UTF-16 index */
177 iter->start=i; /* just in case setState() did not get us to a code point boundary */
178 iter->index= iter->reservedField!=0 ? length-1 : length;
179 } else {
180 i=iter->start;
181 length=iter->index;
182 if(iter->reservedField!=0) {
183 ++length;
184 }
185 }
186
187 /* count from the current index to the end */
188 limit=iter->limit;
189 while(i<limit) {
190 L8_NEXT(s, i, limit, c);
191 if(c<=0xffff) {
192 ++length;
193 } else {
194 length+=2;
195 }
196 }
197 iter->length=length;
198 }
199 return iter->length;
200 default:
201 /* not a valid origin */
202 /* Should never get here! */
203 return -1;
204 }
205 }
206
207 static int32_t U_CALLCONV
lenient8IteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)208 lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
209 const uint8_t *s;
210 UChar32 c;
211 int32_t pos; /* requested UTF-16 index */
212 int32_t i; /* UTF-8 index */
213 UBool havePos;
214
215 /* calculate the requested UTF-16 index */
216 switch(origin) {
217 case UITER_ZERO:
218 case UITER_START:
219 pos=delta;
220 havePos=TRUE;
221 /* iter->index<0 (unknown) is possible */
222 break;
223 case UITER_CURRENT:
224 if(iter->index>=0) {
225 pos=iter->index+delta;
226 havePos=TRUE;
227 } else {
228 /* the current UTF-16 index is unknown after setState(), use only delta */
229 pos=0;
230 havePos=FALSE;
231 }
232 break;
233 case UITER_LIMIT:
234 case UITER_LENGTH:
235 if(iter->length>=0) {
236 pos=iter->length+delta;
237 havePos=TRUE;
238 } else {
239 /* pin to the end, avoid counting the length */
240 iter->index=-1;
241 iter->start=iter->limit;
242 iter->reservedField=0;
243 if(delta>=0) {
244 return UITER_UNKNOWN_INDEX;
245 } else {
246 /* the current UTF-16 index is unknown, use only delta */
247 pos=0;
248 havePos=FALSE;
249 }
250 }
251 break;
252 default:
253 return -1; /* Error */
254 }
255
256 if(havePos) {
257 /* shortcuts: pinning to the edges of the string */
258 if(pos<=0) {
259 iter->index=iter->start=iter->reservedField=0;
260 return 0;
261 } else if(iter->length>=0 && pos>=iter->length) {
262 iter->index=iter->length;
263 iter->start=iter->limit;
264 iter->reservedField=0;
265 return iter->index;
266 }
267
268 /* minimize the number of L8_NEXT/PREV operations */
269 if(iter->index<0 || pos<iter->index/2) {
270 /* go forward from the start instead of backward from the current index */
271 iter->index=iter->start=iter->reservedField=0;
272 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
273 /*
274 * if we have the UTF-16 index and length and the new position is
275 * closer to the end than the current index,
276 * then go backward from the end instead of forward from the current index
277 */
278 iter->index=iter->length;
279 iter->start=iter->limit;
280 iter->reservedField=0;
281 }
282
283 delta=pos-iter->index;
284 if(delta==0) {
285 return iter->index; /* nothing to do */
286 }
287 } else {
288 /* move relative to unknown UTF-16 index */
289 if(delta==0) {
290 return UITER_UNKNOWN_INDEX; /* nothing to do */
291 } else if(-delta>=iter->start) {
292 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
293 iter->index=iter->start=iter->reservedField=0;
294 return 0;
295 } else if(delta>=(iter->limit-iter->start)) {
296 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
297 iter->index=iter->length; /* may or may not be <0 (unknown) */
298 iter->start=iter->limit;
299 iter->reservedField=0;
300 return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
301 }
302 }
303
304 /* delta!=0 */
305
306 /* move towards the requested position, pin to the edges of the string */
307 s=(const uint8_t *)iter->context;
308 pos=iter->index; /* could be <0 (unknown) */
309 i=iter->start;
310 if(delta>0) {
311 /* go forward */
312 int32_t limit=iter->limit;
313 if(iter->reservedField!=0) {
314 iter->reservedField=0;
315 ++pos;
316 --delta;
317 }
318 while(delta>0 && i<limit) {
319 L8_NEXT(s, i, limit, c);
320 if(c<0xffff) {
321 ++pos;
322 --delta;
323 } else if(delta>=2) {
324 pos+=2;
325 delta-=2;
326 } else /* delta==1 */ {
327 /* stop in the middle of a supplementary code point */
328 iter->reservedField=c;
329 ++pos;
330 break; /* delta=0; */
331 }
332 }
333 if(i==limit) {
334 if(iter->length<0 && iter->index>=0) {
335 iter->length= iter->reservedField==0 ? pos : pos+1;
336 } else if(iter->index<0 && iter->length>=0) {
337 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
338 }
339 }
340 } else /* delta<0 */ {
341 /* go backward */
342 if(iter->reservedField!=0) {
343 iter->reservedField=0;
344 i-=4; /* we stayed behind the supplementary code point; go before it now */
345 --pos;
346 ++delta;
347 }
348 while(delta<0 && i>0) {
349 L8_PREV(s, 0, i, c);
350 if(c<0xffff) {
351 --pos;
352 ++delta;
353 } else if(delta<=-2) {
354 pos-=2;
355 delta+=2;
356 } else /* delta==-1 */ {
357 /* stop in the middle of a supplementary code point */
358 i+=4; /* back to behind this supplementary code point for consistent state */
359 iter->reservedField=c;
360 --pos;
361 break; /* delta=0; */
362 }
363 }
364 }
365
366 iter->start=i;
367 if(iter->index>=0) {
368 return iter->index=pos;
369 } else {
370 /* we started with index<0 (unknown) so pos is bogus */
371 if(i<=1) {
372 return iter->index=i; /* reached the beginning */
373 } else {
374 /* we still don't know the UTF-16 index */
375 return UITER_UNKNOWN_INDEX;
376 }
377 }
378 }
379
380 static UBool U_CALLCONV
lenient8IteratorHasNext(UCharIterator * iter)381 lenient8IteratorHasNext(UCharIterator *iter) {
382 return iter->reservedField!=0 || iter->start<iter->limit;
383 }
384
385 static UBool U_CALLCONV
lenient8IteratorHasPrevious(UCharIterator * iter)386 lenient8IteratorHasPrevious(UCharIterator *iter) {
387 return iter->start>0;
388 }
389
390 static UChar32 U_CALLCONV
lenient8IteratorCurrent(UCharIterator * iter)391 lenient8IteratorCurrent(UCharIterator *iter) {
392 if(iter->reservedField!=0) {
393 return U16_TRAIL(iter->reservedField);
394 } else if(iter->start<iter->limit) {
395 const uint8_t *s=(const uint8_t *)iter->context;
396 UChar32 c;
397 int32_t i=iter->start;
398
399 L8_NEXT(s, i, iter->limit, c);
400 if(c<0) {
401 return 0xfffd;
402 } else if(c<=0xffff) {
403 return c;
404 } else {
405 return U16_LEAD(c);
406 }
407 } else {
408 return U_SENTINEL;
409 }
410 }
411
412 static UChar32 U_CALLCONV
lenient8IteratorNext(UCharIterator * iter)413 lenient8IteratorNext(UCharIterator *iter) {
414 int32_t index;
415
416 if(iter->reservedField!=0) {
417 UChar trail=U16_TRAIL(iter->reservedField);
418 iter->reservedField=0;
419 if((index=iter->index)>=0) {
420 iter->index=index+1;
421 }
422 return trail;
423 } else if(iter->start<iter->limit) {
424 const uint8_t *s=(const uint8_t *)iter->context;
425 UChar32 c;
426
427 L8_NEXT(s, iter->start, iter->limit, c);
428 if((index=iter->index)>=0) {
429 iter->index=++index;
430 if(iter->length<0 && iter->start==iter->limit) {
431 iter->length= c<=0xffff ? index : index+1;
432 }
433 } else if(iter->start==iter->limit && iter->length>=0) {
434 iter->index= c<=0xffff ? iter->length : iter->length-1;
435 }
436 if(c<0) {
437 return 0xfffd;
438 } else if(c<=0xffff) {
439 return c;
440 } else {
441 iter->reservedField=c;
442 return U16_LEAD(c);
443 }
444 } else {
445 return U_SENTINEL;
446 }
447 }
448
449 static UChar32 U_CALLCONV
lenient8IteratorPrevious(UCharIterator * iter)450 lenient8IteratorPrevious(UCharIterator *iter) {
451 int32_t index;
452
453 if(iter->reservedField!=0) {
454 UChar lead=U16_LEAD(iter->reservedField);
455 iter->reservedField=0;
456 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
457 if((index=iter->index)>0) {
458 iter->index=index-1;
459 }
460 return lead;
461 } else if(iter->start>0) {
462 const uint8_t *s=(const uint8_t *)iter->context;
463 UChar32 c;
464
465 L8_PREV(s, 0, iter->start, c);
466 if((index=iter->index)>0) {
467 iter->index=index-1;
468 } else if(iter->start<=1) {
469 iter->index= c<=0xffff ? iter->start : iter->start+1;
470 }
471 if(c<0) {
472 return 0xfffd;
473 } else if(c<=0xffff) {
474 return c;
475 } else {
476 iter->start+=4; /* back to behind this supplementary code point for consistent state */
477 iter->reservedField=c;
478 return U16_TRAIL(c);
479 }
480 } else {
481 return U_SENTINEL;
482 }
483 }
484
485 static uint32_t U_CALLCONV
lenient8IteratorGetState(const UCharIterator * iter)486 lenient8IteratorGetState(const UCharIterator *iter) {
487 uint32_t state=(uint32_t)(iter->start<<1);
488 if(iter->reservedField!=0) {
489 state|=1;
490 }
491 return state;
492 }
493
494 static void U_CALLCONV
lenient8IteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)495 lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
496 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
497 /* do nothing */
498 } else if(iter==NULL) {
499 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
500 } else if(state==lenient8IteratorGetState(iter)) {
501 /* setting to the current state: no-op */
502 } else {
503 int32_t index=(int32_t)(state>>1); /* UTF-8 index */
504 state&=1; /* 1 if in surrogate pair, must be index>=4 */
505
506 if((state==0 ? index<0 : index<4) || iter->limit<index) {
507 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
508 } else {
509 iter->start=index; /* restore UTF-8 byte index */
510 if(index<=1) {
511 iter->index=index;
512 } else {
513 iter->index=-1; /* unknown UTF-16 index */
514 }
515 if(state==0) {
516 iter->reservedField=0;
517 } else {
518 /* verified index>=4 above */
519 UChar32 c;
520 L8_PREV((const uint8_t *)iter->context, 0, index, c);
521 if(c<=0xffff) {
522 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
523 } else {
524 iter->reservedField=c;
525 }
526 }
527 }
528 }
529 }
530
531 static const UCharIterator lenient8Iterator={
532 0, 0, 0, 0, 0, 0,
533 lenient8IteratorGetIndex,
534 lenient8IteratorMove,
535 lenient8IteratorHasNext,
536 lenient8IteratorHasPrevious,
537 lenient8IteratorCurrent,
538 lenient8IteratorNext,
539 lenient8IteratorPrevious,
540 NULL,
541 lenient8IteratorGetState,
542 lenient8IteratorSetState
543 };
544
545 U_CAPI void U_EXPORT2
uiter_setLenient8(UCharIterator * iter,const char * s,int32_t length)546 uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
547 if(iter!=0) {
548 if(s!=0 && length>=-1) {
549 *iter=lenient8Iterator;
550 iter->context=s;
551 if(length>=0) {
552 iter->limit=length;
553 } else {
554 iter->limit=strlen(s);
555 }
556 iter->length= iter->limit<=1 ? iter->limit : -1;
557 } else {
558 /* set no-op iterator */
559 uiter_setString(iter, NULL, 0);
560 }
561 }
562 }
563