1 /* 2 ******************************************************************************* 3 * 4 * © 2016 and later: Unicode, Inc. and others. 5 * License & terms of use: http://www.unicode.org/copyright.html#License 6 * 7 ******************************************************************************* 8 ******************************************************************************* 9 * 10 * Copyright (C) 2003-2006, International Business Machines 11 * Corporation and others. All Rights Reserved. 12 * 13 ******************************************************************************* 14 * file name: uit_len8.c 15 * encoding: UTF-8 16 * tab size: 8 (not used) 17 * indentation:4 18 * 19 * created on: 2003feb10 20 * created by: Markus W. Scherer 21 * 22 * This file contains the implementation of the "lenient UTF-8" UCharIterator 23 * as used in the uciter8 sample code. 24 * UTF-8-style macros are defined as well as the UCharIterator. 25 * The macros are incomplete (do not assemble code points from pairs of 26 * surrogates, see comment below) 27 * but sufficient for the iterator. 28 */ 29 30 #include <string.h> 31 #include "unicode/utypes.h" 32 #include "unicode/uiter.h" 33 34 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */ 35 36 /* 37 * This code leniently reads 8-bit Unicode strings, 38 * which could contain a mix of UTF-8 and CESU-8. 39 * More precisely: 40 * - supplementary code points may be encoded with dedicated 4-byte sequences 41 * (UTF-8 style) 42 * - supplementary code points may be encoded with 43 * pairs of 3-byte sequences, one for each surrogate of the UTF-16 form 44 * (CESU-8 style) 45 * - single surrogates are allowed, encoded with their "natural" 3-byte sequences 46 * 47 * Limitation: 48 * Right now, the macros do not attempt to assemble code points from pairs of 49 * separately encoded surrogates. 50 * This would not be sufficient for processing based on these macros, 51 * but it is sufficient for a UCharIterator that returns only UChars anyway. 52 * 53 * The code is copied and modified from utf_impl.c and utf8.h. 54 * 55 * Change 2006feb08: Much of the implementation code is replaced by calling 56 * the utf_impl.c functions which accept a new "strict" parameter value 57 * of -2 implementing exactly this leniency. 58 */ 59 60 #define L8_NEXT(s, i, length, c) { \ 61 (c)=(uint8_t)(s)[(i)++]; \ 62 if((c)>=0x80) { \ 63 if(U8_IS_LEAD(c)) { \ 64 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \ 65 } else { \ 66 (c)=U_SENTINEL; \ 67 } \ 68 } \ 69 } 70 71 #define L8_PREV(s, start, i, c) { \ 72 (c)=(uint8_t)(s)[--(i)]; \ 73 if((c)>=0x80) { \ 74 if((c)<=0xbf) { \ 75 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \ 76 } else { \ 77 (c)=U_SENTINEL; \ 78 } \ 79 } \ 80 } 81 82 /* lenient-8 UCharIterator -------------------------------------------------- */ 83 84 /* 85 * This is a copy of the UTF-8 UCharIterator in uiter.cpp, 86 * except that it uses the lenient-8-bit-Unicode macros above. 87 */ 88 89 /* 90 * Minimal implementation: 91 * Maintain a single-UChar buffer for an additional surrogate. 92 * The caller must not modify start and limit because they are used internally. 93 * 94 * Use UCharIterator fields as follows: 95 * context pointer to UTF-8 string 96 * length UTF-16 length of the string; -1 until lazy evaluation 97 * start current UTF-8 index 98 * index current UTF-16 index; may be -1="unknown" after setState() 99 * limit UTF-8 length of the string 100 * reservedField supplementary code point 101 * 102 * Since UCharIterator delivers 16-bit code units, the iteration can be 103 * currently in the middle of the byte sequence for a supplementary code point. 104 * In this case, reservedField will contain that code point and start will 105 * point to after the corresponding byte sequence. The UTF-16 index will be 106 * one less than what it would otherwise be corresponding to the UTF-8 index. 107 * Otherwise, reservedField will be 0. 108 */ 109 110 /* 111 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: 112 * Add implementations that do not call strlen() for iteration but check for NUL. 113 */ 114 115 static int32_t U_CALLCONV 116 lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 117 switch(origin) { 118 case UITER_ZERO: 119 case UITER_START: 120 return 0; 121 case UITER_CURRENT: 122 if(iter->index<0) { 123 /* the current UTF-16 index is unknown after setState(), count from the beginning */ 124 const uint8_t *s; 125 UChar32 c; 126 int32_t i, limit, index; 127 128 s=(const uint8_t *)iter->context; 129 i=index=0; 130 limit=iter->start; /* count up to the UTF-8 index */ 131 while(i<limit) { 132 L8_NEXT(s, i, limit, c); 133 if(c<=0xffff) { 134 ++index; 135 } else { 136 index+=2; 137 } 138 } 139 140 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 141 if(i==iter->limit) { 142 iter->length=index; /* in case it was <0 or wrong */ 143 } 144 if(iter->reservedField!=0) { 145 --index; /* we are in the middle of a supplementary code point */ 146 } 147 iter->index=index; 148 } 149 return iter->index; 150 case UITER_LIMIT: 151 case UITER_LENGTH: 152 if(iter->length<0) { 153 const uint8_t *s; 154 UChar32 c; 155 int32_t i, limit, length; 156 157 s=(const uint8_t *)iter->context; 158 if(iter->index<0) { 159 /* 160 * the current UTF-16 index is unknown after setState(), 161 * we must first count from the beginning to here 162 */ 163 i=length=0; 164 limit=iter->start; 165 166 /* count from the beginning to the current index */ 167 while(i<limit) { 168 L8_NEXT(s, i, limit, c); 169 if(c<=0xffff) { 170 ++length; 171 } else { 172 length+=2; 173 } 174 } 175 176 /* assume i==limit==iter->start, set the UTF-16 index */ 177 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 178 iter->index= iter->reservedField!=0 ? length-1 : length; 179 } else { 180 i=iter->start; 181 length=iter->index; 182 if(iter->reservedField!=0) { 183 ++length; 184 } 185 } 186 187 /* count from the current index to the end */ 188 limit=iter->limit; 189 while(i<limit) { 190 L8_NEXT(s, i, limit, c); 191 if(c<=0xffff) { 192 ++length; 193 } else { 194 length+=2; 195 } 196 } 197 iter->length=length; 198 } 199 return iter->length; 200 default: 201 /* not a valid origin */ 202 /* Should never get here! */ 203 return -1; 204 } 205 } 206 207 static int32_t U_CALLCONV 208 lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 209 const uint8_t *s; 210 UChar32 c; 211 int32_t pos; /* requested UTF-16 index */ 212 int32_t i; /* UTF-8 index */ 213 UBool havePos; 214 215 /* calculate the requested UTF-16 index */ 216 switch(origin) { 217 case UITER_ZERO: 218 case UITER_START: 219 pos=delta; 220 havePos=TRUE; 221 /* iter->index<0 (unknown) is possible */ 222 break; 223 case UITER_CURRENT: 224 if(iter->index>=0) { 225 pos=iter->index+delta; 226 havePos=TRUE; 227 } else { 228 /* the current UTF-16 index is unknown after setState(), use only delta */ 229 pos=0; 230 havePos=FALSE; 231 } 232 break; 233 case UITER_LIMIT: 234 case UITER_LENGTH: 235 if(iter->length>=0) { 236 pos=iter->length+delta; 237 havePos=TRUE; 238 } else { 239 /* pin to the end, avoid counting the length */ 240 iter->index=-1; 241 iter->start=iter->limit; 242 iter->reservedField=0; 243 if(delta>=0) { 244 return UITER_UNKNOWN_INDEX; 245 } else { 246 /* the current UTF-16 index is unknown, use only delta */ 247 pos=0; 248 havePos=FALSE; 249 } 250 } 251 break; 252 default: 253 return -1; /* Error */ 254 } 255 256 if(havePos) { 257 /* shortcuts: pinning to the edges of the string */ 258 if(pos<=0) { 259 iter->index=iter->start=iter->reservedField=0; 260 return 0; 261 } else if(iter->length>=0 && pos>=iter->length) { 262 iter->index=iter->length; 263 iter->start=iter->limit; 264 iter->reservedField=0; 265 return iter->index; 266 } 267 268 /* minimize the number of L8_NEXT/PREV operations */ 269 if(iter->index<0 || pos<iter->index/2) { 270 /* go forward from the start instead of backward from the current index */ 271 iter->index=iter->start=iter->reservedField=0; 272 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { 273 /* 274 * if we have the UTF-16 index and length and the new position is 275 * closer to the end than the current index, 276 * then go backward from the end instead of forward from the current index 277 */ 278 iter->index=iter->length; 279 iter->start=iter->limit; 280 iter->reservedField=0; 281 } 282 283 delta=pos-iter->index; 284 if(delta==0) { 285 return iter->index; /* nothing to do */ 286 } 287 } else { 288 /* move relative to unknown UTF-16 index */ 289 if(delta==0) { 290 return UITER_UNKNOWN_INDEX; /* nothing to do */ 291 } else if(-delta>=iter->start) { 292 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */ 293 iter->index=iter->start=iter->reservedField=0; 294 return 0; 295 } else if(delta>=(iter->limit-iter->start)) { 296 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */ 297 iter->index=iter->length; /* may or may not be <0 (unknown) */ 298 iter->start=iter->limit; 299 iter->reservedField=0; 300 return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX; 301 } 302 } 303 304 /* delta!=0 */ 305 306 /* move towards the requested position, pin to the edges of the string */ 307 s=(const uint8_t *)iter->context; 308 pos=iter->index; /* could be <0 (unknown) */ 309 i=iter->start; 310 if(delta>0) { 311 /* go forward */ 312 int32_t limit=iter->limit; 313 if(iter->reservedField!=0) { 314 iter->reservedField=0; 315 ++pos; 316 --delta; 317 } 318 while(delta>0 && i<limit) { 319 L8_NEXT(s, i, limit, c); 320 if(c<0xffff) { 321 ++pos; 322 --delta; 323 } else if(delta>=2) { 324 pos+=2; 325 delta-=2; 326 } else /* delta==1 */ { 327 /* stop in the middle of a supplementary code point */ 328 iter->reservedField=c; 329 ++pos; 330 break; /* delta=0; */ 331 } 332 } 333 if(i==limit) { 334 if(iter->length<0 && iter->index>=0) { 335 iter->length= iter->reservedField==0 ? pos : pos+1; 336 } else if(iter->index<0 && iter->length>=0) { 337 iter->index= iter->reservedField==0 ? iter->length : iter->length-1; 338 } 339 } 340 } else /* delta<0 */ { 341 /* go backward */ 342 if(iter->reservedField!=0) { 343 iter->reservedField=0; 344 i-=4; /* we stayed behind the supplementary code point; go before it now */ 345 --pos; 346 ++delta; 347 } 348 while(delta<0 && i>0) { 349 L8_PREV(s, 0, i, c); 350 if(c<0xffff) { 351 --pos; 352 ++delta; 353 } else if(delta<=-2) { 354 pos-=2; 355 delta+=2; 356 } else /* delta==-1 */ { 357 /* stop in the middle of a supplementary code point */ 358 i+=4; /* back to behind this supplementary code point for consistent state */ 359 iter->reservedField=c; 360 --pos; 361 break; /* delta=0; */ 362 } 363 } 364 } 365 366 iter->start=i; 367 if(iter->index>=0) { 368 return iter->index=pos; 369 } else { 370 /* we started with index<0 (unknown) so pos is bogus */ 371 if(i<=1) { 372 return iter->index=i; /* reached the beginning */ 373 } else { 374 /* we still don't know the UTF-16 index */ 375 return UITER_UNKNOWN_INDEX; 376 } 377 } 378 } 379 380 static UBool U_CALLCONV 381 lenient8IteratorHasNext(UCharIterator *iter) { 382 return iter->reservedField!=0 || iter->start<iter->limit; 383 } 384 385 static UBool U_CALLCONV 386 lenient8IteratorHasPrevious(UCharIterator *iter) { 387 return iter->start>0; 388 } 389 390 static UChar32 U_CALLCONV 391 lenient8IteratorCurrent(UCharIterator *iter) { 392 if(iter->reservedField!=0) { 393 return U16_TRAIL(iter->reservedField); 394 } else if(iter->start<iter->limit) { 395 const uint8_t *s=(const uint8_t *)iter->context; 396 UChar32 c; 397 int32_t i=iter->start; 398 399 L8_NEXT(s, i, iter->limit, c); 400 if(c<0) { 401 return 0xfffd; 402 } else if(c<=0xffff) { 403 return c; 404 } else { 405 return U16_LEAD(c); 406 } 407 } else { 408 return U_SENTINEL; 409 } 410 } 411 412 static UChar32 U_CALLCONV 413 lenient8IteratorNext(UCharIterator *iter) { 414 int32_t index; 415 416 if(iter->reservedField!=0) { 417 UChar trail=U16_TRAIL(iter->reservedField); 418 iter->reservedField=0; 419 if((index=iter->index)>=0) { 420 iter->index=index+1; 421 } 422 return trail; 423 } else if(iter->start<iter->limit) { 424 const uint8_t *s=(const uint8_t *)iter->context; 425 UChar32 c; 426 427 L8_NEXT(s, iter->start, iter->limit, c); 428 if((index=iter->index)>=0) { 429 iter->index=++index; 430 if(iter->length<0 && iter->start==iter->limit) { 431 iter->length= c<=0xffff ? index : index+1; 432 } 433 } else if(iter->start==iter->limit && iter->length>=0) { 434 iter->index= c<=0xffff ? iter->length : iter->length-1; 435 } 436 if(c<0) { 437 return 0xfffd; 438 } else if(c<=0xffff) { 439 return c; 440 } else { 441 iter->reservedField=c; 442 return U16_LEAD(c); 443 } 444 } else { 445 return U_SENTINEL; 446 } 447 } 448 449 static UChar32 U_CALLCONV 450 lenient8IteratorPrevious(UCharIterator *iter) { 451 int32_t index; 452 453 if(iter->reservedField!=0) { 454 UChar lead=U16_LEAD(iter->reservedField); 455 iter->reservedField=0; 456 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */ 457 if((index=iter->index)>0) { 458 iter->index=index-1; 459 } 460 return lead; 461 } else if(iter->start>0) { 462 const uint8_t *s=(const uint8_t *)iter->context; 463 UChar32 c; 464 465 L8_PREV(s, 0, iter->start, c); 466 if((index=iter->index)>0) { 467 iter->index=index-1; 468 } else if(iter->start<=1) { 469 iter->index= c<=0xffff ? iter->start : iter->start+1; 470 } 471 if(c<0) { 472 return 0xfffd; 473 } else if(c<=0xffff) { 474 return c; 475 } else { 476 iter->start+=4; /* back to behind this supplementary code point for consistent state */ 477 iter->reservedField=c; 478 return U16_TRAIL(c); 479 } 480 } else { 481 return U_SENTINEL; 482 } 483 } 484 485 static uint32_t U_CALLCONV 486 lenient8IteratorGetState(const UCharIterator *iter) { 487 uint32_t state=(uint32_t)(iter->start<<1); 488 if(iter->reservedField!=0) { 489 state|=1; 490 } 491 return state; 492 } 493 494 static void U_CALLCONV 495 lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 496 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 497 /* do nothing */ 498 } else if(iter==NULL) { 499 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 500 } else if(state==lenient8IteratorGetState(iter)) { 501 /* setting to the current state: no-op */ 502 } else { 503 int32_t index=(int32_t)(state>>1); /* UTF-8 index */ 504 state&=1; /* 1 if in surrogate pair, must be index>=4 */ 505 506 if((state==0 ? index<0 : index<4) || iter->limit<index) { 507 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 508 } else { 509 iter->start=index; /* restore UTF-8 byte index */ 510 if(index<=1) { 511 iter->index=index; 512 } else { 513 iter->index=-1; /* unknown UTF-16 index */ 514 } 515 if(state==0) { 516 iter->reservedField=0; 517 } else { 518 /* verified index>=4 above */ 519 UChar32 c; 520 L8_PREV((const uint8_t *)iter->context, 0, index, c); 521 if(c<=0xffff) { 522 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 523 } else { 524 iter->reservedField=c; 525 } 526 } 527 } 528 } 529 } 530 531 static const UCharIterator lenient8Iterator={ 532 0, 0, 0, 0, 0, 0, 533 lenient8IteratorGetIndex, 534 lenient8IteratorMove, 535 lenient8IteratorHasNext, 536 lenient8IteratorHasPrevious, 537 lenient8IteratorCurrent, 538 lenient8IteratorNext, 539 lenient8IteratorPrevious, 540 NULL, 541 lenient8IteratorGetState, 542 lenient8IteratorSetState 543 }; 544 545 U_CAPI void U_EXPORT2 546 uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) { 547 if(iter!=0) { 548 if(s!=0 && length>=-1) { 549 *iter=lenient8Iterator; 550 iter->context=s; 551 if(length>=0) { 552 iter->limit=length; 553 } else { 554 iter->limit=strlen(s); 555 } 556 iter->length= iter->limit<=1 ? iter->limit : -1; 557 } else { 558 /* set no-op iterator */ 559 uiter_setString(iter, NULL, 0); 560 } 561 } 562 } 563