1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv_u8.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
15 *
16 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
17 *
18 * Also, CESU-8 implementation, see UTR 26.
19 * The CESU-8 converter uses all the same functions as the
20 * UTF-8 converter, with a branch for converting supplementary code points.
21 */
22
23 #include "unicode/utypes.h"
24
25 #if !UCONFIG_NO_CONVERSION
26
27 #include "unicode/ucnv.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf8.h"
30 #include "unicode/utf16.h"
31 #include "ucnv_bld.h"
32 #include "ucnv_cnv.h"
33 #include "cmemory.h"
34
35 /* Prototypes --------------------------------------------------------------- */
36
37 /* Keep these here to make finicky compilers happy */
38
39 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
40 UErrorCode *err);
41 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
42 UErrorCode *err);
43
44
45 /* UTF-8 -------------------------------------------------------------------- */
46
47 /* UTF-8 Conversion DATA
48 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
49 */
50 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
51 #define MAXIMUM_UCS2 0x0000FFFF
52 #define MAXIMUM_UTF 0x0010FFFF
53 #define MAXIMUM_UCS4 0x7FFFFFFF
54 #define HALF_SHIFT 10
55 #define HALF_BASE 0x0010000
56 #define HALF_MASK 0x3FF
57 #define SURROGATE_HIGH_START 0xD800
58 #define SURROGATE_HIGH_END 0xDBFF
59 #define SURROGATE_LOW_START 0xDC00
60 #define SURROGATE_LOW_END 0xDFFF
61
62 /* -SURROGATE_LOW_START + HALF_BASE */
63 #define SURROGATE_LOW_BASE 9216
64
65 static const uint32_t offsetsFromUTF8[7] = {0,
66 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
67 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
68 };
69
70 /* END OF UTF-8 Conversion DATA */
71
72 static const int8_t bytesFromUTF8[256] = {
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
81 };
82
83 /*
84 * Starting with Unicode 3.0.1:
85 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
86 * byte sequences with more than 4 bytes are illegal in UTF-8,
87 * which is tested with impossible values for them
88 */
89 static const uint32_t
90 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
91
hasCESU8Data(const UConverter * cnv)92 static UBool hasCESU8Data(const UConverter *cnv)
93 {
94 #if UCONFIG_ONLY_HTML_CONVERSION
95 return FALSE;
96 #else
97 return (UBool)(cnv->sharedData == &_CESU8Data);
98 #endif
99 }
100
ucnv_toUnicode_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)101 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
102 UErrorCode * err)
103 {
104 UConverter *cnv = args->converter;
105 const unsigned char *mySource = (unsigned char *) args->source;
106 UChar *myTarget = args->target;
107 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
108 const UChar *targetLimit = args->targetLimit;
109 unsigned char *toUBytes = cnv->toUBytes;
110 UBool isCESU8 = hasCESU8Data(cnv);
111 uint32_t ch, ch2 = 0;
112 int32_t i, inBytes;
113
114 /* Restore size of current sequence */
115 if (cnv->toUnicodeStatus && myTarget < targetLimit)
116 {
117 inBytes = cnv->mode; /* restore # of bytes to consume */
118 i = cnv->toULength; /* restore # of bytes consumed */
119 cnv->toULength = 0;
120
121 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
122 cnv->toUnicodeStatus = 0;
123 goto morebytes;
124 }
125
126
127 while (mySource < sourceLimit && myTarget < targetLimit)
128 {
129 ch = *(mySource++);
130 if (ch < 0x80) /* Simple case */
131 {
132 *(myTarget++) = (UChar) ch;
133 }
134 else
135 {
136 /* store the first char */
137 toUBytes[0] = (char)ch;
138 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
139 i = 1;
140
141 morebytes:
142 while (i < inBytes)
143 {
144 if (mySource < sourceLimit)
145 {
146 toUBytes[i] = (char) (ch2 = *mySource);
147 if (!U8_IS_TRAIL(ch2))
148 {
149 break; /* i < inBytes */
150 }
151 ch = (ch << 6) + ch2;
152 ++mySource;
153 i++;
154 }
155 else
156 {
157 /* stores a partially calculated target*/
158 cnv->toUnicodeStatus = ch;
159 cnv->mode = inBytes;
160 cnv->toULength = (int8_t) i;
161 goto donefornow;
162 }
163 }
164
165 /* Remove the accumulated high bits */
166 ch -= offsetsFromUTF8[inBytes];
167
168 /*
169 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
170 * - use only trail bytes after a lead byte (checked above)
171 * - use the right number of trail bytes for a given lead byte
172 * - encode a code point <= U+10ffff
173 * - use the fewest possible number of bytes for their code points
174 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
175 *
176 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
177 * There are no irregular sequences any more.
178 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
179 */
180 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
181 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
182 {
183 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
184 if (ch <= MAXIMUM_UCS2)
185 {
186 /* fits in 16 bits */
187 *(myTarget++) = (UChar) ch;
188 }
189 else
190 {
191 /* write out the surrogates */
192 ch -= HALF_BASE;
193 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
194 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
195 if (myTarget < targetLimit)
196 {
197 *(myTarget++) = (UChar)ch;
198 }
199 else
200 {
201 /* Put in overflow buffer (not handled here) */
202 cnv->UCharErrorBuffer[0] = (UChar) ch;
203 cnv->UCharErrorBufferLength = 1;
204 *err = U_BUFFER_OVERFLOW_ERROR;
205 break;
206 }
207 }
208 }
209 else
210 {
211 cnv->toULength = (int8_t)i;
212 *err = U_ILLEGAL_CHAR_FOUND;
213 break;
214 }
215 }
216 }
217
218 donefornow:
219 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
220 {
221 /* End of target buffer */
222 *err = U_BUFFER_OVERFLOW_ERROR;
223 }
224
225 args->target = myTarget;
226 args->source = (const char *) mySource;
227 }
228
ucnv_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)229 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
230 UErrorCode * err)
231 {
232 UConverter *cnv = args->converter;
233 const unsigned char *mySource = (unsigned char *) args->source;
234 UChar *myTarget = args->target;
235 int32_t *myOffsets = args->offsets;
236 int32_t offsetNum = 0;
237 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
238 const UChar *targetLimit = args->targetLimit;
239 unsigned char *toUBytes = cnv->toUBytes;
240 UBool isCESU8 = hasCESU8Data(cnv);
241 uint32_t ch, ch2 = 0;
242 int32_t i, inBytes;
243
244 /* Restore size of current sequence */
245 if (cnv->toUnicodeStatus && myTarget < targetLimit)
246 {
247 inBytes = cnv->mode; /* restore # of bytes to consume */
248 i = cnv->toULength; /* restore # of bytes consumed */
249 cnv->toULength = 0;
250
251 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
252 cnv->toUnicodeStatus = 0;
253 goto morebytes;
254 }
255
256 while (mySource < sourceLimit && myTarget < targetLimit)
257 {
258 ch = *(mySource++);
259 if (ch < 0x80) /* Simple case */
260 {
261 *(myTarget++) = (UChar) ch;
262 *(myOffsets++) = offsetNum++;
263 }
264 else
265 {
266 toUBytes[0] = (char)ch;
267 inBytes = bytesFromUTF8[ch];
268 i = 1;
269
270 morebytes:
271 while (i < inBytes)
272 {
273 if (mySource < sourceLimit)
274 {
275 toUBytes[i] = (char) (ch2 = *mySource);
276 if (!U8_IS_TRAIL(ch2))
277 {
278 break; /* i < inBytes */
279 }
280 ch = (ch << 6) + ch2;
281 ++mySource;
282 i++;
283 }
284 else
285 {
286 cnv->toUnicodeStatus = ch;
287 cnv->mode = inBytes;
288 cnv->toULength = (int8_t)i;
289 goto donefornow;
290 }
291 }
292
293 /* Remove the accumulated high bits */
294 ch -= offsetsFromUTF8[inBytes];
295
296 /*
297 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
298 * - use only trail bytes after a lead byte (checked above)
299 * - use the right number of trail bytes for a given lead byte
300 * - encode a code point <= U+10ffff
301 * - use the fewest possible number of bytes for their code points
302 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
303 *
304 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
305 * There are no irregular sequences any more.
306 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
307 */
308 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
309 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
310 {
311 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
312 if (ch <= MAXIMUM_UCS2)
313 {
314 /* fits in 16 bits */
315 *(myTarget++) = (UChar) ch;
316 *(myOffsets++) = offsetNum;
317 }
318 else
319 {
320 /* write out the surrogates */
321 ch -= HALF_BASE;
322 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
323 *(myOffsets++) = offsetNum;
324 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
325 if (myTarget < targetLimit)
326 {
327 *(myTarget++) = (UChar)ch;
328 *(myOffsets++) = offsetNum;
329 }
330 else
331 {
332 cnv->UCharErrorBuffer[0] = (UChar) ch;
333 cnv->UCharErrorBufferLength = 1;
334 *err = U_BUFFER_OVERFLOW_ERROR;
335 }
336 }
337 offsetNum += i;
338 }
339 else
340 {
341 cnv->toULength = (int8_t)i;
342 *err = U_ILLEGAL_CHAR_FOUND;
343 break;
344 }
345 }
346 }
347
348 donefornow:
349 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
350 { /* End of target buffer */
351 *err = U_BUFFER_OVERFLOW_ERROR;
352 }
353
354 args->target = myTarget;
355 args->source = (const char *) mySource;
356 args->offsets = myOffsets;
357 }
358
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,UErrorCode * err)359 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
360 UErrorCode * err)
361 {
362 UConverter *cnv = args->converter;
363 const UChar *mySource = args->source;
364 const UChar *sourceLimit = args->sourceLimit;
365 uint8_t *myTarget = (uint8_t *) args->target;
366 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
367 uint8_t *tempPtr;
368 UChar32 ch;
369 uint8_t tempBuf[4];
370 int32_t indexToWrite;
371 UBool isNotCESU8 = !hasCESU8Data(cnv);
372
373 if (cnv->fromUChar32 && myTarget < targetLimit)
374 {
375 ch = cnv->fromUChar32;
376 cnv->fromUChar32 = 0;
377 goto lowsurrogate;
378 }
379
380 while (mySource < sourceLimit && myTarget < targetLimit)
381 {
382 ch = *(mySource++);
383
384 if (ch < 0x80) /* Single byte */
385 {
386 *(myTarget++) = (uint8_t) ch;
387 }
388 else if (ch < 0x800) /* Double byte */
389 {
390 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
391 if (myTarget < targetLimit)
392 {
393 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
394 }
395 else
396 {
397 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
398 cnv->charErrorBufferLength = 1;
399 *err = U_BUFFER_OVERFLOW_ERROR;
400 }
401 }
402 else {
403 /* Check for surrogates */
404 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
405 lowsurrogate:
406 if (mySource < sourceLimit) {
407 /* test both code units */
408 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
409 /* convert and consume this supplementary code point */
410 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
411 ++mySource;
412 /* exit this condition tree */
413 }
414 else {
415 /* this is an unpaired trail or lead code unit */
416 /* callback(illegal) */
417 cnv->fromUChar32 = ch;
418 *err = U_ILLEGAL_CHAR_FOUND;
419 break;
420 }
421 }
422 else {
423 /* no more input */
424 cnv->fromUChar32 = ch;
425 break;
426 }
427 }
428
429 /* Do we write the buffer directly for speed,
430 or do we have to be careful about target buffer space? */
431 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
432
433 if (ch <= MAXIMUM_UCS2) {
434 indexToWrite = 2;
435 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
436 }
437 else {
438 indexToWrite = 3;
439 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
440 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
441 }
442 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
443 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
444
445 if (tempPtr == myTarget) {
446 /* There was enough space to write the codepoint directly. */
447 myTarget += (indexToWrite + 1);
448 }
449 else {
450 /* We might run out of room soon. Write it slowly. */
451 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
452 if (myTarget < targetLimit) {
453 *(myTarget++) = *tempPtr;
454 }
455 else {
456 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
457 *err = U_BUFFER_OVERFLOW_ERROR;
458 }
459 }
460 }
461 }
462 }
463
464 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
465 {
466 *err = U_BUFFER_OVERFLOW_ERROR;
467 }
468
469 args->target = (char *) myTarget;
470 args->source = mySource;
471 }
472
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)473 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
474 UErrorCode * err)
475 {
476 UConverter *cnv = args->converter;
477 const UChar *mySource = args->source;
478 int32_t *myOffsets = args->offsets;
479 const UChar *sourceLimit = args->sourceLimit;
480 uint8_t *myTarget = (uint8_t *) args->target;
481 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
482 uint8_t *tempPtr;
483 UChar32 ch;
484 int32_t offsetNum, nextSourceIndex;
485 int32_t indexToWrite;
486 uint8_t tempBuf[4];
487 UBool isNotCESU8 = !hasCESU8Data(cnv);
488
489 if (cnv->fromUChar32 && myTarget < targetLimit)
490 {
491 ch = cnv->fromUChar32;
492 cnv->fromUChar32 = 0;
493 offsetNum = -1;
494 nextSourceIndex = 0;
495 goto lowsurrogate;
496 } else {
497 offsetNum = 0;
498 }
499
500 while (mySource < sourceLimit && myTarget < targetLimit)
501 {
502 ch = *(mySource++);
503
504 if (ch < 0x80) /* Single byte */
505 {
506 *(myOffsets++) = offsetNum++;
507 *(myTarget++) = (char) ch;
508 }
509 else if (ch < 0x800) /* Double byte */
510 {
511 *(myOffsets++) = offsetNum;
512 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
513 if (myTarget < targetLimit)
514 {
515 *(myOffsets++) = offsetNum++;
516 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
517 }
518 else
519 {
520 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
521 cnv->charErrorBufferLength = 1;
522 *err = U_BUFFER_OVERFLOW_ERROR;
523 }
524 }
525 else
526 /* Check for surrogates */
527 {
528 nextSourceIndex = offsetNum + 1;
529
530 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
531 lowsurrogate:
532 if (mySource < sourceLimit) {
533 /* test both code units */
534 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
535 /* convert and consume this supplementary code point */
536 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
537 ++mySource;
538 ++nextSourceIndex;
539 /* exit this condition tree */
540 }
541 else {
542 /* this is an unpaired trail or lead code unit */
543 /* callback(illegal) */
544 cnv->fromUChar32 = ch;
545 *err = U_ILLEGAL_CHAR_FOUND;
546 break;
547 }
548 }
549 else {
550 /* no more input */
551 cnv->fromUChar32 = ch;
552 break;
553 }
554 }
555
556 /* Do we write the buffer directly for speed,
557 or do we have to be careful about target buffer space? */
558 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
559
560 if (ch <= MAXIMUM_UCS2) {
561 indexToWrite = 2;
562 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
563 }
564 else {
565 indexToWrite = 3;
566 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
567 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
568 }
569 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
570 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
571
572 if (tempPtr == myTarget) {
573 /* There was enough space to write the codepoint directly. */
574 myTarget += (indexToWrite + 1);
575 myOffsets[0] = offsetNum;
576 myOffsets[1] = offsetNum;
577 myOffsets[2] = offsetNum;
578 if (indexToWrite >= 3) {
579 myOffsets[3] = offsetNum;
580 }
581 myOffsets += (indexToWrite + 1);
582 }
583 else {
584 /* We might run out of room soon. Write it slowly. */
585 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
586 if (myTarget < targetLimit)
587 {
588 *(myOffsets++) = offsetNum;
589 *(myTarget++) = *tempPtr;
590 }
591 else
592 {
593 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
594 *err = U_BUFFER_OVERFLOW_ERROR;
595 }
596 }
597 }
598 offsetNum = nextSourceIndex;
599 }
600 }
601
602 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
603 {
604 *err = U_BUFFER_OVERFLOW_ERROR;
605 }
606
607 args->target = (char *) myTarget;
608 args->source = mySource;
609 args->offsets = myOffsets;
610 }
611
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)612 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
613 UErrorCode *err) {
614 UConverter *cnv;
615 const uint8_t *sourceInitial;
616 const uint8_t *source;
617 uint16_t extraBytesToWrite;
618 uint8_t myByte;
619 UChar32 ch;
620 int8_t i, isLegalSequence;
621
622 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
623
624 cnv = args->converter;
625 sourceInitial = source = (const uint8_t *)args->source;
626 if (source >= (const uint8_t *)args->sourceLimit)
627 {
628 /* no input */
629 *err = U_INDEX_OUTOFBOUNDS_ERROR;
630 return 0xffff;
631 }
632
633 myByte = (uint8_t)*(source++);
634 if (myByte < 0x80)
635 {
636 args->source = (const char *)source;
637 return (UChar32)myByte;
638 }
639
640 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
641 if (extraBytesToWrite == 0) {
642 cnv->toUBytes[0] = myByte;
643 cnv->toULength = 1;
644 *err = U_ILLEGAL_CHAR_FOUND;
645 args->source = (const char *)source;
646 return 0xffff;
647 }
648
649 /*The byte sequence is longer than the buffer area passed*/
650 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
651 {
652 /* check if all of the remaining bytes are trail bytes */
653 cnv->toUBytes[0] = myByte;
654 i = 1;
655 *err = U_TRUNCATED_CHAR_FOUND;
656 while(source < (const uint8_t *)args->sourceLimit) {
657 if(U8_IS_TRAIL(myByte = *source)) {
658 cnv->toUBytes[i++] = myByte;
659 ++source;
660 } else {
661 /* error even before we run out of input */
662 *err = U_ILLEGAL_CHAR_FOUND;
663 break;
664 }
665 }
666 cnv->toULength = i;
667 args->source = (const char *)source;
668 return 0xffff;
669 }
670
671 isLegalSequence = 1;
672 ch = myByte << 6;
673 switch(extraBytesToWrite)
674 {
675 /* note: code falls through cases! (sic)*/
676 case 6:
677 ch += (myByte = *source);
678 ch <<= 6;
679 if (!U8_IS_TRAIL(myByte))
680 {
681 isLegalSequence = 0;
682 break;
683 }
684 ++source;
685 U_FALLTHROUGH;
686 case 5:
687 ch += (myByte = *source);
688 ch <<= 6;
689 if (!U8_IS_TRAIL(myByte))
690 {
691 isLegalSequence = 0;
692 break;
693 }
694 ++source;
695 U_FALLTHROUGH;
696 case 4:
697 ch += (myByte = *source);
698 ch <<= 6;
699 if (!U8_IS_TRAIL(myByte))
700 {
701 isLegalSequence = 0;
702 break;
703 }
704 ++source;
705 U_FALLTHROUGH;
706 case 3:
707 ch += (myByte = *source);
708 ch <<= 6;
709 if (!U8_IS_TRAIL(myByte))
710 {
711 isLegalSequence = 0;
712 break;
713 }
714 ++source;
715 U_FALLTHROUGH;
716 case 2:
717 ch += (myByte = *source);
718 if (!U8_IS_TRAIL(myByte))
719 {
720 isLegalSequence = 0;
721 break;
722 }
723 ++source;
724 };
725 ch -= offsetsFromUTF8[extraBytesToWrite];
726 args->source = (const char *)source;
727
728 /*
729 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
730 * - use only trail bytes after a lead byte (checked above)
731 * - use the right number of trail bytes for a given lead byte
732 * - encode a code point <= U+10ffff
733 * - use the fewest possible number of bytes for their code points
734 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
735 *
736 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
737 * There are no irregular sequences any more.
738 */
739 if (isLegalSequence &&
740 (uint32_t)ch <= MAXIMUM_UTF &&
741 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
742 !U_IS_SURROGATE(ch)
743 ) {
744 return ch; /* return the code point */
745 }
746
747 for(i = 0; sourceInitial < source; ++i) {
748 cnv->toUBytes[i] = *sourceInitial++;
749 }
750 cnv->toULength = i;
751 *err = U_ILLEGAL_CHAR_FOUND;
752 return 0xffff;
753 }
754
755 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
756
757 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
758 static const UChar32
759 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
760
761 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
762 static const UChar32
763 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
764
765 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
766 static void
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs * pFromUArgs,UConverterToUnicodeArgs * pToUArgs,UErrorCode * pErrorCode)767 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
768 UConverterToUnicodeArgs *pToUArgs,
769 UErrorCode *pErrorCode) {
770 UConverter *utf8;
771 const uint8_t *source, *sourceLimit;
772 uint8_t *target;
773 int32_t targetCapacity;
774 int32_t count;
775
776 int8_t oldToULength, toULength, toULimit;
777
778 UChar32 c;
779 uint8_t b, t1, t2;
780
781 /* set up the local pointers */
782 utf8=pToUArgs->converter;
783 source=(uint8_t *)pToUArgs->source;
784 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
785 target=(uint8_t *)pFromUArgs->target;
786 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
787
788 /* get the converter state from the UTF-8 UConverter */
789 c=(UChar32)utf8->toUnicodeStatus;
790 if(c!=0) {
791 toULength=oldToULength=utf8->toULength;
792 toULimit=(int8_t)utf8->mode;
793 } else {
794 toULength=oldToULength=toULimit=0;
795 }
796
797 count=(int32_t)(sourceLimit-source)+oldToULength;
798 if(count<toULimit) {
799 /*
800 * Not enough input to complete the partial character.
801 * Jump to moreBytes below - it will not output to target.
802 */
803 } else if(targetCapacity<toULimit) {
804 /*
805 * Not enough target capacity to output the partial character.
806 * Let the standard converter handle this.
807 */
808 *pErrorCode=U_USING_DEFAULT_WARNING;
809 return;
810 } else {
811 /*
812 * Use a single counter for source and target, counting the minimum of
813 * the source length and the target capacity.
814 * As a result, the source length is checked only once per multi-byte
815 * character instead of twice.
816 *
817 * Make sure that the last byte sequence is complete, or else
818 * stop just before it.
819 * (The longest legal byte sequence has 3 trail bytes.)
820 * Count oldToULength (number of source bytes from a previous buffer)
821 * into the source length but reduce the source index by toULimit
822 * while going back over trail bytes in order to not go back into
823 * the bytes that will be read for finishing a partial
824 * sequence from the previous buffer.
825 * Let the standard converter handle edge cases.
826 */
827 int32_t i;
828
829 if(count>targetCapacity) {
830 count=targetCapacity;
831 }
832
833 i=0;
834 while(i<3 && i<(count-toULimit)) {
835 b=source[count-oldToULength-i-1];
836 if(U8_IS_TRAIL(b)) {
837 ++i;
838 } else {
839 if(i<U8_COUNT_TRAIL_BYTES(b)) {
840 /* stop converting before the lead byte if there are not enough trail bytes for it */
841 count-=i+1;
842 }
843 break;
844 }
845 }
846 }
847
848 if(c!=0) {
849 utf8->toUnicodeStatus=0;
850 utf8->toULength=0;
851 goto moreBytes;
852 /* See note in ucnv_SBCSFromUTF8() about this goto. */
853 }
854
855 /* conversion loop */
856 while(count>0) {
857 b=*source++;
858 if((int8_t)b>=0) {
859 /* convert ASCII */
860 *target++=b;
861 --count;
862 continue;
863 } else {
864 if(b>0xe0) {
865 if( /* handle U+1000..U+D7FF inline */
866 (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
867 (b==0xed && (t1 <= 0x9f))) &&
868 (t2=source[1]) >= 0x80 && t2 <= 0xbf
869 ) {
870 source+=2;
871 *target++=b;
872 *target++=t1;
873 *target++=t2;
874 count-=3;
875 continue;
876 }
877 } else if(b<0xe0) {
878 if( /* handle U+0080..U+07FF inline */
879 b>=0xc2 &&
880 (t1=*source) >= 0x80 && t1 <= 0xbf
881 ) {
882 ++source;
883 *target++=b;
884 *target++=t1;
885 count-=2;
886 continue;
887 }
888 } else if(b==0xe0) {
889 if( /* handle U+0800..U+0FFF inline */
890 (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
891 (t2=source[1]) >= 0x80 && t2 <= 0xbf
892 ) {
893 source+=2;
894 *target++=b;
895 *target++=t1;
896 *target++=t2;
897 count-=3;
898 continue;
899 }
900 }
901
902 /* handle "complicated" and error cases, and continuing partial characters */
903 oldToULength=0;
904 toULength=1;
905 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
906 c=b;
907 moreBytes:
908 while(toULength<toULimit) {
909 if(source<sourceLimit) {
910 b=*source;
911 if(U8_IS_TRAIL(b)) {
912 ++source;
913 ++toULength;
914 c=(c<<6)+b;
915 } else {
916 break; /* sequence too short, stop with toULength<toULimit */
917 }
918 } else {
919 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
920 source-=(toULength-oldToULength);
921 while(oldToULength<toULength) {
922 utf8->toUBytes[oldToULength++]=*source++;
923 }
924 utf8->toUnicodeStatus=c;
925 utf8->toULength=toULength;
926 utf8->mode=toULimit;
927 pToUArgs->source=(char *)source;
928 pFromUArgs->target=(char *)target;
929 return;
930 }
931 }
932
933 if( toULength==toULimit && /* consumed all trail bytes */
934 (toULength==3 || toULength==2) && /* BMP */
935 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
936 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
937 ) {
938 /* legal byte sequence for BMP code point */
939 } else if(
940 toULength==toULimit && toULength==4 &&
941 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
942 ) {
943 /* legal byte sequence for supplementary code point */
944 } else {
945 /* error handling: illegal UTF-8 byte sequence */
946 source-=(toULength-oldToULength);
947 while(oldToULength<toULength) {
948 utf8->toUBytes[oldToULength++]=*source++;
949 }
950 utf8->toULength=toULength;
951 pToUArgs->source=(char *)source;
952 pFromUArgs->target=(char *)target;
953 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
954 return;
955 }
956
957 /* copy the legal byte sequence to the target */
958 {
959 int8_t i;
960
961 for(i=0; i<oldToULength; ++i) {
962 *target++=utf8->toUBytes[i];
963 }
964 source-=(toULength-oldToULength);
965 for(; i<toULength; ++i) {
966 *target++=*source++;
967 }
968 count-=toULength;
969 }
970 }
971 }
972
973 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
974 if(target==(const uint8_t *)pFromUArgs->targetLimit) {
975 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
976 } else {
977 b=*source;
978 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
979 if(toULimit>(sourceLimit-source)) {
980 /* collect a truncated byte sequence */
981 toULength=0;
982 c=b;
983 for(;;) {
984 utf8->toUBytes[toULength++]=b;
985 if(++source==sourceLimit) {
986 /* partial byte sequence at end of source */
987 utf8->toUnicodeStatus=c;
988 utf8->toULength=toULength;
989 utf8->mode=toULimit;
990 break;
991 } else if(!U8_IS_TRAIL(b=*source)) {
992 /* lead byte in trail byte position */
993 utf8->toULength=toULength;
994 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
995 break;
996 }
997 c=(c<<6)+b;
998 }
999 } else {
1000 /* partial-sequence target overflow: fall back to the pivoting implementation */
1001 *pErrorCode=U_USING_DEFAULT_WARNING;
1002 }
1003 }
1004 }
1005
1006 /* write back the updated pointers */
1007 pToUArgs->source=(char *)source;
1008 pFromUArgs->target=(char *)target;
1009 }
1010
1011 /* UTF-8 converter data ----------------------------------------------------- */
1012
1013 static const UConverterImpl _UTF8Impl={
1014 UCNV_UTF8,
1015
1016 NULL,
1017 NULL,
1018
1019 NULL,
1020 NULL,
1021 NULL,
1022
1023 ucnv_toUnicode_UTF8,
1024 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1025 ucnv_fromUnicode_UTF8,
1026 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1027 ucnv_getNextUChar_UTF8,
1028
1029 NULL,
1030 NULL,
1031 NULL,
1032 NULL,
1033 ucnv_getNonSurrogateUnicodeSet,
1034
1035 ucnv_UTF8FromUTF8,
1036 ucnv_UTF8FromUTF8
1037 };
1038
1039 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1040 static const UConverterStaticData _UTF8StaticData={
1041 sizeof(UConverterStaticData),
1042 "UTF-8",
1043 1208, UCNV_IBM, UCNV_UTF8,
1044 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1045 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1046 0,
1047 0,
1048 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1049 };
1050
1051
1052 const UConverterSharedData _UTF8Data=
1053 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
1054
1055 /* CESU-8 converter data ---------------------------------------------------- */
1056
1057 static const UConverterImpl _CESU8Impl={
1058 UCNV_CESU8,
1059
1060 NULL,
1061 NULL,
1062
1063 NULL,
1064 NULL,
1065 NULL,
1066
1067 ucnv_toUnicode_UTF8,
1068 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1069 ucnv_fromUnicode_UTF8,
1070 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1071 NULL,
1072
1073 NULL,
1074 NULL,
1075 NULL,
1076 NULL,
1077 ucnv_getCompleteUnicodeSet,
1078
1079 NULL,
1080 NULL
1081 };
1082
1083 static const UConverterStaticData _CESU8StaticData={
1084 sizeof(UConverterStaticData),
1085 "CESU-8",
1086 9400, /* CCSID for CESU-8 */
1087 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1088 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1089 0,
1090 0,
1091 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1092 };
1093
1094
1095 const UConverterSharedData _CESU8Data=
1096 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
1097
1098 #endif
1099