1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   file name:  ucnv_u8.c
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002jul01
14 *   created by: Markus W. Scherer
15 *
16 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
17 *
18 *   Also, CESU-8 implementation, see UTR 26.
19 *   The CESU-8 converter uses all the same functions as the
20 *   UTF-8 converter, with a branch for converting supplementary code points.
21 */
22 
23 #include "unicode/utypes.h"
24 
25 #if !UCONFIG_NO_CONVERSION
26 
27 #include "unicode/ucnv.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf8.h"
30 #include "unicode/utf16.h"
31 #include "uassert.h"
32 #include "ucnv_bld.h"
33 #include "ucnv_cnv.h"
34 #include "cmemory.h"
35 #include "ustr_imp.h"
36 
37 /* Prototypes --------------------------------------------------------------- */
38 
39 /* Keep these here to make finicky compilers happy */
40 
41 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
42                                            UErrorCode *err);
43 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
44                                                         UErrorCode *err);
45 
46 
47 /* UTF-8 -------------------------------------------------------------------- */
48 
49 #define MAXIMUM_UCS2            0x0000FFFF
50 
51 static const uint32_t offsetsFromUTF8[5] = {0,
52   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
53   (uint32_t) 0x03C82080
54 };
55 
hasCESU8Data(const UConverter * cnv)56 static UBool hasCESU8Data(const UConverter *cnv)
57 {
58 #if UCONFIG_ONLY_HTML_CONVERSION
59     return FALSE;
60 #else
61     return (UBool)(cnv->sharedData == &_CESU8Data);
62 #endif
63 }
64 U_CDECL_BEGIN
ucnv_toUnicode_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)65 static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
66                                   UErrorCode * err)
67 {
68     UConverter *cnv = args->converter;
69     const unsigned char *mySource = (unsigned char *) args->source;
70     UChar *myTarget = args->target;
71     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
72     const UChar *targetLimit = args->targetLimit;
73     unsigned char *toUBytes = cnv->toUBytes;
74     UBool isCESU8 = hasCESU8Data(cnv);
75     uint32_t ch, ch2 = 0;
76     int32_t i, inBytes;
77 
78     /* Restore size of current sequence */
79     if (cnv->toULength > 0 && myTarget < targetLimit)
80     {
81         inBytes = cnv->mode;            /* restore # of bytes to consume */
82         i = cnv->toULength;             /* restore # of bytes consumed */
83         cnv->toULength = 0;
84 
85         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
86         cnv->toUnicodeStatus = 0;
87         goto morebytes;
88     }
89 
90 
91     while (mySource < sourceLimit && myTarget < targetLimit)
92     {
93         ch = *(mySource++);
94         if (U8_IS_SINGLE(ch))        /* Simple case */
95         {
96             *(myTarget++) = (UChar) ch;
97         }
98         else
99         {
100             /* store the first char */
101             toUBytes[0] = (char)ch;
102             inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
103             i = 1;
104 
105 morebytes:
106             while (i < inBytes)
107             {
108                 if (mySource < sourceLimit)
109                 {
110                     toUBytes[i] = (char) (ch2 = *mySource);
111                     if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
112                             !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
113                     {
114                         break; /* i < inBytes */
115                     }
116                     ch = (ch << 6) + ch2;
117                     ++mySource;
118                     i++;
119                 }
120                 else
121                 {
122                     /* stores a partially calculated target*/
123                     cnv->toUnicodeStatus = ch;
124                     cnv->mode = inBytes;
125                     cnv->toULength = (int8_t) i;
126                     goto donefornow;
127                 }
128             }
129 
130             // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
131             if (i == inBytes && (!isCESU8 || i <= 3))
132             {
133                 /* Remove the accumulated high bits */
134                 ch -= offsetsFromUTF8[inBytes];
135 
136                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
137                 if (ch <= MAXIMUM_UCS2)
138                 {
139                     /* fits in 16 bits */
140                     *(myTarget++) = (UChar) ch;
141                 }
142                 else
143                 {
144                     /* write out the surrogates */
145                     *(myTarget++) = U16_LEAD(ch);
146                     ch = U16_TRAIL(ch);
147                     if (myTarget < targetLimit)
148                     {
149                         *(myTarget++) = (UChar)ch;
150                     }
151                     else
152                     {
153                         /* Put in overflow buffer (not handled here) */
154                         cnv->UCharErrorBuffer[0] = (UChar) ch;
155                         cnv->UCharErrorBufferLength = 1;
156                         *err = U_BUFFER_OVERFLOW_ERROR;
157                         break;
158                     }
159                 }
160             }
161             else
162             {
163                 cnv->toULength = (int8_t)i;
164                 *err = U_ILLEGAL_CHAR_FOUND;
165                 break;
166             }
167         }
168     }
169 
170 donefornow:
171     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
172     {
173         /* End of target buffer */
174         *err = U_BUFFER_OVERFLOW_ERROR;
175     }
176 
177     args->target = myTarget;
178     args->source = (const char *) mySource;
179 }
180 
ucnv_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)181 static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
182                                                 UErrorCode * err)
183 {
184     UConverter *cnv = args->converter;
185     const unsigned char *mySource = (unsigned char *) args->source;
186     UChar *myTarget = args->target;
187     int32_t *myOffsets = args->offsets;
188     int32_t offsetNum = 0;
189     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
190     const UChar *targetLimit = args->targetLimit;
191     unsigned char *toUBytes = cnv->toUBytes;
192     UBool isCESU8 = hasCESU8Data(cnv);
193     uint32_t ch, ch2 = 0;
194     int32_t i, inBytes;
195 
196     /* Restore size of current sequence */
197     if (cnv->toULength > 0 && myTarget < targetLimit)
198     {
199         inBytes = cnv->mode;            /* restore # of bytes to consume */
200         i = cnv->toULength;             /* restore # of bytes consumed */
201         cnv->toULength = 0;
202 
203         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
204         cnv->toUnicodeStatus = 0;
205         goto morebytes;
206     }
207 
208     while (mySource < sourceLimit && myTarget < targetLimit)
209     {
210         ch = *(mySource++);
211         if (U8_IS_SINGLE(ch))        /* Simple case */
212         {
213             *(myTarget++) = (UChar) ch;
214             *(myOffsets++) = offsetNum++;
215         }
216         else
217         {
218             toUBytes[0] = (char)ch;
219             inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
220             i = 1;
221 
222 morebytes:
223             while (i < inBytes)
224             {
225                 if (mySource < sourceLimit)
226                 {
227                     toUBytes[i] = (char) (ch2 = *mySource);
228                     if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
229                             !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
230                     {
231                         break; /* i < inBytes */
232                     }
233                     ch = (ch << 6) + ch2;
234                     ++mySource;
235                     i++;
236                 }
237                 else
238                 {
239                     cnv->toUnicodeStatus = ch;
240                     cnv->mode = inBytes;
241                     cnv->toULength = (int8_t)i;
242                     goto donefornow;
243                 }
244             }
245 
246             // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
247             if (i == inBytes && (!isCESU8 || i <= 3))
248             {
249                 /* Remove the accumulated high bits */
250                 ch -= offsetsFromUTF8[inBytes];
251 
252                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
253                 if (ch <= MAXIMUM_UCS2)
254                 {
255                     /* fits in 16 bits */
256                     *(myTarget++) = (UChar) ch;
257                     *(myOffsets++) = offsetNum;
258                 }
259                 else
260                 {
261                     /* write out the surrogates */
262                     *(myTarget++) = U16_LEAD(ch);
263                     *(myOffsets++) = offsetNum;
264                     ch = U16_TRAIL(ch);
265                     if (myTarget < targetLimit)
266                     {
267                         *(myTarget++) = (UChar)ch;
268                         *(myOffsets++) = offsetNum;
269                     }
270                     else
271                     {
272                         cnv->UCharErrorBuffer[0] = (UChar) ch;
273                         cnv->UCharErrorBufferLength = 1;
274                         *err = U_BUFFER_OVERFLOW_ERROR;
275                     }
276                 }
277                 offsetNum += i;
278             }
279             else
280             {
281                 cnv->toULength = (int8_t)i;
282                 *err = U_ILLEGAL_CHAR_FOUND;
283                 break;
284             }
285         }
286     }
287 
288 donefornow:
289     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
290     {   /* End of target buffer */
291         *err = U_BUFFER_OVERFLOW_ERROR;
292     }
293 
294     args->target = myTarget;
295     args->source = (const char *) mySource;
296     args->offsets = myOffsets;
297 }
298 U_CDECL_END
299 
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,UErrorCode * err)300 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
301                                     UErrorCode * err)
302 {
303     UConverter *cnv = args->converter;
304     const UChar *mySource = args->source;
305     const UChar *sourceLimit = args->sourceLimit;
306     uint8_t *myTarget = (uint8_t *) args->target;
307     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
308     uint8_t *tempPtr;
309     UChar32 ch;
310     uint8_t tempBuf[4];
311     int32_t indexToWrite;
312     UBool isNotCESU8 = !hasCESU8Data(cnv);
313 
314     if (cnv->fromUChar32 && myTarget < targetLimit)
315     {
316         ch = cnv->fromUChar32;
317         cnv->fromUChar32 = 0;
318         goto lowsurrogate;
319     }
320 
321     while (mySource < sourceLimit && myTarget < targetLimit)
322     {
323         ch = *(mySource++);
324 
325         if (ch < 0x80)        /* Single byte */
326         {
327             *(myTarget++) = (uint8_t) ch;
328         }
329         else if (ch < 0x800)  /* Double byte */
330         {
331             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
332             if (myTarget < targetLimit)
333             {
334                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
335             }
336             else
337             {
338                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
339                 cnv->charErrorBufferLength = 1;
340                 *err = U_BUFFER_OVERFLOW_ERROR;
341             }
342         }
343         else {
344             /* Check for surrogates */
345             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
346 lowsurrogate:
347                 if (mySource < sourceLimit) {
348                     /* test both code units */
349                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
350                         /* convert and consume this supplementary code point */
351                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
352                         ++mySource;
353                         /* exit this condition tree */
354                     }
355                     else {
356                         /* this is an unpaired trail or lead code unit */
357                         /* callback(illegal) */
358                         cnv->fromUChar32 = ch;
359                         *err = U_ILLEGAL_CHAR_FOUND;
360                         break;
361                     }
362                 }
363                 else {
364                     /* no more input */
365                     cnv->fromUChar32 = ch;
366                     break;
367                 }
368             }
369 
370             /* Do we write the buffer directly for speed,
371             or do we have to be careful about target buffer space? */
372             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
373 
374             if (ch <= MAXIMUM_UCS2) {
375                 indexToWrite = 2;
376                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
377             }
378             else {
379                 indexToWrite = 3;
380                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
381                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
382             }
383             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
384             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
385 
386             if (tempPtr == myTarget) {
387                 /* There was enough space to write the codepoint directly. */
388                 myTarget += (indexToWrite + 1);
389             }
390             else {
391                 /* We might run out of room soon. Write it slowly. */
392                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
393                     if (myTarget < targetLimit) {
394                         *(myTarget++) = *tempPtr;
395                     }
396                     else {
397                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
398                         *err = U_BUFFER_OVERFLOW_ERROR;
399                     }
400                 }
401             }
402         }
403     }
404 
405     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
406     {
407         *err = U_BUFFER_OVERFLOW_ERROR;
408     }
409 
410     args->target = (char *) myTarget;
411     args->source = mySource;
412 }
413 
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)414 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
415                                                   UErrorCode * err)
416 {
417     UConverter *cnv = args->converter;
418     const UChar *mySource = args->source;
419     int32_t *myOffsets = args->offsets;
420     const UChar *sourceLimit = args->sourceLimit;
421     uint8_t *myTarget = (uint8_t *) args->target;
422     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
423     uint8_t *tempPtr;
424     UChar32 ch;
425     int32_t offsetNum, nextSourceIndex;
426     int32_t indexToWrite;
427     uint8_t tempBuf[4];
428     UBool isNotCESU8 = !hasCESU8Data(cnv);
429 
430     if (cnv->fromUChar32 && myTarget < targetLimit)
431     {
432         ch = cnv->fromUChar32;
433         cnv->fromUChar32 = 0;
434         offsetNum = -1;
435         nextSourceIndex = 0;
436         goto lowsurrogate;
437     } else {
438         offsetNum = 0;
439     }
440 
441     while (mySource < sourceLimit && myTarget < targetLimit)
442     {
443         ch = *(mySource++);
444 
445         if (ch < 0x80)        /* Single byte */
446         {
447             *(myOffsets++) = offsetNum++;
448             *(myTarget++) = (char) ch;
449         }
450         else if (ch < 0x800)  /* Double byte */
451         {
452             *(myOffsets++) = offsetNum;
453             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
454             if (myTarget < targetLimit)
455             {
456                 *(myOffsets++) = offsetNum++;
457                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
458             }
459             else
460             {
461                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
462                 cnv->charErrorBufferLength = 1;
463                 *err = U_BUFFER_OVERFLOW_ERROR;
464             }
465         }
466         else
467         /* Check for surrogates */
468         {
469             nextSourceIndex = offsetNum + 1;
470 
471             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
472 lowsurrogate:
473                 if (mySource < sourceLimit) {
474                     /* test both code units */
475                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
476                         /* convert and consume this supplementary code point */
477                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
478                         ++mySource;
479                         ++nextSourceIndex;
480                         /* exit this condition tree */
481                     }
482                     else {
483                         /* this is an unpaired trail or lead code unit */
484                         /* callback(illegal) */
485                         cnv->fromUChar32 = ch;
486                         *err = U_ILLEGAL_CHAR_FOUND;
487                         break;
488                     }
489                 }
490                 else {
491                     /* no more input */
492                     cnv->fromUChar32 = ch;
493                     break;
494                 }
495             }
496 
497             /* Do we write the buffer directly for speed,
498             or do we have to be careful about target buffer space? */
499             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
500 
501             if (ch <= MAXIMUM_UCS2) {
502                 indexToWrite = 2;
503                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
504             }
505             else {
506                 indexToWrite = 3;
507                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
508                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
509             }
510             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
511             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
512 
513             if (tempPtr == myTarget) {
514                 /* There was enough space to write the codepoint directly. */
515                 myTarget += (indexToWrite + 1);
516                 myOffsets[0] = offsetNum;
517                 myOffsets[1] = offsetNum;
518                 myOffsets[2] = offsetNum;
519                 if (indexToWrite >= 3) {
520                     myOffsets[3] = offsetNum;
521                 }
522                 myOffsets += (indexToWrite + 1);
523             }
524             else {
525                 /* We might run out of room soon. Write it slowly. */
526                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
527                     if (myTarget < targetLimit)
528                     {
529                         *(myOffsets++) = offsetNum;
530                         *(myTarget++) = *tempPtr;
531                     }
532                     else
533                     {
534                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
535                         *err = U_BUFFER_OVERFLOW_ERROR;
536                     }
537                 }
538             }
539             offsetNum = nextSourceIndex;
540         }
541     }
542 
543     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
544     {
545         *err = U_BUFFER_OVERFLOW_ERROR;
546     }
547 
548     args->target = (char *) myTarget;
549     args->source = mySource;
550     args->offsets = myOffsets;
551 }
552 
553 U_CDECL_BEGIN
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)554 static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
555                                                UErrorCode *err) {
556     UConverter *cnv;
557     const uint8_t *sourceInitial;
558     const uint8_t *source;
559     uint8_t myByte;
560     UChar32 ch;
561     int8_t i;
562 
563     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
564 
565     cnv = args->converter;
566     sourceInitial = source = (const uint8_t *)args->source;
567     if (source >= (const uint8_t *)args->sourceLimit)
568     {
569         /* no input */
570         *err = U_INDEX_OUTOFBOUNDS_ERROR;
571         return 0xffff;
572     }
573 
574     myByte = (uint8_t)*(source++);
575     if (U8_IS_SINGLE(myByte))
576     {
577         args->source = (const char *)source;
578         return (UChar32)myByte;
579     }
580 
581     uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
582     if (countTrailBytes == 0) {
583         cnv->toUBytes[0] = myByte;
584         cnv->toULength = 1;
585         *err = U_ILLEGAL_CHAR_FOUND;
586         args->source = (const char *)source;
587         return 0xffff;
588     }
589 
590     /*The byte sequence is longer than the buffer area passed*/
591     if (((const char *)source + countTrailBytes) > args->sourceLimit)
592     {
593         /* check if all of the remaining bytes are trail bytes */
594         uint16_t extraBytesToWrite = countTrailBytes + 1;
595         cnv->toUBytes[0] = myByte;
596         i = 1;
597         *err = U_TRUNCATED_CHAR_FOUND;
598         while(source < (const uint8_t *)args->sourceLimit) {
599             uint8_t b = *source;
600             if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
601                 cnv->toUBytes[i++] = b;
602                 ++source;
603             } else {
604                 /* error even before we run out of input */
605                 *err = U_ILLEGAL_CHAR_FOUND;
606                 break;
607             }
608         }
609         cnv->toULength = i;
610         args->source = (const char *)source;
611         return 0xffff;
612     }
613 
614     ch = myByte << 6;
615     if(countTrailBytes == 2) {
616         uint8_t t1 = *source, t2;
617         if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
618             args->source = (const char *)(source + 1);
619             return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
620         }
621     } else if(countTrailBytes == 1) {
622         uint8_t t1 = *source;
623         if(U8_IS_TRAIL(t1)) {
624             args->source = (const char *)(source + 1);
625             return (ch + t1) - offsetsFromUTF8[2];
626         }
627     } else {  // countTrailBytes == 3
628         uint8_t t1 = *source, t2, t3;
629         if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
630                 U8_IS_TRAIL(t3 = *++source)) {
631             args->source = (const char *)(source + 1);
632             return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
633         }
634     }
635     args->source = (const char *)source;
636 
637     for(i = 0; sourceInitial < source; ++i) {
638         cnv->toUBytes[i] = *sourceInitial++;
639     }
640     cnv->toULength = i;
641     *err = U_ILLEGAL_CHAR_FOUND;
642     return 0xffff;
643 }
644 U_CDECL_END
645 
646 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
647 
648 U_CDECL_BEGIN
649 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
650 static void U_CALLCONV
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs * pFromUArgs,UConverterToUnicodeArgs * pToUArgs,UErrorCode * pErrorCode)651 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
652                   UConverterToUnicodeArgs *pToUArgs,
653                   UErrorCode *pErrorCode) {
654     UConverter *utf8;
655     const uint8_t *source, *sourceLimit;
656     uint8_t *target;
657     int32_t targetCapacity;
658     int32_t count;
659 
660     int8_t oldToULength, toULength, toULimit;
661 
662     UChar32 c;
663     uint8_t b, t1, t2;
664 
665     /* set up the local pointers */
666     utf8=pToUArgs->converter;
667     source=(uint8_t *)pToUArgs->source;
668     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
669     target=(uint8_t *)pFromUArgs->target;
670     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
671 
672     /* get the converter state from the UTF-8 UConverter */
673     if(utf8->toULength > 0) {
674         toULength=oldToULength=utf8->toULength;
675         toULimit=(int8_t)utf8->mode;
676         c=(UChar32)utf8->toUnicodeStatus;
677     } else {
678         toULength=oldToULength=toULimit=0;
679         c = 0;
680     }
681 
682     count=(int32_t)(sourceLimit-source)+oldToULength;
683     if(count<toULimit) {
684         /*
685          * Not enough input to complete the partial character.
686          * Jump to moreBytes below - it will not output to target.
687          */
688     } else if(targetCapacity<toULimit) {
689         /*
690          * Not enough target capacity to output the partial character.
691          * Let the standard converter handle this.
692          */
693         *pErrorCode=U_USING_DEFAULT_WARNING;
694         return;
695     } else {
696         // Use a single counter for source and target, counting the minimum of
697         // the source length and the target capacity.
698         // Let the standard converter handle edge cases.
699         if(count>targetCapacity) {
700             count=targetCapacity;
701         }
702 
703         // The conversion loop checks count>0 only once per character.
704         // If the buffer ends with a truncated sequence,
705         // then we reduce the count to stop before that,
706         // and collect the remaining bytes after the conversion loop.
707 
708         // Do not go back into the bytes that will be read for finishing a partial
709         // sequence from the previous buffer.
710         int32_t length=count-toULimit;
711         U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
712         count=toULimit+length;
713     }
714 
715     if(c!=0) {
716         utf8->toUnicodeStatus=0;
717         utf8->toULength=0;
718         goto moreBytes;
719         /* See note in ucnv_SBCSFromUTF8() about this goto. */
720     }
721 
722     /* conversion loop */
723     while(count>0) {
724         b=*source++;
725         if(U8_IS_SINGLE(b)) {
726             /* convert ASCII */
727             *target++=b;
728             --count;
729             continue;
730         } else {
731             if(b>=0xe0) {
732                 if( /* handle U+0800..U+FFFF inline */
733                     b<0xf0 &&
734                     U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
735                     U8_IS_TRAIL(t2=source[1])
736                 ) {
737                     source+=2;
738                     *target++=b;
739                     *target++=t1;
740                     *target++=t2;
741                     count-=3;
742                     continue;
743                 }
744             } else {
745                 if( /* handle U+0080..U+07FF inline */
746                     b>=0xc2 &&
747                     U8_IS_TRAIL(t1=*source)
748                 ) {
749                     ++source;
750                     *target++=b;
751                     *target++=t1;
752                     count-=2;
753                     continue;
754                 }
755             }
756 
757             /* handle "complicated" and error cases, and continuing partial characters */
758             oldToULength=0;
759             toULength=1;
760             toULimit=U8_COUNT_BYTES_NON_ASCII(b);
761             c=b;
762 moreBytes:
763             while(toULength<toULimit) {
764                 if(source<sourceLimit) {
765                     b=*source;
766                     if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
767                         ++source;
768                         ++toULength;
769                         c=(c<<6)+b;
770                     } else {
771                         break; /* sequence too short, stop with toULength<toULimit */
772                     }
773                 } else {
774                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
775                     source-=(toULength-oldToULength);
776                     while(oldToULength<toULength) {
777                         utf8->toUBytes[oldToULength++]=*source++;
778                     }
779                     utf8->toUnicodeStatus=c;
780                     utf8->toULength=toULength;
781                     utf8->mode=toULimit;
782                     pToUArgs->source=(char *)source;
783                     pFromUArgs->target=(char *)target;
784                     return;
785                 }
786             }
787 
788             if(toULength!=toULimit) {
789                 /* error handling: illegal UTF-8 byte sequence */
790                 source-=(toULength-oldToULength);
791                 while(oldToULength<toULength) {
792                     utf8->toUBytes[oldToULength++]=*source++;
793                 }
794                 utf8->toULength=toULength;
795                 pToUArgs->source=(char *)source;
796                 pFromUArgs->target=(char *)target;
797                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
798                 return;
799             }
800 
801             /* copy the legal byte sequence to the target */
802             {
803                 int8_t i;
804 
805                 for(i=0; i<oldToULength; ++i) {
806                     *target++=utf8->toUBytes[i];
807                 }
808                 source-=(toULength-oldToULength);
809                 for(; i<toULength; ++i) {
810                     *target++=*source++;
811                 }
812                 count-=toULength;
813             }
814         }
815     }
816     U_ASSERT(count>=0);
817 
818     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
819         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
820             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
821         } else {
822             b=*source;
823             toULimit=U8_COUNT_BYTES(b);
824             if(toULimit>(sourceLimit-source)) {
825                 /* collect a truncated byte sequence */
826                 toULength=0;
827                 c=b;
828                 for(;;) {
829                     utf8->toUBytes[toULength++]=b;
830                     if(++source==sourceLimit) {
831                         /* partial byte sequence at end of source */
832                         utf8->toUnicodeStatus=c;
833                         utf8->toULength=toULength;
834                         utf8->mode=toULimit;
835                         break;
836                     } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
837                         utf8->toULength=toULength;
838                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
839                         break;
840                     }
841                     c=(c<<6)+b;
842                 }
843             } else {
844                 /* partial-sequence target overflow: fall back to the pivoting implementation */
845                 *pErrorCode=U_USING_DEFAULT_WARNING;
846             }
847         }
848     }
849 
850     /* write back the updated pointers */
851     pToUArgs->source=(char *)source;
852     pFromUArgs->target=(char *)target;
853 }
854 
855 U_CDECL_END
856 
857 /* UTF-8 converter data ----------------------------------------------------- */
858 
859 static const UConverterImpl _UTF8Impl={
860     UCNV_UTF8,
861 
862     NULL,
863     NULL,
864 
865     NULL,
866     NULL,
867     NULL,
868 
869     ucnv_toUnicode_UTF8,
870     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
871     ucnv_fromUnicode_UTF8,
872     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
873     ucnv_getNextUChar_UTF8,
874 
875     NULL,
876     NULL,
877     NULL,
878     NULL,
879     ucnv_getNonSurrogateUnicodeSet,
880 
881     ucnv_UTF8FromUTF8,
882     ucnv_UTF8FromUTF8
883 };
884 
885 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
886 static const UConverterStaticData _UTF8StaticData={
887     sizeof(UConverterStaticData),
888     "UTF-8",
889     1208, UCNV_IBM, UCNV_UTF8,
890     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
891     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
892     0,
893     0,
894     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
895 };
896 
897 
898 const UConverterSharedData _UTF8Data=
899         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
900 
901 /* CESU-8 converter data ---------------------------------------------------- */
902 
903 static const UConverterImpl _CESU8Impl={
904     UCNV_CESU8,
905 
906     NULL,
907     NULL,
908 
909     NULL,
910     NULL,
911     NULL,
912 
913     ucnv_toUnicode_UTF8,
914     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
915     ucnv_fromUnicode_UTF8,
916     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
917     NULL,
918 
919     NULL,
920     NULL,
921     NULL,
922     NULL,
923     ucnv_getCompleteUnicodeSet,
924 
925     NULL,
926     NULL
927 };
928 
929 static const UConverterStaticData _CESU8StaticData={
930     sizeof(UConverterStaticData),
931     "CESU-8",
932     9400, /* CCSID for CESU-8 */
933     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
934     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
935     0,
936     0,
937     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
938 };
939 
940 
941 const UConverterSharedData _CESU8Data=
942         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
943 
944 #endif
945