1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2002-2015, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  ucnvbocu.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002mar27
14 *   created by: Markus W. Scherer
15 *
16 *   This is an implementation of the Binary Ordered Compression for Unicode,
17 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
18 */
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
23 
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
26 #include "unicode/utf16.h"
27 #include "putilimp.h"
28 #include "ucnv_bld.h"
29 #include "ucnv_cnv.h"
30 #include "uassert.h"
31 
32 /* BOCU-1 constants and macros ---------------------------------------------- */
33 
34 /*
35  * BOCU-1 encodes the code points of a Unicode string as
36  * a sequence of byte-encoded differences (slope detection),
37  * preserving lexical order.
38  *
39  * Optimize the difference-taking for runs of Unicode text within
40  * small scripts:
41  *
42  * Most small scripts are allocated within aligned 128-blocks of Unicode
43  * code points. Lexical order is preserved if the "previous code point" state
44  * is always moved into the middle of such a block.
45  *
46  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
47  * areas into the middle of those areas.
48  *
49  * C0 control codes and space are encoded with their US-ASCII bytes.
50  * "prev" is reset for C0 controls but not for space.
51  */
52 
53 /* initial value for "prev": middle of the ASCII range */
54 #define BOCU1_ASCII_PREV        0x40
55 
56 /* bounding byte values for differences */
57 #define BOCU1_MIN               0x21
58 #define BOCU1_MIDDLE            0x90
59 #define BOCU1_MAX_LEAD          0xfe
60 #define BOCU1_MAX_TRAIL         0xff
61 #define BOCU1_RESET             0xff
62 
63 /* number of lead bytes */
64 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
65 
66 /* adjust trail byte counts for the use of some C0 control byte values */
67 #define BOCU1_TRAIL_CONTROLS_COUNT  20
68 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
69 
70 /* number of trail bytes */
71 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
72 
73 /*
74  * number of positive and negative single-byte codes
75  * (counting 0==BOCU1_MIDDLE among the positive ones)
76  */
77 #define BOCU1_SINGLE            64
78 
79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
80 #define BOCU1_LEAD_2            43
81 #define BOCU1_LEAD_3            3
82 #define BOCU1_LEAD_4            1
83 
84 /* The difference value range for single-byters. */
85 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
86 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
87 
88 /* The difference value range for double-byters. */
89 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
90 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
91 
92 /* The difference value range for 3-byters. */
93 #define BOCU1_REACH_POS_3   \
94     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
95 
96 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
97 
98 /* The lead byte start values. */
99 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
100 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
101 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
102      /* ==BOCU1_MAX_LEAD */
103 
104 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
105 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
106 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
107      /* ==BOCU1_MIN+1 */
108 
109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
111     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
112      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
113      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
114 
115 /* The length of a byte sequence, according to its packed form. */
116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
117     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
118 
119 /*
120  * 12 commonly used C0 control codes (and space) are only used to encode
121  * themselves directly,
122  * which makes BOCU-1 MIME-usable and reasonably safe for
123  * ASCII-oriented software.
124  *
125  * These controls are
126  *  0   NUL
127  *
128  *  7   BEL
129  *  8   BS
130  *
131  *  9   TAB
132  *  a   LF
133  *  b   VT
134  *  c   FF
135  *  d   CR
136  *
137  *  e   SO
138  *  f   SI
139  *
140  * 1a   SUB
141  * 1b   ESC
142  *
143  * The other 20 C0 controls are also encoded directly (to preserve order)
144  * but are also used as trail bytes in difference encoding
145  * (for better compression).
146  */
147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
148 
149 /*
150  * Byte value map for control codes,
151  * from external byte values 0x00..0x20
152  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
153  * External byte values that are illegal as trail bytes are mapped to -1.
154  */
155 static const int8_t
156 bocu1ByteToTrail[BOCU1_MIN]={
157 /*  0     1     2     3     4     5     6     7    */
158     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
159 
160 /*  8     9     a     b     c     d     e     f    */
161     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
162 
163 /*  10    11    12    13    14    15    16    17   */
164     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
165 
166 /*  18    19    1a    1b    1c    1d    1e    1f   */
167     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
168 
169 /*  20   */
170     -1
171 };
172 
173 /*
174  * Byte value map for control codes,
175  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
176  * to external byte values 0x00..0x20.
177  */
178 static const int8_t
179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
180 /*  0     1     2     3     4     5     6     7    */
181     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
182 
183 /*  8     9     a     b     c     d     e     f    */
184     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
185 
186 /*  10    11    12    13   */
187     0x1c, 0x1d, 0x1e, 0x1f
188 };
189 
190 /**
191  * Integer division and modulo with negative numerators
192  * yields negative modulo results and quotients that are one more than
193  * what we need here.
194  * This macro adjust the results so that the modulo-value m is always >=0.
195  *
196  * For positive n, the if() condition is always FALSE.
197  *
198  * @param n Number to be split into quotient and rest.
199  *          Will be modified to contain the quotient.
200  * @param d Divisor.
201  * @param m Output variable for the rest (modulo result).
202  */
203 #define NEGDIVMOD(n, d, m) { \
204     (m)=(n)%(d); \
205     (n)/=(d); \
206     if((m)<0) { \
207         --(n); \
208         (m)+=(d); \
209     } \
210 }
211 
212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
213 
214 /** Is a diff value encodable in a single byte? */
215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
216 
217 /** Encode a diff value in a single byte. */
218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
219 
220 /** Is a diff value encodable in two bytes? */
221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
222 
223 /* BOCU-1 implementation functions ------------------------------------------ */
224 
225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
226 
227 /**
228  * Compute the next "previous" value for differencing
229  * from the current code point.
230  *
231  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
232  * @return "previous code point" state value
233  */
234 static inline int32_t
bocu1Prev(int32_t c)235 bocu1Prev(int32_t c) {
236     /* compute new prev */
237     if(/* 0x3040<=c && */ c<=0x309f) {
238         /* Hiragana is not 128-aligned */
239         return 0x3070;
240     } else if(0x4e00<=c && c<=0x9fa5) {
241         /* CJK Unihan */
242         return 0x4e00-BOCU1_REACH_NEG_2;
243     } else if(0xac00<=c /* && c<=0xd7a3 */) {
244         /* Korean Hangul */
245         return (0xd7a3+0xac00)/2;
246     } else {
247         /* mostly small scripts */
248         return BOCU1_SIMPLE_PREV(c);
249     }
250 }
251 
252 /** Fast version of bocu1Prev() for most scripts. */
253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
254 
255 /*
256  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
257  * The UConverter fields are used as follows:
258  *
259  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
260  *
261  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
263  */
264 
265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
266 
267 /**
268  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
269  * and return a packed integer with them.
270  *
271  * The encoding favors small absolute differences with short encodings
272  * to compress runs of same-script characters.
273  *
274  * Optimized version with unrolled loops and fewer floating-point operations
275  * than the standard packDiff().
276  *
277  * @param diff difference value -0x10ffff..0x10ffff
278  * @return
279  *      0x010000zz for 1-byte sequence zz
280  *      0x0200yyzz for 2-byte sequence yy zz
281  *      0x03xxyyzz for 3-byte sequence xx yy zz
282  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
283  */
284 static int32_t
packDiff(int32_t diff)285 packDiff(int32_t diff) {
286     int32_t result, m;
287 
288     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
289     if(diff>=BOCU1_REACH_NEG_1) {
290         /* mostly positive differences, and single-byte negative ones */
291 #if 0   /* single-byte case handled in macros, see below */
292         if(diff<=BOCU1_REACH_POS_1) {
293             /* single byte */
294             return 0x01000000|(BOCU1_MIDDLE+diff);
295         } else
296 #endif
297         if(diff<=BOCU1_REACH_POS_2) {
298             /* two bytes */
299             diff-=BOCU1_REACH_POS_1+1;
300             result=0x02000000;
301 
302             m=diff%BOCU1_TRAIL_COUNT;
303             diff/=BOCU1_TRAIL_COUNT;
304             result|=BOCU1_TRAIL_TO_BYTE(m);
305 
306             result|=(BOCU1_START_POS_2+diff)<<8;
307         } else if(diff<=BOCU1_REACH_POS_3) {
308             /* three bytes */
309             diff-=BOCU1_REACH_POS_2+1;
310             result=0x03000000;
311 
312             m=diff%BOCU1_TRAIL_COUNT;
313             diff/=BOCU1_TRAIL_COUNT;
314             result|=BOCU1_TRAIL_TO_BYTE(m);
315 
316             m=diff%BOCU1_TRAIL_COUNT;
317             diff/=BOCU1_TRAIL_COUNT;
318             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
319 
320             result|=(BOCU1_START_POS_3+diff)<<16;
321         } else {
322             /* four bytes */
323             diff-=BOCU1_REACH_POS_3+1;
324 
325             m=diff%BOCU1_TRAIL_COUNT;
326             diff/=BOCU1_TRAIL_COUNT;
327             result=BOCU1_TRAIL_TO_BYTE(m);
328 
329             m=diff%BOCU1_TRAIL_COUNT;
330             diff/=BOCU1_TRAIL_COUNT;
331             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
332 
333             /*
334              * We know that / and % would deliver quotient 0 and rest=diff.
335              * Avoid division and modulo for performance.
336              */
337             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
338 
339             result|=((uint32_t)BOCU1_START_POS_4)<<24;
340         }
341     } else {
342         /* two- to four-byte negative differences */
343         if(diff>=BOCU1_REACH_NEG_2) {
344             /* two bytes */
345             diff-=BOCU1_REACH_NEG_1;
346             result=0x02000000;
347 
348             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
349             result|=BOCU1_TRAIL_TO_BYTE(m);
350 
351             result|=(BOCU1_START_NEG_2+diff)<<8;
352         } else if(diff>=BOCU1_REACH_NEG_3) {
353             /* three bytes */
354             diff-=BOCU1_REACH_NEG_2;
355             result=0x03000000;
356 
357             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
358             result|=BOCU1_TRAIL_TO_BYTE(m);
359 
360             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
361             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
362 
363             result|=(BOCU1_START_NEG_3+diff)<<16;
364         } else {
365             /* four bytes */
366             diff-=BOCU1_REACH_NEG_3;
367 
368             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
369             result=BOCU1_TRAIL_TO_BYTE(m);
370 
371             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
372             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
373 
374             /*
375              * We know that NEGDIVMOD would deliver
376              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
377              * Avoid division and modulo for performance.
378              */
379             m=diff+BOCU1_TRAIL_COUNT;
380             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
381 
382             result|=BOCU1_MIN<<24;
383         }
384     }
385     return result;
386 }
387 
388 
389 static void
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
391                              UErrorCode *pErrorCode) {
392     UConverter *cnv;
393     const UChar *source, *sourceLimit;
394     uint8_t *target;
395     int32_t targetCapacity;
396     int32_t *offsets;
397 
398     int32_t prev, c, diff;
399 
400     int32_t sourceIndex, nextSourceIndex;
401 
402     /* set up the local pointers */
403     cnv=pArgs->converter;
404     source=pArgs->source;
405     sourceLimit=pArgs->sourceLimit;
406     target=(uint8_t *)pArgs->target;
407     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
408     offsets=pArgs->offsets;
409 
410     /* get the converter state from UConverter */
411     c=cnv->fromUChar32;
412     prev=(int32_t)cnv->fromUnicodeStatus;
413     if(prev==0) {
414         prev=BOCU1_ASCII_PREV;
415     }
416 
417     /* sourceIndex=-1 if the current character began in the previous buffer */
418     sourceIndex= c==0 ? 0 : -1;
419     nextSourceIndex=0;
420 
421     /* conversion loop */
422     if(c!=0 && targetCapacity>0) {
423         goto getTrail;
424     }
425 
426 fastSingle:
427     /* fast loop for single-byte differences */
428     /* use only one loop counter variable, targetCapacity, not also source */
429     diff=(int32_t)(sourceLimit-source);
430     if(targetCapacity>diff) {
431         targetCapacity=diff;
432     }
433     while(targetCapacity>0 && (c=*source)<0x3000) {
434         if(c<=0x20) {
435             if(c!=0x20) {
436                 prev=BOCU1_ASCII_PREV;
437             }
438             *target++=(uint8_t)c;
439             *offsets++=nextSourceIndex++;
440             ++source;
441             --targetCapacity;
442         } else {
443             diff=c-prev;
444             if(DIFF_IS_SINGLE(diff)) {
445                 prev=BOCU1_SIMPLE_PREV(c);
446                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
447                 *offsets++=nextSourceIndex++;
448                 ++source;
449                 --targetCapacity;
450             } else {
451                 break;
452             }
453         }
454     }
455     /* restore real values */
456     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
457     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
458 
459     /* regular loop for all cases */
460     while(source<sourceLimit) {
461         if(targetCapacity>0) {
462             c=*source++;
463             ++nextSourceIndex;
464 
465             if(c<=0x20) {
466                 /*
467                  * ISO C0 control & space:
468                  * Encode directly for MIME compatibility,
469                  * and reset state except for space, to not disrupt compression.
470                  */
471                 if(c!=0x20) {
472                     prev=BOCU1_ASCII_PREV;
473                 }
474                 *target++=(uint8_t)c;
475                 *offsets++=sourceIndex;
476                 --targetCapacity;
477 
478                 sourceIndex=nextSourceIndex;
479                 continue;
480             }
481 
482             if(U16_IS_LEAD(c)) {
483 getTrail:
484                 if(source<sourceLimit) {
485                     /* test the following code unit */
486                     UChar trail=*source;
487                     if(U16_IS_TRAIL(trail)) {
488                         ++source;
489                         ++nextSourceIndex;
490                         c=U16_GET_SUPPLEMENTARY(c, trail);
491                     }
492                 } else {
493                     /* no more input */
494                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
495                     break;
496                 }
497             }
498 
499             /*
500              * all other Unicode code points c==U+0021..U+10ffff
501              * are encoded with the difference c-prev
502              *
503              * a new prev is computed from c,
504              * placed in the middle of a 0x80-block (for most small scripts) or
505              * in the middle of the Unihan and Hangul blocks
506              * to statistically minimize the following difference
507              */
508             diff=c-prev;
509             prev=BOCU1_PREV(c);
510             if(DIFF_IS_SINGLE(diff)) {
511                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
512                 *offsets++=sourceIndex;
513                 --targetCapacity;
514                 sourceIndex=nextSourceIndex;
515                 if(c<0x3000) {
516                     goto fastSingle;
517                 }
518             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
519                 /* optimize 2-byte case */
520                 int32_t m;
521 
522                 if(diff>=0) {
523                     diff-=BOCU1_REACH_POS_1+1;
524                     m=diff%BOCU1_TRAIL_COUNT;
525                     diff/=BOCU1_TRAIL_COUNT;
526                     diff+=BOCU1_START_POS_2;
527                 } else {
528                     diff-=BOCU1_REACH_NEG_1;
529                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
530                     diff+=BOCU1_START_NEG_2;
531                 }
532                 *target++=(uint8_t)diff;
533                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
534                 *offsets++=sourceIndex;
535                 *offsets++=sourceIndex;
536                 targetCapacity-=2;
537                 sourceIndex=nextSourceIndex;
538             } else {
539                 int32_t length; /* will be 2..4 */
540 
541                 diff=packDiff(diff);
542                 length=BOCU1_LENGTH_FROM_PACKED(diff);
543 
544                 /* write the output character bytes from diff and length */
545                 /* from the first if in the loop we know that targetCapacity>0 */
546                 if(length<=targetCapacity) {
547                     switch(length) {
548                         /* each branch falls through to the next one */
549                     case 4:
550                         *target++=(uint8_t)(diff>>24);
551                         *offsets++=sourceIndex;
552                     case 3: /*fall through*/
553                         *target++=(uint8_t)(diff>>16);
554                         *offsets++=sourceIndex;
555                     case 2: /*fall through*/
556                         *target++=(uint8_t)(diff>>8);
557                         *offsets++=sourceIndex;
558                     /* case 1: handled above */
559                         *target++=(uint8_t)diff;
560                         *offsets++=sourceIndex;
561                     default:
562                         /* will never occur */
563                         break;
564                     }
565                     targetCapacity-=length;
566                     sourceIndex=nextSourceIndex;
567                 } else {
568                     uint8_t *charErrorBuffer;
569 
570                     /*
571                      * We actually do this backwards here:
572                      * In order to save an intermediate variable, we output
573                      * first to the overflow buffer what does not fit into the
574                      * regular target.
575                      */
576                     /* we know that 1<=targetCapacity<length<=4 */
577                     length-=targetCapacity;
578                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
579                     switch(length) {
580                         /* each branch falls through to the next one */
581                     case 3:
582                         *charErrorBuffer++=(uint8_t)(diff>>16);
583                     case 2: /*fall through*/
584                         *charErrorBuffer++=(uint8_t)(diff>>8);
585                     case 1: /*fall through*/
586                         *charErrorBuffer=(uint8_t)diff;
587                     default:
588                         /* will never occur */
589                         break;
590                     }
591                     cnv->charErrorBufferLength=(int8_t)length;
592 
593                     /* now output what fits into the regular target */
594                     diff>>=8*length; /* length was reduced by targetCapacity */
595                     switch(targetCapacity) {
596                         /* each branch falls through to the next one */
597                     case 3:
598                         *target++=(uint8_t)(diff>>16);
599                         *offsets++=sourceIndex;
600                     case 2: /*fall through*/
601                         *target++=(uint8_t)(diff>>8);
602                         *offsets++=sourceIndex;
603                     case 1: /*fall through*/
604                         *target++=(uint8_t)diff;
605                         *offsets++=sourceIndex;
606                     default:
607                         /* will never occur */
608                         break;
609                     }
610 
611                     /* target overflow */
612                     targetCapacity=0;
613                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
614                     break;
615                 }
616             }
617         } else {
618             /* target is full */
619             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
620             break;
621         }
622     }
623 
624     /* set the converter state back into UConverter */
625     cnv->fromUChar32= c<0 ? -c : 0;
626     cnv->fromUnicodeStatus=(uint32_t)prev;
627 
628     /* write back the updated pointers */
629     pArgs->source=source;
630     pArgs->target=(char *)target;
631     pArgs->offsets=offsets;
632 }
633 
634 /*
635  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
636  * If a change is made in the original function, then either
637  * change this function the same way or
638  * re-copy the original function and remove the variables
639  * offsets, sourceIndex, and nextSourceIndex.
640  */
641 static void
_Bocu1FromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)642 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
643                   UErrorCode *pErrorCode) {
644     UConverter *cnv;
645     const UChar *source, *sourceLimit;
646     uint8_t *target;
647     int32_t targetCapacity;
648 
649     int32_t prev, c, diff;
650 
651     /* set up the local pointers */
652     cnv=pArgs->converter;
653     source=pArgs->source;
654     sourceLimit=pArgs->sourceLimit;
655     target=(uint8_t *)pArgs->target;
656     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
657 
658     /* get the converter state from UConverter */
659     c=cnv->fromUChar32;
660     prev=(int32_t)cnv->fromUnicodeStatus;
661     if(prev==0) {
662         prev=BOCU1_ASCII_PREV;
663     }
664 
665     /* conversion loop */
666     if(c!=0 && targetCapacity>0) {
667         goto getTrail;
668     }
669 
670 fastSingle:
671     /* fast loop for single-byte differences */
672     /* use only one loop counter variable, targetCapacity, not also source */
673     diff=(int32_t)(sourceLimit-source);
674     if(targetCapacity>diff) {
675         targetCapacity=diff;
676     }
677     while(targetCapacity>0 && (c=*source)<0x3000) {
678         if(c<=0x20) {
679             if(c!=0x20) {
680                 prev=BOCU1_ASCII_PREV;
681             }
682             *target++=(uint8_t)c;
683         } else {
684             diff=c-prev;
685             if(DIFF_IS_SINGLE(diff)) {
686                 prev=BOCU1_SIMPLE_PREV(c);
687                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
688             } else {
689                 break;
690             }
691         }
692         ++source;
693         --targetCapacity;
694     }
695     /* restore real values */
696     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
697 
698     /* regular loop for all cases */
699     while(source<sourceLimit) {
700         if(targetCapacity>0) {
701             c=*source++;
702 
703             if(c<=0x20) {
704                 /*
705                  * ISO C0 control & space:
706                  * Encode directly for MIME compatibility,
707                  * and reset state except for space, to not disrupt compression.
708                  */
709                 if(c!=0x20) {
710                     prev=BOCU1_ASCII_PREV;
711                 }
712                 *target++=(uint8_t)c;
713                 --targetCapacity;
714                 continue;
715             }
716 
717             if(U16_IS_LEAD(c)) {
718 getTrail:
719                 if(source<sourceLimit) {
720                     /* test the following code unit */
721                     UChar trail=*source;
722                     if(U16_IS_TRAIL(trail)) {
723                         ++source;
724                         c=U16_GET_SUPPLEMENTARY(c, trail);
725                     }
726                 } else {
727                     /* no more input */
728                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
729                     break;
730                 }
731             }
732 
733             /*
734              * all other Unicode code points c==U+0021..U+10ffff
735              * are encoded with the difference c-prev
736              *
737              * a new prev is computed from c,
738              * placed in the middle of a 0x80-block (for most small scripts) or
739              * in the middle of the Unihan and Hangul blocks
740              * to statistically minimize the following difference
741              */
742             diff=c-prev;
743             prev=BOCU1_PREV(c);
744             if(DIFF_IS_SINGLE(diff)) {
745                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
746                 --targetCapacity;
747                 if(c<0x3000) {
748                     goto fastSingle;
749                 }
750             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
751                 /* optimize 2-byte case */
752                 int32_t m;
753 
754                 if(diff>=0) {
755                     diff-=BOCU1_REACH_POS_1+1;
756                     m=diff%BOCU1_TRAIL_COUNT;
757                     diff/=BOCU1_TRAIL_COUNT;
758                     diff+=BOCU1_START_POS_2;
759                 } else {
760                     diff-=BOCU1_REACH_NEG_1;
761                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
762                     diff+=BOCU1_START_NEG_2;
763                 }
764                 *target++=(uint8_t)diff;
765                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
766                 targetCapacity-=2;
767             } else {
768                 int32_t length; /* will be 2..4 */
769 
770                 diff=packDiff(diff);
771                 length=BOCU1_LENGTH_FROM_PACKED(diff);
772 
773                 /* write the output character bytes from diff and length */
774                 /* from the first if in the loop we know that targetCapacity>0 */
775                 if(length<=targetCapacity) {
776                     switch(length) {
777                         /* each branch falls through to the next one */
778                     case 4:
779                         *target++=(uint8_t)(diff>>24);
780                     case 3: /*fall through*/
781                         *target++=(uint8_t)(diff>>16);
782                     /* case 2: handled above */
783                         *target++=(uint8_t)(diff>>8);
784                     /* case 1: handled above */
785                         *target++=(uint8_t)diff;
786                     default:
787                         /* will never occur */
788                         break;
789                     }
790                     targetCapacity-=length;
791                 } else {
792                     uint8_t *charErrorBuffer;
793 
794                     /*
795                      * We actually do this backwards here:
796                      * In order to save an intermediate variable, we output
797                      * first to the overflow buffer what does not fit into the
798                      * regular target.
799                      */
800                     /* we know that 1<=targetCapacity<length<=4 */
801                     length-=targetCapacity;
802                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
803                     switch(length) {
804                         /* each branch falls through to the next one */
805                     case 3:
806                         *charErrorBuffer++=(uint8_t)(diff>>16);
807                     case 2: /*fall through*/
808                         *charErrorBuffer++=(uint8_t)(diff>>8);
809                     case 1: /*fall through*/
810                         *charErrorBuffer=(uint8_t)diff;
811                     default:
812                         /* will never occur */
813                         break;
814                     }
815                     cnv->charErrorBufferLength=(int8_t)length;
816 
817                     /* now output what fits into the regular target */
818                     diff>>=8*length; /* length was reduced by targetCapacity */
819                     switch(targetCapacity) {
820                         /* each branch falls through to the next one */
821                     case 3:
822                         *target++=(uint8_t)(diff>>16);
823                     case 2: /*fall through*/
824                         *target++=(uint8_t)(diff>>8);
825                     case 1: /*fall through*/
826                         *target++=(uint8_t)diff;
827                     default:
828                         /* will never occur */
829                         break;
830                     }
831 
832                     /* target overflow */
833                     targetCapacity=0;
834                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
835                     break;
836                 }
837             }
838         } else {
839             /* target is full */
840             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
841             break;
842         }
843     }
844 
845     /* set the converter state back into UConverter */
846     cnv->fromUChar32= c<0 ? -c : 0;
847     cnv->fromUnicodeStatus=(uint32_t)prev;
848 
849     /* write back the updated pointers */
850     pArgs->source=source;
851     pArgs->target=(char *)target;
852 }
853 
854 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
855 
856 /**
857  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
858  *
859  * @param b lead byte;
860  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
861  * @return (diff<<2)|count
862  */
863 static inline int32_t
decodeBocu1LeadByte(int32_t b)864 decodeBocu1LeadByte(int32_t b) {
865     int32_t diff, count;
866 
867     if(b>=BOCU1_START_NEG_2) {
868         /* positive difference */
869         if(b<BOCU1_START_POS_3) {
870             /* two bytes */
871             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
872             count=1;
873         } else if(b<BOCU1_START_POS_4) {
874             /* three bytes */
875             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
876             count=2;
877         } else {
878             /* four bytes */
879             diff=BOCU1_REACH_POS_3+1;
880             count=3;
881         }
882     } else {
883         /* negative difference */
884         if(b>=BOCU1_START_NEG_3) {
885             /* two bytes */
886             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
887             count=1;
888         } else if(b>BOCU1_MIN) {
889             /* three bytes */
890             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
891             count=2;
892         } else {
893             /* four bytes */
894             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
895             count=3;
896         }
897     }
898 
899     /* return the state for decoding the trail byte(s) */
900     return (diff<<2)|count;
901 }
902 
903 /**
904  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
905  *
906  * @param count number of remaining trail bytes including this one
907  * @param b trail byte
908  * @return new delta for diff including b - <0 indicates an error
909  *
910  * @see decodeBocu1
911  */
912 static inline int32_t
decodeBocu1TrailByte(int32_t count,int32_t b)913 decodeBocu1TrailByte(int32_t count, int32_t b) {
914     if(b<=0x20) {
915         /* skip some C0 controls and make the trail byte range contiguous */
916         b=bocu1ByteToTrail[b];
917         /* b<0 for an illegal trail byte value will result in return<0 below */
918 #if BOCU1_MAX_TRAIL<0xff
919     } else if(b>BOCU1_MAX_TRAIL) {
920         return -99;
921 #endif
922     } else {
923         b-=BOCU1_TRAIL_BYTE_OFFSET;
924     }
925 
926     /* add trail byte into difference and decrement count */
927     if(count==1) {
928         return b;
929     } else if(count==2) {
930         return b*BOCU1_TRAIL_COUNT;
931     } else /* count==3 */ {
932         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
933     }
934 }
935 
936 static void
_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)937 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
938                            UErrorCode *pErrorCode) {
939     UConverter *cnv;
940     const uint8_t *source, *sourceLimit;
941     UChar *target;
942     const UChar *targetLimit;
943     int32_t *offsets;
944 
945     int32_t prev, count, diff, c;
946 
947     int8_t byteIndex;
948     uint8_t *bytes;
949 
950     int32_t sourceIndex, nextSourceIndex;
951 
952     /* set up the local pointers */
953     cnv=pArgs->converter;
954     source=(const uint8_t *)pArgs->source;
955     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
956     target=pArgs->target;
957     targetLimit=pArgs->targetLimit;
958     offsets=pArgs->offsets;
959 
960     /* get the converter state from UConverter */
961     prev=(int32_t)cnv->toUnicodeStatus;
962     if(prev==0) {
963         prev=BOCU1_ASCII_PREV;
964     }
965     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
966     count=diff&3;
967     diff>>=2;
968 
969     byteIndex=cnv->toULength;
970     bytes=cnv->toUBytes;
971 
972     /* sourceIndex=-1 if the current character began in the previous buffer */
973     sourceIndex=byteIndex==0 ? 0 : -1;
974     nextSourceIndex=0;
975 
976     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
977     if(count>0 && byteIndex>0 && target<targetLimit) {
978         goto getTrail;
979     }
980 
981 fastSingle:
982     /* fast loop for single-byte differences */
983     /* use count as the only loop counter variable */
984     diff=(int32_t)(sourceLimit-source);
985     count=(int32_t)(pArgs->targetLimit-target);
986     if(count>diff) {
987         count=diff;
988     }
989     while(count>0) {
990         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
991             c=prev+(c-BOCU1_MIDDLE);
992             if(c<0x3000) {
993                 *target++=(UChar)c;
994                 *offsets++=nextSourceIndex++;
995                 prev=BOCU1_SIMPLE_PREV(c);
996             } else {
997                 break;
998             }
999         } else if(c<=0x20) {
1000             if(c!=0x20) {
1001                 prev=BOCU1_ASCII_PREV;
1002             }
1003             *target++=(UChar)c;
1004             *offsets++=nextSourceIndex++;
1005         } else {
1006             break;
1007         }
1008         ++source;
1009         --count;
1010     }
1011     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1012 
1013     /* decode a sequence of single and lead bytes */
1014     while(source<sourceLimit) {
1015         if(target>=targetLimit) {
1016             /* target is full */
1017             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1018             break;
1019         }
1020 
1021         ++nextSourceIndex;
1022         c=*source++;
1023         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1024             /* Write a code point directly from a single-byte difference. */
1025             c=prev+(c-BOCU1_MIDDLE);
1026             if(c<0x3000) {
1027                 *target++=(UChar)c;
1028                 *offsets++=sourceIndex;
1029                 prev=BOCU1_SIMPLE_PREV(c);
1030                 sourceIndex=nextSourceIndex;
1031                 goto fastSingle;
1032             }
1033         } else if(c<=0x20) {
1034             /*
1035              * Direct-encoded C0 control code or space.
1036              * Reset prev for C0 control codes but not for space.
1037              */
1038             if(c!=0x20) {
1039                 prev=BOCU1_ASCII_PREV;
1040             }
1041             *target++=(UChar)c;
1042             *offsets++=sourceIndex;
1043             sourceIndex=nextSourceIndex;
1044             continue;
1045         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1046             /* Optimize two-byte case. */
1047             if(c>=BOCU1_MIDDLE) {
1048                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1049             } else {
1050                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1051             }
1052 
1053             /* trail byte */
1054             ++nextSourceIndex;
1055             c=decodeBocu1TrailByte(1, *source++);
1056             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1057                 bytes[0]=source[-2];
1058                 bytes[1]=source[-1];
1059                 byteIndex=2;
1060                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1061                 break;
1062             }
1063         } else if(c==BOCU1_RESET) {
1064             /* only reset the state, no code point */
1065             prev=BOCU1_ASCII_PREV;
1066             sourceIndex=nextSourceIndex;
1067             continue;
1068         } else {
1069             /*
1070              * For multi-byte difference lead bytes, set the decoder state
1071              * with the partial difference value from the lead byte and
1072              * with the number of trail bytes.
1073              */
1074             bytes[0]=(uint8_t)c;
1075             byteIndex=1;
1076 
1077             diff=decodeBocu1LeadByte(c);
1078             count=diff&3;
1079             diff>>=2;
1080 getTrail:
1081             for(;;) {
1082                 if(source>=sourceLimit) {
1083                     goto endloop;
1084                 }
1085                 ++nextSourceIndex;
1086                 c=bytes[byteIndex++]=*source++;
1087 
1088                 /* trail byte in any position */
1089                 c=decodeBocu1TrailByte(count, c);
1090                 if(c<0) {
1091                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1092                     goto endloop;
1093                 }
1094 
1095                 diff+=c;
1096                 if(--count==0) {
1097                     /* final trail byte, deliver a code point */
1098                     byteIndex=0;
1099                     c=prev+diff;
1100                     if((uint32_t)c>0x10ffff) {
1101                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1102                         goto endloop;
1103                     }
1104                     break;
1105                 }
1106             }
1107         }
1108 
1109         /* calculate the next prev and output c */
1110         prev=BOCU1_PREV(c);
1111         if(c<=0xffff) {
1112             *target++=(UChar)c;
1113             *offsets++=sourceIndex;
1114         } else {
1115             /* output surrogate pair */
1116             *target++=U16_LEAD(c);
1117             if(target<targetLimit) {
1118                 *target++=U16_TRAIL(c);
1119                 *offsets++=sourceIndex;
1120                 *offsets++=sourceIndex;
1121             } else {
1122                 /* target overflow */
1123                 *offsets++=sourceIndex;
1124                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1125                 cnv->UCharErrorBufferLength=1;
1126                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1127                 break;
1128             }
1129         }
1130         sourceIndex=nextSourceIndex;
1131     }
1132 endloop:
1133 
1134     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1135         /* set the converter state in UConverter to deal with the next character */
1136         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1137         cnv->mode=0;
1138     } else {
1139         /* set the converter state back into UConverter */
1140         cnv->toUnicodeStatus=(uint32_t)prev;
1141         cnv->mode=(diff<<2)|count;
1142     }
1143     cnv->toULength=byteIndex;
1144 
1145     /* write back the updated pointers */
1146     pArgs->source=(const char *)source;
1147     pArgs->target=target;
1148     pArgs->offsets=offsets;
1149     return;
1150 }
1151 
1152 /*
1153  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1154  * If a change is made in the original function, then either
1155  * change this function the same way or
1156  * re-copy the original function and remove the variables
1157  * offsets, sourceIndex, and nextSourceIndex.
1158  */
1159 static void
_Bocu1ToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1160 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1161                 UErrorCode *pErrorCode) {
1162     UConverter *cnv;
1163     const uint8_t *source, *sourceLimit;
1164     UChar *target;
1165     const UChar *targetLimit;
1166 
1167     int32_t prev, count, diff, c;
1168 
1169     int8_t byteIndex;
1170     uint8_t *bytes;
1171 
1172     /* set up the local pointers */
1173     cnv=pArgs->converter;
1174     source=(const uint8_t *)pArgs->source;
1175     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1176     target=pArgs->target;
1177     targetLimit=pArgs->targetLimit;
1178 
1179     /* get the converter state from UConverter */
1180     prev=(int32_t)cnv->toUnicodeStatus;
1181     if(prev==0) {
1182         prev=BOCU1_ASCII_PREV;
1183     }
1184     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1185     count=diff&3;
1186     diff>>=2;
1187 
1188     byteIndex=cnv->toULength;
1189     bytes=cnv->toUBytes;
1190 
1191     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1192     if(count>0 && byteIndex>0 && target<targetLimit) {
1193         goto getTrail;
1194     }
1195 
1196 fastSingle:
1197     /* fast loop for single-byte differences */
1198     /* use count as the only loop counter variable */
1199     diff=(int32_t)(sourceLimit-source);
1200     count=(int32_t)(pArgs->targetLimit-target);
1201     if(count>diff) {
1202         count=diff;
1203     }
1204     while(count>0) {
1205         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1206             c=prev+(c-BOCU1_MIDDLE);
1207             if(c<0x3000) {
1208                 *target++=(UChar)c;
1209                 prev=BOCU1_SIMPLE_PREV(c);
1210             } else {
1211                 break;
1212             }
1213         } else if(c<=0x20) {
1214             if(c!=0x20) {
1215                 prev=BOCU1_ASCII_PREV;
1216             }
1217             *target++=(UChar)c;
1218         } else {
1219             break;
1220         }
1221         ++source;
1222         --count;
1223     }
1224 
1225     /* decode a sequence of single and lead bytes */
1226     while(source<sourceLimit) {
1227         if(target>=targetLimit) {
1228             /* target is full */
1229             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1230             break;
1231         }
1232 
1233         c=*source++;
1234         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1235             /* Write a code point directly from a single-byte difference. */
1236             c=prev+(c-BOCU1_MIDDLE);
1237             if(c<0x3000) {
1238                 *target++=(UChar)c;
1239                 prev=BOCU1_SIMPLE_PREV(c);
1240                 goto fastSingle;
1241             }
1242         } else if(c<=0x20) {
1243             /*
1244              * Direct-encoded C0 control code or space.
1245              * Reset prev for C0 control codes but not for space.
1246              */
1247             if(c!=0x20) {
1248                 prev=BOCU1_ASCII_PREV;
1249             }
1250             *target++=(UChar)c;
1251             continue;
1252         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1253             /* Optimize two-byte case. */
1254             if(c>=BOCU1_MIDDLE) {
1255                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1256             } else {
1257                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1258             }
1259 
1260             /* trail byte */
1261             c=decodeBocu1TrailByte(1, *source++);
1262             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1263                 bytes[0]=source[-2];
1264                 bytes[1]=source[-1];
1265                 byteIndex=2;
1266                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1267                 break;
1268             }
1269         } else if(c==BOCU1_RESET) {
1270             /* only reset the state, no code point */
1271             prev=BOCU1_ASCII_PREV;
1272             continue;
1273         } else {
1274             /*
1275              * For multi-byte difference lead bytes, set the decoder state
1276              * with the partial difference value from the lead byte and
1277              * with the number of trail bytes.
1278              */
1279             bytes[0]=(uint8_t)c;
1280             byteIndex=1;
1281 
1282             diff=decodeBocu1LeadByte(c);
1283             count=diff&3;
1284             diff>>=2;
1285 getTrail:
1286             for(;;) {
1287                 if(source>=sourceLimit) {
1288                     goto endloop;
1289                 }
1290                 c=bytes[byteIndex++]=*source++;
1291 
1292                 /* trail byte in any position */
1293                 c=decodeBocu1TrailByte(count, c);
1294                 if(c<0) {
1295                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1296                     goto endloop;
1297                 }
1298 
1299                 diff+=c;
1300                 if(--count==0) {
1301                     /* final trail byte, deliver a code point */
1302                     byteIndex=0;
1303                     c=prev+diff;
1304                     if((uint32_t)c>0x10ffff) {
1305                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1306                         goto endloop;
1307                     }
1308                     break;
1309                 }
1310             }
1311         }
1312 
1313         /* calculate the next prev and output c */
1314         prev=BOCU1_PREV(c);
1315         if(c<=0xffff) {
1316             *target++=(UChar)c;
1317         } else {
1318             /* output surrogate pair */
1319             *target++=U16_LEAD(c);
1320             if(target<targetLimit) {
1321                 *target++=U16_TRAIL(c);
1322             } else {
1323                 /* target overflow */
1324                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1325                 cnv->UCharErrorBufferLength=1;
1326                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1327                 break;
1328             }
1329         }
1330     }
1331 endloop:
1332 
1333     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1334         /* set the converter state in UConverter to deal with the next character */
1335         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1336         cnv->mode=0;
1337     } else {
1338         /* set the converter state back into UConverter */
1339         cnv->toUnicodeStatus=(uint32_t)prev;
1340         cnv->mode=(diff<<2)|count;
1341     }
1342     cnv->toULength=byteIndex;
1343 
1344     /* write back the updated pointers */
1345     pArgs->source=(const char *)source;
1346     pArgs->target=target;
1347     return;
1348 }
1349 
1350 /* miscellaneous ------------------------------------------------------------ */
1351 
1352 static const UConverterImpl _Bocu1Impl={
1353     UCNV_BOCU1,
1354 
1355     NULL,
1356     NULL,
1357 
1358     NULL,
1359     NULL,
1360     NULL,
1361 
1362     _Bocu1ToUnicode,
1363     _Bocu1ToUnicodeWithOffsets,
1364     _Bocu1FromUnicode,
1365     _Bocu1FromUnicodeWithOffsets,
1366     NULL,
1367 
1368     NULL,
1369     NULL,
1370     NULL,
1371     NULL,
1372     ucnv_getCompleteUnicodeSet,
1373 
1374     NULL,
1375     NULL
1376 };
1377 
1378 static const UConverterStaticData _Bocu1StaticData={
1379     sizeof(UConverterStaticData),
1380     "BOCU-1",
1381     1214, /* CCSID for BOCU-1 */
1382     UCNV_IBM, UCNV_BOCU1,
1383     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1384     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1385     FALSE, FALSE,
1386     0,
1387     0,
1388     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1389 };
1390 
1391 const UConverterSharedData _Bocu1Data=
1392         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
1393 
1394 #endif
1395