1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2002-2015, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: ucnvbocu.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002mar27
14 * created by: Markus W. Scherer
15 *
16 * This is an implementation of the Binary Ordered Compression for Unicode,
17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
23
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
26 #include "unicode/utf16.h"
27 #include "putilimp.h"
28 #include "ucnv_bld.h"
29 #include "ucnv_cnv.h"
30 #include "uassert.h"
31
32 /* BOCU-1 constants and macros ---------------------------------------------- */
33
34 /*
35 * BOCU-1 encodes the code points of a Unicode string as
36 * a sequence of byte-encoded differences (slope detection),
37 * preserving lexical order.
38 *
39 * Optimize the difference-taking for runs of Unicode text within
40 * small scripts:
41 *
42 * Most small scripts are allocated within aligned 128-blocks of Unicode
43 * code points. Lexical order is preserved if the "previous code point" state
44 * is always moved into the middle of such a block.
45 *
46 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
47 * areas into the middle of those areas.
48 *
49 * C0 control codes and space are encoded with their US-ASCII bytes.
50 * "prev" is reset for C0 controls but not for space.
51 */
52
53 /* initial value for "prev": middle of the ASCII range */
54 #define BOCU1_ASCII_PREV 0x40
55
56 /* bounding byte values for differences */
57 #define BOCU1_MIN 0x21
58 #define BOCU1_MIDDLE 0x90
59 #define BOCU1_MAX_LEAD 0xfe
60 #define BOCU1_MAX_TRAIL 0xff
61 #define BOCU1_RESET 0xff
62
63 /* number of lead bytes */
64 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
65
66 /* adjust trail byte counts for the use of some C0 control byte values */
67 #define BOCU1_TRAIL_CONTROLS_COUNT 20
68 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
69
70 /* number of trail bytes */
71 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
72
73 /*
74 * number of positive and negative single-byte codes
75 * (counting 0==BOCU1_MIDDLE among the positive ones)
76 */
77 #define BOCU1_SINGLE 64
78
79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
80 #define BOCU1_LEAD_2 43
81 #define BOCU1_LEAD_3 3
82 #define BOCU1_LEAD_4 1
83
84 /* The difference value range for single-byters. */
85 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
86 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
87
88 /* The difference value range for double-byters. */
89 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
90 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
91
92 /* The difference value range for 3-byters. */
93 #define BOCU1_REACH_POS_3 \
94 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
95
96 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
97
98 /* The lead byte start values. */
99 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
100 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
101 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
102 /* ==BOCU1_MAX_LEAD */
103
104 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
105 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
106 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
107 /* ==BOCU1_MIN+1 */
108
109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
111 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
112 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
113 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
114
115 /* The length of a byte sequence, according to its packed form. */
116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
117 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
118
119 /*
120 * 12 commonly used C0 control codes (and space) are only used to encode
121 * themselves directly,
122 * which makes BOCU-1 MIME-usable and reasonably safe for
123 * ASCII-oriented software.
124 *
125 * These controls are
126 * 0 NUL
127 *
128 * 7 BEL
129 * 8 BS
130 *
131 * 9 TAB
132 * a LF
133 * b VT
134 * c FF
135 * d CR
136 *
137 * e SO
138 * f SI
139 *
140 * 1a SUB
141 * 1b ESC
142 *
143 * The other 20 C0 controls are also encoded directly (to preserve order)
144 * but are also used as trail bytes in difference encoding
145 * (for better compression).
146 */
147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
148
149 /*
150 * Byte value map for control codes,
151 * from external byte values 0x00..0x20
152 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
153 * External byte values that are illegal as trail bytes are mapped to -1.
154 */
155 static const int8_t
156 bocu1ByteToTrail[BOCU1_MIN]={
157 /* 0 1 2 3 4 5 6 7 */
158 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
159
160 /* 8 9 a b c d e f */
161 -1, -1, -1, -1, -1, -1, -1, -1,
162
163 /* 10 11 12 13 14 15 16 17 */
164 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
165
166 /* 18 19 1a 1b 1c 1d 1e 1f */
167 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
168
169 /* 20 */
170 -1
171 };
172
173 /*
174 * Byte value map for control codes,
175 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
176 * to external byte values 0x00..0x20.
177 */
178 static const int8_t
179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
180 /* 0 1 2 3 4 5 6 7 */
181 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
182
183 /* 8 9 a b c d e f */
184 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
185
186 /* 10 11 12 13 */
187 0x1c, 0x1d, 0x1e, 0x1f
188 };
189
190 /**
191 * Integer division and modulo with negative numerators
192 * yields negative modulo results and quotients that are one more than
193 * what we need here.
194 * This macro adjust the results so that the modulo-value m is always >=0.
195 *
196 * For positive n, the if() condition is always FALSE.
197 *
198 * @param n Number to be split into quotient and rest.
199 * Will be modified to contain the quotient.
200 * @param d Divisor.
201 * @param m Output variable for the rest (modulo result).
202 */
203 #define NEGDIVMOD(n, d, m) { \
204 (m)=(n)%(d); \
205 (n)/=(d); \
206 if((m)<0) { \
207 --(n); \
208 (m)+=(d); \
209 } \
210 }
211
212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
213
214 /** Is a diff value encodable in a single byte? */
215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
216
217 /** Encode a diff value in a single byte. */
218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
219
220 /** Is a diff value encodable in two bytes? */
221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
222
223 /* BOCU-1 implementation functions ------------------------------------------ */
224
225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
226
227 /**
228 * Compute the next "previous" value for differencing
229 * from the current code point.
230 *
231 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
232 * @return "previous code point" state value
233 */
234 static inline int32_t
bocu1Prev(int32_t c)235 bocu1Prev(int32_t c) {
236 /* compute new prev */
237 if(/* 0x3040<=c && */ c<=0x309f) {
238 /* Hiragana is not 128-aligned */
239 return 0x3070;
240 } else if(0x4e00<=c && c<=0x9fa5) {
241 /* CJK Unihan */
242 return 0x4e00-BOCU1_REACH_NEG_2;
243 } else if(0xac00<=c /* && c<=0xd7a3 */) {
244 /* Korean Hangul */
245 return (0xd7a3+0xac00)/2;
246 } else {
247 /* mostly small scripts */
248 return BOCU1_SIMPLE_PREV(c);
249 }
250 }
251
252 /** Fast version of bocu1Prev() for most scripts. */
253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
254
255 /*
256 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
257 * The UConverter fields are used as follows:
258 *
259 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
260 *
261 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
263 */
264
265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
266
267 /**
268 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
269 * and return a packed integer with them.
270 *
271 * The encoding favors small absolute differences with short encodings
272 * to compress runs of same-script characters.
273 *
274 * Optimized version with unrolled loops and fewer floating-point operations
275 * than the standard packDiff().
276 *
277 * @param diff difference value -0x10ffff..0x10ffff
278 * @return
279 * 0x010000zz for 1-byte sequence zz
280 * 0x0200yyzz for 2-byte sequence yy zz
281 * 0x03xxyyzz for 3-byte sequence xx yy zz
282 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
283 */
284 static int32_t
packDiff(int32_t diff)285 packDiff(int32_t diff) {
286 int32_t result, m;
287
288 U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
289 if(diff>=BOCU1_REACH_NEG_1) {
290 /* mostly positive differences, and single-byte negative ones */
291 #if 0 /* single-byte case handled in macros, see below */
292 if(diff<=BOCU1_REACH_POS_1) {
293 /* single byte */
294 return 0x01000000|(BOCU1_MIDDLE+diff);
295 } else
296 #endif
297 if(diff<=BOCU1_REACH_POS_2) {
298 /* two bytes */
299 diff-=BOCU1_REACH_POS_1+1;
300 result=0x02000000;
301
302 m=diff%BOCU1_TRAIL_COUNT;
303 diff/=BOCU1_TRAIL_COUNT;
304 result|=BOCU1_TRAIL_TO_BYTE(m);
305
306 result|=(BOCU1_START_POS_2+diff)<<8;
307 } else if(diff<=BOCU1_REACH_POS_3) {
308 /* three bytes */
309 diff-=BOCU1_REACH_POS_2+1;
310 result=0x03000000;
311
312 m=diff%BOCU1_TRAIL_COUNT;
313 diff/=BOCU1_TRAIL_COUNT;
314 result|=BOCU1_TRAIL_TO_BYTE(m);
315
316 m=diff%BOCU1_TRAIL_COUNT;
317 diff/=BOCU1_TRAIL_COUNT;
318 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
319
320 result|=(BOCU1_START_POS_3+diff)<<16;
321 } else {
322 /* four bytes */
323 diff-=BOCU1_REACH_POS_3+1;
324
325 m=diff%BOCU1_TRAIL_COUNT;
326 diff/=BOCU1_TRAIL_COUNT;
327 result=BOCU1_TRAIL_TO_BYTE(m);
328
329 m=diff%BOCU1_TRAIL_COUNT;
330 diff/=BOCU1_TRAIL_COUNT;
331 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
332
333 /*
334 * We know that / and % would deliver quotient 0 and rest=diff.
335 * Avoid division and modulo for performance.
336 */
337 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
338
339 result|=((uint32_t)BOCU1_START_POS_4)<<24;
340 }
341 } else {
342 /* two- to four-byte negative differences */
343 if(diff>=BOCU1_REACH_NEG_2) {
344 /* two bytes */
345 diff-=BOCU1_REACH_NEG_1;
346 result=0x02000000;
347
348 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
349 result|=BOCU1_TRAIL_TO_BYTE(m);
350
351 result|=(BOCU1_START_NEG_2+diff)<<8;
352 } else if(diff>=BOCU1_REACH_NEG_3) {
353 /* three bytes */
354 diff-=BOCU1_REACH_NEG_2;
355 result=0x03000000;
356
357 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
358 result|=BOCU1_TRAIL_TO_BYTE(m);
359
360 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
361 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
362
363 result|=(BOCU1_START_NEG_3+diff)<<16;
364 } else {
365 /* four bytes */
366 diff-=BOCU1_REACH_NEG_3;
367
368 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
369 result=BOCU1_TRAIL_TO_BYTE(m);
370
371 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
372 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
373
374 /*
375 * We know that NEGDIVMOD would deliver
376 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
377 * Avoid division and modulo for performance.
378 */
379 m=diff+BOCU1_TRAIL_COUNT;
380 result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
381
382 result|=BOCU1_MIN<<24;
383 }
384 }
385 return result;
386 }
387
388
389 static void
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
391 UErrorCode *pErrorCode) {
392 UConverter *cnv;
393 const UChar *source, *sourceLimit;
394 uint8_t *target;
395 int32_t targetCapacity;
396 int32_t *offsets;
397
398 int32_t prev, c, diff;
399
400 int32_t sourceIndex, nextSourceIndex;
401
402 /* set up the local pointers */
403 cnv=pArgs->converter;
404 source=pArgs->source;
405 sourceLimit=pArgs->sourceLimit;
406 target=(uint8_t *)pArgs->target;
407 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
408 offsets=pArgs->offsets;
409
410 /* get the converter state from UConverter */
411 c=cnv->fromUChar32;
412 prev=(int32_t)cnv->fromUnicodeStatus;
413 if(prev==0) {
414 prev=BOCU1_ASCII_PREV;
415 }
416
417 /* sourceIndex=-1 if the current character began in the previous buffer */
418 sourceIndex= c==0 ? 0 : -1;
419 nextSourceIndex=0;
420
421 /* conversion loop */
422 if(c!=0 && targetCapacity>0) {
423 goto getTrail;
424 }
425
426 fastSingle:
427 /* fast loop for single-byte differences */
428 /* use only one loop counter variable, targetCapacity, not also source */
429 diff=(int32_t)(sourceLimit-source);
430 if(targetCapacity>diff) {
431 targetCapacity=diff;
432 }
433 while(targetCapacity>0 && (c=*source)<0x3000) {
434 if(c<=0x20) {
435 if(c!=0x20) {
436 prev=BOCU1_ASCII_PREV;
437 }
438 *target++=(uint8_t)c;
439 *offsets++=nextSourceIndex++;
440 ++source;
441 --targetCapacity;
442 } else {
443 diff=c-prev;
444 if(DIFF_IS_SINGLE(diff)) {
445 prev=BOCU1_SIMPLE_PREV(c);
446 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
447 *offsets++=nextSourceIndex++;
448 ++source;
449 --targetCapacity;
450 } else {
451 break;
452 }
453 }
454 }
455 /* restore real values */
456 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
457 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
458
459 /* regular loop for all cases */
460 while(source<sourceLimit) {
461 if(targetCapacity>0) {
462 c=*source++;
463 ++nextSourceIndex;
464
465 if(c<=0x20) {
466 /*
467 * ISO C0 control & space:
468 * Encode directly for MIME compatibility,
469 * and reset state except for space, to not disrupt compression.
470 */
471 if(c!=0x20) {
472 prev=BOCU1_ASCII_PREV;
473 }
474 *target++=(uint8_t)c;
475 *offsets++=sourceIndex;
476 --targetCapacity;
477
478 sourceIndex=nextSourceIndex;
479 continue;
480 }
481
482 if(U16_IS_LEAD(c)) {
483 getTrail:
484 if(source<sourceLimit) {
485 /* test the following code unit */
486 UChar trail=*source;
487 if(U16_IS_TRAIL(trail)) {
488 ++source;
489 ++nextSourceIndex;
490 c=U16_GET_SUPPLEMENTARY(c, trail);
491 }
492 } else {
493 /* no more input */
494 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
495 break;
496 }
497 }
498
499 /*
500 * all other Unicode code points c==U+0021..U+10ffff
501 * are encoded with the difference c-prev
502 *
503 * a new prev is computed from c,
504 * placed in the middle of a 0x80-block (for most small scripts) or
505 * in the middle of the Unihan and Hangul blocks
506 * to statistically minimize the following difference
507 */
508 diff=c-prev;
509 prev=BOCU1_PREV(c);
510 if(DIFF_IS_SINGLE(diff)) {
511 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
512 *offsets++=sourceIndex;
513 --targetCapacity;
514 sourceIndex=nextSourceIndex;
515 if(c<0x3000) {
516 goto fastSingle;
517 }
518 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
519 /* optimize 2-byte case */
520 int32_t m;
521
522 if(diff>=0) {
523 diff-=BOCU1_REACH_POS_1+1;
524 m=diff%BOCU1_TRAIL_COUNT;
525 diff/=BOCU1_TRAIL_COUNT;
526 diff+=BOCU1_START_POS_2;
527 } else {
528 diff-=BOCU1_REACH_NEG_1;
529 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
530 diff+=BOCU1_START_NEG_2;
531 }
532 *target++=(uint8_t)diff;
533 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
534 *offsets++=sourceIndex;
535 *offsets++=sourceIndex;
536 targetCapacity-=2;
537 sourceIndex=nextSourceIndex;
538 } else {
539 int32_t length; /* will be 2..4 */
540
541 diff=packDiff(diff);
542 length=BOCU1_LENGTH_FROM_PACKED(diff);
543
544 /* write the output character bytes from diff and length */
545 /* from the first if in the loop we know that targetCapacity>0 */
546 if(length<=targetCapacity) {
547 switch(length) {
548 /* each branch falls through to the next one */
549 case 4:
550 *target++=(uint8_t)(diff>>24);
551 *offsets++=sourceIndex;
552 case 3: /*fall through*/
553 *target++=(uint8_t)(diff>>16);
554 *offsets++=sourceIndex;
555 case 2: /*fall through*/
556 *target++=(uint8_t)(diff>>8);
557 *offsets++=sourceIndex;
558 /* case 1: handled above */
559 *target++=(uint8_t)diff;
560 *offsets++=sourceIndex;
561 default:
562 /* will never occur */
563 break;
564 }
565 targetCapacity-=length;
566 sourceIndex=nextSourceIndex;
567 } else {
568 uint8_t *charErrorBuffer;
569
570 /*
571 * We actually do this backwards here:
572 * In order to save an intermediate variable, we output
573 * first to the overflow buffer what does not fit into the
574 * regular target.
575 */
576 /* we know that 1<=targetCapacity<length<=4 */
577 length-=targetCapacity;
578 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
579 switch(length) {
580 /* each branch falls through to the next one */
581 case 3:
582 *charErrorBuffer++=(uint8_t)(diff>>16);
583 case 2: /*fall through*/
584 *charErrorBuffer++=(uint8_t)(diff>>8);
585 case 1: /*fall through*/
586 *charErrorBuffer=(uint8_t)diff;
587 default:
588 /* will never occur */
589 break;
590 }
591 cnv->charErrorBufferLength=(int8_t)length;
592
593 /* now output what fits into the regular target */
594 diff>>=8*length; /* length was reduced by targetCapacity */
595 switch(targetCapacity) {
596 /* each branch falls through to the next one */
597 case 3:
598 *target++=(uint8_t)(diff>>16);
599 *offsets++=sourceIndex;
600 case 2: /*fall through*/
601 *target++=(uint8_t)(diff>>8);
602 *offsets++=sourceIndex;
603 case 1: /*fall through*/
604 *target++=(uint8_t)diff;
605 *offsets++=sourceIndex;
606 default:
607 /* will never occur */
608 break;
609 }
610
611 /* target overflow */
612 targetCapacity=0;
613 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
614 break;
615 }
616 }
617 } else {
618 /* target is full */
619 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
620 break;
621 }
622 }
623
624 /* set the converter state back into UConverter */
625 cnv->fromUChar32= c<0 ? -c : 0;
626 cnv->fromUnicodeStatus=(uint32_t)prev;
627
628 /* write back the updated pointers */
629 pArgs->source=source;
630 pArgs->target=(char *)target;
631 pArgs->offsets=offsets;
632 }
633
634 /*
635 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
636 * If a change is made in the original function, then either
637 * change this function the same way or
638 * re-copy the original function and remove the variables
639 * offsets, sourceIndex, and nextSourceIndex.
640 */
641 static void
_Bocu1FromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)642 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
643 UErrorCode *pErrorCode) {
644 UConverter *cnv;
645 const UChar *source, *sourceLimit;
646 uint8_t *target;
647 int32_t targetCapacity;
648
649 int32_t prev, c, diff;
650
651 /* set up the local pointers */
652 cnv=pArgs->converter;
653 source=pArgs->source;
654 sourceLimit=pArgs->sourceLimit;
655 target=(uint8_t *)pArgs->target;
656 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
657
658 /* get the converter state from UConverter */
659 c=cnv->fromUChar32;
660 prev=(int32_t)cnv->fromUnicodeStatus;
661 if(prev==0) {
662 prev=BOCU1_ASCII_PREV;
663 }
664
665 /* conversion loop */
666 if(c!=0 && targetCapacity>0) {
667 goto getTrail;
668 }
669
670 fastSingle:
671 /* fast loop for single-byte differences */
672 /* use only one loop counter variable, targetCapacity, not also source */
673 diff=(int32_t)(sourceLimit-source);
674 if(targetCapacity>diff) {
675 targetCapacity=diff;
676 }
677 while(targetCapacity>0 && (c=*source)<0x3000) {
678 if(c<=0x20) {
679 if(c!=0x20) {
680 prev=BOCU1_ASCII_PREV;
681 }
682 *target++=(uint8_t)c;
683 } else {
684 diff=c-prev;
685 if(DIFF_IS_SINGLE(diff)) {
686 prev=BOCU1_SIMPLE_PREV(c);
687 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
688 } else {
689 break;
690 }
691 }
692 ++source;
693 --targetCapacity;
694 }
695 /* restore real values */
696 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
697
698 /* regular loop for all cases */
699 while(source<sourceLimit) {
700 if(targetCapacity>0) {
701 c=*source++;
702
703 if(c<=0x20) {
704 /*
705 * ISO C0 control & space:
706 * Encode directly for MIME compatibility,
707 * and reset state except for space, to not disrupt compression.
708 */
709 if(c!=0x20) {
710 prev=BOCU1_ASCII_PREV;
711 }
712 *target++=(uint8_t)c;
713 --targetCapacity;
714 continue;
715 }
716
717 if(U16_IS_LEAD(c)) {
718 getTrail:
719 if(source<sourceLimit) {
720 /* test the following code unit */
721 UChar trail=*source;
722 if(U16_IS_TRAIL(trail)) {
723 ++source;
724 c=U16_GET_SUPPLEMENTARY(c, trail);
725 }
726 } else {
727 /* no more input */
728 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
729 break;
730 }
731 }
732
733 /*
734 * all other Unicode code points c==U+0021..U+10ffff
735 * are encoded with the difference c-prev
736 *
737 * a new prev is computed from c,
738 * placed in the middle of a 0x80-block (for most small scripts) or
739 * in the middle of the Unihan and Hangul blocks
740 * to statistically minimize the following difference
741 */
742 diff=c-prev;
743 prev=BOCU1_PREV(c);
744 if(DIFF_IS_SINGLE(diff)) {
745 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
746 --targetCapacity;
747 if(c<0x3000) {
748 goto fastSingle;
749 }
750 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
751 /* optimize 2-byte case */
752 int32_t m;
753
754 if(diff>=0) {
755 diff-=BOCU1_REACH_POS_1+1;
756 m=diff%BOCU1_TRAIL_COUNT;
757 diff/=BOCU1_TRAIL_COUNT;
758 diff+=BOCU1_START_POS_2;
759 } else {
760 diff-=BOCU1_REACH_NEG_1;
761 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
762 diff+=BOCU1_START_NEG_2;
763 }
764 *target++=(uint8_t)diff;
765 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
766 targetCapacity-=2;
767 } else {
768 int32_t length; /* will be 2..4 */
769
770 diff=packDiff(diff);
771 length=BOCU1_LENGTH_FROM_PACKED(diff);
772
773 /* write the output character bytes from diff and length */
774 /* from the first if in the loop we know that targetCapacity>0 */
775 if(length<=targetCapacity) {
776 switch(length) {
777 /* each branch falls through to the next one */
778 case 4:
779 *target++=(uint8_t)(diff>>24);
780 case 3: /*fall through*/
781 *target++=(uint8_t)(diff>>16);
782 /* case 2: handled above */
783 *target++=(uint8_t)(diff>>8);
784 /* case 1: handled above */
785 *target++=(uint8_t)diff;
786 default:
787 /* will never occur */
788 break;
789 }
790 targetCapacity-=length;
791 } else {
792 uint8_t *charErrorBuffer;
793
794 /*
795 * We actually do this backwards here:
796 * In order to save an intermediate variable, we output
797 * first to the overflow buffer what does not fit into the
798 * regular target.
799 */
800 /* we know that 1<=targetCapacity<length<=4 */
801 length-=targetCapacity;
802 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
803 switch(length) {
804 /* each branch falls through to the next one */
805 case 3:
806 *charErrorBuffer++=(uint8_t)(diff>>16);
807 case 2: /*fall through*/
808 *charErrorBuffer++=(uint8_t)(diff>>8);
809 case 1: /*fall through*/
810 *charErrorBuffer=(uint8_t)diff;
811 default:
812 /* will never occur */
813 break;
814 }
815 cnv->charErrorBufferLength=(int8_t)length;
816
817 /* now output what fits into the regular target */
818 diff>>=8*length; /* length was reduced by targetCapacity */
819 switch(targetCapacity) {
820 /* each branch falls through to the next one */
821 case 3:
822 *target++=(uint8_t)(diff>>16);
823 case 2: /*fall through*/
824 *target++=(uint8_t)(diff>>8);
825 case 1: /*fall through*/
826 *target++=(uint8_t)diff;
827 default:
828 /* will never occur */
829 break;
830 }
831
832 /* target overflow */
833 targetCapacity=0;
834 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
835 break;
836 }
837 }
838 } else {
839 /* target is full */
840 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
841 break;
842 }
843 }
844
845 /* set the converter state back into UConverter */
846 cnv->fromUChar32= c<0 ? -c : 0;
847 cnv->fromUnicodeStatus=(uint32_t)prev;
848
849 /* write back the updated pointers */
850 pArgs->source=source;
851 pArgs->target=(char *)target;
852 }
853
854 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
855
856 /**
857 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
858 *
859 * @param b lead byte;
860 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
861 * @return (diff<<2)|count
862 */
863 static inline int32_t
decodeBocu1LeadByte(int32_t b)864 decodeBocu1LeadByte(int32_t b) {
865 int32_t diff, count;
866
867 if(b>=BOCU1_START_NEG_2) {
868 /* positive difference */
869 if(b<BOCU1_START_POS_3) {
870 /* two bytes */
871 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
872 count=1;
873 } else if(b<BOCU1_START_POS_4) {
874 /* three bytes */
875 diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
876 count=2;
877 } else {
878 /* four bytes */
879 diff=BOCU1_REACH_POS_3+1;
880 count=3;
881 }
882 } else {
883 /* negative difference */
884 if(b>=BOCU1_START_NEG_3) {
885 /* two bytes */
886 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
887 count=1;
888 } else if(b>BOCU1_MIN) {
889 /* three bytes */
890 diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
891 count=2;
892 } else {
893 /* four bytes */
894 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
895 count=3;
896 }
897 }
898
899 /* return the state for decoding the trail byte(s) */
900 return (diff<<2)|count;
901 }
902
903 /**
904 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
905 *
906 * @param count number of remaining trail bytes including this one
907 * @param b trail byte
908 * @return new delta for diff including b - <0 indicates an error
909 *
910 * @see decodeBocu1
911 */
912 static inline int32_t
decodeBocu1TrailByte(int32_t count,int32_t b)913 decodeBocu1TrailByte(int32_t count, int32_t b) {
914 if(b<=0x20) {
915 /* skip some C0 controls and make the trail byte range contiguous */
916 b=bocu1ByteToTrail[b];
917 /* b<0 for an illegal trail byte value will result in return<0 below */
918 #if BOCU1_MAX_TRAIL<0xff
919 } else if(b>BOCU1_MAX_TRAIL) {
920 return -99;
921 #endif
922 } else {
923 b-=BOCU1_TRAIL_BYTE_OFFSET;
924 }
925
926 /* add trail byte into difference and decrement count */
927 if(count==1) {
928 return b;
929 } else if(count==2) {
930 return b*BOCU1_TRAIL_COUNT;
931 } else /* count==3 */ {
932 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
933 }
934 }
935
936 static void
_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)937 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
938 UErrorCode *pErrorCode) {
939 UConverter *cnv;
940 const uint8_t *source, *sourceLimit;
941 UChar *target;
942 const UChar *targetLimit;
943 int32_t *offsets;
944
945 int32_t prev, count, diff, c;
946
947 int8_t byteIndex;
948 uint8_t *bytes;
949
950 int32_t sourceIndex, nextSourceIndex;
951
952 /* set up the local pointers */
953 cnv=pArgs->converter;
954 source=(const uint8_t *)pArgs->source;
955 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
956 target=pArgs->target;
957 targetLimit=pArgs->targetLimit;
958 offsets=pArgs->offsets;
959
960 /* get the converter state from UConverter */
961 prev=(int32_t)cnv->toUnicodeStatus;
962 if(prev==0) {
963 prev=BOCU1_ASCII_PREV;
964 }
965 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
966 count=diff&3;
967 diff>>=2;
968
969 byteIndex=cnv->toULength;
970 bytes=cnv->toUBytes;
971
972 /* sourceIndex=-1 if the current character began in the previous buffer */
973 sourceIndex=byteIndex==0 ? 0 : -1;
974 nextSourceIndex=0;
975
976 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
977 if(count>0 && byteIndex>0 && target<targetLimit) {
978 goto getTrail;
979 }
980
981 fastSingle:
982 /* fast loop for single-byte differences */
983 /* use count as the only loop counter variable */
984 diff=(int32_t)(sourceLimit-source);
985 count=(int32_t)(pArgs->targetLimit-target);
986 if(count>diff) {
987 count=diff;
988 }
989 while(count>0) {
990 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
991 c=prev+(c-BOCU1_MIDDLE);
992 if(c<0x3000) {
993 *target++=(UChar)c;
994 *offsets++=nextSourceIndex++;
995 prev=BOCU1_SIMPLE_PREV(c);
996 } else {
997 break;
998 }
999 } else if(c<=0x20) {
1000 if(c!=0x20) {
1001 prev=BOCU1_ASCII_PREV;
1002 }
1003 *target++=(UChar)c;
1004 *offsets++=nextSourceIndex++;
1005 } else {
1006 break;
1007 }
1008 ++source;
1009 --count;
1010 }
1011 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1012
1013 /* decode a sequence of single and lead bytes */
1014 while(source<sourceLimit) {
1015 if(target>=targetLimit) {
1016 /* target is full */
1017 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1018 break;
1019 }
1020
1021 ++nextSourceIndex;
1022 c=*source++;
1023 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1024 /* Write a code point directly from a single-byte difference. */
1025 c=prev+(c-BOCU1_MIDDLE);
1026 if(c<0x3000) {
1027 *target++=(UChar)c;
1028 *offsets++=sourceIndex;
1029 prev=BOCU1_SIMPLE_PREV(c);
1030 sourceIndex=nextSourceIndex;
1031 goto fastSingle;
1032 }
1033 } else if(c<=0x20) {
1034 /*
1035 * Direct-encoded C0 control code or space.
1036 * Reset prev for C0 control codes but not for space.
1037 */
1038 if(c!=0x20) {
1039 prev=BOCU1_ASCII_PREV;
1040 }
1041 *target++=(UChar)c;
1042 *offsets++=sourceIndex;
1043 sourceIndex=nextSourceIndex;
1044 continue;
1045 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1046 /* Optimize two-byte case. */
1047 if(c>=BOCU1_MIDDLE) {
1048 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1049 } else {
1050 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1051 }
1052
1053 /* trail byte */
1054 ++nextSourceIndex;
1055 c=decodeBocu1TrailByte(1, *source++);
1056 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1057 bytes[0]=source[-2];
1058 bytes[1]=source[-1];
1059 byteIndex=2;
1060 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1061 break;
1062 }
1063 } else if(c==BOCU1_RESET) {
1064 /* only reset the state, no code point */
1065 prev=BOCU1_ASCII_PREV;
1066 sourceIndex=nextSourceIndex;
1067 continue;
1068 } else {
1069 /*
1070 * For multi-byte difference lead bytes, set the decoder state
1071 * with the partial difference value from the lead byte and
1072 * with the number of trail bytes.
1073 */
1074 bytes[0]=(uint8_t)c;
1075 byteIndex=1;
1076
1077 diff=decodeBocu1LeadByte(c);
1078 count=diff&3;
1079 diff>>=2;
1080 getTrail:
1081 for(;;) {
1082 if(source>=sourceLimit) {
1083 goto endloop;
1084 }
1085 ++nextSourceIndex;
1086 c=bytes[byteIndex++]=*source++;
1087
1088 /* trail byte in any position */
1089 c=decodeBocu1TrailByte(count, c);
1090 if(c<0) {
1091 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1092 goto endloop;
1093 }
1094
1095 diff+=c;
1096 if(--count==0) {
1097 /* final trail byte, deliver a code point */
1098 byteIndex=0;
1099 c=prev+diff;
1100 if((uint32_t)c>0x10ffff) {
1101 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1102 goto endloop;
1103 }
1104 break;
1105 }
1106 }
1107 }
1108
1109 /* calculate the next prev and output c */
1110 prev=BOCU1_PREV(c);
1111 if(c<=0xffff) {
1112 *target++=(UChar)c;
1113 *offsets++=sourceIndex;
1114 } else {
1115 /* output surrogate pair */
1116 *target++=U16_LEAD(c);
1117 if(target<targetLimit) {
1118 *target++=U16_TRAIL(c);
1119 *offsets++=sourceIndex;
1120 *offsets++=sourceIndex;
1121 } else {
1122 /* target overflow */
1123 *offsets++=sourceIndex;
1124 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1125 cnv->UCharErrorBufferLength=1;
1126 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1127 break;
1128 }
1129 }
1130 sourceIndex=nextSourceIndex;
1131 }
1132 endloop:
1133
1134 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1135 /* set the converter state in UConverter to deal with the next character */
1136 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1137 cnv->mode=0;
1138 } else {
1139 /* set the converter state back into UConverter */
1140 cnv->toUnicodeStatus=(uint32_t)prev;
1141 cnv->mode=(diff<<2)|count;
1142 }
1143 cnv->toULength=byteIndex;
1144
1145 /* write back the updated pointers */
1146 pArgs->source=(const char *)source;
1147 pArgs->target=target;
1148 pArgs->offsets=offsets;
1149 return;
1150 }
1151
1152 /*
1153 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1154 * If a change is made in the original function, then either
1155 * change this function the same way or
1156 * re-copy the original function and remove the variables
1157 * offsets, sourceIndex, and nextSourceIndex.
1158 */
1159 static void
_Bocu1ToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1160 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1161 UErrorCode *pErrorCode) {
1162 UConverter *cnv;
1163 const uint8_t *source, *sourceLimit;
1164 UChar *target;
1165 const UChar *targetLimit;
1166
1167 int32_t prev, count, diff, c;
1168
1169 int8_t byteIndex;
1170 uint8_t *bytes;
1171
1172 /* set up the local pointers */
1173 cnv=pArgs->converter;
1174 source=(const uint8_t *)pArgs->source;
1175 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1176 target=pArgs->target;
1177 targetLimit=pArgs->targetLimit;
1178
1179 /* get the converter state from UConverter */
1180 prev=(int32_t)cnv->toUnicodeStatus;
1181 if(prev==0) {
1182 prev=BOCU1_ASCII_PREV;
1183 }
1184 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1185 count=diff&3;
1186 diff>>=2;
1187
1188 byteIndex=cnv->toULength;
1189 bytes=cnv->toUBytes;
1190
1191 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1192 if(count>0 && byteIndex>0 && target<targetLimit) {
1193 goto getTrail;
1194 }
1195
1196 fastSingle:
1197 /* fast loop for single-byte differences */
1198 /* use count as the only loop counter variable */
1199 diff=(int32_t)(sourceLimit-source);
1200 count=(int32_t)(pArgs->targetLimit-target);
1201 if(count>diff) {
1202 count=diff;
1203 }
1204 while(count>0) {
1205 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1206 c=prev+(c-BOCU1_MIDDLE);
1207 if(c<0x3000) {
1208 *target++=(UChar)c;
1209 prev=BOCU1_SIMPLE_PREV(c);
1210 } else {
1211 break;
1212 }
1213 } else if(c<=0x20) {
1214 if(c!=0x20) {
1215 prev=BOCU1_ASCII_PREV;
1216 }
1217 *target++=(UChar)c;
1218 } else {
1219 break;
1220 }
1221 ++source;
1222 --count;
1223 }
1224
1225 /* decode a sequence of single and lead bytes */
1226 while(source<sourceLimit) {
1227 if(target>=targetLimit) {
1228 /* target is full */
1229 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1230 break;
1231 }
1232
1233 c=*source++;
1234 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1235 /* Write a code point directly from a single-byte difference. */
1236 c=prev+(c-BOCU1_MIDDLE);
1237 if(c<0x3000) {
1238 *target++=(UChar)c;
1239 prev=BOCU1_SIMPLE_PREV(c);
1240 goto fastSingle;
1241 }
1242 } else if(c<=0x20) {
1243 /*
1244 * Direct-encoded C0 control code or space.
1245 * Reset prev for C0 control codes but not for space.
1246 */
1247 if(c!=0x20) {
1248 prev=BOCU1_ASCII_PREV;
1249 }
1250 *target++=(UChar)c;
1251 continue;
1252 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1253 /* Optimize two-byte case. */
1254 if(c>=BOCU1_MIDDLE) {
1255 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1256 } else {
1257 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1258 }
1259
1260 /* trail byte */
1261 c=decodeBocu1TrailByte(1, *source++);
1262 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1263 bytes[0]=source[-2];
1264 bytes[1]=source[-1];
1265 byteIndex=2;
1266 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1267 break;
1268 }
1269 } else if(c==BOCU1_RESET) {
1270 /* only reset the state, no code point */
1271 prev=BOCU1_ASCII_PREV;
1272 continue;
1273 } else {
1274 /*
1275 * For multi-byte difference lead bytes, set the decoder state
1276 * with the partial difference value from the lead byte and
1277 * with the number of trail bytes.
1278 */
1279 bytes[0]=(uint8_t)c;
1280 byteIndex=1;
1281
1282 diff=decodeBocu1LeadByte(c);
1283 count=diff&3;
1284 diff>>=2;
1285 getTrail:
1286 for(;;) {
1287 if(source>=sourceLimit) {
1288 goto endloop;
1289 }
1290 c=bytes[byteIndex++]=*source++;
1291
1292 /* trail byte in any position */
1293 c=decodeBocu1TrailByte(count, c);
1294 if(c<0) {
1295 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1296 goto endloop;
1297 }
1298
1299 diff+=c;
1300 if(--count==0) {
1301 /* final trail byte, deliver a code point */
1302 byteIndex=0;
1303 c=prev+diff;
1304 if((uint32_t)c>0x10ffff) {
1305 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1306 goto endloop;
1307 }
1308 break;
1309 }
1310 }
1311 }
1312
1313 /* calculate the next prev and output c */
1314 prev=BOCU1_PREV(c);
1315 if(c<=0xffff) {
1316 *target++=(UChar)c;
1317 } else {
1318 /* output surrogate pair */
1319 *target++=U16_LEAD(c);
1320 if(target<targetLimit) {
1321 *target++=U16_TRAIL(c);
1322 } else {
1323 /* target overflow */
1324 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1325 cnv->UCharErrorBufferLength=1;
1326 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1327 break;
1328 }
1329 }
1330 }
1331 endloop:
1332
1333 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1334 /* set the converter state in UConverter to deal with the next character */
1335 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1336 cnv->mode=0;
1337 } else {
1338 /* set the converter state back into UConverter */
1339 cnv->toUnicodeStatus=(uint32_t)prev;
1340 cnv->mode=(diff<<2)|count;
1341 }
1342 cnv->toULength=byteIndex;
1343
1344 /* write back the updated pointers */
1345 pArgs->source=(const char *)source;
1346 pArgs->target=target;
1347 return;
1348 }
1349
1350 /* miscellaneous ------------------------------------------------------------ */
1351
1352 static const UConverterImpl _Bocu1Impl={
1353 UCNV_BOCU1,
1354
1355 NULL,
1356 NULL,
1357
1358 NULL,
1359 NULL,
1360 NULL,
1361
1362 _Bocu1ToUnicode,
1363 _Bocu1ToUnicodeWithOffsets,
1364 _Bocu1FromUnicode,
1365 _Bocu1FromUnicodeWithOffsets,
1366 NULL,
1367
1368 NULL,
1369 NULL,
1370 NULL,
1371 NULL,
1372 ucnv_getCompleteUnicodeSet,
1373
1374 NULL,
1375 NULL
1376 };
1377
1378 static const UConverterStaticData _Bocu1StaticData={
1379 sizeof(UConverterStaticData),
1380 "BOCU-1",
1381 1214, /* CCSID for BOCU-1 */
1382 UCNV_IBM, UCNV_BOCU1,
1383 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1384 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1385 FALSE, FALSE,
1386 0,
1387 0,
1388 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1389 };
1390
1391 const UConverterSharedData _Bocu1Data=
1392 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
1393
1394 #endif
1395