1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2000-2015, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  ucnvscsu.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2000nov18
14 *   created by: Markus W. Scherer
15 *
16 *   This is an implementation of the Standard Compression Scheme for Unicode
17 *   as defined in http://www.unicode.org/unicode/reports/tr6/ .
18 *   Reserved commands and window settings are treated as illegal sequences and
19 *   will result in callback calls.
20 */
21 
22 #include "unicode/utypes.h"
23 
24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
25 
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
28 #include "unicode/utf16.h"
29 #include "ucnv_bld.h"
30 #include "ucnv_cnv.h"
31 #include "cmemory.h"
32 
33 /* SCSU definitions --------------------------------------------------------- */
34 
35 /* SCSU command byte values */
36 enum {
37     SQ0=0x01, /* Quote from window pair 0 */
38     SQ7=0x08, /* Quote from window pair 7 */
39     SDX=0x0B, /* Define a window as extended */
40     Srs=0x0C, /* reserved */
41     SQU=0x0E, /* Quote a single Unicode character */
42     SCU=0x0F, /* Change to Unicode mode */
43     SC0=0x10, /* Select window 0 */
44     SC7=0x17, /* Select window 7 */
45     SD0=0x18, /* Define and select window 0 */
46     SD7=0x1F, /* Define and select window 7 */
47 
48     UC0=0xE0, /* Select window 0 */
49     UC7=0xE7, /* Select window 7 */
50     UD0=0xE8, /* Define and select window 0 */
51     UD7=0xEF, /* Define and select window 7 */
52     UQU=0xF0, /* Quote a single Unicode character */
53     UDX=0xF1, /* Define a Window as extended */
54     Urs=0xF2  /* reserved */
55 };
56 
57 enum {
58     /*
59      * Unicode code points from 3400 to E000 are not adressible by
60      * dynamic window, since in these areas no short run alphabets are
61      * found. Therefore add gapOffset to all values from gapThreshold.
62      */
63     gapThreshold=0x68,
64     gapOffset=0xAC00,
65 
66     /* values between reservedStart and fixedThreshold are reserved */
67     reservedStart=0xA8,
68 
69     /* use table of predefined fixed offsets for values from fixedThreshold */
70     fixedThreshold=0xF9
71 };
72 
73 /* constant offsets for the 8 static windows */
74 static const uint32_t staticOffsets[8]={
75     0x0000, /* ASCII for quoted tags */
76     0x0080, /* Latin - 1 Supplement (for access to punctuation) */
77     0x0100, /* Latin Extended-A */
78     0x0300, /* Combining Diacritical Marks */
79     0x2000, /* General Punctuation */
80     0x2080, /* Currency Symbols */
81     0x2100, /* Letterlike Symbols and Number Forms */
82     0x3000  /* CJK Symbols and punctuation */
83 };
84 
85 /* initial offsets for the 8 dynamic (sliding) windows */
86 static const uint32_t initialDynamicOffsets[8]={
87     0x0080, /* Latin-1 */
88     0x00C0, /* Latin Extended A */
89     0x0400, /* Cyrillic */
90     0x0600, /* Arabic */
91     0x0900, /* Devanagari */
92     0x3040, /* Hiragana */
93     0x30A0, /* Katakana */
94     0xFF00  /* Fullwidth ASCII */
95 };
96 
97 /* Table of fixed predefined Offsets */
98 static const uint32_t fixedOffsets[]={
99     /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
100     /* 0xFA */ 0x0250, /* IPA extensions */
101     /* 0xFB */ 0x0370, /* Greek */
102     /* 0xFC */ 0x0530, /* Armenian */
103     /* 0xFD */ 0x3040, /* Hiragana */
104     /* 0xFE */ 0x30A0, /* Katakana */
105     /* 0xFF */ 0xFF60  /* Halfwidth Katakana */
106 };
107 
108 /* state values */
109 enum {
110     readCommand,
111     quotePairOne,
112     quotePairTwo,
113     quoteOne,
114     definePairOne,
115     definePairTwo,
116     defineOne
117 };
118 
119 typedef struct SCSUData {
120     /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
121     uint32_t toUDynamicOffsets[8];
122     uint32_t fromUDynamicOffsets[8];
123 
124     /* state machine state - toUnicode */
125     UBool toUIsSingleByteMode;
126     uint8_t toUState;
127     int8_t toUQuoteWindow, toUDynamicWindow;
128     uint8_t toUByteOne;
129     uint8_t toUPadding[3];
130 
131     /* state machine state - fromUnicode */
132     UBool fromUIsSingleByteMode;
133     int8_t fromUDynamicWindow;
134 
135     /*
136      * windowUse[] keeps track of the use of the dynamic windows:
137      * At nextWindowUseIndex there is the least recently used window,
138      * and the following windows (in a wrapping manner) are more and more
139      * recently used.
140      * At nextWindowUseIndex-1 there is the most recently used window.
141      */
142     uint8_t locale;
143     int8_t nextWindowUseIndex;
144     int8_t windowUse[8];
145 } SCSUData;
146 
147 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
148 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
149 
150 enum {
151     lGeneric, l_ja
152 };
153 
154 /* SCSU setup functions ----------------------------------------------------- */
155 
156 static void
_SCSUReset(UConverter * cnv,UConverterResetChoice choice)157 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
158     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
159 
160     if(choice<=UCNV_RESET_TO_UNICODE) {
161         /* reset toUnicode */
162         uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
163 
164         scsu->toUIsSingleByteMode=TRUE;
165         scsu->toUState=readCommand;
166         scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
167         scsu->toUByteOne=0;
168 
169         cnv->toULength=0;
170     }
171     if(choice!=UCNV_RESET_TO_UNICODE) {
172         /* reset fromUnicode */
173         uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
174 
175         scsu->fromUIsSingleByteMode=TRUE;
176         scsu->fromUDynamicWindow=0;
177 
178         scsu->nextWindowUseIndex=0;
179         switch(scsu->locale) {
180         case l_ja:
181             uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
182             break;
183         default:
184             uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
185             break;
186         }
187 
188         cnv->fromUChar32=0;
189     }
190 }
191 
192 static void
_SCSUOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)193 _SCSUOpen(UConverter *cnv,
194           UConverterLoadArgs *pArgs,
195           UErrorCode *pErrorCode) {
196     const char *locale=pArgs->locale;
197     if(pArgs->onlyTestIsLoadable) {
198         return;
199     }
200     cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
201     if(cnv->extraInfo!=NULL) {
202         if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
203             ((SCSUData *)cnv->extraInfo)->locale=l_ja;
204         } else {
205             ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
206         }
207         _SCSUReset(cnv, UCNV_RESET_BOTH);
208     } else {
209         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
210     }
211 
212     /* Set the substitution character U+fffd as a Unicode string. */
213     cnv->subUChars[0]=0xfffd;
214     cnv->subCharLen=-1;
215 }
216 
217 static void
_SCSUClose(UConverter * cnv)218 _SCSUClose(UConverter *cnv) {
219     if(cnv->extraInfo!=NULL) {
220         if(!cnv->isExtraLocal) {
221             uprv_free(cnv->extraInfo);
222         }
223         cnv->extraInfo=NULL;
224     }
225 }
226 
227 /* SCSU-to-Unicode conversion functions ------------------------------------- */
228 
229 static void
_SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)230 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
231                           UErrorCode *pErrorCode) {
232     UConverter *cnv;
233     SCSUData *scsu;
234     const uint8_t *source, *sourceLimit;
235     UChar *target;
236     const UChar *targetLimit;
237     int32_t *offsets;
238     UBool isSingleByteMode;
239     uint8_t state, byteOne;
240     int8_t quoteWindow, dynamicWindow;
241 
242     int32_t sourceIndex, nextSourceIndex;
243 
244     uint8_t b;
245 
246     /* set up the local pointers */
247     cnv=pArgs->converter;
248     scsu=(SCSUData *)cnv->extraInfo;
249 
250     source=(const uint8_t *)pArgs->source;
251     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
252     target=pArgs->target;
253     targetLimit=pArgs->targetLimit;
254     offsets=pArgs->offsets;
255 
256     /* get the state machine state */
257     isSingleByteMode=scsu->toUIsSingleByteMode;
258     state=scsu->toUState;
259     quoteWindow=scsu->toUQuoteWindow;
260     dynamicWindow=scsu->toUDynamicWindow;
261     byteOne=scsu->toUByteOne;
262 
263     /* sourceIndex=-1 if the current character began in the previous buffer */
264     sourceIndex=state==readCommand ? 0 : -1;
265     nextSourceIndex=0;
266 
267     /*
268      * conversion "loop"
269      *
270      * For performance, this is not a normal C loop.
271      * Instead, there are two code blocks for the two SCSU modes.
272      * The function branches to either one, and a change of the mode is done with a goto to
273      * the other branch.
274      *
275      * Each branch has two conventional loops:
276      * - a fast-path loop for the most common codes in the mode
277      * - a loop for all other codes in the mode
278      * When the fast-path runs into a code that it cannot handle, its loop ends and it
279      * runs into the following loop to handle the other codes.
280      * The end of the input or output buffer is also handled by the slower loop.
281      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
282      *
283      * The callback handling is done by returning with an error code.
284      * The conversion framework actually calls the callback function.
285      */
286     if(isSingleByteMode) {
287         /* fast path for single-byte mode */
288         if(state==readCommand) {
289 fastSingle:
290             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
291                 ++source;
292                 ++nextSourceIndex;
293                 if(b<=0x7f) {
294                     /* write US-ASCII graphic character or DEL */
295                     *target++=(UChar)b;
296                     if(offsets!=NULL) {
297                         *offsets++=sourceIndex;
298                     }
299                 } else {
300                     /* write from dynamic window */
301                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
302                     if(c<=0xffff) {
303                         *target++=(UChar)c;
304                         if(offsets!=NULL) {
305                             *offsets++=sourceIndex;
306                         }
307                     } else {
308                         /* output surrogate pair */
309                         *target++=(UChar)(0xd7c0+(c>>10));
310                         if(target<targetLimit) {
311                             *target++=(UChar)(0xdc00|(c&0x3ff));
312                             if(offsets!=NULL) {
313                                 *offsets++=sourceIndex;
314                                 *offsets++=sourceIndex;
315                             }
316                         } else {
317                             /* target overflow */
318                             if(offsets!=NULL) {
319                                 *offsets++=sourceIndex;
320                             }
321                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
322                             cnv->UCharErrorBufferLength=1;
323                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
324                             goto endloop;
325                         }
326                     }
327                 }
328                 sourceIndex=nextSourceIndex;
329             }
330         }
331 
332         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
333 singleByteMode:
334         while(source<sourceLimit) {
335             if(target>=targetLimit) {
336                 /* target is full */
337                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
338                 break;
339             }
340             b=*source++;
341             ++nextSourceIndex;
342             switch(state) {
343             case readCommand:
344                 /* redundant conditions are commented out */
345                 /* here: b<0x20 because otherwise we would be in fastSingle */
346                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
347                     /* CR/LF/TAB/NUL */
348                     *target++=(UChar)b;
349                     if(offsets!=NULL) {
350                         *offsets++=sourceIndex;
351                     }
352                     sourceIndex=nextSourceIndex;
353                     goto fastSingle;
354                 } else if(SC0<=b) {
355                     if(b<=SC7) {
356                         dynamicWindow=(int8_t)(b-SC0);
357                         sourceIndex=nextSourceIndex;
358                         goto fastSingle;
359                     } else /* if(SD0<=b && b<=SD7) */ {
360                         dynamicWindow=(int8_t)(b-SD0);
361                         state=defineOne;
362                     }
363                 } else if(/* SQ0<=b && */ b<=SQ7) {
364                     quoteWindow=(int8_t)(b-SQ0);
365                     state=quoteOne;
366                 } else if(b==SDX) {
367                     state=definePairOne;
368                 } else if(b==SQU) {
369                     state=quotePairOne;
370                 } else if(b==SCU) {
371                     sourceIndex=nextSourceIndex;
372                     isSingleByteMode=FALSE;
373                     goto fastUnicode;
374                 } else /* Srs */ {
375                     /* callback(illegal) */
376                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
377                     cnv->toUBytes[0]=b;
378                     cnv->toULength=1;
379                     goto endloop;
380                 }
381 
382                 /* store the first byte of a multibyte sequence in toUBytes[] */
383                 cnv->toUBytes[0]=b;
384                 cnv->toULength=1;
385                 break;
386             case quotePairOne:
387                 byteOne=b;
388                 cnv->toUBytes[1]=b;
389                 cnv->toULength=2;
390                 state=quotePairTwo;
391                 break;
392             case quotePairTwo:
393                 *target++=(UChar)((byteOne<<8)|b);
394                 if(offsets!=NULL) {
395                     *offsets++=sourceIndex;
396                 }
397                 sourceIndex=nextSourceIndex;
398                 state=readCommand;
399                 goto fastSingle;
400             case quoteOne:
401                 if(b<0x80) {
402                     /* all static offsets are in the BMP */
403                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
404                     if(offsets!=NULL) {
405                         *offsets++=sourceIndex;
406                     }
407                 } else {
408                     /* write from dynamic window */
409                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
410                     if(c<=0xffff) {
411                         *target++=(UChar)c;
412                         if(offsets!=NULL) {
413                             *offsets++=sourceIndex;
414                         }
415                     } else {
416                         /* output surrogate pair */
417                         *target++=(UChar)(0xd7c0+(c>>10));
418                         if(target<targetLimit) {
419                             *target++=(UChar)(0xdc00|(c&0x3ff));
420                             if(offsets!=NULL) {
421                                 *offsets++=sourceIndex;
422                                 *offsets++=sourceIndex;
423                             }
424                         } else {
425                             /* target overflow */
426                             if(offsets!=NULL) {
427                                 *offsets++=sourceIndex;
428                             }
429                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
430                             cnv->UCharErrorBufferLength=1;
431                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
432                             goto endloop;
433                         }
434                     }
435                 }
436                 sourceIndex=nextSourceIndex;
437                 state=readCommand;
438                 goto fastSingle;
439             case definePairOne:
440                 dynamicWindow=(int8_t)((b>>5)&7);
441                 byteOne=(uint8_t)(b&0x1f);
442                 cnv->toUBytes[1]=b;
443                 cnv->toULength=2;
444                 state=definePairTwo;
445                 break;
446             case definePairTwo:
447                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
448                 sourceIndex=nextSourceIndex;
449                 state=readCommand;
450                 goto fastSingle;
451             case defineOne:
452                 if(b==0) {
453                     /* callback(illegal): Reserved window offset value 0 */
454                     cnv->toUBytes[1]=b;
455                     cnv->toULength=2;
456                     goto endloop;
457                 } else if(b<gapThreshold) {
458                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
459                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
460                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
461                 } else if(b>=fixedThreshold) {
462                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
463                 } else {
464                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
465                     cnv->toUBytes[1]=b;
466                     cnv->toULength=2;
467                     goto endloop;
468                 }
469                 sourceIndex=nextSourceIndex;
470                 state=readCommand;
471                 goto fastSingle;
472             }
473         }
474     } else {
475         /* fast path for Unicode mode */
476         if(state==readCommand) {
477 fastUnicode:
478             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
479                 *target++=(UChar)((b<<8)|source[1]);
480                 if(offsets!=NULL) {
481                     *offsets++=sourceIndex;
482                 }
483                 sourceIndex=nextSourceIndex;
484                 nextSourceIndex+=2;
485                 source+=2;
486             }
487         }
488 
489         /* normal state machine for Unicode mode */
490 /* unicodeByteMode: */
491         while(source<sourceLimit) {
492             if(target>=targetLimit) {
493                 /* target is full */
494                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
495                 break;
496             }
497             b=*source++;
498             ++nextSourceIndex;
499             switch(state) {
500             case readCommand:
501                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
502                     byteOne=b;
503                     cnv->toUBytes[0]=b;
504                     cnv->toULength=1;
505                     state=quotePairTwo;
506                 } else if(/* UC0<=b && */ b<=UC7) {
507                     dynamicWindow=(int8_t)(b-UC0);
508                     sourceIndex=nextSourceIndex;
509                     isSingleByteMode=TRUE;
510                     goto fastSingle;
511                 } else if(/* UD0<=b && */ b<=UD7) {
512                     dynamicWindow=(int8_t)(b-UD0);
513                     isSingleByteMode=TRUE;
514                     cnv->toUBytes[0]=b;
515                     cnv->toULength=1;
516                     state=defineOne;
517                     goto singleByteMode;
518                 } else if(b==UDX) {
519                     isSingleByteMode=TRUE;
520                     cnv->toUBytes[0]=b;
521                     cnv->toULength=1;
522                     state=definePairOne;
523                     goto singleByteMode;
524                 } else if(b==UQU) {
525                     cnv->toUBytes[0]=b;
526                     cnv->toULength=1;
527                     state=quotePairOne;
528                 } else /* Urs */ {
529                     /* callback(illegal) */
530                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
531                     cnv->toUBytes[0]=b;
532                     cnv->toULength=1;
533                     goto endloop;
534                 }
535                 break;
536             case quotePairOne:
537                 byteOne=b;
538                 cnv->toUBytes[1]=b;
539                 cnv->toULength=2;
540                 state=quotePairTwo;
541                 break;
542             case quotePairTwo:
543                 *target++=(UChar)((byteOne<<8)|b);
544                 if(offsets!=NULL) {
545                     *offsets++=sourceIndex;
546                 }
547                 sourceIndex=nextSourceIndex;
548                 state=readCommand;
549                 goto fastUnicode;
550             }
551         }
552     }
553 endloop:
554 
555     /* set the converter state back into UConverter */
556     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
557         /* reset to deal with the next character */
558         state=readCommand;
559     } else if(state==readCommand) {
560         /* not in a multi-byte sequence, reset toULength */
561         cnv->toULength=0;
562     }
563     scsu->toUIsSingleByteMode=isSingleByteMode;
564     scsu->toUState=state;
565     scsu->toUQuoteWindow=quoteWindow;
566     scsu->toUDynamicWindow=dynamicWindow;
567     scsu->toUByteOne=byteOne;
568 
569     /* write back the updated pointers */
570     pArgs->source=(const char *)source;
571     pArgs->target=target;
572     pArgs->offsets=offsets;
573     return;
574 }
575 
576 /*
577  * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
578  * If a change is made in the original function, then either
579  * change this function the same way or
580  * re-copy the original function and remove the variables
581  * offsets, sourceIndex, and nextSourceIndex.
582  */
583 static void
_SCSUToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)584 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
585                UErrorCode *pErrorCode) {
586     UConverter *cnv;
587     SCSUData *scsu;
588     const uint8_t *source, *sourceLimit;
589     UChar *target;
590     const UChar *targetLimit;
591     UBool isSingleByteMode;
592     uint8_t state, byteOne;
593     int8_t quoteWindow, dynamicWindow;
594 
595     uint8_t b;
596 
597     /* set up the local pointers */
598     cnv=pArgs->converter;
599     scsu=(SCSUData *)cnv->extraInfo;
600 
601     source=(const uint8_t *)pArgs->source;
602     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
603     target=pArgs->target;
604     targetLimit=pArgs->targetLimit;
605 
606     /* get the state machine state */
607     isSingleByteMode=scsu->toUIsSingleByteMode;
608     state=scsu->toUState;
609     quoteWindow=scsu->toUQuoteWindow;
610     dynamicWindow=scsu->toUDynamicWindow;
611     byteOne=scsu->toUByteOne;
612 
613     /*
614      * conversion "loop"
615      *
616      * For performance, this is not a normal C loop.
617      * Instead, there are two code blocks for the two SCSU modes.
618      * The function branches to either one, and a change of the mode is done with a goto to
619      * the other branch.
620      *
621      * Each branch has two conventional loops:
622      * - a fast-path loop for the most common codes in the mode
623      * - a loop for all other codes in the mode
624      * When the fast-path runs into a code that it cannot handle, its loop ends and it
625      * runs into the following loop to handle the other codes.
626      * The end of the input or output buffer is also handled by the slower loop.
627      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
628      *
629      * The callback handling is done by returning with an error code.
630      * The conversion framework actually calls the callback function.
631      */
632     if(isSingleByteMode) {
633         /* fast path for single-byte mode */
634         if(state==readCommand) {
635 fastSingle:
636             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
637                 ++source;
638                 if(b<=0x7f) {
639                     /* write US-ASCII graphic character or DEL */
640                     *target++=(UChar)b;
641                 } else {
642                     /* write from dynamic window */
643                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
644                     if(c<=0xffff) {
645                         *target++=(UChar)c;
646                     } else {
647                         /* output surrogate pair */
648                         *target++=(UChar)(0xd7c0+(c>>10));
649                         if(target<targetLimit) {
650                             *target++=(UChar)(0xdc00|(c&0x3ff));
651                         } else {
652                             /* target overflow */
653                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
654                             cnv->UCharErrorBufferLength=1;
655                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
656                             goto endloop;
657                         }
658                     }
659                 }
660             }
661         }
662 
663         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
664 singleByteMode:
665         while(source<sourceLimit) {
666             if(target>=targetLimit) {
667                 /* target is full */
668                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
669                 break;
670             }
671             b=*source++;
672             switch(state) {
673             case readCommand:
674                 /* redundant conditions are commented out */
675                 /* here: b<0x20 because otherwise we would be in fastSingle */
676                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
677                     /* CR/LF/TAB/NUL */
678                     *target++=(UChar)b;
679                     goto fastSingle;
680                 } else if(SC0<=b) {
681                     if(b<=SC7) {
682                         dynamicWindow=(int8_t)(b-SC0);
683                         goto fastSingle;
684                     } else /* if(SD0<=b && b<=SD7) */ {
685                         dynamicWindow=(int8_t)(b-SD0);
686                         state=defineOne;
687                     }
688                 } else if(/* SQ0<=b && */ b<=SQ7) {
689                     quoteWindow=(int8_t)(b-SQ0);
690                     state=quoteOne;
691                 } else if(b==SDX) {
692                     state=definePairOne;
693                 } else if(b==SQU) {
694                     state=quotePairOne;
695                 } else if(b==SCU) {
696                     isSingleByteMode=FALSE;
697                     goto fastUnicode;
698                 } else /* Srs */ {
699                     /* callback(illegal) */
700                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
701                     cnv->toUBytes[0]=b;
702                     cnv->toULength=1;
703                     goto endloop;
704                 }
705 
706                 /* store the first byte of a multibyte sequence in toUBytes[] */
707                 cnv->toUBytes[0]=b;
708                 cnv->toULength=1;
709                 break;
710             case quotePairOne:
711                 byteOne=b;
712                 cnv->toUBytes[1]=b;
713                 cnv->toULength=2;
714                 state=quotePairTwo;
715                 break;
716             case quotePairTwo:
717                 *target++=(UChar)((byteOne<<8)|b);
718                 state=readCommand;
719                 goto fastSingle;
720             case quoteOne:
721                 if(b<0x80) {
722                     /* all static offsets are in the BMP */
723                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
724                 } else {
725                     /* write from dynamic window */
726                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
727                     if(c<=0xffff) {
728                         *target++=(UChar)c;
729                     } else {
730                         /* output surrogate pair */
731                         *target++=(UChar)(0xd7c0+(c>>10));
732                         if(target<targetLimit) {
733                             *target++=(UChar)(0xdc00|(c&0x3ff));
734                         } else {
735                             /* target overflow */
736                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
737                             cnv->UCharErrorBufferLength=1;
738                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
739                             goto endloop;
740                         }
741                     }
742                 }
743                 state=readCommand;
744                 goto fastSingle;
745             case definePairOne:
746                 dynamicWindow=(int8_t)((b>>5)&7);
747                 byteOne=(uint8_t)(b&0x1f);
748                 cnv->toUBytes[1]=b;
749                 cnv->toULength=2;
750                 state=definePairTwo;
751                 break;
752             case definePairTwo:
753                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
754                 state=readCommand;
755                 goto fastSingle;
756             case defineOne:
757                 if(b==0) {
758                     /* callback(illegal): Reserved window offset value 0 */
759                     cnv->toUBytes[1]=b;
760                     cnv->toULength=2;
761                     goto endloop;
762                 } else if(b<gapThreshold) {
763                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
764                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
765                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
766                 } else if(b>=fixedThreshold) {
767                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
768                 } else {
769                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
770                     cnv->toUBytes[1]=b;
771                     cnv->toULength=2;
772                     goto endloop;
773                 }
774                 state=readCommand;
775                 goto fastSingle;
776             }
777         }
778     } else {
779         /* fast path for Unicode mode */
780         if(state==readCommand) {
781 fastUnicode:
782             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
783                 *target++=(UChar)((b<<8)|source[1]);
784                 source+=2;
785             }
786         }
787 
788         /* normal state machine for Unicode mode */
789 /* unicodeByteMode: */
790         while(source<sourceLimit) {
791             if(target>=targetLimit) {
792                 /* target is full */
793                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
794                 break;
795             }
796             b=*source++;
797             switch(state) {
798             case readCommand:
799                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
800                     byteOne=b;
801                     cnv->toUBytes[0]=b;
802                     cnv->toULength=1;
803                     state=quotePairTwo;
804                 } else if(/* UC0<=b && */ b<=UC7) {
805                     dynamicWindow=(int8_t)(b-UC0);
806                     isSingleByteMode=TRUE;
807                     goto fastSingle;
808                 } else if(/* UD0<=b && */ b<=UD7) {
809                     dynamicWindow=(int8_t)(b-UD0);
810                     isSingleByteMode=TRUE;
811                     cnv->toUBytes[0]=b;
812                     cnv->toULength=1;
813                     state=defineOne;
814                     goto singleByteMode;
815                 } else if(b==UDX) {
816                     isSingleByteMode=TRUE;
817                     cnv->toUBytes[0]=b;
818                     cnv->toULength=1;
819                     state=definePairOne;
820                     goto singleByteMode;
821                 } else if(b==UQU) {
822                     cnv->toUBytes[0]=b;
823                     cnv->toULength=1;
824                     state=quotePairOne;
825                 } else /* Urs */ {
826                     /* callback(illegal) */
827                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
828                     cnv->toUBytes[0]=b;
829                     cnv->toULength=1;
830                     goto endloop;
831                 }
832                 break;
833             case quotePairOne:
834                 byteOne=b;
835                 cnv->toUBytes[1]=b;
836                 cnv->toULength=2;
837                 state=quotePairTwo;
838                 break;
839             case quotePairTwo:
840                 *target++=(UChar)((byteOne<<8)|b);
841                 state=readCommand;
842                 goto fastUnicode;
843             }
844         }
845     }
846 endloop:
847 
848     /* set the converter state back into UConverter */
849     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
850         /* reset to deal with the next character */
851         state=readCommand;
852     } else if(state==readCommand) {
853         /* not in a multi-byte sequence, reset toULength */
854         cnv->toULength=0;
855     }
856     scsu->toUIsSingleByteMode=isSingleByteMode;
857     scsu->toUState=state;
858     scsu->toUQuoteWindow=quoteWindow;
859     scsu->toUDynamicWindow=dynamicWindow;
860     scsu->toUByteOne=byteOne;
861 
862     /* write back the updated pointers */
863     pArgs->source=(const char *)source;
864     pArgs->target=target;
865     return;
866 }
867 
868 /* SCSU-from-Unicode conversion functions ----------------------------------- */
869 
870 /*
871  * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
872  * reasonable results. The lookahead is minimal.
873  * Many cases are simple:
874  * A character fits directly into the current mode, a dynamic or static window,
875  * or is not compressible. These cases are tested first.
876  * Real compression heuristics are applied to the rest, in code branches for
877  * single/Unicode mode and BMP/supplementary code points.
878  * The heuristics used here are extremely simple.
879  */
880 
881 /* get the number of the window that this character is in, or -1 */
882 static int8_t
getWindow(const uint32_t offsets[8],uint32_t c)883 getWindow(const uint32_t offsets[8], uint32_t c) {
884     int i;
885     for(i=0; i<8; ++i) {
886         if((uint32_t)(c-offsets[i])<=0x7f) {
887             return (int8_t)(i);
888         }
889     }
890     return -1;
891 }
892 
893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
894 static UBool
isInOffsetWindowOrDirect(uint32_t offset,uint32_t c)895 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
896     return (UBool)(c<=offset+0x7f &&
897           (c>=offset || (c<=0x7f &&
898                         (c>=0x20 || (1UL<<c)&0x2601))));
899                                 /* binary 0010 0110 0000 0001,
900                                    check for b==0xd || b==0xa || b==9 || b==0 */
901 }
902 
903 /*
904  * getNextDynamicWindow returns the next dynamic window to be redefined
905  */
906 static int8_t
getNextDynamicWindow(SCSUData * scsu)907 getNextDynamicWindow(SCSUData *scsu) {
908     int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
909     if(++scsu->nextWindowUseIndex==8) {
910         scsu->nextWindowUseIndex=0;
911     }
912     return window;
913 }
914 
915 /*
916  * useDynamicWindow() adjusts
917  * windowUse[] and nextWindowUseIndex for the algorithm to choose
918  * the next dynamic window to be defined;
919  * a subclass may override it and provide its own algorithm.
920  */
921 static void
useDynamicWindow(SCSUData * scsu,int8_t window)922 useDynamicWindow(SCSUData *scsu, int8_t window) {
923     /*
924      * move the existing window, which just became the most recently used one,
925      * up in windowUse[] to nextWindowUseIndex-1
926      */
927 
928     /* first, find the index of the window - backwards to favor the more recently used windows */
929     int i, j;
930 
931     i=scsu->nextWindowUseIndex;
932     do {
933         if(--i<0) {
934             i=7;
935         }
936     } while(scsu->windowUse[i]!=window);
937 
938     /* now copy each windowUse[i+1] to [i] */
939     j=i+1;
940     if(j==8) {
941         j=0;
942     }
943     while(j!=scsu->nextWindowUseIndex) {
944         scsu->windowUse[i]=scsu->windowUse[j];
945         i=j;
946         if(++j==8) { j=0; }
947     }
948 
949     /* finally, set the window into the most recently used index */
950     scsu->windowUse[i]=window;
951 }
952 
953 /*
954  * calculate the offset and the code for a dynamic window that contains the character
955  * takes fixed offsets into account
956  * the offset of the window is stored in the offset variable,
957  * the code is returned
958  *
959  * return offset code: -1 none  <=0xff code for SDn/UDn  else code for SDX/UDX, subtract 0x200 to get the true code
960  */
961 static int
getDynamicOffset(uint32_t c,uint32_t * pOffset)962 getDynamicOffset(uint32_t c, uint32_t *pOffset) {
963     int i;
964 
965     for(i=0; i<7; ++i) {
966         if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
967             *pOffset=fixedOffsets[i];
968             return 0xf9+i;
969         }
970     }
971 
972     if(c<0x80) {
973         /* No dynamic window for US-ASCII. */
974         return -1;
975     } else if(c<0x3400 ||
976               (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
977               (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
978     ) {
979         /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
980         *pOffset=c&0x7fffff80;
981         return (int)(c>>7);
982     } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
983         /* For these characters we need to take the gapOffset into account. */
984         *pOffset=c&0x7fffff80;
985         return (int)((c-gapOffset)>>7);
986     } else {
987         return -1;
988     }
989 }
990 
991 /*
992  * Idea for compression:
993  *  - save SCSUData and other state before really starting work
994  *  - at endloop, see if compression could be better with just unicode mode
995  *  - don't do this if a callback has been called
996  *  - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
997  *  - different buffer handling!
998  *
999  * Drawback or need for corrective handling:
1000  * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1001  * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1002  * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1003  *
1004  * How to achieve both?
1005  *  - Only replace the result after an SDX or SCU?
1006  */
1007 
1008 static void
_SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1010                             UErrorCode *pErrorCode) {
1011     UConverter *cnv;
1012     SCSUData *scsu;
1013     const UChar *source, *sourceLimit;
1014     uint8_t *target;
1015     int32_t targetCapacity;
1016     int32_t *offsets;
1017 
1018     UBool isSingleByteMode;
1019     uint8_t dynamicWindow;
1020     uint32_t currentOffset;
1021 
1022     uint32_t c, delta;
1023 
1024     int32_t sourceIndex, nextSourceIndex;
1025 
1026     int32_t length;
1027 
1028     /* variables for compression heuristics */
1029     uint32_t offset;
1030     UChar lead, trail;
1031     int code;
1032     int8_t window;
1033 
1034     /* set up the local pointers */
1035     cnv=pArgs->converter;
1036     scsu=(SCSUData *)cnv->extraInfo;
1037 
1038     /* set up the local pointers */
1039     source=pArgs->source;
1040     sourceLimit=pArgs->sourceLimit;
1041     target=(uint8_t *)pArgs->target;
1042     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1043     offsets=pArgs->offsets;
1044 
1045     /* get the state machine state */
1046     isSingleByteMode=scsu->fromUIsSingleByteMode;
1047     dynamicWindow=scsu->fromUDynamicWindow;
1048     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1049 
1050     c=cnv->fromUChar32;
1051 
1052     /* sourceIndex=-1 if the current character began in the previous buffer */
1053     sourceIndex= c==0 ? 0 : -1;
1054     nextSourceIndex=0;
1055 
1056     /* similar conversion "loop" as in toUnicode */
1057 loop:
1058     if(isSingleByteMode) {
1059         if(c!=0 && targetCapacity>0) {
1060             goto getTrailSingle;
1061         }
1062 
1063         /* state machine for single-byte mode */
1064 /* singleByteMode: */
1065         while(source<sourceLimit) {
1066             if(targetCapacity<=0) {
1067                 /* target is full */
1068                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1069                 break;
1070             }
1071             c=*source++;
1072             ++nextSourceIndex;
1073 
1074             if((c-0x20)<=0x5f) {
1075                 /* pass US-ASCII graphic character through */
1076                 *target++=(uint8_t)c;
1077                 if(offsets!=NULL) {
1078                     *offsets++=sourceIndex;
1079                 }
1080                 --targetCapacity;
1081             } else if(c<0x20) {
1082                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1083                     /* CR/LF/TAB/NUL */
1084                     *target++=(uint8_t)c;
1085                     if(offsets!=NULL) {
1086                         *offsets++=sourceIndex;
1087                     }
1088                     --targetCapacity;
1089                 } else {
1090                     /* quote C0 control character */
1091                     c|=SQ0<<8;
1092                     length=2;
1093                     goto outputBytes;
1094                 }
1095             } else if((delta=c-currentOffset)<=0x7f) {
1096                 /* use the current dynamic window */
1097                 *target++=(uint8_t)(delta|0x80);
1098                 if(offsets!=NULL) {
1099                     *offsets++=sourceIndex;
1100                 }
1101                 --targetCapacity;
1102             } else if(U16_IS_SURROGATE(c)) {
1103                 if(U16_IS_SURROGATE_LEAD(c)) {
1104 getTrailSingle:
1105                     lead=(UChar)c;
1106                     if(source<sourceLimit) {
1107                         /* test the following code unit */
1108                         trail=*source;
1109                         if(U16_IS_TRAIL(trail)) {
1110                             ++source;
1111                             ++nextSourceIndex;
1112                             c=U16_GET_SUPPLEMENTARY(c, trail);
1113                             /* convert this surrogate code point */
1114                             /* exit this condition tree */
1115                         } else {
1116                             /* this is an unmatched lead code unit (1st surrogate) */
1117                             /* callback(illegal) */
1118                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1119                             goto endloop;
1120                         }
1121                     } else {
1122                         /* no more input */
1123                         break;
1124                     }
1125                 } else {
1126                     /* this is an unmatched trail code unit (2nd surrogate) */
1127                     /* callback(illegal) */
1128                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1129                     goto endloop;
1130                 }
1131 
1132                 /* compress supplementary character U+10000..U+10ffff */
1133                 if((delta=c-currentOffset)<=0x7f) {
1134                     /* use the current dynamic window */
1135                     *target++=(uint8_t)(delta|0x80);
1136                     if(offsets!=NULL) {
1137                         *offsets++=sourceIndex;
1138                     }
1139                     --targetCapacity;
1140                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1141                     /* there is a dynamic window that contains this character, change to it */
1142                     dynamicWindow=window;
1143                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1144                     useDynamicWindow(scsu, dynamicWindow);
1145                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1146                     length=2;
1147                     goto outputBytes;
1148                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1149                     /* might check if there are more characters in this window to come */
1150                     /* define an extended window with this character */
1151                     code-=0x200;
1152                     dynamicWindow=getNextDynamicWindow(scsu);
1153                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1154                     useDynamicWindow(scsu, dynamicWindow);
1155                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1156                     length=4;
1157                     goto outputBytes;
1158                 } else {
1159                     /* change to Unicode mode and output this (lead, trail) pair */
1160                     isSingleByteMode=FALSE;
1161                     *target++=(uint8_t)SCU;
1162                     if(offsets!=NULL) {
1163                         *offsets++=sourceIndex;
1164                     }
1165                     --targetCapacity;
1166                     c=((uint32_t)lead<<16)|trail;
1167                     length=4;
1168                     goto outputBytes;
1169                 }
1170             } else if(c<0xa0) {
1171                 /* quote C1 control character */
1172                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1173                 length=2;
1174                 goto outputBytes;
1175             } else if(c==0xfeff || c>=0xfff0) {
1176                 /* quote signature character=byte order mark and specials */
1177                 c|=SQU<<16;
1178                 length=3;
1179                 goto outputBytes;
1180             } else {
1181                 /* compress all other BMP characters */
1182                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1183                     /* there is a window defined that contains this character - switch to it or quote from it? */
1184                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1185                         /* change to dynamic window */
1186                         dynamicWindow=window;
1187                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1188                         useDynamicWindow(scsu, dynamicWindow);
1189                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1190                         length=2;
1191                         goto outputBytes;
1192                     } else {
1193                         /* quote from dynamic window */
1194                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1195                         length=2;
1196                         goto outputBytes;
1197                     }
1198                 } else if((window=getWindow(staticOffsets, c))>=0) {
1199                     /* quote from static window */
1200                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1201                     length=2;
1202                     goto outputBytes;
1203                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1204                     /* define a dynamic window with this character */
1205                     dynamicWindow=getNextDynamicWindow(scsu);
1206                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1207                     useDynamicWindow(scsu, dynamicWindow);
1208                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1209                     length=3;
1210                     goto outputBytes;
1211                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1212                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1213                 ) {
1214                     /*
1215                      * this character is not compressible (a BMP ideograph or similar);
1216                      * switch to Unicode mode if this is the last character in the block
1217                      * or there is at least one more ideograph following immediately
1218                      */
1219                     isSingleByteMode=FALSE;
1220                     c|=SCU<<16;
1221                     length=3;
1222                     goto outputBytes;
1223                 } else {
1224                     /* quote Unicode */
1225                     c|=SQU<<16;
1226                     length=3;
1227                     goto outputBytes;
1228                 }
1229             }
1230 
1231             /* normal end of conversion: prepare for a new character */
1232             c=0;
1233             sourceIndex=nextSourceIndex;
1234         }
1235     } else {
1236         if(c!=0 && targetCapacity>0) {
1237             goto getTrailUnicode;
1238         }
1239 
1240         /* state machine for Unicode mode */
1241 /* unicodeByteMode: */
1242         while(source<sourceLimit) {
1243             if(targetCapacity<=0) {
1244                 /* target is full */
1245                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1246                 break;
1247             }
1248             c=*source++;
1249             ++nextSourceIndex;
1250 
1251             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1252                 /* not compressible, write character directly */
1253                 if(targetCapacity>=2) {
1254                     *target++=(uint8_t)(c>>8);
1255                     *target++=(uint8_t)c;
1256                     if(offsets!=NULL) {
1257                         *offsets++=sourceIndex;
1258                         *offsets++=sourceIndex;
1259                     }
1260                     targetCapacity-=2;
1261                 } else {
1262                     length=2;
1263                     goto outputBytes;
1264                 }
1265             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1266                 /* compress BMP character if the following one is not an uncompressible ideograph */
1267                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1268                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1269                         /* ASCII digit or letter */
1270                         isSingleByteMode=TRUE;
1271                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1272                         length=2;
1273                         goto outputBytes;
1274                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1275                         /* there is a dynamic window that contains this character, change to it */
1276                         isSingleByteMode=TRUE;
1277                         dynamicWindow=window;
1278                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1279                         useDynamicWindow(scsu, dynamicWindow);
1280                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1281                         length=2;
1282                         goto outputBytes;
1283                     } else if((code=getDynamicOffset(c, &offset))>=0) {
1284                         /* define a dynamic window with this character */
1285                         isSingleByteMode=TRUE;
1286                         dynamicWindow=getNextDynamicWindow(scsu);
1287                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1288                         useDynamicWindow(scsu, dynamicWindow);
1289                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1290                         length=3;
1291                         goto outputBytes;
1292                     }
1293                 }
1294 
1295                 /* don't know how to compress this character, just write it directly */
1296                 length=2;
1297                 goto outputBytes;
1298             } else if(c<0xe000) {
1299                 /* c is a surrogate */
1300                 if(U16_IS_SURROGATE_LEAD(c)) {
1301 getTrailUnicode:
1302                     lead=(UChar)c;
1303                     if(source<sourceLimit) {
1304                         /* test the following code unit */
1305                         trail=*source;
1306                         if(U16_IS_TRAIL(trail)) {
1307                             ++source;
1308                             ++nextSourceIndex;
1309                             c=U16_GET_SUPPLEMENTARY(c, trail);
1310                             /* convert this surrogate code point */
1311                             /* exit this condition tree */
1312                         } else {
1313                             /* this is an unmatched lead code unit (1st surrogate) */
1314                             /* callback(illegal) */
1315                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1316                             goto endloop;
1317                         }
1318                     } else {
1319                         /* no more input */
1320                         break;
1321                     }
1322                 } else {
1323                     /* this is an unmatched trail code unit (2nd surrogate) */
1324                     /* callback(illegal) */
1325                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1326                     goto endloop;
1327                 }
1328 
1329                 /* compress supplementary character */
1330                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1331                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1332                 ) {
1333                     /*
1334                      * there is a dynamic window that contains this character and
1335                      * the following character is not uncompressible,
1336                      * change to the window
1337                      */
1338                     isSingleByteMode=TRUE;
1339                     dynamicWindow=window;
1340                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1341                     useDynamicWindow(scsu, dynamicWindow);
1342                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1343                     length=2;
1344                     goto outputBytes;
1345                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1346                           (code=getDynamicOffset(c, &offset))>=0
1347                 ) {
1348                     /* two supplementary characters in (probably) the same window - define an extended one */
1349                     isSingleByteMode=TRUE;
1350                     code-=0x200;
1351                     dynamicWindow=getNextDynamicWindow(scsu);
1352                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1353                     useDynamicWindow(scsu, dynamicWindow);
1354                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1355                     length=4;
1356                     goto outputBytes;
1357                 } else {
1358                     /* don't know how to compress this character, just write it directly */
1359                     c=((uint32_t)lead<<16)|trail;
1360                     length=4;
1361                     goto outputBytes;
1362                 }
1363             } else /* 0xe000<=c<0xf300 */ {
1364                 /* quote to avoid SCSU tags */
1365                 c|=UQU<<16;
1366                 length=3;
1367                 goto outputBytes;
1368             }
1369 
1370             /* normal end of conversion: prepare for a new character */
1371             c=0;
1372             sourceIndex=nextSourceIndex;
1373         }
1374     }
1375 endloop:
1376 
1377     /* set the converter state back into UConverter */
1378     scsu->fromUIsSingleByteMode=isSingleByteMode;
1379     scsu->fromUDynamicWindow=dynamicWindow;
1380 
1381     cnv->fromUChar32=c;
1382 
1383     /* write back the updated pointers */
1384     pArgs->source=source;
1385     pArgs->target=(char *)target;
1386     pArgs->offsets=offsets;
1387     return;
1388 
1389 outputBytes:
1390     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1391     /* from the first if in the loop we know that targetCapacity>0 */
1392     if(length<=targetCapacity) {
1393         if(offsets==NULL) {
1394             switch(length) {
1395                 /* each branch falls through to the next one */
1396             case 4:
1397                 *target++=(uint8_t)(c>>24);
1398             case 3: /*fall through*/
1399                 *target++=(uint8_t)(c>>16);
1400             case 2: /*fall through*/
1401                 *target++=(uint8_t)(c>>8);
1402             case 1: /*fall through*/
1403                 *target++=(uint8_t)c;
1404             default:
1405                 /* will never occur */
1406                 break;
1407             }
1408         } else {
1409             switch(length) {
1410                 /* each branch falls through to the next one */
1411             case 4:
1412                 *target++=(uint8_t)(c>>24);
1413                 *offsets++=sourceIndex;
1414             case 3: /*fall through*/
1415                 *target++=(uint8_t)(c>>16);
1416                 *offsets++=sourceIndex;
1417             case 2: /*fall through*/
1418                 *target++=(uint8_t)(c>>8);
1419                 *offsets++=sourceIndex;
1420             case 1: /*fall through*/
1421                 *target++=(uint8_t)c;
1422                 *offsets++=sourceIndex;
1423             default:
1424                 /* will never occur */
1425                 break;
1426             }
1427         }
1428         targetCapacity-=length;
1429 
1430         /* normal end of conversion: prepare for a new character */
1431         c=0;
1432         sourceIndex=nextSourceIndex;
1433         goto loop;
1434     } else {
1435         uint8_t *p;
1436 
1437         /*
1438          * We actually do this backwards here:
1439          * In order to save an intermediate variable, we output
1440          * first to the overflow buffer what does not fit into the
1441          * regular target.
1442          */
1443         /* we know that 0<=targetCapacity<length<=4 */
1444         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1445         length-=targetCapacity;
1446         p=(uint8_t *)cnv->charErrorBuffer;
1447         switch(length) {
1448             /* each branch falls through to the next one */
1449         case 4:
1450             *p++=(uint8_t)(c>>24);
1451         case 3: /*fall through*/
1452             *p++=(uint8_t)(c>>16);
1453         case 2: /*fall through*/
1454             *p++=(uint8_t)(c>>8);
1455         case 1: /*fall through*/
1456             *p=(uint8_t)c;
1457         default:
1458             /* will never occur */
1459             break;
1460         }
1461         cnv->charErrorBufferLength=(int8_t)length;
1462 
1463         /* now output what fits into the regular target */
1464         c>>=8*length; /* length was reduced by targetCapacity */
1465         switch(targetCapacity) {
1466             /* each branch falls through to the next one */
1467         case 3:
1468             *target++=(uint8_t)(c>>16);
1469             if(offsets!=NULL) {
1470                 *offsets++=sourceIndex;
1471             }
1472         case 2: /*fall through*/
1473             *target++=(uint8_t)(c>>8);
1474             if(offsets!=NULL) {
1475                 *offsets++=sourceIndex;
1476             }
1477         case 1: /*fall through*/
1478             *target++=(uint8_t)c;
1479             if(offsets!=NULL) {
1480                 *offsets++=sourceIndex;
1481             }
1482         default:
1483             break;
1484         }
1485 
1486         /* target overflow */
1487         targetCapacity=0;
1488         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1489         c=0;
1490         goto endloop;
1491     }
1492 }
1493 
1494 /*
1495  * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1496  * If a change is made in the original function, then either
1497  * change this function the same way or
1498  * re-copy the original function and remove the variables
1499  * offsets, sourceIndex, and nextSourceIndex.
1500  */
1501 static void
_SCSUFromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1502 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
1503                  UErrorCode *pErrorCode) {
1504     UConverter *cnv;
1505     SCSUData *scsu;
1506     const UChar *source, *sourceLimit;
1507     uint8_t *target;
1508     int32_t targetCapacity;
1509 
1510     UBool isSingleByteMode;
1511     uint8_t dynamicWindow;
1512     uint32_t currentOffset;
1513 
1514     uint32_t c, delta;
1515 
1516     int32_t length;
1517 
1518     /* variables for compression heuristics */
1519     uint32_t offset;
1520     UChar lead, trail;
1521     int code;
1522     int8_t window;
1523 
1524     /* set up the local pointers */
1525     cnv=pArgs->converter;
1526     scsu=(SCSUData *)cnv->extraInfo;
1527 
1528     /* set up the local pointers */
1529     source=pArgs->source;
1530     sourceLimit=pArgs->sourceLimit;
1531     target=(uint8_t *)pArgs->target;
1532     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1533 
1534     /* get the state machine state */
1535     isSingleByteMode=scsu->fromUIsSingleByteMode;
1536     dynamicWindow=scsu->fromUDynamicWindow;
1537     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1538 
1539     c=cnv->fromUChar32;
1540 
1541     /* similar conversion "loop" as in toUnicode */
1542 loop:
1543     if(isSingleByteMode) {
1544         if(c!=0 && targetCapacity>0) {
1545             goto getTrailSingle;
1546         }
1547 
1548         /* state machine for single-byte mode */
1549 /* singleByteMode: */
1550         while(source<sourceLimit) {
1551             if(targetCapacity<=0) {
1552                 /* target is full */
1553                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1554                 break;
1555             }
1556             c=*source++;
1557 
1558             if((c-0x20)<=0x5f) {
1559                 /* pass US-ASCII graphic character through */
1560                 *target++=(uint8_t)c;
1561                 --targetCapacity;
1562             } else if(c<0x20) {
1563                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1564                     /* CR/LF/TAB/NUL */
1565                     *target++=(uint8_t)c;
1566                     --targetCapacity;
1567                 } else {
1568                     /* quote C0 control character */
1569                     c|=SQ0<<8;
1570                     length=2;
1571                     goto outputBytes;
1572                 }
1573             } else if((delta=c-currentOffset)<=0x7f) {
1574                 /* use the current dynamic window */
1575                 *target++=(uint8_t)(delta|0x80);
1576                 --targetCapacity;
1577             } else if(U16_IS_SURROGATE(c)) {
1578                 if(U16_IS_SURROGATE_LEAD(c)) {
1579 getTrailSingle:
1580                     lead=(UChar)c;
1581                     if(source<sourceLimit) {
1582                         /* test the following code unit */
1583                         trail=*source;
1584                         if(U16_IS_TRAIL(trail)) {
1585                             ++source;
1586                             c=U16_GET_SUPPLEMENTARY(c, trail);
1587                             /* convert this surrogate code point */
1588                             /* exit this condition tree */
1589                         } else {
1590                             /* this is an unmatched lead code unit (1st surrogate) */
1591                             /* callback(illegal) */
1592                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1593                             goto endloop;
1594                         }
1595                     } else {
1596                         /* no more input */
1597                         break;
1598                     }
1599                 } else {
1600                     /* this is an unmatched trail code unit (2nd surrogate) */
1601                     /* callback(illegal) */
1602                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1603                     goto endloop;
1604                 }
1605 
1606                 /* compress supplementary character U+10000..U+10ffff */
1607                 if((delta=c-currentOffset)<=0x7f) {
1608                     /* use the current dynamic window */
1609                     *target++=(uint8_t)(delta|0x80);
1610                     --targetCapacity;
1611                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1612                     /* there is a dynamic window that contains this character, change to it */
1613                     dynamicWindow=window;
1614                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1615                     useDynamicWindow(scsu, dynamicWindow);
1616                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1617                     length=2;
1618                     goto outputBytes;
1619                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1620                     /* might check if there are more characters in this window to come */
1621                     /* define an extended window with this character */
1622                     code-=0x200;
1623                     dynamicWindow=getNextDynamicWindow(scsu);
1624                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1625                     useDynamicWindow(scsu, dynamicWindow);
1626                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1627                     length=4;
1628                     goto outputBytes;
1629                 } else {
1630                     /* change to Unicode mode and output this (lead, trail) pair */
1631                     isSingleByteMode=FALSE;
1632                     *target++=(uint8_t)SCU;
1633                     --targetCapacity;
1634                     c=((uint32_t)lead<<16)|trail;
1635                     length=4;
1636                     goto outputBytes;
1637                 }
1638             } else if(c<0xa0) {
1639                 /* quote C1 control character */
1640                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1641                 length=2;
1642                 goto outputBytes;
1643             } else if(c==0xfeff || c>=0xfff0) {
1644                 /* quote signature character=byte order mark and specials */
1645                 c|=SQU<<16;
1646                 length=3;
1647                 goto outputBytes;
1648             } else {
1649                 /* compress all other BMP characters */
1650                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1651                     /* there is a window defined that contains this character - switch to it or quote from it? */
1652                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1653                         /* change to dynamic window */
1654                         dynamicWindow=window;
1655                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1656                         useDynamicWindow(scsu, dynamicWindow);
1657                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1658                         length=2;
1659                         goto outputBytes;
1660                     } else {
1661                         /* quote from dynamic window */
1662                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1663                         length=2;
1664                         goto outputBytes;
1665                     }
1666                 } else if((window=getWindow(staticOffsets, c))>=0) {
1667                     /* quote from static window */
1668                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1669                     length=2;
1670                     goto outputBytes;
1671                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1672                     /* define a dynamic window with this character */
1673                     dynamicWindow=getNextDynamicWindow(scsu);
1674                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1675                     useDynamicWindow(scsu, dynamicWindow);
1676                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1677                     length=3;
1678                     goto outputBytes;
1679                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1680                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1681                 ) {
1682                     /*
1683                      * this character is not compressible (a BMP ideograph or similar);
1684                      * switch to Unicode mode if this is the last character in the block
1685                      * or there is at least one more ideograph following immediately
1686                      */
1687                     isSingleByteMode=FALSE;
1688                     c|=SCU<<16;
1689                     length=3;
1690                     goto outputBytes;
1691                 } else {
1692                     /* quote Unicode */
1693                     c|=SQU<<16;
1694                     length=3;
1695                     goto outputBytes;
1696                 }
1697             }
1698 
1699             /* normal end of conversion: prepare for a new character */
1700             c=0;
1701         }
1702     } else {
1703         if(c!=0 && targetCapacity>0) {
1704             goto getTrailUnicode;
1705         }
1706 
1707         /* state machine for Unicode mode */
1708 /* unicodeByteMode: */
1709         while(source<sourceLimit) {
1710             if(targetCapacity<=0) {
1711                 /* target is full */
1712                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1713                 break;
1714             }
1715             c=*source++;
1716 
1717             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1718                 /* not compressible, write character directly */
1719                 if(targetCapacity>=2) {
1720                     *target++=(uint8_t)(c>>8);
1721                     *target++=(uint8_t)c;
1722                     targetCapacity-=2;
1723                 } else {
1724                     length=2;
1725                     goto outputBytes;
1726                 }
1727             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1728                 /* compress BMP character if the following one is not an uncompressible ideograph */
1729                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1730                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1731                         /* ASCII digit or letter */
1732                         isSingleByteMode=TRUE;
1733                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1734                         length=2;
1735                         goto outputBytes;
1736                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1737                         /* there is a dynamic window that contains this character, change to it */
1738                         isSingleByteMode=TRUE;
1739                         dynamicWindow=window;
1740                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1741                         useDynamicWindow(scsu, dynamicWindow);
1742                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1743                         length=2;
1744                         goto outputBytes;
1745                     } else if((code=getDynamicOffset(c, &offset))>=0) {
1746                         /* define a dynamic window with this character */
1747                         isSingleByteMode=TRUE;
1748                         dynamicWindow=getNextDynamicWindow(scsu);
1749                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1750                         useDynamicWindow(scsu, dynamicWindow);
1751                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1752                         length=3;
1753                         goto outputBytes;
1754                     }
1755                 }
1756 
1757                 /* don't know how to compress this character, just write it directly */
1758                 length=2;
1759                 goto outputBytes;
1760             } else if(c<0xe000) {
1761                 /* c is a surrogate */
1762                 if(U16_IS_SURROGATE_LEAD(c)) {
1763 getTrailUnicode:
1764                     lead=(UChar)c;
1765                     if(source<sourceLimit) {
1766                         /* test the following code unit */
1767                         trail=*source;
1768                         if(U16_IS_TRAIL(trail)) {
1769                             ++source;
1770                             c=U16_GET_SUPPLEMENTARY(c, trail);
1771                             /* convert this surrogate code point */
1772                             /* exit this condition tree */
1773                         } else {
1774                             /* this is an unmatched lead code unit (1st surrogate) */
1775                             /* callback(illegal) */
1776                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1777                             goto endloop;
1778                         }
1779                     } else {
1780                         /* no more input */
1781                         break;
1782                     }
1783                 } else {
1784                     /* this is an unmatched trail code unit (2nd surrogate) */
1785                     /* callback(illegal) */
1786                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1787                     goto endloop;
1788                 }
1789 
1790                 /* compress supplementary character */
1791                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1792                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1793                 ) {
1794                     /*
1795                      * there is a dynamic window that contains this character and
1796                      * the following character is not uncompressible,
1797                      * change to the window
1798                      */
1799                     isSingleByteMode=TRUE;
1800                     dynamicWindow=window;
1801                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1802                     useDynamicWindow(scsu, dynamicWindow);
1803                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1804                     length=2;
1805                     goto outputBytes;
1806                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1807                           (code=getDynamicOffset(c, &offset))>=0
1808                 ) {
1809                     /* two supplementary characters in (probably) the same window - define an extended one */
1810                     isSingleByteMode=TRUE;
1811                     code-=0x200;
1812                     dynamicWindow=getNextDynamicWindow(scsu);
1813                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1814                     useDynamicWindow(scsu, dynamicWindow);
1815                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1816                     length=4;
1817                     goto outputBytes;
1818                 } else {
1819                     /* don't know how to compress this character, just write it directly */
1820                     c=((uint32_t)lead<<16)|trail;
1821                     length=4;
1822                     goto outputBytes;
1823                 }
1824             } else /* 0xe000<=c<0xf300 */ {
1825                 /* quote to avoid SCSU tags */
1826                 c|=UQU<<16;
1827                 length=3;
1828                 goto outputBytes;
1829             }
1830 
1831             /* normal end of conversion: prepare for a new character */
1832             c=0;
1833         }
1834     }
1835 endloop:
1836 
1837     /* set the converter state back into UConverter */
1838     scsu->fromUIsSingleByteMode=isSingleByteMode;
1839     scsu->fromUDynamicWindow=dynamicWindow;
1840 
1841     cnv->fromUChar32=c;
1842 
1843     /* write back the updated pointers */
1844     pArgs->source=source;
1845     pArgs->target=(char *)target;
1846     return;
1847 
1848 outputBytes:
1849     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1850     /* from the first if in the loop we know that targetCapacity>0 */
1851     if(length<=targetCapacity) {
1852         switch(length) {
1853             /* each branch falls through to the next one */
1854         case 4:
1855             *target++=(uint8_t)(c>>24);
1856         case 3: /*fall through*/
1857             *target++=(uint8_t)(c>>16);
1858         case 2: /*fall through*/
1859             *target++=(uint8_t)(c>>8);
1860         case 1: /*fall through*/
1861             *target++=(uint8_t)c;
1862         default:
1863             /* will never occur */
1864             break;
1865         }
1866         targetCapacity-=length;
1867 
1868         /* normal end of conversion: prepare for a new character */
1869         c=0;
1870         goto loop;
1871     } else {
1872         uint8_t *p;
1873 
1874         /*
1875          * We actually do this backwards here:
1876          * In order to save an intermediate variable, we output
1877          * first to the overflow buffer what does not fit into the
1878          * regular target.
1879          */
1880         /* we know that 0<=targetCapacity<length<=4 */
1881         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1882         length-=targetCapacity;
1883         p=(uint8_t *)cnv->charErrorBuffer;
1884         switch(length) {
1885             /* each branch falls through to the next one */
1886         case 4:
1887             *p++=(uint8_t)(c>>24);
1888         case 3: /*fall through*/
1889             *p++=(uint8_t)(c>>16);
1890         case 2: /*fall through*/
1891             *p++=(uint8_t)(c>>8);
1892         case 1: /*fall through*/
1893             *p=(uint8_t)c;
1894         default:
1895             /* will never occur */
1896             break;
1897         }
1898         cnv->charErrorBufferLength=(int8_t)length;
1899 
1900         /* now output what fits into the regular target */
1901         c>>=8*length; /* length was reduced by targetCapacity */
1902         switch(targetCapacity) {
1903             /* each branch falls through to the next one */
1904         case 3:
1905             *target++=(uint8_t)(c>>16);
1906         case 2: /*fall through*/
1907             *target++=(uint8_t)(c>>8);
1908         case 1: /*fall through*/
1909             *target++=(uint8_t)c;
1910         default:
1911             break;
1912         }
1913 
1914         /* target overflow */
1915         targetCapacity=0;
1916         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1917         c=0;
1918         goto endloop;
1919     }
1920 }
1921 
1922 /* miscellaneous ------------------------------------------------------------ */
1923 
1924 static const char *
_SCSUGetName(const UConverter * cnv)1925 _SCSUGetName(const UConverter *cnv) {
1926     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
1927 
1928     switch(scsu->locale) {
1929     case l_ja:
1930         return "SCSU,locale=ja";
1931     default:
1932         return "SCSU";
1933     }
1934 }
1935 
1936 /* structure for SafeClone calculations */
1937 struct cloneSCSUStruct
1938 {
1939     UConverter cnv;
1940     SCSUData mydata;
1941 };
1942 
1943 static UConverter *
_SCSUSafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)1944 _SCSUSafeClone(const UConverter *cnv,
1945                void *stackBuffer,
1946                int32_t *pBufferSize,
1947                UErrorCode *status)
1948 {
1949     struct cloneSCSUStruct * localClone;
1950     int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
1951 
1952     if (U_FAILURE(*status)){
1953         return 0;
1954     }
1955 
1956     if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1957         *pBufferSize = bufferSizeNeeded;
1958         return 0;
1959     }
1960 
1961     localClone = (struct cloneSCSUStruct *)stackBuffer;
1962     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1963 
1964     uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
1965     localClone->cnv.extraInfo = &localClone->mydata;
1966     localClone->cnv.isExtraLocal = TRUE;
1967 
1968     return &localClone->cnv;
1969 }
1970 
1971 
1972 static const UConverterImpl _SCSUImpl={
1973     UCNV_SCSU,
1974 
1975     NULL,
1976     NULL,
1977 
1978     _SCSUOpen,
1979     _SCSUClose,
1980     _SCSUReset,
1981 
1982     _SCSUToUnicode,
1983     _SCSUToUnicodeWithOffsets,
1984     _SCSUFromUnicode,
1985     _SCSUFromUnicodeWithOffsets,
1986     NULL,
1987 
1988     NULL,
1989     _SCSUGetName,
1990     NULL,
1991     _SCSUSafeClone,
1992     ucnv_getCompleteUnicodeSet
1993 };
1994 
1995 static const UConverterStaticData _SCSUStaticData={
1996     sizeof(UConverterStaticData),
1997     "SCSU",
1998     1212, /* CCSID for SCSU */
1999     UCNV_IBM, UCNV_SCSU,
2000     1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2001     /*
2002      * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2003      * substitution string.
2004      */
2005     { 0x0e, 0xff, 0xfd, 0 }, 3,
2006     FALSE, FALSE,
2007     0,
2008     0,
2009     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2010 };
2011 
2012 const UConverterSharedData _SCSUData=
2013         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl);
2014 
2015 #endif
2016