1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2000-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  genmbcs.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2000jul06
16 *   created by: Markus W. Scherer
17 */
18 
19 #include <stdio.h>
20 #include "unicode/utypes.h"
21 #include "cstring.h"
22 #include "cmemory.h"
23 #include "unewdata.h"
24 #include "ucnv_cnv.h"
25 #include "ucnvmbcs.h"
26 #include "ucm.h"
27 #include "makeconv.h"
28 #include "genmbcs.h"
29 
30 /*
31  * TODO: Split this file into toUnicode, SBCSFromUnicode and MBCSFromUnicode files.
32  * Reduce tests for maxCharLength.
33  */
34 
35 struct MBCSData {
36     NewConverter newConverter;
37 
38     UCMFile *ucm;
39 
40     /* toUnicode (state table in ucm->states) */
41     _MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT];
42     int32_t countToUFallbacks;
43     uint16_t *unicodeCodeUnits;
44 
45     /* fromUnicode */
46     uint16_t stage1[MBCS_STAGE_1_SIZE];
47     uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */
48     uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */
49     uint8_t *fromUBytes;
50     uint32_t stage2Top, stage3Top;
51 
52     /* fromUTF8 */
53     uint16_t stageUTF8[0x10000>>MBCS_UTF8_STAGE_SHIFT];  /* allow for utf8Max=0xffff */
54 
55     /*
56      * Maximum UTF-8-friendly code point.
57      * 0 if !utf8Friendly, otherwise 0x01ff..0xffff in steps of 0x100.
58      * If utf8Friendly, utf8Max is normally either MBCS_UTF8_MAX or 0xffff.
59      */
60     uint16_t utf8Max;
61 
62     UBool utf8Friendly;
63     UBool omitFromU;
64 };
65 
66 /* prototypes */
67 U_CDECL_BEGIN
68 static void
69 MBCSClose(NewConverter *cnvData);
70 
71 static UBool
72 MBCSStartMappings(MBCSData *mbcsData);
73 
74 static UBool
75 MBCSAddToUnicode(MBCSData *mbcsData,
76                  const uint8_t *bytes, int32_t length,
77                  UChar32 c,
78                  int8_t flag);
79 
80 static UBool
81 MBCSIsValid(NewConverter *cnvData,
82             const uint8_t *bytes, int32_t length);
83 
84 static UBool
85 MBCSSingleAddFromUnicode(MBCSData *mbcsData,
86                          const uint8_t *bytes, int32_t length,
87                          UChar32 c,
88                          int8_t flag);
89 
90 static UBool
91 MBCSAddFromUnicode(MBCSData *mbcsData,
92                    const uint8_t *bytes, int32_t length,
93                    UChar32 c,
94                    int8_t flag);
95 
96 static void
97 MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData);
98 
99 static UBool
100 MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
101 
102 static uint32_t
103 MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
104           UNewDataMemory *pData, int32_t tableType);
105 U_CDECL_END
106 
107 /* helper ------------------------------------------------------------------- */
108 
109 static inline char
hexDigit(uint8_t digit)110 hexDigit(uint8_t digit) {
111     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
112 }
113 
114 static inline char *
printBytes(char * buffer,const uint8_t * bytes,int32_t length)115 printBytes(char *buffer, const uint8_t *bytes, int32_t length) {
116     char *s=buffer;
117     while(length>0) {
118         *s++=hexDigit((uint8_t)(*bytes>>4));
119         *s++=hexDigit((uint8_t)(*bytes&0xf));
120         ++bytes;
121         --length;
122     }
123 
124     *s=0;
125     return buffer;
126 }
127 
128 /* implementation ----------------------------------------------------------- */
129 
130 static MBCSData gDummy;
131 
132 
133 U_CFUNC const MBCSData *
MBCSGetDummy()134 MBCSGetDummy() {
135     uprv_memset(&gDummy, 0, sizeof(MBCSData));
136 
137     /*
138      * Set "pessimistic" values which may sometimes move too many
139      * mappings to the extension table (but never too few).
140      * These values cause MBCSOkForBaseFromUnicode() to return FALSE for the
141      * largest set of mappings.
142      * Assume maxCharLength>1.
143      */
144     gDummy.utf8Friendly=TRUE;
145     if(SMALL) {
146         gDummy.utf8Max=0xffff;
147         gDummy.omitFromU=TRUE;
148     } else {
149         gDummy.utf8Max=MBCS_UTF8_MAX;
150     }
151     return &gDummy;
152 }
153 
154 static void
MBCSInit(MBCSData * mbcsData,UCMFile * ucm)155 MBCSInit(MBCSData *mbcsData, UCMFile *ucm) {
156     uprv_memset(mbcsData, 0, sizeof(MBCSData));
157 
158     mbcsData->ucm=ucm; /* aliased, not owned */
159 
160     mbcsData->newConverter.close=MBCSClose;
161     mbcsData->newConverter.isValid=MBCSIsValid;
162     mbcsData->newConverter.addTable=MBCSAddTable;
163     mbcsData->newConverter.write=MBCSWrite;
164 }
165 
166 U_CFUNC NewConverter *
MBCSOpen(UCMFile * ucm)167 MBCSOpen(UCMFile *ucm) {
168     MBCSData *mbcsData=(MBCSData *)uprv_malloc(sizeof(MBCSData));
169     if(mbcsData==NULL) {
170         printf("out of memory\n");
171         exit(U_MEMORY_ALLOCATION_ERROR);
172     }
173 
174     MBCSInit(mbcsData, ucm);
175     return &mbcsData->newConverter;
176 }
177 
178 static void
MBCSDestruct(MBCSData * mbcsData)179 MBCSDestruct(MBCSData *mbcsData) {
180     uprv_free(mbcsData->unicodeCodeUnits);
181     uprv_free(mbcsData->fromUBytes);
182 }
183 
184 U_CDECL_BEGIN
185 static void
MBCSClose(NewConverter * cnvData)186 MBCSClose(NewConverter *cnvData) {
187     MBCSData *mbcsData=(MBCSData *)cnvData;
188     if(mbcsData!=NULL) {
189         MBCSDestruct(mbcsData);
190         uprv_free(mbcsData);
191     }
192 }
193 U_CDECL_END
194 
195 static UBool
MBCSStartMappings(MBCSData * mbcsData)196 MBCSStartMappings(MBCSData *mbcsData) {
197     int32_t i, sum, maxCharLength,
198             stage2NullLength, stage2AllocLength,
199             stage3NullLength, stage3AllocLength;
200 
201     /* toUnicode */
202 
203     /* allocate the code unit array and prefill it with "unassigned" values */
204     sum=mbcsData->ucm->states.countToUCodeUnits;
205     if(VERBOSE) {
206         printf("the total number of offsets is 0x%lx=%ld\n", (long)sum, (long)sum);
207     }
208 
209     if(sum>0) {
210         mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
211         if(mbcsData->unicodeCodeUnits==NULL) {
212             fprintf(stderr, "error: out of memory allocating %ld 16-bit code units\n",
213                 (long)sum);
214             return FALSE;
215         }
216         for(i=0; i<sum; ++i) {
217             mbcsData->unicodeCodeUnits[i]=0xfffe;
218         }
219     }
220 
221     /* fromUnicode */
222     maxCharLength=mbcsData->ucm->states.maxCharLength;
223 
224     /* allocate the codepage mappings and preset the first 16 characters to 0 */
225     if(maxCharLength==1) {
226         /* allocate 64k 16-bit results for single-byte codepages */
227         sum=0x20000;
228     } else {
229         /* allocate 1M * maxCharLength bytes for at most 1M mappings */
230         sum=0x100000*maxCharLength;
231     }
232     mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum);
233     if(mbcsData->fromUBytes==NULL) {
234         fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", (long)sum);
235         return FALSE;
236     }
237     uprv_memset(mbcsData->fromUBytes, 0, sum);
238 
239     /*
240      * UTF-8-friendly fromUnicode tries: allocate multiple blocks at a time.
241      * See ucnvmbcs.h for details.
242      *
243      * There is code, for example in ucnv_MBCSGetUnicodeSetForUnicode(), which
244      * assumes that the initial stage 2/3 blocks are the all-unassigned ones.
245      * Therefore, we refine the data structure while maintaining this placement
246      * even though it would be convenient to allocate the ASCII block at the
247      * beginning of stage 3, for example.
248      *
249      * UTF-8-friendly fromUnicode tries work from sorted tables and are built
250      * pre-compacted, overlapping adjacent stage 2/3 blocks.
251      * This is necessary because the block allocation and compaction changes
252      * at SBCS_UTF8_MAX or MBCS_UTF8_MAX, and for MBCS tables the additional
253      * stage table uses direct indexes into stage 3, without a multiplier and
254      * thus with a smaller reach.
255      *
256      * Non-UTF-8-friendly fromUnicode tries work from unsorted tables
257      * (because implicit precision is used), and are compacted
258      * in post-processing.
259      *
260      * Preallocation for UTF-8-friendly fromUnicode tries:
261      *
262      * Stage 3:
263      * 64-entry all-unassigned first block followed by ASCII (128 entries).
264      *
265      * Stage 2:
266      * 64-entry all-unassigned first block followed by preallocated
267      * 64-block for ASCII.
268      */
269 
270     /* Preallocate ASCII as a linear 128-entry stage 3 block. */
271     stage2NullLength=MBCS_STAGE_2_BLOCK_SIZE;
272     stage2AllocLength=MBCS_STAGE_2_BLOCK_SIZE;
273 
274     stage3NullLength=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
275     stage3AllocLength=128; /* ASCII U+0000..U+007f */
276 
277     /* Initialize stage 1 for the preallocated blocks. */
278     sum=stage2NullLength;
279     for(i=0; i<(stage2AllocLength>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT); ++i) {
280         mbcsData->stage1[i]=sum;
281         sum+=MBCS_STAGE_2_BLOCK_SIZE;
282     }
283     mbcsData->stage2Top=stage2NullLength+stage2AllocLength; /* ==sum */
284 
285     /*
286      * Stage 2 indexes count 16-blocks in stage 3 as follows:
287      * SBCS: directly, indexes increment by 16
288      * MBCS: indexes need to be multiplied by 16*maxCharLength, indexes increment by 1
289      * MBCS UTF-8: directly, indexes increment by 16
290      */
291     if(maxCharLength==1) {
292         sum=stage3NullLength;
293         for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
294             mbcsData->stage2Single[mbcsData->stage1[0]+i]=sum;
295             sum+=MBCS_STAGE_3_BLOCK_SIZE;
296         }
297     } else {
298         sum=stage3NullLength/MBCS_STAGE_3_GRANULARITY;
299         for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
300             mbcsData->stage2[mbcsData->stage1[0]+i]=sum;
301             sum+=MBCS_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_GRANULARITY;
302         }
303     }
304 
305     sum=stage3NullLength;
306     for(i=0; i<(stage3AllocLength/MBCS_UTF8_STAGE_3_BLOCK_SIZE); ++i) {
307         mbcsData->stageUTF8[i]=sum;
308         sum+=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
309     }
310 
311     /*
312      * Allocate a 64-entry all-unassigned first stage 3 block,
313      * for UTF-8-friendly lookup with a trail byte,
314      * plus 128 entries for ASCII.
315      */
316     mbcsData->stage3Top=(stage3NullLength+stage3AllocLength)*maxCharLength; /* ==sum*maxCharLength */
317 
318     return TRUE;
319 }
320 
321 /* return TRUE for success */
322 static UBool
setFallback(MBCSData * mbcsData,uint32_t offset,UChar32 c)323 setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) {
324     int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
325     if(i>=0) {
326         /* if there is already a fallback for this offset, then overwrite it */
327         mbcsData->toUFallbacks[i].codePoint=c;
328         return TRUE;
329     } else {
330         /* if there is no fallback for this offset, then add one */
331         i=mbcsData->countToUFallbacks;
332         if(i>=MBCS_MAX_FALLBACK_COUNT) {
333             fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", (int)c);
334             return FALSE;
335         } else {
336             mbcsData->toUFallbacks[i].offset=offset;
337             mbcsData->toUFallbacks[i].codePoint=c;
338             mbcsData->countToUFallbacks=i+1;
339             return TRUE;
340         }
341     }
342 }
343 
344 /* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */
345 static int32_t
removeFallback(MBCSData * mbcsData,uint32_t offset)346 removeFallback(MBCSData *mbcsData, uint32_t offset) {
347     int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
348     if(i>=0) {
349         _MBCSToUFallback *toUFallbacks;
350         int32_t limit, old;
351 
352         toUFallbacks=mbcsData->toUFallbacks;
353         limit=mbcsData->countToUFallbacks;
354         old=(int32_t)toUFallbacks[i].codePoint;
355 
356         /* copy the last fallback entry here to keep the list contiguous */
357         toUFallbacks[i].offset=toUFallbacks[limit-1].offset;
358         toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint;
359         mbcsData->countToUFallbacks=limit-1;
360         return old;
361     } else {
362         return -1;
363     }
364 }
365 
366 /*
367  * isFallback is almost a boolean:
368  * 1 (TRUE)  this is a fallback mapping
369  * 0 (FALSE) this is a precise mapping
370  * -1        the precision of this mapping is not specified
371  */
372 static UBool
MBCSAddToUnicode(MBCSData * mbcsData,const uint8_t * bytes,int32_t length,UChar32 c,int8_t flag)373 MBCSAddToUnicode(MBCSData *mbcsData,
374                  const uint8_t *bytes, int32_t length,
375                  UChar32 c,
376                  int8_t flag) {
377     char buffer[10];
378     uint32_t offset=0;
379     int32_t i=0, entry, old;
380     uint8_t state=0;
381 
382     if(mbcsData->ucm->states.countStates==0) {
383         fprintf(stderr, "error: there is no state information!\n");
384         return FALSE;
385     }
386 
387     /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
388     if(length==2 && mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO) {
389         state=1;
390     }
391 
392     /*
393      * Walk down the state table like in conversion,
394      * much like getNextUChar().
395      * We assume that c<=0x10ffff.
396      */
397     for(i=0;;) {
398         entry=mbcsData->ucm->states.stateTable[state][bytes[i++]];
399         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
400             if(i==length) {
401                 fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n",
402                     (short)state, printBytes(buffer, bytes, length), (int)c);
403                 return FALSE;
404             }
405             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
406             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
407         } else {
408             if(i<length) {
409                 fprintf(stderr, "error: byte sequence too long by %d bytes, final state %u: 0x%s (U+%x)\n",
410                     (int)(length-i), state, printBytes(buffer, bytes, length), (int)c);
411                 return FALSE;
412             }
413             switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
414             case MBCS_STATE_ILLEGAL:
415                 fprintf(stderr, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n",
416                     (int)c, printBytes(buffer, bytes, length));
417                 return FALSE;
418             case MBCS_STATE_CHANGE_ONLY:
419                 fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n",
420                     (int)c, printBytes(buffer, bytes, length));
421                 return FALSE;
422             case MBCS_STATE_UNASSIGNED:
423                 fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n",
424                     (int)c, printBytes(buffer, bytes, length));
425                 return FALSE;
426             case MBCS_STATE_FALLBACK_DIRECT_16:
427             case MBCS_STATE_VALID_DIRECT_16:
428             case MBCS_STATE_FALLBACK_DIRECT_20:
429             case MBCS_STATE_VALID_DIRECT_20:
430                 if(MBCS_ENTRY_SET_STATE(entry, 0)!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
431                     /* the "direct" action's value is not "valid-direct-16-unassigned" any more */
432                     if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_DIRECT_16 || MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_FALLBACK_DIRECT_16) {
433                         old=MBCS_ENTRY_FINAL_VALUE(entry);
434                     } else {
435                         old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
436                     }
437                     if(flag>=0) {
438                         fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
439                             (int)c, printBytes(buffer, bytes, length), (int)old);
440                         return FALSE;
441                     } else if(VERBOSE) {
442                         fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
443                             (int)c, printBytes(buffer, bytes, length), (int)old);
444                     }
445                     /*
446                      * Continue after the above warning
447                      * if the precision of the mapping is unspecified.
448                      */
449                 }
450                 /* reassign the correct action code */
451                 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(flag==3 ? 2 : 0)+(c>=0x10000 ? 1 : 0)));
452 
453                 /* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */
454                 if(c<=0xffff) {
455                     entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c);
456                 } else {
457                     entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000);
458                 }
459                 mbcsData->ucm->states.stateTable[state][bytes[i-1]]=entry;
460                 break;
461             case MBCS_STATE_VALID_16:
462                 /* bits 26..16 are not used, 0 */
463                 /* bits 15..7 contain the final offset delta to one 16-bit code unit */
464                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
465                 /* check that this byte sequence is still unassigned */
466                 if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) {
467                     if(flag>=0) {
468                         fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
469                             (int)c, printBytes(buffer, bytes, length), (int)old);
470                         return FALSE;
471                     } else if(VERBOSE) {
472                         fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
473                             (int)c, printBytes(buffer, bytes, length), (int)old);
474                     }
475                 }
476                 if(c>=0x10000) {
477                     fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n",
478                         (int)c, printBytes(buffer, bytes, length));
479                     return FALSE;
480                 }
481                 if(flag>0) {
482                     /* assign only if there is no precise mapping */
483                     if(mbcsData->unicodeCodeUnits[offset]==0xfffe) {
484                         return setFallback(mbcsData, offset, c);
485                     }
486                 } else {
487                     mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
488                 }
489                 break;
490             case MBCS_STATE_VALID_16_PAIR:
491                 /* bits 26..16 are not used, 0 */
492                 /* bits 15..7 contain the final offset delta to two 16-bit code units */
493                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
494                 /* check that this byte sequence is still unassigned */
495                 old=mbcsData->unicodeCodeUnits[offset];
496                 if(old<0xfffe) {
497                     int32_t real;
498                     if(old<0xd800) {
499                         real=old;
500                     } else if(old<=0xdfff) {
501                         real=0x10000+((old&0x3ff)<<10)+((mbcsData->unicodeCodeUnits[offset+1])&0x3ff);
502                     } else /* old<=0xe001 */ {
503                         real=mbcsData->unicodeCodeUnits[offset+1];
504                     }
505                     if(flag>=0) {
506                         fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
507                             (int)c, printBytes(buffer, bytes, length), (int)real);
508                         return FALSE;
509                     } else if(VERBOSE) {
510                         fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
511                             (int)c, printBytes(buffer, bytes, length), (int)real);
512                     }
513                 }
514                 if(flag>0) {
515                     /* assign only if there is no precise mapping */
516                     if(old<=0xdbff || old==0xe000) {
517                         /* do nothing */
518                     } else if(c<=0xffff) {
519                         /* set a BMP fallback code point as a pair with 0xe001 */
520                         mbcsData->unicodeCodeUnits[offset++]=0xe001;
521                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
522                     } else {
523                         /* set a fallback surrogate pair with two second surrogates */
524                         mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xdbc0+(c>>10));
525                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff));
526                     }
527                 } else {
528                     if(c<0xd800) {
529                         /* set a BMP code point */
530                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
531                     } else if(c<=0xffff) {
532                         /* set a BMP code point above 0xd800 as a pair with 0xe000 */
533                         mbcsData->unicodeCodeUnits[offset++]=0xe000;
534                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
535                     } else {
536                         /* set a surrogate pair */
537                         mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xd7c0+(c>>10));
538                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff));
539                     }
540                 }
541                 break;
542             default:
543                 /* reserved, must never occur */
544                 fprintf(stderr, "internal error: byte sequence reached reserved action code, entry 0x%02x: 0x%s (U+%x)\n",
545                     (int)entry, printBytes(buffer, bytes, length), (int)c);
546                 return FALSE;
547             }
548 
549             return TRUE;
550         }
551     }
552 }
553 
554 U_CDECL_BEGIN
555 /* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */
556 static UBool
MBCSIsValid(NewConverter * cnvData,const uint8_t * bytes,int32_t length)557 MBCSIsValid(NewConverter *cnvData,
558             const uint8_t *bytes, int32_t length) {
559     MBCSData *mbcsData=(MBCSData *)cnvData;
560 
561     return (UBool)(1==ucm_countChars(&mbcsData->ucm->states, bytes, length));
562 }
563 U_CDECL_END
564 static UBool
MBCSSingleAddFromUnicode(MBCSData * mbcsData,const uint8_t * bytes,int32_t,UChar32 c,int8_t flag)565 MBCSSingleAddFromUnicode(MBCSData *mbcsData,
566                          const uint8_t *bytes, int32_t /*length*/,
567                          UChar32 c,
568                          int8_t flag) {
569     uint16_t *stage3, *p;
570     uint32_t idx;
571     uint16_t old;
572     uint8_t b;
573 
574     uint32_t blockSize, newTop, i, nextOffset, newBlock, min;
575 
576     /* ignore |2 SUB mappings */
577     if(flag==2) {
578         return TRUE;
579     }
580 
581     /*
582      * Walk down the triple-stage compact array ("trie") and
583      * allocate parts as necessary.
584      * Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
585      * We assume that length<=maxCharLength and that c<=0x10ffff.
586      */
587     stage3=(uint16_t *)mbcsData->fromUBytes;
588     b=*bytes;
589 
590     /* inspect stage 1 */
591     idx=c>>MBCS_STAGE_1_SHIFT;
592     if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
593         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
594     } else {
595         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
596     }
597     if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
598         /* allocate another block in stage 2 */
599         newBlock=mbcsData->stage2Top;
600         if(mbcsData->utf8Friendly) {
601             min=newBlock-nextOffset; /* minimum block start with overlap */
602             while(min<newBlock && mbcsData->stage2Single[newBlock-1]==0) {
603                 --newBlock;
604             }
605         }
606         newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
607 
608         if(newTop>MBCS_MAX_STAGE_2_TOP) {
609             fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c, b);
610             return FALSE;
611         }
612 
613         /*
614          * each stage 2 block contains 64 16-bit words:
615          * 6 code point bits 9..4 with 1 stage 3 index
616          */
617         mbcsData->stage1[idx]=(uint16_t)newBlock;
618         mbcsData->stage2Top=newTop;
619     }
620 
621     /* inspect stage 2 */
622     idx=mbcsData->stage1[idx]+nextOffset;
623     if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
624         /* allocate 64-entry blocks for UTF-8-friendly lookup */
625         blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
626         nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
627     } else {
628         blockSize=MBCS_STAGE_3_BLOCK_SIZE;
629         nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
630     }
631     if(mbcsData->stage2Single[idx]==0) {
632         /* allocate another block in stage 3 */
633         newBlock=mbcsData->stage3Top;
634         if(mbcsData->utf8Friendly) {
635             min=newBlock-nextOffset; /* minimum block start with overlap */
636             while(min<newBlock && stage3[newBlock-1]==0) {
637                 --newBlock;
638             }
639         }
640         newTop=newBlock+blockSize;
641 
642         if(newTop>MBCS_STAGE_3_SBCS_SIZE) {
643             fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", (int)c, b);
644             return FALSE;
645         }
646         /* each block has 16 uint16_t entries */
647         i=idx;
648         while(newBlock<newTop) {
649             mbcsData->stage2Single[i++]=(uint16_t)newBlock;
650             newBlock+=MBCS_STAGE_3_BLOCK_SIZE;
651         }
652         mbcsData->stage3Top=newTop; /* ==newBlock */
653     }
654 
655     /* write the codepage entry into stage 3 and get the previous entry */
656     p=stage3+mbcsData->stage2Single[idx]+nextOffset;
657     old=*p;
658     if(flag<=0) {
659         *p=(uint16_t)(0xf00|b);
660     } else if(IS_PRIVATE_USE(c)) {
661         *p=(uint16_t)(0xc00|b);
662     } else {
663         *p=(uint16_t)(0x800|b);
664     }
665 
666     /* check that this Unicode code point was still unassigned */
667     if(old>=0x100) {
668         if(flag>=0) {
669             fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
670                 (int)c, b, old&0xff);
671             return FALSE;
672         } else if(VERBOSE) {
673             fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
674                 (int)c, b, old&0xff);
675         }
676         /* continue after the above warning if the precision of the mapping is unspecified */
677     }
678 
679     return TRUE;
680 }
681 
682 static UBool
MBCSAddFromUnicode(MBCSData * mbcsData,const uint8_t * bytes,int32_t length,UChar32 c,int8_t flag)683 MBCSAddFromUnicode(MBCSData *mbcsData,
684                    const uint8_t *bytes, int32_t length,
685                    UChar32 c,
686                    int8_t flag) {
687     char buffer[10];
688     const uint8_t *pb;
689     uint8_t *stage3, *p;
690     uint32_t idx, b, old, stage3Index;
691     int32_t maxCharLength;
692 
693     uint32_t blockSize, newTop, i, nextOffset, newBlock, min, overlap, maxOverlap;
694 
695     maxCharLength=mbcsData->ucm->states.maxCharLength;
696 
697     if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO &&
698         (!IGNORE_SISO_CHECK && (*bytes==0xe || *bytes==0xf))
699     ) {
700         fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n",
701             (int)c, printBytes(buffer, bytes, length));
702         return FALSE;
703     }
704 
705     if(flag==1 && length==1 && *bytes==0) {
706         fprintf(stderr, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n",
707             (int)c, *bytes);
708         return FALSE;
709     }
710 
711     /*
712      * Walk down the triple-stage compact array ("trie") and
713      * allocate parts as necessary.
714      * Note that the first stage 2 and 3 blocks are reserved for
715      * all-unassigned mappings.
716      * We assume that length<=maxCharLength and that c<=0x10ffff.
717      */
718     stage3=mbcsData->fromUBytes;
719 
720     /* inspect stage 1 */
721     idx=c>>MBCS_STAGE_1_SHIFT;
722     if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
723         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
724     } else {
725         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
726     }
727     if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
728         /* allocate another block in stage 2 */
729         newBlock=mbcsData->stage2Top;
730         if(mbcsData->utf8Friendly) {
731             min=newBlock-nextOffset; /* minimum block start with overlap */
732             while(min<newBlock && mbcsData->stage2[newBlock-1]==0) {
733                 --newBlock;
734             }
735         }
736         newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
737 
738         if(newTop>MBCS_MAX_STAGE_2_TOP) {
739             fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n",
740                 (int)c, printBytes(buffer, bytes, length));
741             return FALSE;
742         }
743 
744         /*
745          * each stage 2 block contains 64 32-bit words:
746          * 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index
747          */
748         i=idx;
749         while(newBlock<newTop) {
750             mbcsData->stage1[i++]=(uint16_t)newBlock;
751             newBlock+=MBCS_STAGE_2_BLOCK_SIZE;
752         }
753         mbcsData->stage2Top=newTop; /* ==newBlock */
754     }
755 
756     /* inspect stage 2 */
757     idx=mbcsData->stage1[idx]+nextOffset;
758     if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
759         /* allocate 64-entry blocks for UTF-8-friendly lookup */
760         blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE*maxCharLength;
761         nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
762     } else {
763         blockSize=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
764         nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
765     }
766     if(mbcsData->stage2[idx]==0) {
767         /* allocate another block in stage 3 */
768         newBlock=mbcsData->stage3Top;
769         if(mbcsData->utf8Friendly && nextOffset>=MBCS_STAGE_3_GRANULARITY) {
770             /*
771              * Overlap stage 3 blocks only in multiples of 16-entry blocks
772              * because of the indexing granularity in stage 2.
773              */
774             maxOverlap=(nextOffset&~(MBCS_STAGE_3_GRANULARITY-1))*maxCharLength;
775             for(overlap=0;
776                 overlap<maxOverlap && stage3[newBlock-overlap-1]==0;
777                 ++overlap) {}
778 
779             overlap=(overlap/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
780             overlap=(overlap*MBCS_STAGE_3_GRANULARITY)*maxCharLength;
781 
782             newBlock-=overlap;
783         }
784         newTop=newBlock+blockSize;
785 
786         if(newTop>MBCS_STAGE_3_MBCS_SIZE*(uint32_t)maxCharLength) {
787             fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n",
788                 (int)c, printBytes(buffer, bytes, length));
789             return FALSE;
790         }
791         /* each block has 16*maxCharLength bytes */
792         i=idx;
793         while(newBlock<newTop) {
794             mbcsData->stage2[i++]=(newBlock/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
795             newBlock+=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
796         }
797         mbcsData->stage3Top=newTop; /* ==newBlock */
798     }
799 
800     stage3Index=MBCS_STAGE_3_GRANULARITY*(uint32_t)(uint16_t)mbcsData->stage2[idx];
801 
802     /* Build an alternate, UTF-8-friendly stage table as well. */
803     if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
804         /* Overflow for uint16_t entries in stageUTF8? */
805         if(stage3Index>0xffff) {
806             /*
807              * This can occur only if the mapping table is nearly perfectly filled and if
808              * utf8Max==0xffff.
809              * (There is no known charset like this. GB 18030 does not map
810              * surrogate code points and LMBCS does not map 256 PUA code points.)
811              *
812              * Otherwise, stage3Index<=MBCS_UTF8_LIMIT<0xffff
813              * (stage3Index can at most reach exactly MBCS_UTF8_LIMIT)
814              * because we have a sorted table and there are at most MBCS_UTF8_LIMIT
815              * mappings with 0<=c<MBCS_UTF8_LIMIT, and there is only also
816              * the initial all-unassigned block in stage3.
817              *
818              * Solution for the overflow: Reduce utf8Max to the next lower value, 0xfeff.
819              *
820              * (See svn revision 20866 of the markus/ucnvutf8 feature branch for
821              * code that causes MBCSAddTable() to rebuild the table not utf8Friendly
822              * in case of overflow. That code was not tested.)
823              */
824             mbcsData->utf8Max=0xfeff;
825         } else {
826             /*
827              * The stage 3 block has been assigned for the regular trie.
828              * Just copy its index into stageUTF8[], without the granularity.
829              */
830             mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index;
831         }
832     }
833 
834     /* write the codepage bytes into stage 3 and get the previous bytes */
835 
836     /* assemble the bytes into a single integer */
837     pb=bytes;
838     b=0;
839     switch(length) {
840     case 4:
841         b=*pb++;
842         U_FALLTHROUGH;
843     case 3:
844         b=(b<<8)|*pb++;
845         U_FALLTHROUGH;
846     case 2:
847         b=(b<<8)|*pb++;
848         U_FALLTHROUGH;
849     case 1:
850     default:
851         b=(b<<8)|*pb++;
852         break;
853     }
854 
855     old=0;
856     p=stage3+(stage3Index+nextOffset)*maxCharLength;
857     switch(maxCharLength) {
858     case 2:
859         old=*(uint16_t *)p;
860         *(uint16_t *)p=(uint16_t)b;
861         break;
862     case 3:
863         old=(uint32_t)*p<<16;
864         *p++=(uint8_t)(b>>16);
865         old|=(uint32_t)*p<<8;
866         *p++=(uint8_t)(b>>8);
867         old|=*p;
868         *p=(uint8_t)b;
869         break;
870     case 4:
871         old=*(uint32_t *)p;
872         *(uint32_t *)p=b;
873         break;
874     default:
875         /* will never occur */
876         break;
877     }
878 
879     /* check that this Unicode code point was still unassigned */
880     if((mbcsData->stage2[idx+(nextOffset>>MBCS_STAGE_2_SHIFT)]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
881         if(flag>=0) {
882             fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
883                 (int)c, printBytes(buffer, bytes, length), (int)old);
884             return FALSE;
885         } else if(VERBOSE) {
886             fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
887                 (int)c, printBytes(buffer, bytes, length), (int)old);
888         }
889         /* continue after the above warning if the precision of the mapping is
890            unspecified */
891     }
892     if(flag<=0) {
893         /* set the roundtrip flag */
894         mbcsData->stage2[idx+(nextOffset>>4)]|=(1UL<<(16+(c&0xf)));
895     }
896 
897     return TRUE;
898 }
899 
900 U_CFUNC UBool
MBCSOkForBaseFromUnicode(const MBCSData * mbcsData,const uint8_t * bytes,int32_t length,UChar32 c,int8_t flag)901 MBCSOkForBaseFromUnicode(const MBCSData *mbcsData,
902                          const uint8_t *bytes, int32_t length,
903                          UChar32 c, int8_t flag) {
904     /*
905      * A 1:1 mapping does not fit into the MBCS base table's fromUnicode table under
906      * the following conditions:
907      *
908      * - a |2 SUB mapping for <subchar1> (no base table data structure for them)
909      * - a |1 fallback to 0x00 (result value 0, indistinguishable from unmappable entry)
910      * - a multi-byte mapping with leading 0x00 bytes (no explicit length field)
911      *
912      * Some of these tests are redundant with ucm_mappingType().
913      */
914     if( (flag==2 && length==1) ||
915         (flag==1 && bytes[0]==0) || /* testing length==1 would be redundant with the next test */
916         (flag<=1 && length>1 && bytes[0]==0)
917     ) {
918         return FALSE;
919     }
920 
921     /*
922      * Additional restrictions for UTF-8-friendly fromUnicode tables,
923      * for code points up to the maximum optimized one:
924      *
925      * - any mapping to 0x00 (result value 0, indistinguishable from unmappable entry)
926      * - any |1 fallback (no roundtrip flags in the optimized table)
927      */
928     if(mbcsData->utf8Friendly && flag<=1 && c<=mbcsData->utf8Max && (bytes[0]==0 || flag==1)) {
929         return FALSE;
930     }
931 
932     /*
933      * If we omit the fromUnicode data, we can only store roundtrips there
934      * because only they are recoverable from the toUnicode data.
935      * Fallbacks must go into the extension table.
936      */
937     if(mbcsData->omitFromU && flag!=0) {
938         return FALSE;
939     }
940 
941     /* All other mappings do fit into the base table. */
942     return TRUE;
943 }
944 
945 U_CDECL_BEGIN
946 /* we can assume that the table only contains 1:1 mappings with <=4 bytes each */
947 static UBool
MBCSAddTable(NewConverter * cnvData,UCMTable * table,UConverterStaticData * staticData)948 MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
949     MBCSData *mbcsData;
950     UCMapping *m;
951     UChar32 c;
952     int32_t i, maxCharLength;
953     int8_t f;
954     UBool isOK, utf8Friendly;
955 
956     staticData->unicodeMask=table->unicodeMask;
957     if(staticData->unicodeMask==3) {
958         fprintf(stderr, "error: contains mappings for both supplementary and surrogate code points\n");
959         return FALSE;
960     }
961 
962     staticData->conversionType=UCNV_MBCS;
963 
964     mbcsData=(MBCSData *)cnvData;
965     maxCharLength=mbcsData->ucm->states.maxCharLength;
966 
967     /*
968      * Generation of UTF-8-friendly data requires
969      * a sorted table, which makeconv generates when explicit precision
970      * indicators are used.
971      */
972     mbcsData->utf8Friendly=utf8Friendly=(UBool)((table->flagsType&UCM_FLAGS_EXPLICIT)!=0);
973     if(utf8Friendly) {
974         mbcsData->utf8Max=MBCS_UTF8_MAX;
975         if(SMALL && maxCharLength>1) {
976             mbcsData->omitFromU=TRUE;
977         }
978     } else {
979         mbcsData->utf8Max=0;
980         if(SMALL && maxCharLength>1) {
981             fprintf(stderr,
982                 "makeconv warning: --small not available for .ucm files without |0 etc.\n");
983         }
984     }
985 
986     if(!MBCSStartMappings(mbcsData)) {
987         return FALSE;
988     }
989 
990     staticData->hasFromUnicodeFallback=FALSE;
991     staticData->hasToUnicodeFallback=FALSE;
992 
993     isOK=TRUE;
994 
995     m=table->mappings;
996     for(i=0; i<table->mappingsLength; ++m, ++i) {
997         c=m->u;
998         f=m->f;
999 
1000         /*
1001          * Small optimization for --small .cnv files:
1002          *
1003          * If there are fromUnicode mappings above MBCS_UTF8_MAX,
1004          * then the file size will be smaller if we make utf8Max larger
1005          * because the size increase in stageUTF8 will be more than balanced by
1006          * how much less of stage2 needs to be stored.
1007          *
1008          * There is no point in doing this incrementally because stageUTF8
1009          * uses so much less space per block than stage2,
1010          * so we immediately increase utf8Max to 0xffff.
1011          *
1012          * Do not increase utf8Max if it is already at 0xfeff because MBCSAddFromUnicode()
1013          * sets it to that value when stageUTF8 overflows.
1014          */
1015         if( mbcsData->omitFromU && f<=1 &&
1016             mbcsData->utf8Max<c && c<=0xffff &&
1017             mbcsData->utf8Max<0xfeff
1018         ) {
1019             mbcsData->utf8Max=0xffff;
1020         }
1021 
1022         switch(f) {
1023         case -1:
1024             /* there was no precision/fallback indicator */
1025             /* fall through to set the mappings */
1026             U_FALLTHROUGH;
1027         case 0:
1028             /* set roundtrip mappings */
1029             isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1030 
1031             if(maxCharLength==1) {
1032                 isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1033             } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
1034                 isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1035             } else {
1036                 m->f|=MBCS_FROM_U_EXT_FLAG;
1037                 m->moveFlag=UCM_MOVE_TO_EXT;
1038             }
1039             break;
1040         case 1:
1041             /* set only a fallback mapping from Unicode to codepage */
1042             if(maxCharLength==1) {
1043                 staticData->hasFromUnicodeFallback=TRUE;
1044                 isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1045             } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
1046                 staticData->hasFromUnicodeFallback=TRUE;
1047                 isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1048             } else {
1049                 m->f|=MBCS_FROM_U_EXT_FLAG;
1050                 m->moveFlag=UCM_MOVE_TO_EXT;
1051             }
1052             break;
1053         case 2:
1054             /* ignore |2 SUB mappings, except to move <subchar1> mappings to the extension table */
1055             if(maxCharLength>1 && m->bLen==1) {
1056                 m->f|=MBCS_FROM_U_EXT_FLAG;
1057                 m->moveFlag=UCM_MOVE_TO_EXT;
1058             }
1059             break;
1060         case 3:
1061             /* set only a fallback mapping from codepage to Unicode */
1062             staticData->hasToUnicodeFallback=TRUE;
1063             isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1064             break;
1065         case 4:
1066             /* move "good one-way" mappings to the extension table */
1067             m->f|=MBCS_FROM_U_EXT_FLAG;
1068             m->moveFlag=UCM_MOVE_TO_EXT;
1069             break;
1070         default:
1071             /* will not occur because the parser checked it already */
1072             fprintf(stderr, "error: illegal fallback indicator %d\n", f);
1073             return FALSE;
1074         }
1075     }
1076 
1077     MBCSPostprocess(mbcsData, staticData);
1078 
1079     return isOK;
1080 }
1081 U_CDECL_END
1082 static UBool
transformEUC(MBCSData * mbcsData)1083 transformEUC(MBCSData *mbcsData) {
1084     uint8_t *p8;
1085     uint32_t i, value, oldLength, old3Top;
1086     uint8_t b;
1087 
1088     oldLength=mbcsData->ucm->states.maxCharLength;
1089     if(oldLength<3) {
1090         return FALSE;
1091     }
1092 
1093     old3Top=mbcsData->stage3Top;
1094 
1095     /* careful: 2-byte and 4-byte codes are stored in platform endianness! */
1096 
1097     /* test if all first bytes are in {0, 0x8e, 0x8f} */
1098     p8=mbcsData->fromUBytes;
1099 
1100 #if !U_IS_BIG_ENDIAN
1101     if(oldLength==4) {
1102         p8+=3;
1103     }
1104 #endif
1105 
1106     for(i=0; i<old3Top; i+=oldLength) {
1107         b=p8[i];
1108         if(b!=0 && b!=0x8e && b!=0x8f) {
1109             /* some first byte does not fit the EUC pattern, nothing to be done */
1110             return FALSE;
1111         }
1112     }
1113     /* restore p if it was modified above */
1114     p8=mbcsData->fromUBytes;
1115 
1116     /* modify outputType and adjust stage3Top */
1117     mbcsData->ucm->states.outputType=(int8_t)(MBCS_OUTPUT_3_EUC+oldLength-3);
1118     mbcsData->stage3Top=(old3Top*(oldLength-1))/oldLength;
1119 
1120     /*
1121      * EUC-encode all byte sequences;
1122      * see "CJKV Information Processing" (1st ed. 1999) from Ken Lunde, O'Reilly,
1123      * p. 161 in chapter 4 "Encoding Methods"
1124      *
1125      * This also must reverse the byte order if the platform is little-endian!
1126      */
1127     if(oldLength==3) {
1128         uint16_t *q=(uint16_t *)p8;
1129         for(i=0; i<old3Top; i+=oldLength) {
1130             b=*p8;
1131             if(b==0) {
1132                 /* short sequences are stored directly */
1133                 /* code set 0 or 1 */
1134                 (*q++)=(uint16_t)((p8[1]<<8)|p8[2]);
1135             } else if(b==0x8e) {
1136                 /* code set 2 */
1137                 (*q++)=(uint16_t)(((p8[1]&0x7f)<<8)|p8[2]);
1138             } else /* b==0x8f */ {
1139                 /* code set 3 */
1140                 (*q++)=(uint16_t)((p8[1]<<8)|(p8[2]&0x7f));
1141             }
1142             p8+=3;
1143         }
1144     } else /* oldLength==4 */ {
1145         uint8_t *q=p8;
1146         uint32_t *p32=(uint32_t *)p8;
1147         for(i=0; i<old3Top; i+=4) {
1148             value=(*p32++);
1149             if(value<=0xffffff) {
1150                 /* short sequences are stored directly */
1151                 /* code set 0 or 1 */
1152                 (*q++)=(uint8_t)(value>>16);
1153                 (*q++)=(uint8_t)(value>>8);
1154                 (*q++)=(uint8_t)value;
1155             } else if(value<=0x8effffff) {
1156                 /* code set 2 */
1157                 (*q++)=(uint8_t)((value>>16)&0x7f);
1158                 (*q++)=(uint8_t)(value>>8);
1159                 (*q++)=(uint8_t)value;
1160             } else /* first byte is 0x8f */ {
1161                 /* code set 3 */
1162                 (*q++)=(uint8_t)(value>>16);
1163                 (*q++)=(uint8_t)((value>>8)&0x7f);
1164                 (*q++)=(uint8_t)value;
1165             }
1166         }
1167     }
1168 
1169     return TRUE;
1170 }
1171 
1172 /*
1173  * Compact stage 2 for SBCS by overlapping adjacent stage 2 blocks as far
1174  * as possible. Overlapping is done on unassigned head and tail
1175  * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
1176  * Stage 1 indexes need to be adjusted accordingly.
1177  * This function is very similar to genprops/store.c/compactStage().
1178  */
1179 static void
singleCompactStage2(MBCSData * mbcsData)1180 singleCompactStage2(MBCSData *mbcsData) {
1181     /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
1182     uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
1183     uint16_t i, start, prevEnd, newStart;
1184 
1185     /* enter the all-unassigned first stage 2 block into the map */
1186     map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
1187 
1188     /* begin with the first block after the all-unassigned one */
1189     start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
1190     while(start<mbcsData->stage2Top) {
1191         prevEnd=(uint16_t)(newStart-1);
1192 
1193         /* find the size of the overlap */
1194         for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2Single[start+i]==0 && mbcsData->stage2Single[prevEnd-i]==0; ++i) {}
1195 
1196         if(i>0) {
1197             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i);
1198 
1199             /* move the non-overlapping indexes to their new positions */
1200             start+=i;
1201             for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) {
1202                 mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
1203             }
1204         } else if(newStart<start) {
1205             /* move the indexes to their new positions */
1206             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
1207             for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
1208                 mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
1209             }
1210         } else /* no overlap && newStart==start */ {
1211             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
1212             start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
1213         }
1214     }
1215 
1216     /* adjust stage2Top */
1217     if(VERBOSE && newStart<mbcsData->stage2Top) {
1218         printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
1219                 (unsigned long)mbcsData->stage2Top, (unsigned long)newStart,
1220                 (long)(mbcsData->stage2Top-newStart)*2);
1221     }
1222     mbcsData->stage2Top=newStart;
1223 
1224     /* now adjust stage 1 */
1225     for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
1226         mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
1227     }
1228 }
1229 
1230 /* Compact stage 3 for SBCS - same algorithm as above. */
1231 static void
singleCompactStage3(MBCSData * mbcsData)1232 singleCompactStage3(MBCSData *mbcsData) {
1233     uint16_t *stage3=(uint16_t *)mbcsData->fromUBytes;
1234 
1235     /* this array maps the ordinal number of a stage 3 block to its new stage 2 index */
1236     uint16_t map[0x1000];
1237     uint16_t i, start, prevEnd, newStart;
1238 
1239     /* enter the all-unassigned first stage 3 block into the map */
1240     map[0]=0;
1241 
1242     /* begin with the first block after the all-unassigned one */
1243     start=newStart=16;
1244     while(start<mbcsData->stage3Top) {
1245         prevEnd=(uint16_t)(newStart-1);
1246 
1247         /* find the size of the overlap */
1248         for(i=0; i<16 && stage3[start+i]==0 && stage3[prevEnd-i]==0; ++i) {}
1249 
1250         if(i>0) {
1251             map[start>>4]=(uint16_t)(newStart-i);
1252 
1253             /* move the non-overlapping indexes to their new positions */
1254             start+=i;
1255             for(i=(uint16_t)(16-i); i>0; --i) {
1256                 stage3[newStart++]=stage3[start++];
1257             }
1258         } else if(newStart<start) {
1259             /* move the indexes to their new positions */
1260             map[start>>4]=newStart;
1261             for(i=16; i>0; --i) {
1262                 stage3[newStart++]=stage3[start++];
1263             }
1264         } else /* no overlap && newStart==start */ {
1265             map[start>>4]=start;
1266             start=newStart+=16;
1267         }
1268     }
1269 
1270     /* adjust stage3Top */
1271     if(VERBOSE && newStart<mbcsData->stage3Top) {
1272         printf("compacting stage 3 from stage3Top=0x%lx to 0x%lx, saving %ld bytes\n",
1273                 (unsigned long)mbcsData->stage3Top, (unsigned long)newStart,
1274                 (long)(mbcsData->stage3Top-newStart)*2);
1275     }
1276     mbcsData->stage3Top=newStart;
1277 
1278     /* now adjust stage 2 */
1279     for(i=0; i<mbcsData->stage2Top; ++i) {
1280         mbcsData->stage2Single[i]=map[mbcsData->stage2Single[i]>>4];
1281     }
1282 }
1283 
1284 /*
1285  * Compact stage 2 by overlapping adjacent stage 2 blocks as far
1286  * as possible. Overlapping is done on unassigned head and tail
1287  * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
1288  * Stage 1 indexes need to be adjusted accordingly.
1289  * This function is very similar to genprops/store.c/compactStage().
1290  */
1291 static void
compactStage2(MBCSData * mbcsData)1292 compactStage2(MBCSData *mbcsData) {
1293     /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
1294     uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
1295     uint16_t i, start, prevEnd, newStart;
1296 
1297     /* enter the all-unassigned first stage 2 block into the map */
1298     map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
1299 
1300     /* begin with the first block after the all-unassigned one */
1301     start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
1302     while(start<mbcsData->stage2Top) {
1303         prevEnd=(uint16_t)(newStart-1);
1304 
1305         /* find the size of the overlap */
1306         for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2[start+i]==0 && mbcsData->stage2[prevEnd-i]==0; ++i) {}
1307 
1308         if(i>0) {
1309             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i);
1310 
1311             /* move the non-overlapping indexes to their new positions */
1312             start+=i;
1313             for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) {
1314                 mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
1315             }
1316         } else if(newStart<start) {
1317             /* move the indexes to their new positions */
1318             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
1319             for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
1320                 mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
1321             }
1322         } else /* no overlap && newStart==start */ {
1323             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
1324             start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
1325         }
1326     }
1327 
1328     /* adjust stage2Top */
1329     if(VERBOSE && newStart<mbcsData->stage2Top) {
1330         printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
1331                 (unsigned long)mbcsData->stage2Top, (unsigned long)newStart,
1332                 (long)(mbcsData->stage2Top-newStart)*4);
1333     }
1334     mbcsData->stage2Top=newStart;
1335 
1336     /* now adjust stage 1 */
1337     for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
1338         mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
1339     }
1340 }
1341 
1342 static void
MBCSPostprocess(MBCSData * mbcsData,const UConverterStaticData *)1343 MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData * /*staticData*/) {
1344     UCMStates *states;
1345     int32_t maxCharLength, stage3Width;
1346 
1347     states=&mbcsData->ucm->states;
1348     stage3Width=maxCharLength=states->maxCharLength;
1349 
1350     ucm_optimizeStates(states,
1351                        &mbcsData->unicodeCodeUnits,
1352                        mbcsData->toUFallbacks, mbcsData->countToUFallbacks,
1353                        VERBOSE);
1354 
1355     /* try to compact the fromUnicode tables */
1356     if(transformEUC(mbcsData)) {
1357         --stage3Width;
1358     }
1359 
1360     /*
1361      * UTF-8-friendly tries are built precompacted, to cope with variable
1362      * stage 3 allocation block sizes.
1363      *
1364      * Tables without precision indicators cannot be built that way,
1365      * because if a block was overlapped with a previous one, then a smaller
1366      * code point for the same block would not fit.
1367      * Therefore, such tables are not marked UTF-8-friendly and must be
1368      * compacted after all mappings are entered.
1369      */
1370     if(!mbcsData->utf8Friendly) {
1371         if(maxCharLength==1) {
1372             singleCompactStage3(mbcsData);
1373             singleCompactStage2(mbcsData);
1374         } else {
1375             compactStage2(mbcsData);
1376         }
1377     }
1378 
1379     if(VERBOSE) {
1380         /*uint32_t c, i1, i2, i2Limit, i3;*/
1381 
1382         printf("fromUnicode number of uint%s_t in stage 2: 0x%lx=%lu\n",
1383                maxCharLength==1 ? "16" : "32",
1384                (unsigned long)mbcsData->stage2Top,
1385                (unsigned long)mbcsData->stage2Top);
1386         printf("fromUnicode number of %d-byte stage 3 mapping entries: 0x%lx=%lu\n",
1387                (int)stage3Width,
1388                (unsigned long)mbcsData->stage3Top/stage3Width,
1389                (unsigned long)mbcsData->stage3Top/stage3Width);
1390 #if 0
1391         c=0;
1392         for(i1=0; i1<MBCS_STAGE_1_SIZE; ++i1) {
1393             i2=mbcsData->stage1[i1];
1394             if(i2==0) {
1395                 c+=MBCS_STAGE_2_BLOCK_SIZE*MBCS_STAGE_3_BLOCK_SIZE;
1396                 continue;
1397             }
1398             for(i2Limit=i2+MBCS_STAGE_2_BLOCK_SIZE; i2<i2Limit; ++i2) {
1399                 if(maxCharLength==1) {
1400                     i3=mbcsData->stage2Single[i2];
1401                 } else {
1402                     i3=(uint16_t)mbcsData->stage2[i2];
1403                 }
1404                 if(i3==0) {
1405                     c+=MBCS_STAGE_3_BLOCK_SIZE;
1406                     continue;
1407                 }
1408                 printf("U+%04lx i1=0x%02lx i2=0x%04lx i3=0x%04lx\n",
1409                        (unsigned long)c,
1410                        (unsigned long)i1,
1411                        (unsigned long)i2,
1412                        (unsigned long)i3);
1413                 c+=MBCS_STAGE_3_BLOCK_SIZE;
1414             }
1415         }
1416 #endif
1417     }
1418 }
1419 
1420 U_CDECL_BEGIN
1421 static uint32_t
MBCSWrite(NewConverter * cnvData,const UConverterStaticData * staticData,UNewDataMemory * pData,int32_t tableType)1422 MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
1423           UNewDataMemory *pData, int32_t tableType) {
1424     MBCSData *mbcsData=(MBCSData *)cnvData;
1425     uint32_t stage2Start, stage2Length;
1426     uint32_t top, stageUTF8Length=0;
1427     int32_t i, stage1Top;
1428     uint32_t headerLength;
1429 
1430     _MBCSHeader header=UCNV_MBCS_HEADER_INITIALIZER;
1431 
1432     stage2Length=mbcsData->stage2Top;
1433     if(mbcsData->omitFromU) {
1434         /* find how much of stage2 can be omitted */
1435         int32_t utf8Limit=(int32_t)mbcsData->utf8Max+1;
1436         uint32_t st2=0; /*initialized it to avoid compiler warnings */
1437 
1438         i=utf8Limit>>MBCS_STAGE_1_SHIFT;
1439         if((utf8Limit&((1<<MBCS_STAGE_1_SHIFT)-1))!=0 && (st2=mbcsData->stage1[i])!=0) {
1440             /* utf8Limit is in the middle of an existing stage 2 block */
1441             stage2Start=st2+((utf8Limit>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK);
1442         } else {
1443             /* find the last stage2 block with mappings before utf8Limit */
1444             while(i>0 && (st2=mbcsData->stage1[--i])==0) {}
1445             /* stage2 up to the end of this block corresponds to stageUTF8 */
1446             stage2Start=st2+MBCS_STAGE_2_BLOCK_SIZE;
1447         }
1448         header.options|=MBCS_OPT_NO_FROM_U;
1449         header.fullStage2Length=stage2Length;
1450         stage2Length-=stage2Start;
1451         if(VERBOSE) {
1452             printf("+ omitting %lu out of %lu stage2 entries and %lu fromUBytes\n",
1453                     (unsigned long)stage2Start,
1454                     (unsigned long)mbcsData->stage2Top,
1455                     (unsigned long)mbcsData->stage3Top);
1456             printf("+ total size savings: %lu bytes\n", (unsigned long)stage2Start*4+mbcsData->stage3Top);
1457         }
1458     } else {
1459         stage2Start=0;
1460     }
1461 
1462     if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
1463         stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
1464     } else {
1465         stage1Top=0x40; /* 0x40==64 */
1466     }
1467 
1468     /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
1469     if(mbcsData->ucm->states.maxCharLength==1) {
1470         for(i=0; i<stage1Top; ++i) {
1471             mbcsData->stage1[i]+=(uint16_t)stage1Top;
1472         }
1473 
1474         /* stage2Top/Length have counted 16-bit results, now we need to count bytes */
1475         /* also round up to a multiple of 4 bytes */
1476         stage2Length=(stage2Length*2+1)&~1;
1477 
1478         /* stage3Top has counted 16-bit results, now we need to count bytes */
1479         mbcsData->stage3Top*=2;
1480 
1481         if(mbcsData->utf8Friendly) {
1482             header.version[2]=(uint8_t)(SBCS_UTF8_MAX>>8); /* store 0x1f for max==0x1fff */
1483         }
1484     } else {
1485         for(i=0; i<stage1Top; ++i) {
1486             mbcsData->stage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */
1487         }
1488 
1489         /* stage2Top/Length have counted 32-bit results, now we need to count bytes */
1490         stage2Length*=4;
1491         /* leave stage2Start counting 32-bit units */
1492 
1493         if(mbcsData->utf8Friendly) {
1494             stageUTF8Length=(mbcsData->utf8Max+1)>>MBCS_UTF8_STAGE_SHIFT;
1495             header.version[2]=(uint8_t)(mbcsData->utf8Max>>8); /* store 0xd7 for max==0xd7ff */
1496         }
1497 
1498         /* stage3Top has already counted bytes */
1499     }
1500 
1501     /* round up stage3Top so that the sizes of all data blocks are multiples of 4 */
1502     mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3;
1503 
1504     /* fill the header */
1505     if(header.options&MBCS_OPT_INCOMPATIBLE_MASK) {
1506         header.version[0]=5;
1507         if(header.options&MBCS_OPT_NO_FROM_U) {
1508             headerLength=10;  /* include fullStage2Length */
1509         } else {
1510             headerLength=MBCS_HEADER_V5_MIN_LENGTH;  /* 9 */
1511         }
1512     } else {
1513         header.version[0]=4;
1514         headerLength=MBCS_HEADER_V4_LENGTH;  /* 8 */
1515     }
1516     header.version[1]=4;
1517     /* header.version[2] set above for utf8Friendly data */
1518 
1519     header.options|=(uint32_t)headerLength;
1520 
1521     header.countStates=mbcsData->ucm->states.countStates;
1522     header.countToUFallbacks=mbcsData->countToUFallbacks;
1523 
1524     header.offsetToUCodeUnits=
1525         headerLength*4+
1526         mbcsData->ucm->states.countStates*1024+
1527         mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback);
1528     header.offsetFromUTable=
1529         header.offsetToUCodeUnits+
1530         mbcsData->ucm->states.countToUCodeUnits*2;
1531     header.offsetFromUBytes=
1532         header.offsetFromUTable+
1533         stage1Top*2+
1534         stage2Length;
1535     header.fromUBytesLength=mbcsData->stage3Top;
1536 
1537     top=header.offsetFromUBytes+stageUTF8Length*2;
1538     if(!(header.options&MBCS_OPT_NO_FROM_U)) {
1539         top+=header.fromUBytesLength;
1540     }
1541 
1542     header.flags=(uint8_t)(mbcsData->ucm->states.outputType);
1543 
1544     if(tableType&TABLE_EXT) {
1545         if(top>0xffffff) {
1546             fprintf(stderr, "error: offset 0x%lx to extension table exceeds 0xffffff\n", (long)top);
1547             return 0;
1548         }
1549 
1550         header.flags|=top<<8;
1551     }
1552 
1553     /* write the MBCS data */
1554     udata_writeBlock(pData, &header, headerLength*4);
1555     udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024);
1556     udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback));
1557     udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2);
1558     udata_writeBlock(pData, mbcsData->stage1, stage1Top*2);
1559     if(mbcsData->ucm->states.maxCharLength==1) {
1560         udata_writeBlock(pData, mbcsData->stage2Single+stage2Start, stage2Length);
1561     } else {
1562         udata_writeBlock(pData, mbcsData->stage2+stage2Start, stage2Length);
1563     }
1564     if(!(header.options&MBCS_OPT_NO_FROM_U)) {
1565         udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
1566     }
1567 
1568     if(stageUTF8Length>0) {
1569         udata_writeBlock(pData, mbcsData->stageUTF8, stageUTF8Length*2);
1570     }
1571 
1572     /* return the number of bytes that should have been written */
1573     return top;
1574 }
1575 U_CDECL_END
1576