1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2003-2013, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  ucm.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2003jun20
14 *   created by: Markus W. Scherer
15 *
16 *   This file reads a .ucm file, stores its mappings and sorts them.
17 *   It implements handling of Unicode conversion mappings from .ucm files
18 *   for makeconv, canonucm, rptp2ucm, etc.
19 *
20 *   Unicode code point sequences with a length of more than 1,
21 *   as well as byte sequences with more than 4 bytes or more than one complete
22 *   character sequence are handled to support m:n mappings.
23 */
24 
25 #include "unicode/utypes.h"
26 #include "unicode/ustring.h"
27 #include "cstring.h"
28 #include "cmemory.h"
29 #include "filestrm.h"
30 #include "uarrsort.h"
31 #include "ucnvmbcs.h"
32 #include "ucnv_bld.h"
33 #include "ucnv_ext.h"
34 #include "uparse.h"
35 #include "ucm.h"
36 #include <stdio.h>
37 
38 #if !UCONFIG_NO_CONVERSION
39 
40 /* -------------------------------------------------------------------------- */
41 
42 static void
printMapping(UCMapping * m,UChar32 * codePoints,uint8_t * bytes,FILE * f)43 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
44     int32_t j;
45 
46     for(j=0; j<m->uLen; ++j) {
47         fprintf(f, "<U%04lX>", (long)codePoints[j]);
48     }
49 
50     fputc(' ', f);
51 
52     for(j=0; j<m->bLen; ++j) {
53         fprintf(f, "\\x%02X", bytes[j]);
54     }
55 
56     if(m->f>=0) {
57         fprintf(f, " |%u\n", m->f);
58     } else {
59         fputs("\n", f);
60     }
61 }
62 
63 U_CAPI void U_EXPORT2
ucm_printMapping(UCMTable * table,UCMapping * m,FILE * f)64 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
65     printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
66 }
67 
68 U_CAPI void U_EXPORT2
ucm_printTable(UCMTable * table,FILE * f,UBool byUnicode)69 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
70     UCMapping *m;
71     int32_t i, length;
72 
73     m=table->mappings;
74     length=table->mappingsLength;
75     if(byUnicode) {
76         for(i=0; i<length; ++m, ++i) {
77             ucm_printMapping(table, m, f);
78         }
79     } else {
80         const int32_t *map=table->reverseMap;
81         for(i=0; i<length; ++i) {
82             ucm_printMapping(table, m+map[i], f);
83         }
84     }
85 }
86 
87 /* mapping comparisons ------------------------------------------------------ */
88 
89 static int32_t
compareUnicode(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r)90 compareUnicode(UCMTable *lTable, const UCMapping *l,
91                UCMTable *rTable, const UCMapping *r) {
92     const UChar32 *lu, *ru;
93     int32_t result, i, length;
94 
95     if(l->uLen==1 && r->uLen==1) {
96         /* compare two single code points */
97         return l->u-r->u;
98     }
99 
100     /* get pointers to the code point sequences */
101     lu=UCM_GET_CODE_POINTS(lTable, l);
102     ru=UCM_GET_CODE_POINTS(rTable, r);
103 
104     /* get the minimum length */
105     if(l->uLen<=r->uLen) {
106         length=l->uLen;
107     } else {
108         length=r->uLen;
109     }
110 
111     /* compare the code points */
112     for(i=0; i<length; ++i) {
113         result=lu[i]-ru[i];
114         if(result!=0) {
115             return result;
116         }
117     }
118 
119     /* compare the lengths */
120     return l->uLen-r->uLen;
121 }
122 
123 static int32_t
compareBytes(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool lexical)124 compareBytes(UCMTable *lTable, const UCMapping *l,
125              UCMTable *rTable, const UCMapping *r,
126              UBool lexical) {
127     const uint8_t *lb, *rb;
128     int32_t result, i, length;
129 
130     /*
131      * A lexical comparison is used for sorting in the builder, to allow
132      * an efficient search for a byte sequence that could be a prefix
133      * of a previously entered byte sequence.
134      *
135      * Comparing by lengths first is for compatibility with old .ucm tools
136      * like canonucm and rptp2ucm.
137      */
138     if(lexical) {
139         /* get the minimum length and continue */
140         if(l->bLen<=r->bLen) {
141             length=l->bLen;
142         } else {
143             length=r->bLen;
144         }
145     } else {
146         /* compare lengths first */
147         result=l->bLen-r->bLen;
148         if(result!=0) {
149             return result;
150         } else {
151             length=l->bLen;
152         }
153     }
154 
155     /* get pointers to the byte sequences */
156     lb=UCM_GET_BYTES(lTable, l);
157     rb=UCM_GET_BYTES(rTable, r);
158 
159     /* compare the bytes */
160     for(i=0; i<length; ++i) {
161         result=lb[i]-rb[i];
162         if(result!=0) {
163             return result;
164         }
165     }
166 
167     /* compare the lengths */
168     return l->bLen-r->bLen;
169 }
170 
171 /* compare UCMappings for sorting */
172 static int32_t
compareMappings(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool uFirst)173 compareMappings(UCMTable *lTable, const UCMapping *l,
174                 UCMTable *rTable, const UCMapping *r,
175                 UBool uFirst) {
176     int32_t result;
177 
178     /* choose which side to compare first */
179     if(uFirst) {
180         /* Unicode then bytes */
181         result=compareUnicode(lTable, l, rTable, r);
182         if(result==0) {
183             result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
184         }
185     } else {
186         /* bytes then Unicode */
187         result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
188         if(result==0) {
189             result=compareUnicode(lTable, l, rTable, r);
190         }
191     }
192 
193     if(result!=0) {
194         return result;
195     }
196 
197     /* compare the flags */
198     return l->f-r->f;
199 }
200 
201 /* sorting by Unicode first sorts mappings directly */
202 static int32_t
compareMappingsUnicodeFirst(const void * context,const void * left,const void * right)203 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
204     return compareMappings(
205         (UCMTable *)context, (const UCMapping *)left,
206         (UCMTable *)context, (const UCMapping *)right, TRUE);
207 }
208 
209 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
210 static int32_t
compareMappingsBytesFirst(const void * context,const void * left,const void * right)211 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
212     UCMTable *table=(UCMTable *)context;
213     int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
214     return compareMappings(
215         table, table->mappings+l,
216         table, table->mappings+r, FALSE);
217 }
218 
219 U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable * t)220 ucm_sortTable(UCMTable *t) {
221     UErrorCode errorCode;
222     int32_t i;
223 
224     if(t->isSorted) {
225         return;
226     }
227 
228     errorCode=U_ZERO_ERROR;
229 
230     /* 1. sort by Unicode first */
231     uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
232                    compareMappingsUnicodeFirst, t,
233                    FALSE, &errorCode);
234 
235     /* build the reverseMap */
236     if(t->reverseMap==NULL) {
237         /*
238          * allocate mappingsCapacity instead of mappingsLength so that
239          * if mappings are added, the reverseMap need not be
240          * reallocated each time
241          * (see ucm_moveMappings() and ucm_addMapping())
242          */
243         t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
244         if(t->reverseMap==NULL) {
245             fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
246             exit(U_MEMORY_ALLOCATION_ERROR);
247         }
248     }
249     for(i=0; i<t->mappingsLength; ++i) {
250         t->reverseMap[i]=i;
251     }
252 
253     /* 2. sort reverseMap by mappings bytes first */
254     uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
255                    compareMappingsBytesFirst, t,
256                    FALSE, &errorCode);
257 
258     if(U_FAILURE(errorCode)) {
259         fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
260                 u_errorName(errorCode));
261         exit(errorCode);
262     }
263 
264     t->isSorted=TRUE;
265 }
266 
267 /*
268  * remove mappings with their move flag set from the base table
269  * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
270  */
271 U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable * base,UCMTable * ext)272 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
273     UCMapping *mb, *mbLimit;
274     int8_t flag;
275 
276     mb=base->mappings;
277     mbLimit=mb+base->mappingsLength;
278 
279     while(mb<mbLimit) {
280         flag=mb->moveFlag;
281         if(flag!=0) {
282             /* reset the move flag */
283             mb->moveFlag=0;
284 
285             if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
286                 /* add the mapping to the extension table */
287                 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
288             }
289 
290             /* remove this mapping: move the last base mapping down and overwrite the current one */
291             if(mb<(mbLimit-1)) {
292                 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
293             }
294             --mbLimit;
295             --base->mappingsLength;
296             base->isSorted=FALSE;
297         } else {
298             ++mb;
299         }
300     }
301 }
302 
303 enum {
304     NEEDS_MOVE=1,
305     HAS_ERRORS=2
306 };
307 
308 static uint8_t
checkBaseExtUnicode(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)309 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
310                     UBool moveToExt, UBool intersectBase) {
311     UCMapping *mb, *me, *mbLimit, *meLimit;
312     int32_t cmp;
313     uint8_t result;
314 
315     mb=base->mappings;
316     mbLimit=mb+base->mappingsLength;
317 
318     me=ext->mappings;
319     meLimit=me+ext->mappingsLength;
320 
321     result=0;
322 
323     for(;;) {
324         /* skip irrelevant mappings on both sides */
325         for(;;) {
326             if(mb==mbLimit) {
327                 return result;
328             }
329 
330             if((0<=mb->f && mb->f<=2) || mb->f==4) {
331                 break;
332             }
333 
334             ++mb;
335         }
336 
337         for(;;) {
338             if(me==meLimit) {
339                 return result;
340             }
341 
342             if((0<=me->f && me->f<=2) || me->f==4) {
343                 break;
344             }
345 
346             ++me;
347         }
348 
349         /* compare the base and extension mappings */
350         cmp=compareUnicode(base, mb, ext, me);
351         if(cmp<0) {
352             if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
353                 /*
354                  * mapping in base but not in ext, move it
355                  *
356                  * if ext is DBCS, move DBCS mappings here
357                  * and check SBCS ones for Unicode prefix below
358                  */
359                 mb->moveFlag|=UCM_MOVE_TO_EXT;
360                 result|=NEEDS_MOVE;
361 
362             /* does mb map from an input sequence that is a prefix of me's? */
363             } else if( mb->uLen<me->uLen &&
364                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
365             ) {
366                 if(moveToExt) {
367                     /* mark this mapping to be moved to the extension table */
368                     mb->moveFlag|=UCM_MOVE_TO_EXT;
369                     result|=NEEDS_MOVE;
370                 } else {
371                     fprintf(stderr,
372                             "ucm error: the base table contains a mapping whose input sequence\n"
373                             "           is a prefix of the input sequence of an extension mapping\n");
374                     ucm_printMapping(base, mb, stderr);
375                     ucm_printMapping(ext, me, stderr);
376                     result|=HAS_ERRORS;
377                 }
378             }
379 
380             ++mb;
381         } else if(cmp==0) {
382             /*
383              * same output: remove the extension mapping,
384              * otherwise treat as an error
385              */
386             if( mb->f==me->f && mb->bLen==me->bLen &&
387                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
388             ) {
389                 me->moveFlag|=UCM_REMOVE_MAPPING;
390                 result|=NEEDS_MOVE;
391             } else if(intersectBase) {
392                 /* mapping in base but not in ext, move it */
393                 mb->moveFlag|=UCM_MOVE_TO_EXT;
394                 result|=NEEDS_MOVE;
395             } else {
396                 fprintf(stderr,
397                         "ucm error: the base table contains a mapping whose input sequence\n"
398                         "           is the same as the input sequence of an extension mapping\n"
399                         "           but it maps differently\n");
400                 ucm_printMapping(base, mb, stderr);
401                 ucm_printMapping(ext, me, stderr);
402                 result|=HAS_ERRORS;
403             }
404 
405             ++mb;
406         } else /* cmp>0 */ {
407             ++me;
408         }
409     }
410 }
411 
412 static uint8_t
checkBaseExtBytes(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)413 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
414                   UBool moveToExt, UBool intersectBase) {
415     UCMapping *mb, *me;
416     int32_t *baseMap, *extMap;
417     int32_t b, e, bLimit, eLimit, cmp;
418     uint8_t result;
419     UBool isSISO;
420 
421     baseMap=base->reverseMap;
422     extMap=ext->reverseMap;
423 
424     b=e=0;
425     bLimit=base->mappingsLength;
426     eLimit=ext->mappingsLength;
427 
428     result=0;
429 
430     isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
431 
432     for(;;) {
433         /* skip irrelevant mappings on both sides */
434         for(;; ++b) {
435             if(b==bLimit) {
436                 return result;
437             }
438             mb=base->mappings+baseMap[b];
439 
440             if(intersectBase==2 && mb->bLen==1) {
441                 /*
442                  * comparing a base against a DBCS extension:
443                  * leave SBCS base mappings alone
444                  */
445                 continue;
446             }
447 
448             if(mb->f==0 || mb->f==3) {
449                 break;
450             }
451         }
452 
453         for(;;) {
454             if(e==eLimit) {
455                 return result;
456             }
457             me=ext->mappings+extMap[e];
458 
459             if(me->f==0 || me->f==3) {
460                 break;
461             }
462 
463             ++e;
464         }
465 
466         /* compare the base and extension mappings */
467         cmp=compareBytes(base, mb, ext, me, TRUE);
468         if(cmp<0) {
469             if(intersectBase) {
470                 /* mapping in base but not in ext, move it */
471                 mb->moveFlag|=UCM_MOVE_TO_EXT;
472                 result|=NEEDS_MOVE;
473 
474             /*
475              * does mb map from an input sequence that is a prefix of me's?
476              * for SI/SO tables, a single byte is never a prefix because it
477              * occurs in a separate single-byte state
478              */
479             } else if( mb->bLen<me->bLen &&
480                 (!isSISO || mb->bLen>1) &&
481                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
482             ) {
483                 if(moveToExt) {
484                     /* mark this mapping to be moved to the extension table */
485                     mb->moveFlag|=UCM_MOVE_TO_EXT;
486                     result|=NEEDS_MOVE;
487                 } else {
488                     fprintf(stderr,
489                             "ucm error: the base table contains a mapping whose input sequence\n"
490                             "           is a prefix of the input sequence of an extension mapping\n");
491                     ucm_printMapping(base, mb, stderr);
492                     ucm_printMapping(ext, me, stderr);
493                     result|=HAS_ERRORS;
494                 }
495             }
496 
497             ++b;
498         } else if(cmp==0) {
499             /*
500              * same output: remove the extension mapping,
501              * otherwise treat as an error
502              */
503             if( mb->f==me->f && mb->uLen==me->uLen &&
504                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
505             ) {
506                 me->moveFlag|=UCM_REMOVE_MAPPING;
507                 result|=NEEDS_MOVE;
508             } else if(intersectBase) {
509                 /* mapping in base but not in ext, move it */
510                 mb->moveFlag|=UCM_MOVE_TO_EXT;
511                 result|=NEEDS_MOVE;
512             } else {
513                 fprintf(stderr,
514                         "ucm error: the base table contains a mapping whose input sequence\n"
515                         "           is the same as the input sequence of an extension mapping\n"
516                         "           but it maps differently\n");
517                 ucm_printMapping(base, mb, stderr);
518                 ucm_printMapping(ext, me, stderr);
519                 result|=HAS_ERRORS;
520             }
521 
522             ++b;
523         } else /* cmp>0 */ {
524             ++e;
525         }
526     }
527 }
528 
529 U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable * table,UCMStates * baseStates)530 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
531     UCMapping *m, *mLimit;
532     int32_t count;
533     UBool isOK;
534 
535     m=table->mappings;
536     mLimit=m+table->mappingsLength;
537     isOK=TRUE;
538 
539     while(m<mLimit) {
540         count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
541         if(count<1) {
542             ucm_printMapping(table, m, stderr);
543             isOK=FALSE;
544         }
545         ++m;
546     }
547 
548     return isOK;
549 }
550 
551 U_CAPI UBool U_EXPORT2
ucm_checkBaseExt(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UCMTable * moveTarget,UBool intersectBase)552 ucm_checkBaseExt(UCMStates *baseStates,
553                  UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
554                  UBool intersectBase) {
555     uint8_t result;
556 
557     /* if we have an extension table, we must always use precision flags */
558     if(base->flagsType&UCM_FLAGS_IMPLICIT) {
559         fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
560         return FALSE;
561     }
562     if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
563         fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
564         return FALSE;
565     }
566 
567     /* checking requires both tables to be sorted */
568     ucm_sortTable(base);
569     ucm_sortTable(ext);
570 
571     /* check */
572     result=
573         checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
574         checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
575 
576     if(result&HAS_ERRORS) {
577         return FALSE;
578     }
579 
580     if(result&NEEDS_MOVE) {
581         ucm_moveMappings(ext, NULL);
582         ucm_moveMappings(base, moveTarget);
583         ucm_sortTable(base);
584         ucm_sortTable(ext);
585         if(moveTarget!=NULL) {
586             ucm_sortTable(moveTarget);
587         }
588     }
589 
590     return TRUE;
591 }
592 
593 /* merge tables for rptp2ucm ------------------------------------------------ */
594 
595 U_CAPI void U_EXPORT2
ucm_mergeTables(UCMTable * fromUTable,UCMTable * toUTable,const uint8_t * subchar,int32_t subcharLength,uint8_t subchar1)596 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
597                 const uint8_t *subchar, int32_t subcharLength,
598                 uint8_t subchar1) {
599     UCMapping *fromUMapping, *toUMapping;
600     int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
601 
602     ucm_sortTable(fromUTable);
603     ucm_sortTable(toUTable);
604 
605     fromUMapping=fromUTable->mappings;
606     toUMapping=toUTable->mappings;
607 
608     fromUTop=fromUTable->mappingsLength;
609     toUTop=toUTable->mappingsLength;
610 
611     fromUIndex=toUIndex=0;
612 
613     while(fromUIndex<fromUTop && toUIndex<toUTop) {
614         cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
615         if(cmp==0) {
616             /* equal: roundtrip, nothing to do (flags are initially 0) */
617             ++fromUMapping;
618             ++toUMapping;
619 
620             ++fromUIndex;
621             ++toUIndex;
622         } else if(cmp<0) {
623             /*
624              * the fromU mapping does not have a toU counterpart:
625              * fallback Unicode->codepage
626              */
627             if( (fromUMapping->bLen==subcharLength &&
628                  0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
629                 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
630             ) {
631                 fromUMapping->f=2; /* SUB mapping */
632             } else {
633                 fromUMapping->f=1; /* normal fallback */
634             }
635 
636             ++fromUMapping;
637             ++fromUIndex;
638         } else {
639             /*
640              * the toU mapping does not have a fromU counterpart:
641              * (reverse) fallback codepage->Unicode, copy it to the fromU table
642              */
643 
644             /* ignore reverse fallbacks to Unicode SUB */
645             if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
646                 toUMapping->f=3; /* reverse fallback */
647                 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
648 
649                 /* the table may have been reallocated */
650                 fromUMapping=fromUTable->mappings+fromUIndex;
651             }
652 
653             ++toUMapping;
654             ++toUIndex;
655         }
656     }
657 
658     /* either one or both tables are exhausted */
659     while(fromUIndex<fromUTop) {
660         /* leftover fromU mappings are fallbacks */
661         if( (fromUMapping->bLen==subcharLength &&
662              0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
663             (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
664         ) {
665             fromUMapping->f=2; /* SUB mapping */
666         } else {
667             fromUMapping->f=1; /* normal fallback */
668         }
669 
670         ++fromUMapping;
671         ++fromUIndex;
672     }
673 
674     while(toUIndex<toUTop) {
675         /* leftover toU mappings are reverse fallbacks */
676 
677         /* ignore reverse fallbacks to Unicode SUB */
678         if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
679             toUMapping->f=3; /* reverse fallback */
680             ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
681         }
682 
683         ++toUMapping;
684         ++toUIndex;
685     }
686 
687     fromUTable->isSorted=FALSE;
688 }
689 
690 /* separate extension mappings out of base table for rptp2ucm --------------- */
691 
692 U_CAPI UBool U_EXPORT2
ucm_separateMappings(UCMFile * ucm,UBool isSISO)693 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
694     UCMTable *table;
695     UCMapping *m, *mLimit;
696     int32_t type;
697     UBool needsMove, isOK;
698 
699     table=ucm->base;
700     m=table->mappings;
701     mLimit=m+table->mappingsLength;
702 
703     needsMove=FALSE;
704     isOK=TRUE;
705 
706     for(; m<mLimit; ++m) {
707         if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
708             fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
709             ucm_printMapping(table, m, stderr);
710             m->moveFlag|=UCM_REMOVE_MAPPING;
711             needsMove=TRUE;
712             continue;
713         }
714 
715         type=ucm_mappingType(
716                 &ucm->states, m,
717                 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
718         if(type<0) {
719             /* illegal byte sequence */
720             printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
721             isOK=FALSE;
722         } else if(type>0) {
723             m->moveFlag|=UCM_MOVE_TO_EXT;
724             needsMove=TRUE;
725         }
726     }
727 
728     if(!isOK) {
729         return FALSE;
730     }
731     if(needsMove) {
732         ucm_moveMappings(ucm->base, ucm->ext);
733         return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
734     } else {
735         ucm_sortTable(ucm->base);
736         return TRUE;
737     }
738 }
739 
740 /* ucm parser --------------------------------------------------------------- */
741 
742 U_CAPI int8_t U_EXPORT2
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line,const char ** ps)743 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
744     const char *s=*ps;
745     char *end;
746     uint8_t byte;
747     int8_t bLen;
748 
749     bLen=0;
750     for(;;) {
751         /* skip an optional plus sign */
752         if(bLen>0 && *s=='+') {
753             ++s;
754         }
755         if(*s!='\\') {
756             break;
757         }
758 
759         if( s[1]!='x' ||
760             (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
761         ) {
762             fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
763             return -1;
764         }
765 
766         if(bLen==UCNV_EXT_MAX_BYTES) {
767             fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
768             return -1;
769         }
770         bytes[bLen++]=byte;
771         s=end;
772     }
773 
774     *ps=s;
775     return bLen;
776 }
777 
778 /* parse a mapping line; must not be empty */
779 U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line)780 ucm_parseMappingLine(UCMapping *m,
781                      UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
782                      uint8_t bytes[UCNV_EXT_MAX_BYTES],
783                      const char *line) {
784     const char *s;
785     char *end;
786     UChar32 cp;
787     int32_t u16Length;
788     int8_t uLen, bLen, f;
789 
790     s=line;
791     uLen=bLen=0;
792 
793     /* parse code points */
794     for(;;) {
795         /* skip an optional plus sign */
796         if(uLen>0 && *s=='+') {
797             ++s;
798         }
799         if(*s!='<') {
800             break;
801         }
802 
803         if( s[1]!='U' ||
804             (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
805             *end!='>'
806         ) {
807             fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
808             return FALSE;
809         }
810         if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
811             fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
812             return FALSE;
813         }
814 
815         if(uLen==UCNV_EXT_MAX_UCHARS) {
816             fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
817             return FALSE;
818         }
819         codePoints[uLen++]=cp;
820         s=end+1;
821     }
822 
823     if(uLen==0) {
824         fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
825         return FALSE;
826     } else if(uLen==1) {
827         m->u=codePoints[0];
828     } else {
829         UErrorCode errorCode=U_ZERO_ERROR;
830         u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
831         if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
832             u16Length>UCNV_EXT_MAX_UCHARS
833         ) {
834             fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
835             return FALSE;
836         }
837     }
838 
839     s=u_skipWhitespace(s);
840 
841     /* parse bytes */
842     bLen=ucm_parseBytes(bytes, line, &s);
843 
844     if(bLen<0) {
845         return FALSE;
846     } else if(bLen==0) {
847         fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
848         return FALSE;
849     } else if(bLen<=4) {
850         uprv_memcpy(m->b.bytes, bytes, bLen);
851     }
852 
853     /* skip everything until the fallback indicator, even the start of a comment */
854     for(;;) {
855         if(*s==0) {
856             f=-1; /* no fallback indicator */
857             break;
858         } else if(*s=='|') {
859             f=(int8_t)(s[1]-'0');
860             if((uint8_t)f>4) {
861                 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
862                 return FALSE;
863             }
864             break;
865         }
866         ++s;
867     }
868 
869     m->uLen=uLen;
870     m->bLen=bLen;
871     m->f=f;
872     return TRUE;
873 }
874 
875 /* general APIs ------------------------------------------------------------- */
876 
877 U_CAPI UCMTable * U_EXPORT2
ucm_openTable()878 ucm_openTable() {
879     UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
880     if(table==NULL) {
881         fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
882         exit(U_MEMORY_ALLOCATION_ERROR);
883     }
884 
885     memset(table, 0, sizeof(UCMTable));
886     return table;
887 }
888 
889 U_CAPI void U_EXPORT2
ucm_closeTable(UCMTable * table)890 ucm_closeTable(UCMTable *table) {
891     if(table!=NULL) {
892         uprv_free(table->mappings);
893         uprv_free(table->codePoints);
894         uprv_free(table->bytes);
895         uprv_free(table->reverseMap);
896         uprv_free(table);
897     }
898 }
899 
900 U_CAPI void U_EXPORT2
ucm_resetTable(UCMTable * table)901 ucm_resetTable(UCMTable *table) {
902     if(table!=NULL) {
903         table->mappingsLength=0;
904         table->flagsType=0;
905         table->unicodeMask=0;
906         table->bytesLength=table->codePointsLength=0;
907         table->isSorted=FALSE;
908     }
909 }
910 
911 U_CAPI void U_EXPORT2
ucm_addMapping(UCMTable * table,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])912 ucm_addMapping(UCMTable *table,
913                UCMapping *m,
914                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
915                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
916     UCMapping *tm;
917     UChar32 c;
918     int32_t idx;
919 
920     if(table->mappingsLength>=table->mappingsCapacity) {
921         /* make the mappings array larger */
922         if(table->mappingsCapacity==0) {
923             table->mappingsCapacity=1000;
924         } else {
925             table->mappingsCapacity*=10;
926         }
927         table->mappings=(UCMapping *)uprv_realloc(table->mappings,
928                                              table->mappingsCapacity*sizeof(UCMapping));
929         if(table->mappings==NULL) {
930             fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
931                             (int)table->mappingsCapacity);
932             exit(U_MEMORY_ALLOCATION_ERROR);
933         }
934 
935         if(table->reverseMap!=NULL) {
936             /* the reverseMap must be reallocated in a new sort */
937             uprv_free(table->reverseMap);
938             table->reverseMap=NULL;
939         }
940     }
941 
942     if(m->uLen>1 && table->codePointsCapacity==0) {
943         table->codePointsCapacity=10000;
944         table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
945         if(table->codePoints==NULL) {
946             fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
947                             (int)table->codePointsCapacity);
948             exit(U_MEMORY_ALLOCATION_ERROR);
949         }
950     }
951 
952     if(m->bLen>4 && table->bytesCapacity==0) {
953         table->bytesCapacity=10000;
954         table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
955         if(table->bytes==NULL) {
956             fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
957                             (int)table->bytesCapacity);
958             exit(U_MEMORY_ALLOCATION_ERROR);
959         }
960     }
961 
962     if(m->uLen>1) {
963         idx=table->codePointsLength;
964         table->codePointsLength+=m->uLen;
965         if(table->codePointsLength>table->codePointsCapacity) {
966             fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
967             exit(U_MEMORY_ALLOCATION_ERROR);
968         }
969 
970         uprv_memcpy(table->codePoints+idx, codePoints, m->uLen*4);
971         m->u=idx;
972     }
973 
974     if(m->bLen>4) {
975         idx=table->bytesLength;
976         table->bytesLength+=m->bLen;
977         if(table->bytesLength>table->bytesCapacity) {
978             fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
979             exit(U_MEMORY_ALLOCATION_ERROR);
980         }
981 
982         uprv_memcpy(table->bytes+idx, bytes, m->bLen);
983         m->b.idx=idx;
984     }
985 
986     /* set unicodeMask */
987     for(idx=0; idx<m->uLen; ++idx) {
988         c=codePoints[idx];
989         if(c>=0x10000) {
990             table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
991         } else if(U_IS_SURROGATE(c)) {
992             table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
993         }
994     }
995 
996     /* set flagsType */
997     if(m->f<0) {
998         table->flagsType|=UCM_FLAGS_IMPLICIT;
999     } else {
1000         table->flagsType|=UCM_FLAGS_EXPLICIT;
1001     }
1002 
1003     tm=table->mappings+table->mappingsLength++;
1004     uprv_memcpy(tm, m, sizeof(UCMapping));
1005 
1006     table->isSorted=FALSE;
1007 }
1008 
1009 U_CAPI UCMFile * U_EXPORT2
ucm_open()1010 ucm_open() {
1011     UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
1012     if(ucm==NULL) {
1013         fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
1014         exit(U_MEMORY_ALLOCATION_ERROR);
1015     }
1016 
1017     memset(ucm, 0, sizeof(UCMFile));
1018 
1019     ucm->base=ucm_openTable();
1020     ucm->ext=ucm_openTable();
1021 
1022     ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
1023     ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
1024     ucm->states.outputType=-1;
1025     ucm->states.minCharLength=ucm->states.maxCharLength=1;
1026 
1027     return ucm;
1028 }
1029 
1030 U_CAPI void U_EXPORT2
ucm_close(UCMFile * ucm)1031 ucm_close(UCMFile *ucm) {
1032     if(ucm!=NULL) {
1033         ucm_closeTable(ucm->base);
1034         ucm_closeTable(ucm->ext);
1035         uprv_free(ucm);
1036     }
1037 }
1038 
1039 U_CAPI int32_t U_EXPORT2
ucm_mappingType(UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1040 ucm_mappingType(UCMStates *baseStates,
1041                 UCMapping *m,
1042                 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1043                 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1044     /* check validity of the bytes and count the characters in them */
1045     int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
1046     if(count<1) {
1047         /* illegal byte sequence */
1048         return -1;
1049     }
1050 
1051     /*
1052      * Suitable for an ICU conversion base table means:
1053      * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
1054      * - precision flag 0..3
1055      * - SBCS: any 1:1 mapping
1056      *         (the table stores additional bits to distinguish mapping types)
1057      * - MBCS: not a |2 SUB mapping for <subchar1>
1058      * - MBCS: not a |1 fallback to 0x00
1059      * - MBCS: not a multi-byte mapping with leading 0x00 bytes
1060      *
1061      * Further restrictions for fromUnicode tables
1062      * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
1063      *
1064      * All of the MBCS fromUnicode specific tests could be removed from here,
1065      * but the ones above are for unusual mappings, and removing the tests
1066      * from here would change canonucm output which seems gratuitous.
1067      * (Markus Scherer 2006-nov-28)
1068      *
1069      * Exception: All implicit mappings (f<0) that need to be moved
1070      * because of fromUnicode restrictions _must_ be moved here because
1071      * makeconv uses a hack for moving mappings only for the fromUnicode table
1072      * that only works with non-negative values of f.
1073      */
1074     if( m->uLen==1 && count==1 && m->f<=3 &&
1075         (baseStates->maxCharLength==1 ||
1076             !((m->f==2 && m->bLen==1) ||
1077               (m->f==1 && bytes[0]==0) ||
1078               (m->f<=1 && m->bLen>1 && bytes[0]==0)))
1079     ) {
1080         return 0; /* suitable for a base table */
1081     } else {
1082         return 1; /* needs to go into an extension table */
1083     }
1084 }
1085 
1086 U_CAPI UBool U_EXPORT2
ucm_addMappingAuto(UCMFile * ucm,UBool forBase,UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1087 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
1088                    UCMapping *m,
1089                    UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1090                    uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1091     int32_t type;
1092 
1093     if(m->f==2 && m->uLen>1) {
1094         fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1095         printMapping(m, codePoints, bytes, stderr);
1096         return FALSE;
1097     }
1098 
1099     if(baseStates!=NULL) {
1100         /* check validity of the bytes and count the characters in them */
1101         type=ucm_mappingType(baseStates, m, codePoints, bytes);
1102         if(type<0) {
1103             /* illegal byte sequence */
1104             printMapping(m, codePoints, bytes, stderr);
1105             return FALSE;
1106         }
1107     } else {
1108         /* not used - adding a mapping for an extension-only table before its base table is read */
1109         type=1;
1110     }
1111 
1112     /*
1113      * Add the mapping to the base table if this is requested and suitable.
1114      * Otherwise, add it to the extension table.
1115      */
1116     if(forBase && type==0) {
1117         ucm_addMapping(ucm->base, m, codePoints, bytes);
1118     } else {
1119         ucm_addMapping(ucm->ext, m, codePoints, bytes);
1120     }
1121 
1122     return TRUE;
1123 }
1124 
1125 U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile * ucm,const char * line,UBool forBase,UCMStates * baseStates)1126 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
1127     UCMapping m={ 0 };
1128     UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
1129     uint8_t bytes[UCNV_EXT_MAX_BYTES];
1130 
1131     const char *s;
1132 
1133     /* ignore empty and comment lines */
1134     if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
1135         return TRUE;
1136     }
1137 
1138     return
1139         ucm_parseMappingLine(&m, codePoints, bytes, line) &&
1140         ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
1141 }
1142 
1143 U_CAPI void U_EXPORT2
ucm_readTable(UCMFile * ucm,FileStream * convFile,UBool forBase,UCMStates * baseStates,UErrorCode * pErrorCode)1144 ucm_readTable(UCMFile *ucm, FileStream* convFile,
1145               UBool forBase, UCMStates *baseStates,
1146               UErrorCode *pErrorCode) {
1147     char line[500];
1148     char *end;
1149     UBool isOK;
1150 
1151     if(U_FAILURE(*pErrorCode)) {
1152         return;
1153     }
1154 
1155     isOK=TRUE;
1156 
1157     for(;;) {
1158         /* read the next line */
1159         if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
1160             fprintf(stderr, "incomplete charmap section\n");
1161             isOK=FALSE;
1162             break;
1163         }
1164 
1165         /* remove CR LF */
1166         end=uprv_strchr(line, 0);
1167         while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
1168             --end;
1169         }
1170         *end=0;
1171 
1172         /* ignore empty and comment lines */
1173         if(line[0]==0 || line[0]=='#') {
1174             continue;
1175         }
1176 
1177         /* stop at the end of the mapping table */
1178         if(0==uprv_strcmp(line, "END CHARMAP")) {
1179             break;
1180         }
1181 
1182         isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
1183     }
1184 
1185     if(!isOK) {
1186         *pErrorCode=U_INVALID_TABLE_FORMAT;
1187     }
1188 }
1189 #endif
1190