1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2003-2013, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucm.c
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2003jun20
16 *   created by: Markus W. Scherer
17 *
18 *   This file reads a .ucm file, stores its mappings and sorts them.
19 *   It implements handling of Unicode conversion mappings from .ucm files
20 *   for makeconv, canonucm, rptp2ucm, etc.
21 *
22 *   Unicode code point sequences with a length of more than 1,
23 *   as well as byte sequences with more than 4 bytes or more than one complete
24 *   character sequence are handled to support m:n mappings.
25 */
26 
27 #include "unicode/utypes.h"
28 #include "unicode/ustring.h"
29 #include "cstring.h"
30 #include "cmemory.h"
31 #include "filestrm.h"
32 #include "uarrsort.h"
33 #include "ucnvmbcs.h"
34 #include "ucnv_bld.h"
35 #include "ucnv_ext.h"
36 #include "uparse.h"
37 #include "ucm.h"
38 #include <stdio.h>
39 
40 #if !UCONFIG_NO_CONVERSION
41 
42 /* -------------------------------------------------------------------------- */
43 
44 static void
printMapping(UCMapping * m,UChar32 * codePoints,uint8_t * bytes,FILE * f)45 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
46     int32_t j;
47 
48     for(j=0; j<m->uLen; ++j) {
49         fprintf(f, "<U%04lX>", (long)codePoints[j]);
50     }
51 
52     fputc(' ', f);
53 
54     for(j=0; j<m->bLen; ++j) {
55         fprintf(f, "\\x%02X", bytes[j]);
56     }
57 
58     if(m->f>=0) {
59         fprintf(f, " |%u\n", m->f);
60     } else {
61         fputs("\n", f);
62     }
63 }
64 
65 U_CAPI void U_EXPORT2
ucm_printMapping(UCMTable * table,UCMapping * m,FILE * f)66 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
67     printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
68 }
69 
70 U_CAPI void U_EXPORT2
ucm_printTable(UCMTable * table,FILE * f,UBool byUnicode)71 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
72     UCMapping *m;
73     int32_t i, length;
74 
75     m=table->mappings;
76     length=table->mappingsLength;
77     if(byUnicode) {
78         for(i=0; i<length; ++m, ++i) {
79             ucm_printMapping(table, m, f);
80         }
81     } else {
82         const int32_t *map=table->reverseMap;
83         for(i=0; i<length; ++i) {
84             ucm_printMapping(table, m+map[i], f);
85         }
86     }
87 }
88 
89 /* mapping comparisons ------------------------------------------------------ */
90 
91 static int32_t
compareUnicode(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r)92 compareUnicode(UCMTable *lTable, const UCMapping *l,
93                UCMTable *rTable, const UCMapping *r) {
94     const UChar32 *lu, *ru;
95     int32_t result, i, length;
96 
97     if(l->uLen==1 && r->uLen==1) {
98         /* compare two single code points */
99         return l->u-r->u;
100     }
101 
102     /* get pointers to the code point sequences */
103     lu=UCM_GET_CODE_POINTS(lTable, l);
104     ru=UCM_GET_CODE_POINTS(rTable, r);
105 
106     /* get the minimum length */
107     if(l->uLen<=r->uLen) {
108         length=l->uLen;
109     } else {
110         length=r->uLen;
111     }
112 
113     /* compare the code points */
114     for(i=0; i<length; ++i) {
115         result=lu[i]-ru[i];
116         if(result!=0) {
117             return result;
118         }
119     }
120 
121     /* compare the lengths */
122     return l->uLen-r->uLen;
123 }
124 
125 static int32_t
compareBytes(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool lexical)126 compareBytes(UCMTable *lTable, const UCMapping *l,
127              UCMTable *rTable, const UCMapping *r,
128              UBool lexical) {
129     const uint8_t *lb, *rb;
130     int32_t result, i, length;
131 
132     /*
133      * A lexical comparison is used for sorting in the builder, to allow
134      * an efficient search for a byte sequence that could be a prefix
135      * of a previously entered byte sequence.
136      *
137      * Comparing by lengths first is for compatibility with old .ucm tools
138      * like canonucm and rptp2ucm.
139      */
140     if(lexical) {
141         /* get the minimum length and continue */
142         if(l->bLen<=r->bLen) {
143             length=l->bLen;
144         } else {
145             length=r->bLen;
146         }
147     } else {
148         /* compare lengths first */
149         result=l->bLen-r->bLen;
150         if(result!=0) {
151             return result;
152         } else {
153             length=l->bLen;
154         }
155     }
156 
157     /* get pointers to the byte sequences */
158     lb=UCM_GET_BYTES(lTable, l);
159     rb=UCM_GET_BYTES(rTable, r);
160 
161     /* compare the bytes */
162     for(i=0; i<length; ++i) {
163         result=lb[i]-rb[i];
164         if(result!=0) {
165             return result;
166         }
167     }
168 
169     /* compare the lengths */
170     return l->bLen-r->bLen;
171 }
172 
173 /* compare UCMappings for sorting */
174 static int32_t
compareMappings(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool uFirst)175 compareMappings(UCMTable *lTable, const UCMapping *l,
176                 UCMTable *rTable, const UCMapping *r,
177                 UBool uFirst) {
178     int32_t result;
179 
180     /* choose which side to compare first */
181     if(uFirst) {
182         /* Unicode then bytes */
183         result=compareUnicode(lTable, l, rTable, r);
184         if(result==0) {
185             result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
186         }
187     } else {
188         /* bytes then Unicode */
189         result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
190         if(result==0) {
191             result=compareUnicode(lTable, l, rTable, r);
192         }
193     }
194 
195     if(result!=0) {
196         return result;
197     }
198 
199     /* compare the flags */
200     return l->f-r->f;
201 }
202 
203 /* sorting by Unicode first sorts mappings directly */
204 static int32_t
compareMappingsUnicodeFirst(const void * context,const void * left,const void * right)205 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
206     return compareMappings(
207         (UCMTable *)context, (const UCMapping *)left,
208         (UCMTable *)context, (const UCMapping *)right, TRUE);
209 }
210 
211 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
212 static int32_t
compareMappingsBytesFirst(const void * context,const void * left,const void * right)213 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
214     UCMTable *table=(UCMTable *)context;
215     int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
216     return compareMappings(
217         table, table->mappings+l,
218         table, table->mappings+r, FALSE);
219 }
220 
221 U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable * t)222 ucm_sortTable(UCMTable *t) {
223     UErrorCode errorCode;
224     int32_t i;
225 
226     if(t->isSorted) {
227         return;
228     }
229 
230     errorCode=U_ZERO_ERROR;
231 
232     /* 1. sort by Unicode first */
233     uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
234                    compareMappingsUnicodeFirst, t,
235                    FALSE, &errorCode);
236 
237     /* build the reverseMap */
238     if(t->reverseMap==NULL) {
239         /*
240          * allocate mappingsCapacity instead of mappingsLength so that
241          * if mappings are added, the reverseMap need not be
242          * reallocated each time
243          * (see ucm_moveMappings() and ucm_addMapping())
244          */
245         t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
246         if(t->reverseMap==NULL) {
247             fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
248             exit(U_MEMORY_ALLOCATION_ERROR);
249         }
250     }
251     for(i=0; i<t->mappingsLength; ++i) {
252         t->reverseMap[i]=i;
253     }
254 
255     /* 2. sort reverseMap by mappings bytes first */
256     uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
257                    compareMappingsBytesFirst, t,
258                    FALSE, &errorCode);
259 
260     if(U_FAILURE(errorCode)) {
261         fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
262                 u_errorName(errorCode));
263         exit(errorCode);
264     }
265 
266     t->isSorted=TRUE;
267 }
268 
269 /*
270  * remove mappings with their move flag set from the base table
271  * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
272  */
273 U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable * base,UCMTable * ext)274 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
275     UCMapping *mb, *mbLimit;
276     int8_t flag;
277 
278     mb=base->mappings;
279     mbLimit=mb+base->mappingsLength;
280 
281     while(mb<mbLimit) {
282         flag=mb->moveFlag;
283         if(flag!=0) {
284             /* reset the move flag */
285             mb->moveFlag=0;
286 
287             if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
288                 /* add the mapping to the extension table */
289                 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
290             }
291 
292             /* remove this mapping: move the last base mapping down and overwrite the current one */
293             if(mb<(mbLimit-1)) {
294                 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
295             }
296             --mbLimit;
297             --base->mappingsLength;
298             base->isSorted=FALSE;
299         } else {
300             ++mb;
301         }
302     }
303 }
304 
305 enum {
306     NEEDS_MOVE=1,
307     HAS_ERRORS=2
308 };
309 
310 static uint8_t
checkBaseExtUnicode(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)311 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
312                     UBool moveToExt, UBool intersectBase) {
313     UCMapping *mb, *me, *mbLimit, *meLimit;
314     int32_t cmp;
315     uint8_t result;
316 
317     mb=base->mappings;
318     mbLimit=mb+base->mappingsLength;
319 
320     me=ext->mappings;
321     meLimit=me+ext->mappingsLength;
322 
323     result=0;
324 
325     for(;;) {
326         /* skip irrelevant mappings on both sides */
327         for(;;) {
328             if(mb==mbLimit) {
329                 return result;
330             }
331 
332             if((0<=mb->f && mb->f<=2) || mb->f==4) {
333                 break;
334             }
335 
336             ++mb;
337         }
338 
339         for(;;) {
340             if(me==meLimit) {
341                 return result;
342             }
343 
344             if((0<=me->f && me->f<=2) || me->f==4) {
345                 break;
346             }
347 
348             ++me;
349         }
350 
351         /* compare the base and extension mappings */
352         cmp=compareUnicode(base, mb, ext, me);
353         if(cmp<0) {
354             if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
355                 /*
356                  * mapping in base but not in ext, move it
357                  *
358                  * if ext is DBCS, move DBCS mappings here
359                  * and check SBCS ones for Unicode prefix below
360                  */
361                 mb->moveFlag|=UCM_MOVE_TO_EXT;
362                 result|=NEEDS_MOVE;
363 
364             /* does mb map from an input sequence that is a prefix of me's? */
365             } else if( mb->uLen<me->uLen &&
366                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
367             ) {
368                 if(moveToExt) {
369                     /* mark this mapping to be moved to the extension table */
370                     mb->moveFlag|=UCM_MOVE_TO_EXT;
371                     result|=NEEDS_MOVE;
372                 } else {
373                     fprintf(stderr,
374                             "ucm error: the base table contains a mapping whose input sequence\n"
375                             "           is a prefix of the input sequence of an extension mapping\n");
376                     ucm_printMapping(base, mb, stderr);
377                     ucm_printMapping(ext, me, stderr);
378                     result|=HAS_ERRORS;
379                 }
380             }
381 
382             ++mb;
383         } else if(cmp==0) {
384             /*
385              * same output: remove the extension mapping,
386              * otherwise treat as an error
387              */
388             if( mb->f==me->f && mb->bLen==me->bLen &&
389                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
390             ) {
391                 me->moveFlag|=UCM_REMOVE_MAPPING;
392                 result|=NEEDS_MOVE;
393             } else if(intersectBase) {
394                 /* mapping in base but not in ext, move it */
395                 mb->moveFlag|=UCM_MOVE_TO_EXT;
396                 result|=NEEDS_MOVE;
397             } else {
398                 fprintf(stderr,
399                         "ucm error: the base table contains a mapping whose input sequence\n"
400                         "           is the same as the input sequence of an extension mapping\n"
401                         "           but it maps differently\n");
402                 ucm_printMapping(base, mb, stderr);
403                 ucm_printMapping(ext, me, stderr);
404                 result|=HAS_ERRORS;
405             }
406 
407             ++mb;
408         } else /* cmp>0 */ {
409             ++me;
410         }
411     }
412 }
413 
414 static uint8_t
checkBaseExtBytes(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)415 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
416                   UBool moveToExt, UBool intersectBase) {
417     UCMapping *mb, *me;
418     int32_t *baseMap, *extMap;
419     int32_t b, e, bLimit, eLimit, cmp;
420     uint8_t result;
421     UBool isSISO;
422 
423     baseMap=base->reverseMap;
424     extMap=ext->reverseMap;
425 
426     b=e=0;
427     bLimit=base->mappingsLength;
428     eLimit=ext->mappingsLength;
429 
430     result=0;
431 
432     isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
433 
434     for(;;) {
435         /* skip irrelevant mappings on both sides */
436         for(;; ++b) {
437             if(b==bLimit) {
438                 return result;
439             }
440             mb=base->mappings+baseMap[b];
441 
442             if(intersectBase==2 && mb->bLen==1) {
443                 /*
444                  * comparing a base against a DBCS extension:
445                  * leave SBCS base mappings alone
446                  */
447                 continue;
448             }
449 
450             if(mb->f==0 || mb->f==3) {
451                 break;
452             }
453         }
454 
455         for(;;) {
456             if(e==eLimit) {
457                 return result;
458             }
459             me=ext->mappings+extMap[e];
460 
461             if(me->f==0 || me->f==3) {
462                 break;
463             }
464 
465             ++e;
466         }
467 
468         /* compare the base and extension mappings */
469         cmp=compareBytes(base, mb, ext, me, TRUE);
470         if(cmp<0) {
471             if(intersectBase) {
472                 /* mapping in base but not in ext, move it */
473                 mb->moveFlag|=UCM_MOVE_TO_EXT;
474                 result|=NEEDS_MOVE;
475 
476             /*
477              * does mb map from an input sequence that is a prefix of me's?
478              * for SI/SO tables, a single byte is never a prefix because it
479              * occurs in a separate single-byte state
480              */
481             } else if( mb->bLen<me->bLen &&
482                 (!isSISO || mb->bLen>1) &&
483                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
484             ) {
485                 if(moveToExt) {
486                     /* mark this mapping to be moved to the extension table */
487                     mb->moveFlag|=UCM_MOVE_TO_EXT;
488                     result|=NEEDS_MOVE;
489                 } else {
490                     fprintf(stderr,
491                             "ucm error: the base table contains a mapping whose input sequence\n"
492                             "           is a prefix of the input sequence of an extension mapping\n");
493                     ucm_printMapping(base, mb, stderr);
494                     ucm_printMapping(ext, me, stderr);
495                     result|=HAS_ERRORS;
496                 }
497             }
498 
499             ++b;
500         } else if(cmp==0) {
501             /*
502              * same output: remove the extension mapping,
503              * otherwise treat as an error
504              */
505             if( mb->f==me->f && mb->uLen==me->uLen &&
506                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
507             ) {
508                 me->moveFlag|=UCM_REMOVE_MAPPING;
509                 result|=NEEDS_MOVE;
510             } else if(intersectBase) {
511                 /* mapping in base but not in ext, move it */
512                 mb->moveFlag|=UCM_MOVE_TO_EXT;
513                 result|=NEEDS_MOVE;
514             } else {
515                 fprintf(stderr,
516                         "ucm error: the base table contains a mapping whose input sequence\n"
517                         "           is the same as the input sequence of an extension mapping\n"
518                         "           but it maps differently\n");
519                 ucm_printMapping(base, mb, stderr);
520                 ucm_printMapping(ext, me, stderr);
521                 result|=HAS_ERRORS;
522             }
523 
524             ++b;
525         } else /* cmp>0 */ {
526             ++e;
527         }
528     }
529 }
530 
531 U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable * table,UCMStates * baseStates)532 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
533     UCMapping *m, *mLimit;
534     int32_t count;
535     UBool isOK;
536 
537     m=table->mappings;
538     mLimit=m+table->mappingsLength;
539     isOK=TRUE;
540 
541     while(m<mLimit) {
542         count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
543         if(count<1) {
544             ucm_printMapping(table, m, stderr);
545             isOK=FALSE;
546         }
547         ++m;
548     }
549 
550     return isOK;
551 }
552 
553 U_CAPI UBool U_EXPORT2
ucm_checkBaseExt(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UCMTable * moveTarget,UBool intersectBase)554 ucm_checkBaseExt(UCMStates *baseStates,
555                  UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
556                  UBool intersectBase) {
557     uint8_t result;
558 
559     /* if we have an extension table, we must always use precision flags */
560     if(base->flagsType&UCM_FLAGS_IMPLICIT) {
561         fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
562         return FALSE;
563     }
564     if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
565         fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
566         return FALSE;
567     }
568 
569     /* checking requires both tables to be sorted */
570     ucm_sortTable(base);
571     ucm_sortTable(ext);
572 
573     /* check */
574     result=
575         checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
576         checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
577 
578     if(result&HAS_ERRORS) {
579         return FALSE;
580     }
581 
582     if(result&NEEDS_MOVE) {
583         ucm_moveMappings(ext, NULL);
584         ucm_moveMappings(base, moveTarget);
585         ucm_sortTable(base);
586         ucm_sortTable(ext);
587         if(moveTarget!=NULL) {
588             ucm_sortTable(moveTarget);
589         }
590     }
591 
592     return TRUE;
593 }
594 
595 /* merge tables for rptp2ucm ------------------------------------------------ */
596 
597 U_CAPI void U_EXPORT2
ucm_mergeTables(UCMTable * fromUTable,UCMTable * toUTable,const uint8_t * subchar,int32_t subcharLength,uint8_t subchar1)598 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
599                 const uint8_t *subchar, int32_t subcharLength,
600                 uint8_t subchar1) {
601     UCMapping *fromUMapping, *toUMapping;
602     int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
603 
604     ucm_sortTable(fromUTable);
605     ucm_sortTable(toUTable);
606 
607     fromUMapping=fromUTable->mappings;
608     toUMapping=toUTable->mappings;
609 
610     fromUTop=fromUTable->mappingsLength;
611     toUTop=toUTable->mappingsLength;
612 
613     fromUIndex=toUIndex=0;
614 
615     while(fromUIndex<fromUTop && toUIndex<toUTop) {
616         cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
617         if(cmp==0) {
618             /* equal: roundtrip, nothing to do (flags are initially 0) */
619             ++fromUMapping;
620             ++toUMapping;
621 
622             ++fromUIndex;
623             ++toUIndex;
624         } else if(cmp<0) {
625             /*
626              * the fromU mapping does not have a toU counterpart:
627              * fallback Unicode->codepage
628              */
629             if( (fromUMapping->bLen==subcharLength &&
630                  0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
631                 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
632             ) {
633                 fromUMapping->f=2; /* SUB mapping */
634             } else {
635                 fromUMapping->f=1; /* normal fallback */
636             }
637 
638             ++fromUMapping;
639             ++fromUIndex;
640         } else {
641             /*
642              * the toU mapping does not have a fromU counterpart:
643              * (reverse) fallback codepage->Unicode, copy it to the fromU table
644              */
645 
646             /* ignore reverse fallbacks to Unicode SUB */
647             if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
648                 toUMapping->f=3; /* reverse fallback */
649                 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
650 
651                 /* the table may have been reallocated */
652                 fromUMapping=fromUTable->mappings+fromUIndex;
653             }
654 
655             ++toUMapping;
656             ++toUIndex;
657         }
658     }
659 
660     /* either one or both tables are exhausted */
661     while(fromUIndex<fromUTop) {
662         /* leftover fromU mappings are fallbacks */
663         if( (fromUMapping->bLen==subcharLength &&
664              0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
665             (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
666         ) {
667             fromUMapping->f=2; /* SUB mapping */
668         } else {
669             fromUMapping->f=1; /* normal fallback */
670         }
671 
672         ++fromUMapping;
673         ++fromUIndex;
674     }
675 
676     while(toUIndex<toUTop) {
677         /* leftover toU mappings are reverse fallbacks */
678 
679         /* ignore reverse fallbacks to Unicode SUB */
680         if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
681             toUMapping->f=3; /* reverse fallback */
682             ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
683         }
684 
685         ++toUMapping;
686         ++toUIndex;
687     }
688 
689     fromUTable->isSorted=FALSE;
690 }
691 
692 /* separate extension mappings out of base table for rptp2ucm --------------- */
693 
694 U_CAPI UBool U_EXPORT2
ucm_separateMappings(UCMFile * ucm,UBool isSISO)695 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
696     UCMTable *table;
697     UCMapping *m, *mLimit;
698     int32_t type;
699     UBool needsMove, isOK;
700 
701     table=ucm->base;
702     m=table->mappings;
703     mLimit=m+table->mappingsLength;
704 
705     needsMove=FALSE;
706     isOK=TRUE;
707 
708     for(; m<mLimit; ++m) {
709         if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
710             fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
711             ucm_printMapping(table, m, stderr);
712             m->moveFlag|=UCM_REMOVE_MAPPING;
713             needsMove=TRUE;
714             continue;
715         }
716 
717         type=ucm_mappingType(
718                 &ucm->states, m,
719                 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
720         if(type<0) {
721             /* illegal byte sequence */
722             printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
723             isOK=FALSE;
724         } else if(type>0) {
725             m->moveFlag|=UCM_MOVE_TO_EXT;
726             needsMove=TRUE;
727         }
728     }
729 
730     if(!isOK) {
731         return FALSE;
732     }
733     if(needsMove) {
734         ucm_moveMappings(ucm->base, ucm->ext);
735         return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
736     } else {
737         ucm_sortTable(ucm->base);
738         return TRUE;
739     }
740 }
741 
742 /* ucm parser --------------------------------------------------------------- */
743 
744 U_CAPI int8_t U_EXPORT2
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line,const char ** ps)745 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
746     const char *s=*ps;
747     char *end;
748     uint8_t byte;
749     int8_t bLen;
750 
751     bLen=0;
752     for(;;) {
753         /* skip an optional plus sign */
754         if(bLen>0 && *s=='+') {
755             ++s;
756         }
757         if(*s!='\\') {
758             break;
759         }
760 
761         if( s[1]!='x' ||
762             (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
763         ) {
764             fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
765             return -1;
766         }
767 
768         if(bLen==UCNV_EXT_MAX_BYTES) {
769             fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
770             return -1;
771         }
772         bytes[bLen++]=byte;
773         s=end;
774     }
775 
776     *ps=s;
777     return bLen;
778 }
779 
780 /* parse a mapping line; must not be empty */
781 U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line)782 ucm_parseMappingLine(UCMapping *m,
783                      UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
784                      uint8_t bytes[UCNV_EXT_MAX_BYTES],
785                      const char *line) {
786     const char *s;
787     char *end;
788     UChar32 cp;
789     int32_t u16Length;
790     int8_t uLen, bLen, f;
791 
792     s=line;
793     uLen=bLen=0;
794 
795     /* parse code points */
796     for(;;) {
797         /* skip an optional plus sign */
798         if(uLen>0 && *s=='+') {
799             ++s;
800         }
801         if(*s!='<') {
802             break;
803         }
804 
805         if( s[1]!='U' ||
806             (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
807             *end!='>'
808         ) {
809             fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
810             return FALSE;
811         }
812         if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
813             fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
814             return FALSE;
815         }
816 
817         if(uLen==UCNV_EXT_MAX_UCHARS) {
818             fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
819             return FALSE;
820         }
821         codePoints[uLen++]=cp;
822         s=end+1;
823     }
824 
825     if(uLen==0) {
826         fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
827         return FALSE;
828     } else if(uLen==1) {
829         m->u=codePoints[0];
830     } else {
831         UErrorCode errorCode=U_ZERO_ERROR;
832         u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
833         if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
834             u16Length>UCNV_EXT_MAX_UCHARS
835         ) {
836             fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
837             return FALSE;
838         }
839     }
840 
841     s=u_skipWhitespace(s);
842 
843     /* parse bytes */
844     bLen=ucm_parseBytes(bytes, line, &s);
845 
846     if(bLen<0) {
847         return FALSE;
848     } else if(bLen==0) {
849         fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
850         return FALSE;
851     } else if(bLen<=4) {
852         uprv_memcpy(m->b.bytes, bytes, bLen);
853     }
854 
855     /* skip everything until the fallback indicator, even the start of a comment */
856     for(;;) {
857         if(*s==0) {
858             f=-1; /* no fallback indicator */
859             break;
860         } else if(*s=='|') {
861             f=(int8_t)(s[1]-'0');
862             if((uint8_t)f>4) {
863                 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
864                 return FALSE;
865             }
866             break;
867         }
868         ++s;
869     }
870 
871     m->uLen=uLen;
872     m->bLen=bLen;
873     m->f=f;
874     return TRUE;
875 }
876 
877 /* general APIs ------------------------------------------------------------- */
878 
879 U_CAPI UCMTable * U_EXPORT2
ucm_openTable()880 ucm_openTable() {
881     UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
882     if(table==NULL) {
883         fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
884         exit(U_MEMORY_ALLOCATION_ERROR);
885     }
886 
887     memset(table, 0, sizeof(UCMTable));
888     return table;
889 }
890 
891 U_CAPI void U_EXPORT2
ucm_closeTable(UCMTable * table)892 ucm_closeTable(UCMTable *table) {
893     if(table!=NULL) {
894         uprv_free(table->mappings);
895         uprv_free(table->codePoints);
896         uprv_free(table->bytes);
897         uprv_free(table->reverseMap);
898         uprv_free(table);
899     }
900 }
901 
902 U_CAPI void U_EXPORT2
ucm_resetTable(UCMTable * table)903 ucm_resetTable(UCMTable *table) {
904     if(table!=NULL) {
905         table->mappingsLength=0;
906         table->flagsType=0;
907         table->unicodeMask=0;
908         table->bytesLength=table->codePointsLength=0;
909         table->isSorted=FALSE;
910     }
911 }
912 
913 U_CAPI void U_EXPORT2
ucm_addMapping(UCMTable * table,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])914 ucm_addMapping(UCMTable *table,
915                UCMapping *m,
916                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
917                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
918     UCMapping *tm;
919     UChar32 c;
920     int32_t idx;
921 
922     if(table->mappingsLength>=table->mappingsCapacity) {
923         /* make the mappings array larger */
924         if(table->mappingsCapacity==0) {
925             table->mappingsCapacity=1000;
926         } else {
927             table->mappingsCapacity*=10;
928         }
929         table->mappings=(UCMapping *)uprv_realloc(table->mappings,
930                                              table->mappingsCapacity*sizeof(UCMapping));
931         if(table->mappings==NULL) {
932             fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
933                             (int)table->mappingsCapacity);
934             exit(U_MEMORY_ALLOCATION_ERROR);
935         }
936 
937         if(table->reverseMap!=NULL) {
938             /* the reverseMap must be reallocated in a new sort */
939             uprv_free(table->reverseMap);
940             table->reverseMap=NULL;
941         }
942     }
943 
944     if(m->uLen>1 && table->codePointsCapacity==0) {
945         table->codePointsCapacity=10000;
946         table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
947         if(table->codePoints==NULL) {
948             fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
949                             (int)table->codePointsCapacity);
950             exit(U_MEMORY_ALLOCATION_ERROR);
951         }
952     }
953 
954     if(m->bLen>4 && table->bytesCapacity==0) {
955         table->bytesCapacity=10000;
956         table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
957         if(table->bytes==NULL) {
958             fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
959                             (int)table->bytesCapacity);
960             exit(U_MEMORY_ALLOCATION_ERROR);
961         }
962     }
963 
964     if(m->uLen>1) {
965         idx=table->codePointsLength;
966         table->codePointsLength+=m->uLen;
967         if(table->codePointsLength>table->codePointsCapacity) {
968             fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
969             exit(U_MEMORY_ALLOCATION_ERROR);
970         }
971 
972         uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4);
973         m->u=idx;
974     }
975 
976     if(m->bLen>4) {
977         idx=table->bytesLength;
978         table->bytesLength+=m->bLen;
979         if(table->bytesLength>table->bytesCapacity) {
980             fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
981             exit(U_MEMORY_ALLOCATION_ERROR);
982         }
983 
984         uprv_memcpy(table->bytes+idx, bytes, m->bLen);
985         m->b.idx=idx;
986     }
987 
988     /* set unicodeMask */
989     for(idx=0; idx<m->uLen; ++idx) {
990         c=codePoints[idx];
991         if(c>=0x10000) {
992             table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
993         } else if(U_IS_SURROGATE(c)) {
994             table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
995         }
996     }
997 
998     /* set flagsType */
999     if(m->f<0) {
1000         table->flagsType|=UCM_FLAGS_IMPLICIT;
1001     } else {
1002         table->flagsType|=UCM_FLAGS_EXPLICIT;
1003     }
1004 
1005     tm=table->mappings+table->mappingsLength++;
1006     uprv_memcpy(tm, m, sizeof(UCMapping));
1007 
1008     table->isSorted=FALSE;
1009 }
1010 
1011 U_CAPI UCMFile * U_EXPORT2
ucm_open()1012 ucm_open() {
1013     UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
1014     if(ucm==NULL) {
1015         fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
1016         exit(U_MEMORY_ALLOCATION_ERROR);
1017     }
1018 
1019     memset(ucm, 0, sizeof(UCMFile));
1020 
1021     ucm->base=ucm_openTable();
1022     ucm->ext=ucm_openTable();
1023 
1024     ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
1025     ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
1026     ucm->states.outputType=-1;
1027     ucm->states.minCharLength=ucm->states.maxCharLength=1;
1028 
1029     return ucm;
1030 }
1031 
1032 U_CAPI void U_EXPORT2
ucm_close(UCMFile * ucm)1033 ucm_close(UCMFile *ucm) {
1034     if(ucm!=NULL) {
1035         ucm_closeTable(ucm->base);
1036         ucm_closeTable(ucm->ext);
1037         uprv_free(ucm);
1038     }
1039 }
1040 
1041 U_CAPI int32_t U_EXPORT2
ucm_mappingType(UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1042 ucm_mappingType(UCMStates *baseStates,
1043                 UCMapping *m,
1044                 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1045                 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1046     /* check validity of the bytes and count the characters in them */
1047     int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
1048     if(count<1) {
1049         /* illegal byte sequence */
1050         return -1;
1051     }
1052 
1053     /*
1054      * Suitable for an ICU conversion base table means:
1055      * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
1056      * - precision flag 0..3
1057      * - SBCS: any 1:1 mapping
1058      *         (the table stores additional bits to distinguish mapping types)
1059      * - MBCS: not a |2 SUB mapping for <subchar1>
1060      * - MBCS: not a |1 fallback to 0x00
1061      * - MBCS: not a multi-byte mapping with leading 0x00 bytes
1062      *
1063      * Further restrictions for fromUnicode tables
1064      * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
1065      *
1066      * All of the MBCS fromUnicode specific tests could be removed from here,
1067      * but the ones above are for unusual mappings, and removing the tests
1068      * from here would change canonucm output which seems gratuitous.
1069      * (Markus Scherer 2006-nov-28)
1070      *
1071      * Exception: All implicit mappings (f<0) that need to be moved
1072      * because of fromUnicode restrictions _must_ be moved here because
1073      * makeconv uses a hack for moving mappings only for the fromUnicode table
1074      * that only works with non-negative values of f.
1075      */
1076     if( m->uLen==1 && count==1 && m->f<=3 &&
1077         (baseStates->maxCharLength==1 ||
1078             !((m->f==2 && m->bLen==1) ||
1079               (m->f==1 && bytes[0]==0) ||
1080               (m->f<=1 && m->bLen>1 && bytes[0]==0)))
1081     ) {
1082         return 0; /* suitable for a base table */
1083     } else {
1084         return 1; /* needs to go into an extension table */
1085     }
1086 }
1087 
1088 U_CAPI UBool U_EXPORT2
ucm_addMappingAuto(UCMFile * ucm,UBool forBase,UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1089 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
1090                    UCMapping *m,
1091                    UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1092                    uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1093     int32_t type;
1094 
1095     if(m->f==2 && m->uLen>1) {
1096         fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1097         printMapping(m, codePoints, bytes, stderr);
1098         return FALSE;
1099     }
1100 
1101     if(baseStates!=NULL) {
1102         /* check validity of the bytes and count the characters in them */
1103         type=ucm_mappingType(baseStates, m, codePoints, bytes);
1104         if(type<0) {
1105             /* illegal byte sequence */
1106             printMapping(m, codePoints, bytes, stderr);
1107             return FALSE;
1108         }
1109     } else {
1110         /* not used - adding a mapping for an extension-only table before its base table is read */
1111         type=1;
1112     }
1113 
1114     /*
1115      * Add the mapping to the base table if this is requested and suitable.
1116      * Otherwise, add it to the extension table.
1117      */
1118     if(forBase && type==0) {
1119         ucm_addMapping(ucm->base, m, codePoints, bytes);
1120     } else {
1121         ucm_addMapping(ucm->ext, m, codePoints, bytes);
1122     }
1123 
1124     return TRUE;
1125 }
1126 
1127 U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile * ucm,const char * line,UBool forBase,UCMStates * baseStates)1128 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
1129   UCMapping m={ 0, {0}, 0, 0, 0, 0 };
1130     UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
1131     uint8_t bytes[UCNV_EXT_MAX_BYTES];
1132 
1133     const char *s;
1134 
1135     /* ignore empty and comment lines */
1136     if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
1137         return TRUE;
1138     }
1139 
1140     return
1141         ucm_parseMappingLine(&m, codePoints, bytes, line) &&
1142         ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
1143 }
1144 
1145 U_CAPI void U_EXPORT2
ucm_readTable(UCMFile * ucm,FileStream * convFile,UBool forBase,UCMStates * baseStates,UErrorCode * pErrorCode)1146 ucm_readTable(UCMFile *ucm, FileStream* convFile,
1147               UBool forBase, UCMStates *baseStates,
1148               UErrorCode *pErrorCode) {
1149     char line[500];
1150     char *end;
1151     UBool isOK;
1152 
1153     if(U_FAILURE(*pErrorCode)) {
1154         return;
1155     }
1156 
1157     isOK=TRUE;
1158 
1159     for(;;) {
1160         /* read the next line */
1161         if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
1162             fprintf(stderr, "incomplete charmap section\n");
1163             isOK=FALSE;
1164             break;
1165         }
1166 
1167         /* remove CR LF */
1168         end=uprv_strchr(line, 0);
1169         while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
1170             --end;
1171         }
1172         *end=0;
1173 
1174         /* ignore empty and comment lines */
1175         if(line[0]==0 || line[0]=='#') {
1176             continue;
1177         }
1178 
1179         /* stop at the end of the mapping table */
1180         if(0==uprv_strcmp(line, "END CHARMAP")) {
1181             break;
1182         }
1183 
1184         isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
1185     }
1186 
1187     if(!isOK) {
1188         *pErrorCode=U_INVALID_TABLE_FORMAT;
1189     }
1190 }
1191 #endif
1192