1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2001-2014, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  utrie2_builder.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2008sep26 (split off from utrie2.c)
14 *   created by: Markus W. Scherer
15 *
16 *   This is a common implementation of a Unicode trie.
17 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
18 *   Unicode code points (0..0x10ffff).
19 *   This is the second common version of a Unicode trie (hence the name UTrie2).
20 *   See utrie2.h for a comparison.
21 *
22 *   This file contains only the builder code.
23 *   See utrie2.c for the runtime and enumeration code.
24 */
25 #ifdef UTRIE2_DEBUG
26 #   include <stdio.h>
27 #endif
28 
29 #include "unicode/utypes.h"
30 #include "cmemory.h"
31 #include "utrie2.h"
32 #include "utrie2_impl.h"
33 
34 #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */
35 
36 /* Implementation notes ----------------------------------------------------- */
37 
38 /*
39  * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
40  * have been chosen to minimize trie sizes overall.
41  * Most of the code is flexible enough to work with a range of values,
42  * within certain limits.
43  *
44  * Exception: Support for separate values for lead surrogate code _units_
45  * vs. code _points_ was added after the constants were fixed,
46  * and has not been tested nor particularly designed for different constant values.
47  * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
48  * part and back.)
49  *
50  * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
51  * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
52  * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
53  * remapping) stops working.
54  *
55  * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
56  * assumes that a single index-2 block is used for 0x400 code points
57  * corresponding to one lead surrogate.
58  *
59  * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
60  * more than one Unicode plane, and the split of the index-2 table into a BMP
61  * part and a supplementary part, with a gap in between, would not work.
62  *
63  * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
64  * there is data with more than 64k distinct values,
65  * for example for Unihan collation with a separate collation weight per
66  * Han character.
67  */
68 
69 /* Building a trie ----------------------------------------------------------*/
70 
71 enum {
72     /** The null index-2 block, following the gap in the index-2 table. */
73     UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
74 
75     /** The start of allocated index-2 blocks. */
76     UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
77 
78     /**
79      * The null data block.
80      * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
81      * to work with 6-bit trail bytes from 2-byte UTF-8.
82      */
83     UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
84 
85     /** The start of allocated data blocks. */
86     UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
87 
88     /**
89      * The start of data blocks for U+0800 and above.
90      * Below, compaction uses a block length of 64 for 2-byte UTF-8.
91      * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
92      * Data values for 0x780 code points beyond ASCII.
93      */
94     UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
95 };
96 
97 /* Start with allocation of 16k data entries. */
98 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
99 
100 /* Grow about 8x each time. */
101 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
102 
103 static int32_t
104 allocIndex2Block(UNewTrie2 *trie);
105 
106 U_CAPI UTrie2 * U_EXPORT2
utrie2_open(uint32_t initialValue,uint32_t errorValue,UErrorCode * pErrorCode)107 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
108     UTrie2 *trie;
109     UNewTrie2 *newTrie;
110     uint32_t *data;
111     int32_t i, j;
112 
113     if(U_FAILURE(*pErrorCode)) {
114         return NULL;
115     }
116 
117     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
118     newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
119     data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
120     if(trie==NULL || newTrie==NULL || data==NULL) {
121         uprv_free(trie);
122         uprv_free(newTrie);
123         uprv_free(data);
124         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
125         return 0;
126     }
127 
128     uprv_memset(trie, 0, sizeof(UTrie2));
129     trie->initialValue=initialValue;
130     trie->errorValue=errorValue;
131     trie->highStart=0x110000;
132     trie->newTrie=newTrie;
133 
134     newTrie->data=data;
135     newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
136     newTrie->initialValue=initialValue;
137     newTrie->errorValue=errorValue;
138     newTrie->highStart=0x110000;
139     newTrie->firstFreeBlock=0;  /* no free block in the list */
140     newTrie->isCompacted=FALSE;
141 
142     /*
143      * preallocate and reset
144      * - ASCII
145      * - the bad-UTF-8-data block
146      * - the null data block
147      */
148     for(i=0; i<0x80; ++i) {
149         newTrie->data[i]=initialValue;
150     }
151     for(; i<0xc0; ++i) {
152         newTrie->data[i]=errorValue;
153     }
154     for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
155         newTrie->data[i]=initialValue;
156     }
157     newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
158     newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
159 
160     /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
161     for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
162         newTrie->index2[i]=j;
163         newTrie->map[i]=1;
164     }
165     /* reference counts for the bad-UTF-8-data block */
166     for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
167         newTrie->map[i]=0;
168     }
169     /*
170      * Reference counts for the null data block: all blocks except for the ASCII blocks.
171      * Plus 1 so that we don't drop this block during compaction.
172      * Plus as many as needed for lead surrogate code points.
173      */
174     /* i==newTrie->dataNullOffset */
175     newTrie->map[i++]=
176         (0x110000>>UTRIE2_SHIFT_2)-
177         (0x80>>UTRIE2_SHIFT_2)+
178         1+
179         UTRIE2_LSCP_INDEX_2_LENGTH;
180     j+=UTRIE2_DATA_BLOCK_LENGTH;
181     for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
182         newTrie->map[i]=0;
183     }
184 
185     /*
186      * set the remaining indexes in the BMP index-2 block
187      * to the null data block
188      */
189     for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
190         newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
191     }
192 
193     /*
194      * Fill the index gap with impossible values so that compaction
195      * does not overlap other index-2 blocks with the gap.
196      */
197     for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
198         newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
199     }
200 
201     /* set the indexes in the null index-2 block */
202     for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
203         newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
204     }
205     newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
206     newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
207 
208     /* set the index-1 indexes for the linear index-2 block */
209     for(i=0, j=0;
210         i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
211         ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
212     ) {
213         newTrie->index1[i]=j;
214     }
215 
216     /* set the remaining index-1 indexes to the null index-2 block */
217     for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
218         newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
219     }
220 
221     /*
222      * Preallocate and reset data for U+0080..U+07ff,
223      * for 2-byte UTF-8 which will be compacted in 64-blocks
224      * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
225      */
226     for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
227         utrie2_set32(trie, i, initialValue, pErrorCode);
228     }
229 
230     return trie;
231 }
232 
233 static UNewTrie2 *
cloneBuilder(const UNewTrie2 * other)234 cloneBuilder(const UNewTrie2 *other) {
235     UNewTrie2 *trie;
236 
237     trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
238     if(trie==NULL) {
239         return NULL;
240     }
241 
242     trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4);
243     if(trie->data==NULL) {
244         uprv_free(trie);
245         return NULL;
246     }
247     trie->dataCapacity=other->dataCapacity;
248 
249     /* clone data */
250     uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
251     uprv_memcpy(trie->index2, other->index2, other->index2Length*4);
252     trie->index2NullOffset=other->index2NullOffset;
253     trie->index2Length=other->index2Length;
254 
255     uprv_memcpy(trie->data, other->data, other->dataLength*4);
256     trie->dataNullOffset=other->dataNullOffset;
257     trie->dataLength=other->dataLength;
258 
259     /* reference counters */
260     if(other->isCompacted) {
261         trie->firstFreeBlock=0;
262     } else {
263         uprv_memcpy(trie->map, other->map, (other->dataLength>>UTRIE2_SHIFT_2)*4);
264         trie->firstFreeBlock=other->firstFreeBlock;
265     }
266 
267     trie->initialValue=other->initialValue;
268     trie->errorValue=other->errorValue;
269     trie->highStart=other->highStart;
270     trie->isCompacted=other->isCompacted;
271 
272     return trie;
273 }
274 
275 U_CAPI UTrie2 * U_EXPORT2
utrie2_clone(const UTrie2 * other,UErrorCode * pErrorCode)276 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
277     UTrie2 *trie;
278 
279     if(U_FAILURE(*pErrorCode)) {
280         return NULL;
281     }
282     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
283         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
284         return NULL;
285     }
286 
287     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
288     if(trie==NULL) {
289         return NULL;
290     }
291     uprv_memcpy(trie, other, sizeof(UTrie2));
292 
293     if(other->memory!=NULL) {
294         trie->memory=uprv_malloc(other->length);
295         if(trie->memory!=NULL) {
296             trie->isMemoryOwned=TRUE;
297             uprv_memcpy(trie->memory, other->memory, other->length);
298 
299             /* make the clone's pointers point to its own memory */
300             trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
301             if(other->data16!=NULL) {
302                 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
303             }
304             if(other->data32!=NULL) {
305                 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
306             }
307         }
308     } else /* other->newTrie!=NULL */ {
309         trie->newTrie=cloneBuilder(other->newTrie);
310     }
311 
312     if(trie->memory==NULL && trie->newTrie==NULL) {
313         uprv_free(trie);
314         trie=NULL;
315     }
316     return trie;
317 }
318 
319 typedef struct NewTrieAndStatus {
320     UTrie2 *trie;
321     UErrorCode errorCode;
322     UBool exclusiveLimit;  /* rather than inclusive range end */
323 } NewTrieAndStatus;
324 
325 static UBool U_CALLCONV
copyEnumRange(const void * context,UChar32 start,UChar32 end,uint32_t value)326 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
327     NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
328     if(value!=nt->trie->initialValue) {
329         if(nt->exclusiveLimit) {
330             --end;
331         }
332         if(start==end) {
333             utrie2_set32(nt->trie, start, value, &nt->errorCode);
334         } else {
335             utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode);
336         }
337         return U_SUCCESS(nt->errorCode);
338     } else {
339         return TRUE;
340     }
341 }
342 
343 #ifdef UTRIE2_DEBUG
344 static void
utrie_printLengths(const UTrie * trie)345 utrie_printLengths(const UTrie *trie) {
346     long indexLength=trie->indexLength;
347     long dataLength=(long)trie->dataLength;
348     long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
349     printf("**UTrieLengths** index:%6ld  data:%6ld  serialized:%6ld\n",
350            indexLength, dataLength, totalLength);
351 }
352 
353 static void
utrie2_printLengths(const UTrie2 * trie,const char * which)354 utrie2_printLengths(const UTrie2 *trie, const char *which) {
355     long indexLength=trie->indexLength;
356     long dataLength=(long)trie->dataLength;
357     long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
358     printf("**UTrie2Lengths(%s)** index:%6ld  data:%6ld  serialized:%6ld\n",
359            which, indexLength, dataLength, totalLength);
360 }
361 #endif
362 
363 U_CAPI UTrie2 * U_EXPORT2
utrie2_cloneAsThawed(const UTrie2 * other,UErrorCode * pErrorCode)364 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
365     NewTrieAndStatus context;
366     UChar lead;
367 
368     if(U_FAILURE(*pErrorCode)) {
369         return NULL;
370     }
371     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
372         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
373         return NULL;
374     }
375     if(other->newTrie!=NULL && !other->newTrie->isCompacted) {
376         return utrie2_clone(other, pErrorCode);  /* clone an unfrozen trie */
377     }
378 
379     /* Clone the frozen trie by enumerating it and building a new one. */
380     context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
381     if(U_FAILURE(*pErrorCode)) {
382         return NULL;
383     }
384     context.exclusiveLimit=FALSE;
385     context.errorCode=*pErrorCode;
386     utrie2_enum(other, NULL, copyEnumRange, &context);
387     *pErrorCode=context.errorCode;
388     for(lead=0xd800; lead<0xdc00; ++lead) {
389         uint32_t value;
390         if(other->data32==NULL) {
391             value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
392         } else {
393             value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
394         }
395         if(value!=other->initialValue) {
396             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
397         }
398     }
399     if(U_FAILURE(*pErrorCode)) {
400         utrie2_close(context.trie);
401         context.trie=NULL;
402     }
403     return context.trie;
404 }
405 
406 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
407 U_CAPI UTrie2 * U_EXPORT2
utrie2_fromUTrie(const UTrie * trie1,uint32_t errorValue,UErrorCode * pErrorCode)408 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
409     NewTrieAndStatus context;
410     UChar lead;
411 
412     if(U_FAILURE(*pErrorCode)) {
413         return NULL;
414     }
415     if(trie1==NULL) {
416         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
417         return NULL;
418     }
419     context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
420     if(U_FAILURE(*pErrorCode)) {
421         return NULL;
422     }
423     context.exclusiveLimit=TRUE;
424     context.errorCode=*pErrorCode;
425     utrie_enum(trie1, NULL, copyEnumRange, &context);
426     *pErrorCode=context.errorCode;
427     for(lead=0xd800; lead<0xdc00; ++lead) {
428         uint32_t value;
429         if(trie1->data32==NULL) {
430             value=UTRIE_GET16_FROM_LEAD(trie1, lead);
431         } else {
432             value=UTRIE_GET32_FROM_LEAD(trie1, lead);
433         }
434         if(value!=trie1->initialValue) {
435             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
436         }
437     }
438     if(U_SUCCESS(*pErrorCode)) {
439         utrie2_freeze(context.trie,
440                       trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
441                       pErrorCode);
442     }
443 #ifdef UTRIE2_DEBUG
444     if(U_SUCCESS(*pErrorCode)) {
445         utrie_printLengths(trie1);
446         utrie2_printLengths(context.trie, "fromUTrie");
447     }
448 #endif
449     if(U_FAILURE(*pErrorCode)) {
450         utrie2_close(context.trie);
451         context.trie=NULL;
452     }
453     return context.trie;
454 }
455 
456 static inline UBool
isInNullBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)457 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
458     int32_t i2, block;
459 
460     if(U_IS_LEAD(c) && forLSCP) {
461         i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
462             (c>>UTRIE2_SHIFT_2);
463     } else {
464         i2=trie->index1[c>>UTRIE2_SHIFT_1]+
465             ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
466     }
467     block=trie->index2[i2];
468     return (UBool)(block==trie->dataNullOffset);
469 }
470 
471 static int32_t
allocIndex2Block(UNewTrie2 * trie)472 allocIndex2Block(UNewTrie2 *trie) {
473     int32_t newBlock, newTop;
474 
475     newBlock=trie->index2Length;
476     newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
477     if(newTop>UPRV_LENGTHOF(trie->index2)) {
478         /*
479          * Should never occur.
480          * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
481          * or the code writes more values than should be possible.
482          */
483         return -1;
484     }
485     trie->index2Length=newTop;
486     uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
487     return newBlock;
488 }
489 
490 static int32_t
getIndex2Block(UNewTrie2 * trie,UChar32 c,UBool forLSCP)491 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
492     int32_t i1, i2;
493 
494     if(U_IS_LEAD(c) && forLSCP) {
495         return UTRIE2_LSCP_INDEX_2_OFFSET;
496     }
497 
498     i1=c>>UTRIE2_SHIFT_1;
499     i2=trie->index1[i1];
500     if(i2==trie->index2NullOffset) {
501         i2=allocIndex2Block(trie);
502         if(i2<0) {
503             return -1;  /* program error */
504         }
505         trie->index1[i1]=i2;
506     }
507     return i2;
508 }
509 
510 static int32_t
allocDataBlock(UNewTrie2 * trie,int32_t copyBlock)511 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
512     int32_t newBlock, newTop;
513 
514     if(trie->firstFreeBlock!=0) {
515         /* get the first free block */
516         newBlock=trie->firstFreeBlock;
517         trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
518     } else {
519         /* get a new block from the high end */
520         newBlock=trie->dataLength;
521         newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
522         if(newTop>trie->dataCapacity) {
523             /* out of memory in the data array */
524             int32_t capacity;
525             uint32_t *data;
526 
527             if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
528                 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
529             } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
530                 capacity=UNEWTRIE2_MAX_DATA_LENGTH;
531             } else {
532                 /*
533                  * Should never occur.
534                  * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
535                  * or the code writes more values than should be possible.
536                  */
537                 return -1;
538             }
539             data=(uint32_t *)uprv_malloc(capacity*4);
540             if(data==NULL) {
541                 return -1;
542             }
543             uprv_memcpy(data, trie->data, trie->dataLength*4);
544             uprv_free(trie->data);
545             trie->data=data;
546             trie->dataCapacity=capacity;
547         }
548         trie->dataLength=newTop;
549     }
550     uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
551     trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
552     return newBlock;
553 }
554 
555 /* call when the block's reference counter reaches 0 */
556 static void
releaseDataBlock(UNewTrie2 * trie,int32_t block)557 releaseDataBlock(UNewTrie2 *trie, int32_t block) {
558     /* put this block at the front of the free-block chain */
559     trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
560     trie->firstFreeBlock=block;
561 }
562 
563 static inline UBool
isWritableBlock(UNewTrie2 * trie,int32_t block)564 isWritableBlock(UNewTrie2 *trie, int32_t block) {
565     return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]);
566 }
567 
568 static inline void
setIndex2Entry(UNewTrie2 * trie,int32_t i2,int32_t block)569 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
570     int32_t oldBlock;
571     ++trie->map[block>>UTRIE2_SHIFT_2];  /* increment first, in case block==oldBlock! */
572     oldBlock=trie->index2[i2];
573     if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
574         releaseDataBlock(trie, oldBlock);
575     }
576     trie->index2[i2]=block;
577 }
578 
579 /**
580  * No error checking for illegal arguments.
581  *
582  * @return -1 if no new data block available (out of memory in data array)
583  * @internal
584  */
585 static int32_t
getDataBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)586 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
587     int32_t i2, oldBlock, newBlock;
588 
589     i2=getIndex2Block(trie, c, forLSCP);
590     if(i2<0) {
591         return -1;  /* program error */
592     }
593 
594     i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
595     oldBlock=trie->index2[i2];
596     if(isWritableBlock(trie, oldBlock)) {
597         return oldBlock;
598     }
599 
600     /* allocate a new data block */
601     newBlock=allocDataBlock(trie, oldBlock);
602     if(newBlock<0) {
603         /* out of memory in the data array */
604         return -1;
605     }
606     setIndex2Entry(trie, i2, newBlock);
607     return newBlock;
608 }
609 
610 /**
611  * @return TRUE if the value was successfully set
612  */
613 static void
set32(UNewTrie2 * trie,UChar32 c,UBool forLSCP,uint32_t value,UErrorCode * pErrorCode)614 set32(UNewTrie2 *trie,
615       UChar32 c, UBool forLSCP, uint32_t value,
616       UErrorCode *pErrorCode) {
617     int32_t block;
618 
619     if(trie==NULL || trie->isCompacted) {
620         *pErrorCode=U_NO_WRITE_PERMISSION;
621         return;
622     }
623 
624     block=getDataBlock(trie, c, forLSCP);
625     if(block<0) {
626         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
627         return;
628     }
629 
630     trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
631 }
632 
633 U_CAPI void U_EXPORT2
utrie2_set32(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)634 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
635     if(U_FAILURE(*pErrorCode)) {
636         return;
637     }
638     if((uint32_t)c>0x10ffff) {
639         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
640         return;
641     }
642     set32(trie->newTrie, c, TRUE, value, pErrorCode);
643 }
644 
645 U_CAPI void U_EXPORT2
utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)646 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
647                                      UChar32 c, uint32_t value,
648                                      UErrorCode *pErrorCode) {
649     if(U_FAILURE(*pErrorCode)) {
650         return;
651     }
652     if(!U_IS_LEAD(c)) {
653         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
654         return;
655     }
656     set32(trie->newTrie, c, FALSE, value, pErrorCode);
657 }
658 
659 static void
writeBlock(uint32_t * block,uint32_t value)660 writeBlock(uint32_t *block, uint32_t value) {
661     uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
662     while(block<limit) {
663         *block++=value;
664     }
665 }
666 
667 /**
668  * initialValue is ignored if overwrite=TRUE
669  * @internal
670  */
671 static void
fillBlock(uint32_t * block,UChar32 start,UChar32 limit,uint32_t value,uint32_t initialValue,UBool overwrite)672 fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
673           uint32_t value, uint32_t initialValue, UBool overwrite) {
674     uint32_t *pLimit;
675 
676     pLimit=block+limit;
677     block+=start;
678     if(overwrite) {
679         while(block<pLimit) {
680             *block++=value;
681         }
682     } else {
683         while(block<pLimit) {
684             if(*block==initialValue) {
685                 *block=value;
686             }
687             ++block;
688         }
689     }
690 }
691 
692 U_CAPI void U_EXPORT2
utrie2_setRange32(UTrie2 * trie,UChar32 start,UChar32 end,uint32_t value,UBool overwrite,UErrorCode * pErrorCode)693 utrie2_setRange32(UTrie2 *trie,
694                   UChar32 start, UChar32 end,
695                   uint32_t value, UBool overwrite,
696                   UErrorCode *pErrorCode) {
697     /*
698      * repeat value in [start..end]
699      * mark index values for repeat-data blocks by setting bit 31 of the index values
700      * fill around existing values if any, if(overwrite)
701      */
702     UNewTrie2 *newTrie;
703     int32_t block, rest, repeatBlock;
704     UChar32 limit;
705 
706     if(U_FAILURE(*pErrorCode)) {
707         return;
708     }
709     if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
710         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
711         return;
712     }
713     newTrie=trie->newTrie;
714     if(newTrie==NULL || newTrie->isCompacted) {
715         *pErrorCode=U_NO_WRITE_PERMISSION;
716         return;
717     }
718     if(!overwrite && value==newTrie->initialValue) {
719         return; /* nothing to do */
720     }
721 
722     limit=end+1;
723     if(start&UTRIE2_DATA_MASK) {
724         UChar32 nextStart;
725 
726         /* set partial block at [start..following block boundary[ */
727         block=getDataBlock(newTrie, start, TRUE);
728         if(block<0) {
729             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
730             return;
731         }
732 
733         nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK;
734         if(nextStart<=limit) {
735             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
736                       value, newTrie->initialValue, overwrite);
737             start=nextStart;
738         } else {
739             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
740                       value, newTrie->initialValue, overwrite);
741             return;
742         }
743     }
744 
745     /* number of positions in the last, partial block */
746     rest=limit&UTRIE2_DATA_MASK;
747 
748     /* round down limit to a block boundary */
749     limit&=~UTRIE2_DATA_MASK;
750 
751     /* iterate over all-value blocks */
752     if(value==newTrie->initialValue) {
753         repeatBlock=newTrie->dataNullOffset;
754     } else {
755         repeatBlock=-1;
756     }
757 
758     while(start<limit) {
759         int32_t i2;
760         UBool setRepeatBlock=FALSE;
761 
762         if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) {
763             start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
764             continue;
765         }
766 
767         /* get index value */
768         i2=getIndex2Block(newTrie, start, TRUE);
769         if(i2<0) {
770             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
771             return;
772         }
773         i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
774         block=newTrie->index2[i2];
775         if(isWritableBlock(newTrie, block)) {
776             /* already allocated */
777             if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
778                 /*
779                  * We overwrite all values, and it's not a
780                  * protected (ASCII-linear or 2-byte UTF-8) block:
781                  * replace with the repeatBlock.
782                  */
783                 setRepeatBlock=TRUE;
784             } else {
785                 /* !overwrite, or protected block: just write the values into this block */
786                 fillBlock(newTrie->data+block,
787                           0, UTRIE2_DATA_BLOCK_LENGTH,
788                           value, newTrie->initialValue, overwrite);
789             }
790         } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
791             /*
792              * Set the repeatBlock instead of the null block or previous repeat block:
793              *
794              * If !isWritableBlock() then all entries in the block have the same value
795              * because it's the null block or a range block (the repeatBlock from a previous
796              * call to utrie2_setRange32()).
797              * No other blocks are used multiple times before compacting.
798              *
799              * The null block is the only non-writable block with the initialValue because
800              * of the repeatBlock initialization above. (If value==initialValue, then
801              * the repeatBlock will be the null data block.)
802              *
803              * We set our repeatBlock if the desired value differs from the block's value,
804              * and if we overwrite any data or if the data is all initial values
805              * (which is the same as the block being the null block, see above).
806              */
807             setRepeatBlock=TRUE;
808         }
809         if(setRepeatBlock) {
810             if(repeatBlock>=0) {
811                 setIndex2Entry(newTrie, i2, repeatBlock);
812             } else {
813                 /* create and set and fill the repeatBlock */
814                 repeatBlock=getDataBlock(newTrie, start, TRUE);
815                 if(repeatBlock<0) {
816                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
817                     return;
818                 }
819                 writeBlock(newTrie->data+repeatBlock, value);
820             }
821         }
822 
823         start+=UTRIE2_DATA_BLOCK_LENGTH;
824     }
825 
826     if(rest>0) {
827         /* set partial block at [last block boundary..limit[ */
828         block=getDataBlock(newTrie, start, TRUE);
829         if(block<0) {
830             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
831             return;
832         }
833 
834         fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
835     }
836 
837     return;
838 }
839 
840 /* compaction --------------------------------------------------------------- */
841 
842 static inline UBool
equal_int32(const int32_t * s,const int32_t * t,int32_t length)843 equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
844     while(length>0 && *s==*t) {
845         ++s;
846         ++t;
847         --length;
848     }
849     return (UBool)(length==0);
850 }
851 
852 static inline UBool
equal_uint32(const uint32_t * s,const uint32_t * t,int32_t length)853 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
854     while(length>0 && *s==*t) {
855         ++s;
856         ++t;
857         --length;
858     }
859     return (UBool)(length==0);
860 }
861 
862 static int32_t
findSameIndex2Block(const int32_t * idx,int32_t index2Length,int32_t otherBlock)863 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
864     int32_t block;
865 
866     /* ensure that we do not even partially get past index2Length */
867     index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
868 
869     for(block=0; block<=index2Length; ++block) {
870         if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
871             return block;
872         }
873     }
874     return -1;
875 }
876 
877 static int32_t
findSameDataBlock(const uint32_t * data,int32_t dataLength,int32_t otherBlock,int32_t blockLength)878 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
879     int32_t block;
880 
881     /* ensure that we do not even partially get past dataLength */
882     dataLength-=blockLength;
883 
884     for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
885         if(equal_uint32(data+block, data+otherBlock, blockLength)) {
886             return block;
887         }
888     }
889     return -1;
890 }
891 
892 /*
893  * Find the start of the last range in the trie by enumerating backward.
894  * Indexes for supplementary code points higher than this will be omitted.
895  */
896 static UChar32
findHighStart(UNewTrie2 * trie,uint32_t highValue)897 findHighStart(UNewTrie2 *trie, uint32_t highValue) {
898     const uint32_t *data32;
899 
900     uint32_t value, initialValue;
901     UChar32 c, prev;
902     int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
903 
904     data32=trie->data;
905     initialValue=trie->initialValue;
906 
907     index2NullOffset=trie->index2NullOffset;
908     nullBlock=trie->dataNullOffset;
909 
910     /* set variables for previous range */
911     if(highValue==initialValue) {
912         prevI2Block=index2NullOffset;
913         prevBlock=nullBlock;
914     } else {
915         prevI2Block=-1;
916         prevBlock=-1;
917     }
918     prev=0x110000;
919 
920     /* enumerate index-2 blocks */
921     i1=UNEWTRIE2_INDEX_1_LENGTH;
922     c=prev;
923     while(c>0) {
924         i2Block=trie->index1[--i1];
925         if(i2Block==prevI2Block) {
926             /* the index-2 block is the same as the previous one, and filled with highValue */
927             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
928             continue;
929         }
930         prevI2Block=i2Block;
931         if(i2Block==index2NullOffset) {
932             /* this is the null index-2 block */
933             if(highValue!=initialValue) {
934                 return c;
935             }
936             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
937         } else {
938             /* enumerate data blocks for one index-2 block */
939             for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
940                 block=trie->index2[i2Block+ --i2];
941                 if(block==prevBlock) {
942                     /* the block is the same as the previous one, and filled with highValue */
943                     c-=UTRIE2_DATA_BLOCK_LENGTH;
944                     continue;
945                 }
946                 prevBlock=block;
947                 if(block==nullBlock) {
948                     /* this is the null data block */
949                     if(highValue!=initialValue) {
950                         return c;
951                     }
952                     c-=UTRIE2_DATA_BLOCK_LENGTH;
953                 } else {
954                     for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
955                         value=data32[block+ --j];
956                         if(value!=highValue) {
957                             return c;
958                         }
959                         --c;
960                     }
961                 }
962             }
963         }
964     }
965 
966     /* deliver last range */
967     return 0;
968 }
969 
970 /*
971  * Compact a build-time trie.
972  *
973  * The compaction
974  * - removes blocks that are identical with earlier ones
975  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
976  * - moves blocks in steps of the data granularity
977  * - moves and overlaps blocks that overlap with multiple values in the overlap region
978  *
979  * It does not
980  * - try to move and overlap blocks that are not already adjacent
981  */
982 static void
compactData(UNewTrie2 * trie)983 compactData(UNewTrie2 *trie) {
984     int32_t start, newStart, movedStart;
985     int32_t blockLength, overlap;
986     int32_t i, mapIndex, blockCount;
987 
988     /* do not compact linear-ASCII data */
989     newStart=UTRIE2_DATA_START_OFFSET;
990     for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
991         trie->map[i]=start;
992     }
993 
994     /*
995      * Start with a block length of 64 for 2-byte UTF-8,
996      * then switch to UTRIE2_DATA_BLOCK_LENGTH.
997      */
998     blockLength=64;
999     blockCount=blockLength>>UTRIE2_SHIFT_2;
1000     for(start=newStart; start<trie->dataLength;) {
1001         /*
1002          * start: index of first entry of current block
1003          * newStart: index where the current block is to be moved
1004          *           (right after current end of already-compacted data)
1005          */
1006         if(start==UNEWTRIE2_DATA_0800_OFFSET) {
1007             blockLength=UTRIE2_DATA_BLOCK_LENGTH;
1008             blockCount=1;
1009         }
1010 
1011         /* skip blocks that are not used */
1012         if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
1013             /* advance start to the next block */
1014             start+=blockLength;
1015 
1016             /* leave newStart with the previous block! */
1017             continue;
1018         }
1019 
1020         /* search for an identical block */
1021         if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
1022              >=0
1023         ) {
1024             /* found an identical block, set the other block's index value for the current block */
1025             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1026                 trie->map[mapIndex++]=movedStart;
1027                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1028             }
1029 
1030             /* advance start to the next block */
1031             start+=blockLength;
1032 
1033             /* leave newStart with the previous block! */
1034             continue;
1035         }
1036 
1037         /* see if the beginning of this block can be overlapped with the end of the previous block */
1038         /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
1039         for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
1040             overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
1041             overlap-=UTRIE2_DATA_GRANULARITY) {}
1042 
1043         if(overlap>0 || newStart<start) {
1044             /* some overlap, or just move the whole block */
1045             movedStart=newStart-overlap;
1046             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1047                 trie->map[mapIndex++]=movedStart;
1048                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1049             }
1050 
1051             /* move the non-overlapping indexes to their new positions */
1052             start+=overlap;
1053             for(i=blockLength-overlap; i>0; --i) {
1054                 trie->data[newStart++]=trie->data[start++];
1055             }
1056         } else /* no overlap && newStart==start */ {
1057             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1058                 trie->map[mapIndex++]=start;
1059                 start+=UTRIE2_DATA_BLOCK_LENGTH;
1060             }
1061             newStart=start;
1062         }
1063     }
1064 
1065     /* now adjust the index-2 table */
1066     for(i=0; i<trie->index2Length; ++i) {
1067         if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
1068             /* Gap indexes are invalid (-1). Skip over the gap. */
1069             i+=UNEWTRIE2_INDEX_GAP_LENGTH;
1070         }
1071         trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
1072     }
1073     trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
1074 
1075     /* ensure dataLength alignment */
1076     while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1077         trie->data[newStart++]=trie->initialValue;
1078     }
1079 
1080 #ifdef UTRIE2_DEBUG
1081     /* we saved some space */
1082     printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n",
1083             (long)trie->dataLength, (long)newStart);
1084 #endif
1085 
1086     trie->dataLength=newStart;
1087 }
1088 
1089 static void
compactIndex2(UNewTrie2 * trie)1090 compactIndex2(UNewTrie2 *trie) {
1091     int32_t i, start, newStart, movedStart, overlap;
1092 
1093     /* do not compact linear-BMP index-2 blocks */
1094     newStart=UTRIE2_INDEX_2_BMP_LENGTH;
1095     for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
1096         trie->map[i]=start;
1097     }
1098 
1099     /* Reduce the index table gap to what will be needed at runtime. */
1100     newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
1101 
1102     for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
1103         /*
1104          * start: index of first entry of current block
1105          * newStart: index where the current block is to be moved
1106          *           (right after current end of already-compacted data)
1107          */
1108 
1109         /* search for an identical block */
1110         if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
1111              >=0
1112         ) {
1113             /* found an identical block, set the other block's index value for the current block */
1114             trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
1115 
1116             /* advance start to the next block */
1117             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1118 
1119             /* leave newStart with the previous block! */
1120             continue;
1121         }
1122 
1123         /* see if the beginning of this block can be overlapped with the end of the previous block */
1124         /* look for maximum overlap with the previous, adjacent block */
1125         for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
1126             overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
1127             --overlap) {}
1128 
1129         if(overlap>0 || newStart<start) {
1130             /* some overlap, or just move the whole block */
1131             trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
1132 
1133             /* move the non-overlapping indexes to their new positions */
1134             start+=overlap;
1135             for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
1136                 trie->index2[newStart++]=trie->index2[start++];
1137             }
1138         } else /* no overlap && newStart==start */ {
1139             trie->map[start>>UTRIE2_SHIFT_1_2]=start;
1140             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1141             newStart=start;
1142         }
1143     }
1144 
1145     /* now adjust the index-1 table */
1146     for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
1147         trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
1148     }
1149     trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
1150 
1151     /*
1152      * Ensure data table alignment:
1153      * Needs to be granularity-aligned for 16-bit trie
1154      * (so that dataMove will be down-shiftable),
1155      * and 2-aligned for uint32_t data.
1156      */
1157     while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
1158         /* Arbitrary value: 0x3fffc not possible for real data. */
1159         trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT;
1160     }
1161 
1162 #ifdef UTRIE2_DEBUG
1163     /* we saved some space */
1164     printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n",
1165             (long)trie->index2Length, (long)newStart);
1166 #endif
1167 
1168     trie->index2Length=newStart;
1169 }
1170 
1171 static void
compactTrie(UTrie2 * trie,UErrorCode * pErrorCode)1172 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
1173     UNewTrie2 *newTrie;
1174     UChar32 highStart, suppHighStart;
1175     uint32_t highValue;
1176 
1177     newTrie=trie->newTrie;
1178 
1179     /* find highStart and round it up */
1180     highValue=utrie2_get32(trie, 0x10ffff);
1181     highStart=findHighStart(newTrie, highValue);
1182     highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
1183     if(highStart==0x110000) {
1184         highValue=trie->errorValue;
1185     }
1186 
1187     /*
1188      * Set trie->highStart only after utrie2_get32(trie, highStart).
1189      * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
1190      */
1191     trie->highStart=newTrie->highStart=highStart;
1192 
1193 #ifdef UTRIE2_DEBUG
1194     printf("UTrie2: highStart U+%04lx  highValue 0x%lx  initialValue 0x%lx\n",
1195             (long)highStart, (long)highValue, (long)trie->initialValue);
1196 #endif
1197 
1198     if(highStart<0x110000) {
1199         /* Blank out [highStart..10ffff] to release associated data blocks. */
1200         suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
1201         utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode);
1202         if(U_FAILURE(*pErrorCode)) {
1203             return;
1204         }
1205     }
1206 
1207     compactData(newTrie);
1208     if(highStart>0x10000) {
1209         compactIndex2(newTrie);
1210 #ifdef UTRIE2_DEBUG
1211     } else {
1212         printf("UTrie2: highStart U+%04lx  count of 16-bit index-2 words %lu->%lu\n",
1213                 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
1214 #endif
1215     }
1216 
1217     /*
1218      * Store the highValue in the data array and round up the dataLength.
1219      * Must be done after compactData() because that assumes that dataLength
1220      * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
1221      */
1222     newTrie->data[newTrie->dataLength++]=highValue;
1223     while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1224         newTrie->data[newTrie->dataLength++]=trie->initialValue;
1225     }
1226 
1227     newTrie->isCompacted=TRUE;
1228 }
1229 
1230 /* serialization ------------------------------------------------------------ */
1231 
1232 /**
1233  * Maximum length of the runtime index array.
1234  * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
1235  * (The actual maximum length is lower,
1236  * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
1237  */
1238 #define UTRIE2_MAX_INDEX_LENGTH 0xffff
1239 
1240 /**
1241  * Maximum length of the runtime data array.
1242  * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
1243  * and by uint16_t UTrie2Header.shiftedDataLength.
1244  */
1245 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
1246 
1247 /* Compact and internally serialize the trie. */
1248 U_CAPI void U_EXPORT2
utrie2_freeze(UTrie2 * trie,UTrie2ValueBits valueBits,UErrorCode * pErrorCode)1249 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
1250     UNewTrie2 *newTrie;
1251     UTrie2Header *header;
1252     uint32_t *p;
1253     uint16_t *dest16;
1254     int32_t i, length;
1255     int32_t allIndexesLength;
1256     int32_t dataMove;  /* >0 if the data is moved to the end of the index array */
1257     UChar32 highStart;
1258 
1259     /* argument check */
1260     if(U_FAILURE(*pErrorCode)) {
1261         return;
1262     }
1263     if( trie==NULL ||
1264         valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
1265     ) {
1266         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1267         return;
1268     }
1269     newTrie=trie->newTrie;
1270     if(newTrie==NULL) {
1271         /* already frozen */
1272         UTrie2ValueBits frozenValueBits=
1273             trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
1274         if(valueBits!=frozenValueBits) {
1275             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1276         }
1277         return;
1278     }
1279 
1280     /* compact if necessary */
1281     if(!newTrie->isCompacted) {
1282         compactTrie(trie, pErrorCode);
1283         if(U_FAILURE(*pErrorCode)) {
1284             return;
1285         }
1286     }
1287     highStart=trie->highStart;
1288 
1289     if(highStart<=0x10000) {
1290         allIndexesLength=UTRIE2_INDEX_1_OFFSET;
1291     } else {
1292         allIndexesLength=newTrie->index2Length;
1293     }
1294     if(valueBits==UTRIE2_16_VALUE_BITS) {
1295         dataMove=allIndexesLength;
1296     } else {
1297         dataMove=0;
1298     }
1299 
1300     /* are indexLength and dataLength within limits? */
1301     if( /* for unshifted indexLength */
1302         allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
1303         /* for unshifted dataNullOffset */
1304         (dataMove+newTrie->dataNullOffset)>0xffff ||
1305         /* for unshifted 2-byte UTF-8 index-2 values */
1306         (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
1307         /* for shiftedDataLength */
1308         (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
1309     ) {
1310         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1311         return;
1312     }
1313 
1314     /* calculate the total serialized length */
1315     length=sizeof(UTrie2Header)+allIndexesLength*2;
1316     if(valueBits==UTRIE2_16_VALUE_BITS) {
1317         length+=newTrie->dataLength*2;
1318     } else {
1319         length+=newTrie->dataLength*4;
1320     }
1321 
1322     trie->memory=uprv_malloc(length);
1323     if(trie->memory==NULL) {
1324         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1325         return;
1326     }
1327     trie->length=length;
1328     trie->isMemoryOwned=TRUE;
1329 
1330     trie->indexLength=allIndexesLength;
1331     trie->dataLength=newTrie->dataLength;
1332     if(highStart<=0x10000) {
1333         trie->index2NullOffset=0xffff;
1334     } else {
1335         trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset;
1336     }
1337     trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
1338     trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
1339 
1340     /* set the header fields */
1341     header=(UTrie2Header *)trie->memory;
1342 
1343     header->signature=UTRIE2_SIG; /* "Tri2" */
1344     header->options=(uint16_t)valueBits;
1345 
1346     header->indexLength=(uint16_t)trie->indexLength;
1347     header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
1348     header->index2NullOffset=trie->index2NullOffset;
1349     header->dataNullOffset=trie->dataNullOffset;
1350     header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
1351 
1352     /* fill the index and data arrays */
1353     dest16=(uint16_t *)(header+1);
1354     trie->index=dest16;
1355 
1356     /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
1357     p=(uint32_t *)newTrie->index2;
1358     for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
1359         *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1360     }
1361 
1362     /* write UTF-8 2-byte index-2 values, not right-shifted */
1363     for(i=0; i<(0xc2-0xc0); ++i) {                                  /* C0..C1 */
1364         *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
1365     }
1366     for(; i<(0xe0-0xc0); ++i) {                                     /* C2..DF */
1367         *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
1368     }
1369 
1370     if(highStart>0x10000) {
1371         int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
1372         int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
1373 
1374         /* write 16-bit index-1 values for supplementary code points */
1375         p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
1376         for(i=index1Length; i>0; --i) {
1377             *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
1378         }
1379 
1380         /*
1381          * write the index-2 array values for supplementary code points,
1382          * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
1383          */
1384         p=(uint32_t *)newTrie->index2+index2Offset;
1385         for(i=newTrie->index2Length-index2Offset; i>0; --i) {
1386             *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1387         }
1388     }
1389 
1390     /* write the 16/32-bit data array */
1391     switch(valueBits) {
1392     case UTRIE2_16_VALUE_BITS:
1393         /* write 16-bit data values */
1394         trie->data16=dest16;
1395         trie->data32=NULL;
1396         p=newTrie->data;
1397         for(i=newTrie->dataLength; i>0; --i) {
1398             *dest16++=(uint16_t)*p++;
1399         }
1400         break;
1401     case UTRIE2_32_VALUE_BITS:
1402         /* write 32-bit data values */
1403         trie->data16=NULL;
1404         trie->data32=(uint32_t *)dest16;
1405         uprv_memcpy(dest16, newTrie->data, newTrie->dataLength*4);
1406         break;
1407     default:
1408         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1409         return;
1410     }
1411 
1412     /* Delete the UNewTrie2. */
1413     uprv_free(newTrie->data);
1414     uprv_free(newTrie);
1415     trie->newTrie=NULL;
1416 }
1417 
1418 /*
1419  * This is here to avoid a dependency from utrie2.cpp on utrie.c.
1420  * This file already depends on utrie.c.
1421  * Otherwise, this should be in utrie2.cpp right after utrie2_swap().
1422  */
1423 U_CAPI int32_t U_EXPORT2
utrie2_swapAnyVersion(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)1424 utrie2_swapAnyVersion(const UDataSwapper *ds,
1425                       const void *inData, int32_t length, void *outData,
1426                       UErrorCode *pErrorCode) {
1427     if(U_SUCCESS(*pErrorCode)) {
1428         switch(utrie2_getVersion(inData, length, TRUE)) {
1429         case 1:
1430             return utrie_swap(ds, inData, length, outData, pErrorCode);
1431         case 2:
1432             return utrie2_swap(ds, inData, length, outData, pErrorCode);
1433         default:
1434             *pErrorCode=U_INVALID_FORMAT_ERROR;
1435             return 0;
1436         }
1437     }
1438     return 0;
1439 }
1440