1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2001-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: utrie2_builder.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2008sep26 (split off from utrie2.c)
14 * created by: Markus W. Scherer
15 *
16 * This is a common implementation of a Unicode trie.
17 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
18 * Unicode code points (0..0x10ffff).
19 * This is the second common version of a Unicode trie (hence the name UTrie2).
20 * See utrie2.h for a comparison.
21 *
22 * This file contains only the builder code.
23 * See utrie2.c for the runtime and enumeration code.
24 */
25 #ifdef UTRIE2_DEBUG
26 # include <stdio.h>
27 #endif
28
29 #include "unicode/utypes.h"
30 #include "cmemory.h"
31 #include "utrie2.h"
32 #include "utrie2_impl.h"
33
34 #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */
35
36 /* Implementation notes ----------------------------------------------------- */
37
38 /*
39 * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
40 * have been chosen to minimize trie sizes overall.
41 * Most of the code is flexible enough to work with a range of values,
42 * within certain limits.
43 *
44 * Exception: Support for separate values for lead surrogate code _units_
45 * vs. code _points_ was added after the constants were fixed,
46 * and has not been tested nor particularly designed for different constant values.
47 * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
48 * part and back.)
49 *
50 * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
51 * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
52 * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
53 * remapping) stops working.
54 *
55 * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
56 * assumes that a single index-2 block is used for 0x400 code points
57 * corresponding to one lead surrogate.
58 *
59 * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
60 * more than one Unicode plane, and the split of the index-2 table into a BMP
61 * part and a supplementary part, with a gap in between, would not work.
62 *
63 * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
64 * there is data with more than 64k distinct values,
65 * for example for Unihan collation with a separate collation weight per
66 * Han character.
67 */
68
69 /* Building a trie ----------------------------------------------------------*/
70
71 enum {
72 /** The null index-2 block, following the gap in the index-2 table. */
73 UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
74
75 /** The start of allocated index-2 blocks. */
76 UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
77
78 /**
79 * The null data block.
80 * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
81 * to work with 6-bit trail bytes from 2-byte UTF-8.
82 */
83 UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
84
85 /** The start of allocated data blocks. */
86 UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
87
88 /**
89 * The start of data blocks for U+0800 and above.
90 * Below, compaction uses a block length of 64 for 2-byte UTF-8.
91 * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
92 * Data values for 0x780 code points beyond ASCII.
93 */
94 UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
95 };
96
97 /* Start with allocation of 16k data entries. */
98 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
99
100 /* Grow about 8x each time. */
101 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
102
103 static int32_t
104 allocIndex2Block(UNewTrie2 *trie);
105
106 U_CAPI UTrie2 * U_EXPORT2
utrie2_open(uint32_t initialValue,uint32_t errorValue,UErrorCode * pErrorCode)107 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
108 UTrie2 *trie;
109 UNewTrie2 *newTrie;
110 uint32_t *data;
111 int32_t i, j;
112
113 if(U_FAILURE(*pErrorCode)) {
114 return NULL;
115 }
116
117 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
118 newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
119 data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
120 if(trie==NULL || newTrie==NULL || data==NULL) {
121 uprv_free(trie);
122 uprv_free(newTrie);
123 uprv_free(data);
124 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
125 return 0;
126 }
127
128 uprv_memset(trie, 0, sizeof(UTrie2));
129 trie->initialValue=initialValue;
130 trie->errorValue=errorValue;
131 trie->highStart=0x110000;
132 trie->newTrie=newTrie;
133
134 newTrie->data=data;
135 newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
136 newTrie->initialValue=initialValue;
137 newTrie->errorValue=errorValue;
138 newTrie->highStart=0x110000;
139 newTrie->firstFreeBlock=0; /* no free block in the list */
140 newTrie->isCompacted=FALSE;
141
142 /*
143 * preallocate and reset
144 * - ASCII
145 * - the bad-UTF-8-data block
146 * - the null data block
147 */
148 for(i=0; i<0x80; ++i) {
149 newTrie->data[i]=initialValue;
150 }
151 for(; i<0xc0; ++i) {
152 newTrie->data[i]=errorValue;
153 }
154 for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
155 newTrie->data[i]=initialValue;
156 }
157 newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
158 newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
159
160 /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
161 for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
162 newTrie->index2[i]=j;
163 newTrie->map[i]=1;
164 }
165 /* reference counts for the bad-UTF-8-data block */
166 for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
167 newTrie->map[i]=0;
168 }
169 /*
170 * Reference counts for the null data block: all blocks except for the ASCII blocks.
171 * Plus 1 so that we don't drop this block during compaction.
172 * Plus as many as needed for lead surrogate code points.
173 */
174 /* i==newTrie->dataNullOffset */
175 newTrie->map[i++]=
176 (0x110000>>UTRIE2_SHIFT_2)-
177 (0x80>>UTRIE2_SHIFT_2)+
178 1+
179 UTRIE2_LSCP_INDEX_2_LENGTH;
180 j+=UTRIE2_DATA_BLOCK_LENGTH;
181 for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
182 newTrie->map[i]=0;
183 }
184
185 /*
186 * set the remaining indexes in the BMP index-2 block
187 * to the null data block
188 */
189 for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
190 newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
191 }
192
193 /*
194 * Fill the index gap with impossible values so that compaction
195 * does not overlap other index-2 blocks with the gap.
196 */
197 for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
198 newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
199 }
200
201 /* set the indexes in the null index-2 block */
202 for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
203 newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
204 }
205 newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
206 newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
207
208 /* set the index-1 indexes for the linear index-2 block */
209 for(i=0, j=0;
210 i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
211 ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
212 ) {
213 newTrie->index1[i]=j;
214 }
215
216 /* set the remaining index-1 indexes to the null index-2 block */
217 for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
218 newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
219 }
220
221 /*
222 * Preallocate and reset data for U+0080..U+07ff,
223 * for 2-byte UTF-8 which will be compacted in 64-blocks
224 * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
225 */
226 for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
227 utrie2_set32(trie, i, initialValue, pErrorCode);
228 }
229
230 return trie;
231 }
232
233 static UNewTrie2 *
cloneBuilder(const UNewTrie2 * other)234 cloneBuilder(const UNewTrie2 *other) {
235 UNewTrie2 *trie;
236
237 trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
238 if(trie==NULL) {
239 return NULL;
240 }
241
242 trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4);
243 if(trie->data==NULL) {
244 uprv_free(trie);
245 return NULL;
246 }
247 trie->dataCapacity=other->dataCapacity;
248
249 /* clone data */
250 uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
251 uprv_memcpy(trie->index2, other->index2, other->index2Length*4);
252 trie->index2NullOffset=other->index2NullOffset;
253 trie->index2Length=other->index2Length;
254
255 uprv_memcpy(trie->data, other->data, other->dataLength*4);
256 trie->dataNullOffset=other->dataNullOffset;
257 trie->dataLength=other->dataLength;
258
259 /* reference counters */
260 if(other->isCompacted) {
261 trie->firstFreeBlock=0;
262 } else {
263 uprv_memcpy(trie->map, other->map, (other->dataLength>>UTRIE2_SHIFT_2)*4);
264 trie->firstFreeBlock=other->firstFreeBlock;
265 }
266
267 trie->initialValue=other->initialValue;
268 trie->errorValue=other->errorValue;
269 trie->highStart=other->highStart;
270 trie->isCompacted=other->isCompacted;
271
272 return trie;
273 }
274
275 U_CAPI UTrie2 * U_EXPORT2
utrie2_clone(const UTrie2 * other,UErrorCode * pErrorCode)276 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
277 UTrie2 *trie;
278
279 if(U_FAILURE(*pErrorCode)) {
280 return NULL;
281 }
282 if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
283 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
284 return NULL;
285 }
286
287 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
288 if(trie==NULL) {
289 return NULL;
290 }
291 uprv_memcpy(trie, other, sizeof(UTrie2));
292
293 if(other->memory!=NULL) {
294 trie->memory=uprv_malloc(other->length);
295 if(trie->memory!=NULL) {
296 trie->isMemoryOwned=TRUE;
297 uprv_memcpy(trie->memory, other->memory, other->length);
298
299 /* make the clone's pointers point to its own memory */
300 trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
301 if(other->data16!=NULL) {
302 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
303 }
304 if(other->data32!=NULL) {
305 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
306 }
307 }
308 } else /* other->newTrie!=NULL */ {
309 trie->newTrie=cloneBuilder(other->newTrie);
310 }
311
312 if(trie->memory==NULL && trie->newTrie==NULL) {
313 uprv_free(trie);
314 trie=NULL;
315 }
316 return trie;
317 }
318
319 typedef struct NewTrieAndStatus {
320 UTrie2 *trie;
321 UErrorCode errorCode;
322 UBool exclusiveLimit; /* rather than inclusive range end */
323 } NewTrieAndStatus;
324
325 static UBool U_CALLCONV
copyEnumRange(const void * context,UChar32 start,UChar32 end,uint32_t value)326 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
327 NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
328 if(value!=nt->trie->initialValue) {
329 if(nt->exclusiveLimit) {
330 --end;
331 }
332 if(start==end) {
333 utrie2_set32(nt->trie, start, value, &nt->errorCode);
334 } else {
335 utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode);
336 }
337 return U_SUCCESS(nt->errorCode);
338 } else {
339 return TRUE;
340 }
341 }
342
343 #ifdef UTRIE2_DEBUG
344 static void
utrie_printLengths(const UTrie * trie)345 utrie_printLengths(const UTrie *trie) {
346 long indexLength=trie->indexLength;
347 long dataLength=(long)trie->dataLength;
348 long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
349 printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n",
350 indexLength, dataLength, totalLength);
351 }
352
353 static void
utrie2_printLengths(const UTrie2 * trie,const char * which)354 utrie2_printLengths(const UTrie2 *trie, const char *which) {
355 long indexLength=trie->indexLength;
356 long dataLength=(long)trie->dataLength;
357 long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
358 printf("**UTrie2Lengths(%s)** index:%6ld data:%6ld serialized:%6ld\n",
359 which, indexLength, dataLength, totalLength);
360 }
361 #endif
362
363 U_CAPI UTrie2 * U_EXPORT2
utrie2_cloneAsThawed(const UTrie2 * other,UErrorCode * pErrorCode)364 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
365 NewTrieAndStatus context;
366 UChar lead;
367
368 if(U_FAILURE(*pErrorCode)) {
369 return NULL;
370 }
371 if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
372 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
373 return NULL;
374 }
375 if(other->newTrie!=NULL && !other->newTrie->isCompacted) {
376 return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */
377 }
378
379 /* Clone the frozen trie by enumerating it and building a new one. */
380 context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
381 if(U_FAILURE(*pErrorCode)) {
382 return NULL;
383 }
384 context.exclusiveLimit=FALSE;
385 context.errorCode=*pErrorCode;
386 utrie2_enum(other, NULL, copyEnumRange, &context);
387 *pErrorCode=context.errorCode;
388 for(lead=0xd800; lead<0xdc00; ++lead) {
389 uint32_t value;
390 if(other->data32==NULL) {
391 value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
392 } else {
393 value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
394 }
395 if(value!=other->initialValue) {
396 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
397 }
398 }
399 if(U_FAILURE(*pErrorCode)) {
400 utrie2_close(context.trie);
401 context.trie=NULL;
402 }
403 return context.trie;
404 }
405
406 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
407 U_CAPI UTrie2 * U_EXPORT2
utrie2_fromUTrie(const UTrie * trie1,uint32_t errorValue,UErrorCode * pErrorCode)408 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
409 NewTrieAndStatus context;
410 UChar lead;
411
412 if(U_FAILURE(*pErrorCode)) {
413 return NULL;
414 }
415 if(trie1==NULL) {
416 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
417 return NULL;
418 }
419 context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
420 if(U_FAILURE(*pErrorCode)) {
421 return NULL;
422 }
423 context.exclusiveLimit=TRUE;
424 context.errorCode=*pErrorCode;
425 utrie_enum(trie1, NULL, copyEnumRange, &context);
426 *pErrorCode=context.errorCode;
427 for(lead=0xd800; lead<0xdc00; ++lead) {
428 uint32_t value;
429 if(trie1->data32==NULL) {
430 value=UTRIE_GET16_FROM_LEAD(trie1, lead);
431 } else {
432 value=UTRIE_GET32_FROM_LEAD(trie1, lead);
433 }
434 if(value!=trie1->initialValue) {
435 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
436 }
437 }
438 if(U_SUCCESS(*pErrorCode)) {
439 utrie2_freeze(context.trie,
440 trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
441 pErrorCode);
442 }
443 #ifdef UTRIE2_DEBUG
444 if(U_SUCCESS(*pErrorCode)) {
445 utrie_printLengths(trie1);
446 utrie2_printLengths(context.trie, "fromUTrie");
447 }
448 #endif
449 if(U_FAILURE(*pErrorCode)) {
450 utrie2_close(context.trie);
451 context.trie=NULL;
452 }
453 return context.trie;
454 }
455
456 static inline UBool
isInNullBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)457 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
458 int32_t i2, block;
459
460 if(U_IS_LEAD(c) && forLSCP) {
461 i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
462 (c>>UTRIE2_SHIFT_2);
463 } else {
464 i2=trie->index1[c>>UTRIE2_SHIFT_1]+
465 ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
466 }
467 block=trie->index2[i2];
468 return (UBool)(block==trie->dataNullOffset);
469 }
470
471 static int32_t
allocIndex2Block(UNewTrie2 * trie)472 allocIndex2Block(UNewTrie2 *trie) {
473 int32_t newBlock, newTop;
474
475 newBlock=trie->index2Length;
476 newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
477 if(newTop>UPRV_LENGTHOF(trie->index2)) {
478 /*
479 * Should never occur.
480 * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
481 * or the code writes more values than should be possible.
482 */
483 return -1;
484 }
485 trie->index2Length=newTop;
486 uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
487 return newBlock;
488 }
489
490 static int32_t
getIndex2Block(UNewTrie2 * trie,UChar32 c,UBool forLSCP)491 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
492 int32_t i1, i2;
493
494 if(U_IS_LEAD(c) && forLSCP) {
495 return UTRIE2_LSCP_INDEX_2_OFFSET;
496 }
497
498 i1=c>>UTRIE2_SHIFT_1;
499 i2=trie->index1[i1];
500 if(i2==trie->index2NullOffset) {
501 i2=allocIndex2Block(trie);
502 if(i2<0) {
503 return -1; /* program error */
504 }
505 trie->index1[i1]=i2;
506 }
507 return i2;
508 }
509
510 static int32_t
allocDataBlock(UNewTrie2 * trie,int32_t copyBlock)511 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
512 int32_t newBlock, newTop;
513
514 if(trie->firstFreeBlock!=0) {
515 /* get the first free block */
516 newBlock=trie->firstFreeBlock;
517 trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
518 } else {
519 /* get a new block from the high end */
520 newBlock=trie->dataLength;
521 newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
522 if(newTop>trie->dataCapacity) {
523 /* out of memory in the data array */
524 int32_t capacity;
525 uint32_t *data;
526
527 if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
528 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
529 } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
530 capacity=UNEWTRIE2_MAX_DATA_LENGTH;
531 } else {
532 /*
533 * Should never occur.
534 * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
535 * or the code writes more values than should be possible.
536 */
537 return -1;
538 }
539 data=(uint32_t *)uprv_malloc(capacity*4);
540 if(data==NULL) {
541 return -1;
542 }
543 uprv_memcpy(data, trie->data, trie->dataLength*4);
544 uprv_free(trie->data);
545 trie->data=data;
546 trie->dataCapacity=capacity;
547 }
548 trie->dataLength=newTop;
549 }
550 uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
551 trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
552 return newBlock;
553 }
554
555 /* call when the block's reference counter reaches 0 */
556 static void
releaseDataBlock(UNewTrie2 * trie,int32_t block)557 releaseDataBlock(UNewTrie2 *trie, int32_t block) {
558 /* put this block at the front of the free-block chain */
559 trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
560 trie->firstFreeBlock=block;
561 }
562
563 static inline UBool
isWritableBlock(UNewTrie2 * trie,int32_t block)564 isWritableBlock(UNewTrie2 *trie, int32_t block) {
565 return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]);
566 }
567
568 static inline void
setIndex2Entry(UNewTrie2 * trie,int32_t i2,int32_t block)569 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
570 int32_t oldBlock;
571 ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */
572 oldBlock=trie->index2[i2];
573 if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
574 releaseDataBlock(trie, oldBlock);
575 }
576 trie->index2[i2]=block;
577 }
578
579 /**
580 * No error checking for illegal arguments.
581 *
582 * @return -1 if no new data block available (out of memory in data array)
583 * @internal
584 */
585 static int32_t
getDataBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)586 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
587 int32_t i2, oldBlock, newBlock;
588
589 i2=getIndex2Block(trie, c, forLSCP);
590 if(i2<0) {
591 return -1; /* program error */
592 }
593
594 i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
595 oldBlock=trie->index2[i2];
596 if(isWritableBlock(trie, oldBlock)) {
597 return oldBlock;
598 }
599
600 /* allocate a new data block */
601 newBlock=allocDataBlock(trie, oldBlock);
602 if(newBlock<0) {
603 /* out of memory in the data array */
604 return -1;
605 }
606 setIndex2Entry(trie, i2, newBlock);
607 return newBlock;
608 }
609
610 /**
611 * @return TRUE if the value was successfully set
612 */
613 static void
set32(UNewTrie2 * trie,UChar32 c,UBool forLSCP,uint32_t value,UErrorCode * pErrorCode)614 set32(UNewTrie2 *trie,
615 UChar32 c, UBool forLSCP, uint32_t value,
616 UErrorCode *pErrorCode) {
617 int32_t block;
618
619 if(trie==NULL || trie->isCompacted) {
620 *pErrorCode=U_NO_WRITE_PERMISSION;
621 return;
622 }
623
624 block=getDataBlock(trie, c, forLSCP);
625 if(block<0) {
626 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
627 return;
628 }
629
630 trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
631 }
632
633 U_CAPI void U_EXPORT2
utrie2_set32(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)634 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
635 if(U_FAILURE(*pErrorCode)) {
636 return;
637 }
638 if((uint32_t)c>0x10ffff) {
639 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
640 return;
641 }
642 set32(trie->newTrie, c, TRUE, value, pErrorCode);
643 }
644
645 U_CAPI void U_EXPORT2
utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)646 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
647 UChar32 c, uint32_t value,
648 UErrorCode *pErrorCode) {
649 if(U_FAILURE(*pErrorCode)) {
650 return;
651 }
652 if(!U_IS_LEAD(c)) {
653 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
654 return;
655 }
656 set32(trie->newTrie, c, FALSE, value, pErrorCode);
657 }
658
659 static void
writeBlock(uint32_t * block,uint32_t value)660 writeBlock(uint32_t *block, uint32_t value) {
661 uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
662 while(block<limit) {
663 *block++=value;
664 }
665 }
666
667 /**
668 * initialValue is ignored if overwrite=TRUE
669 * @internal
670 */
671 static void
fillBlock(uint32_t * block,UChar32 start,UChar32 limit,uint32_t value,uint32_t initialValue,UBool overwrite)672 fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
673 uint32_t value, uint32_t initialValue, UBool overwrite) {
674 uint32_t *pLimit;
675
676 pLimit=block+limit;
677 block+=start;
678 if(overwrite) {
679 while(block<pLimit) {
680 *block++=value;
681 }
682 } else {
683 while(block<pLimit) {
684 if(*block==initialValue) {
685 *block=value;
686 }
687 ++block;
688 }
689 }
690 }
691
692 U_CAPI void U_EXPORT2
utrie2_setRange32(UTrie2 * trie,UChar32 start,UChar32 end,uint32_t value,UBool overwrite,UErrorCode * pErrorCode)693 utrie2_setRange32(UTrie2 *trie,
694 UChar32 start, UChar32 end,
695 uint32_t value, UBool overwrite,
696 UErrorCode *pErrorCode) {
697 /*
698 * repeat value in [start..end]
699 * mark index values for repeat-data blocks by setting bit 31 of the index values
700 * fill around existing values if any, if(overwrite)
701 */
702 UNewTrie2 *newTrie;
703 int32_t block, rest, repeatBlock;
704 UChar32 limit;
705
706 if(U_FAILURE(*pErrorCode)) {
707 return;
708 }
709 if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
710 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
711 return;
712 }
713 newTrie=trie->newTrie;
714 if(newTrie==NULL || newTrie->isCompacted) {
715 *pErrorCode=U_NO_WRITE_PERMISSION;
716 return;
717 }
718 if(!overwrite && value==newTrie->initialValue) {
719 return; /* nothing to do */
720 }
721
722 limit=end+1;
723 if(start&UTRIE2_DATA_MASK) {
724 UChar32 nextStart;
725
726 /* set partial block at [start..following block boundary[ */
727 block=getDataBlock(newTrie, start, TRUE);
728 if(block<0) {
729 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
730 return;
731 }
732
733 nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK;
734 if(nextStart<=limit) {
735 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
736 value, newTrie->initialValue, overwrite);
737 start=nextStart;
738 } else {
739 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
740 value, newTrie->initialValue, overwrite);
741 return;
742 }
743 }
744
745 /* number of positions in the last, partial block */
746 rest=limit&UTRIE2_DATA_MASK;
747
748 /* round down limit to a block boundary */
749 limit&=~UTRIE2_DATA_MASK;
750
751 /* iterate over all-value blocks */
752 if(value==newTrie->initialValue) {
753 repeatBlock=newTrie->dataNullOffset;
754 } else {
755 repeatBlock=-1;
756 }
757
758 while(start<limit) {
759 int32_t i2;
760 UBool setRepeatBlock=FALSE;
761
762 if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) {
763 start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
764 continue;
765 }
766
767 /* get index value */
768 i2=getIndex2Block(newTrie, start, TRUE);
769 if(i2<0) {
770 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
771 return;
772 }
773 i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
774 block=newTrie->index2[i2];
775 if(isWritableBlock(newTrie, block)) {
776 /* already allocated */
777 if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
778 /*
779 * We overwrite all values, and it's not a
780 * protected (ASCII-linear or 2-byte UTF-8) block:
781 * replace with the repeatBlock.
782 */
783 setRepeatBlock=TRUE;
784 } else {
785 /* !overwrite, or protected block: just write the values into this block */
786 fillBlock(newTrie->data+block,
787 0, UTRIE2_DATA_BLOCK_LENGTH,
788 value, newTrie->initialValue, overwrite);
789 }
790 } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
791 /*
792 * Set the repeatBlock instead of the null block or previous repeat block:
793 *
794 * If !isWritableBlock() then all entries in the block have the same value
795 * because it's the null block or a range block (the repeatBlock from a previous
796 * call to utrie2_setRange32()).
797 * No other blocks are used multiple times before compacting.
798 *
799 * The null block is the only non-writable block with the initialValue because
800 * of the repeatBlock initialization above. (If value==initialValue, then
801 * the repeatBlock will be the null data block.)
802 *
803 * We set our repeatBlock if the desired value differs from the block's value,
804 * and if we overwrite any data or if the data is all initial values
805 * (which is the same as the block being the null block, see above).
806 */
807 setRepeatBlock=TRUE;
808 }
809 if(setRepeatBlock) {
810 if(repeatBlock>=0) {
811 setIndex2Entry(newTrie, i2, repeatBlock);
812 } else {
813 /* create and set and fill the repeatBlock */
814 repeatBlock=getDataBlock(newTrie, start, TRUE);
815 if(repeatBlock<0) {
816 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
817 return;
818 }
819 writeBlock(newTrie->data+repeatBlock, value);
820 }
821 }
822
823 start+=UTRIE2_DATA_BLOCK_LENGTH;
824 }
825
826 if(rest>0) {
827 /* set partial block at [last block boundary..limit[ */
828 block=getDataBlock(newTrie, start, TRUE);
829 if(block<0) {
830 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
831 return;
832 }
833
834 fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
835 }
836
837 return;
838 }
839
840 /* compaction --------------------------------------------------------------- */
841
842 static inline UBool
equal_int32(const int32_t * s,const int32_t * t,int32_t length)843 equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
844 while(length>0 && *s==*t) {
845 ++s;
846 ++t;
847 --length;
848 }
849 return (UBool)(length==0);
850 }
851
852 static inline UBool
equal_uint32(const uint32_t * s,const uint32_t * t,int32_t length)853 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
854 while(length>0 && *s==*t) {
855 ++s;
856 ++t;
857 --length;
858 }
859 return (UBool)(length==0);
860 }
861
862 static int32_t
findSameIndex2Block(const int32_t * idx,int32_t index2Length,int32_t otherBlock)863 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
864 int32_t block;
865
866 /* ensure that we do not even partially get past index2Length */
867 index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
868
869 for(block=0; block<=index2Length; ++block) {
870 if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
871 return block;
872 }
873 }
874 return -1;
875 }
876
877 static int32_t
findSameDataBlock(const uint32_t * data,int32_t dataLength,int32_t otherBlock,int32_t blockLength)878 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
879 int32_t block;
880
881 /* ensure that we do not even partially get past dataLength */
882 dataLength-=blockLength;
883
884 for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
885 if(equal_uint32(data+block, data+otherBlock, blockLength)) {
886 return block;
887 }
888 }
889 return -1;
890 }
891
892 /*
893 * Find the start of the last range in the trie by enumerating backward.
894 * Indexes for supplementary code points higher than this will be omitted.
895 */
896 static UChar32
findHighStart(UNewTrie2 * trie,uint32_t highValue)897 findHighStart(UNewTrie2 *trie, uint32_t highValue) {
898 const uint32_t *data32;
899
900 uint32_t value, initialValue;
901 UChar32 c, prev;
902 int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
903
904 data32=trie->data;
905 initialValue=trie->initialValue;
906
907 index2NullOffset=trie->index2NullOffset;
908 nullBlock=trie->dataNullOffset;
909
910 /* set variables for previous range */
911 if(highValue==initialValue) {
912 prevI2Block=index2NullOffset;
913 prevBlock=nullBlock;
914 } else {
915 prevI2Block=-1;
916 prevBlock=-1;
917 }
918 prev=0x110000;
919
920 /* enumerate index-2 blocks */
921 i1=UNEWTRIE2_INDEX_1_LENGTH;
922 c=prev;
923 while(c>0) {
924 i2Block=trie->index1[--i1];
925 if(i2Block==prevI2Block) {
926 /* the index-2 block is the same as the previous one, and filled with highValue */
927 c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
928 continue;
929 }
930 prevI2Block=i2Block;
931 if(i2Block==index2NullOffset) {
932 /* this is the null index-2 block */
933 if(highValue!=initialValue) {
934 return c;
935 }
936 c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
937 } else {
938 /* enumerate data blocks for one index-2 block */
939 for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
940 block=trie->index2[i2Block+ --i2];
941 if(block==prevBlock) {
942 /* the block is the same as the previous one, and filled with highValue */
943 c-=UTRIE2_DATA_BLOCK_LENGTH;
944 continue;
945 }
946 prevBlock=block;
947 if(block==nullBlock) {
948 /* this is the null data block */
949 if(highValue!=initialValue) {
950 return c;
951 }
952 c-=UTRIE2_DATA_BLOCK_LENGTH;
953 } else {
954 for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
955 value=data32[block+ --j];
956 if(value!=highValue) {
957 return c;
958 }
959 --c;
960 }
961 }
962 }
963 }
964 }
965
966 /* deliver last range */
967 return 0;
968 }
969
970 /*
971 * Compact a build-time trie.
972 *
973 * The compaction
974 * - removes blocks that are identical with earlier ones
975 * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
976 * - moves blocks in steps of the data granularity
977 * - moves and overlaps blocks that overlap with multiple values in the overlap region
978 *
979 * It does not
980 * - try to move and overlap blocks that are not already adjacent
981 */
982 static void
compactData(UNewTrie2 * trie)983 compactData(UNewTrie2 *trie) {
984 int32_t start, newStart, movedStart;
985 int32_t blockLength, overlap;
986 int32_t i, mapIndex, blockCount;
987
988 /* do not compact linear-ASCII data */
989 newStart=UTRIE2_DATA_START_OFFSET;
990 for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
991 trie->map[i]=start;
992 }
993
994 /*
995 * Start with a block length of 64 for 2-byte UTF-8,
996 * then switch to UTRIE2_DATA_BLOCK_LENGTH.
997 */
998 blockLength=64;
999 blockCount=blockLength>>UTRIE2_SHIFT_2;
1000 for(start=newStart; start<trie->dataLength;) {
1001 /*
1002 * start: index of first entry of current block
1003 * newStart: index where the current block is to be moved
1004 * (right after current end of already-compacted data)
1005 */
1006 if(start==UNEWTRIE2_DATA_0800_OFFSET) {
1007 blockLength=UTRIE2_DATA_BLOCK_LENGTH;
1008 blockCount=1;
1009 }
1010
1011 /* skip blocks that are not used */
1012 if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
1013 /* advance start to the next block */
1014 start+=blockLength;
1015
1016 /* leave newStart with the previous block! */
1017 continue;
1018 }
1019
1020 /* search for an identical block */
1021 if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
1022 >=0
1023 ) {
1024 /* found an identical block, set the other block's index value for the current block */
1025 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1026 trie->map[mapIndex++]=movedStart;
1027 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1028 }
1029
1030 /* advance start to the next block */
1031 start+=blockLength;
1032
1033 /* leave newStart with the previous block! */
1034 continue;
1035 }
1036
1037 /* see if the beginning of this block can be overlapped with the end of the previous block */
1038 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
1039 for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
1040 overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
1041 overlap-=UTRIE2_DATA_GRANULARITY) {}
1042
1043 if(overlap>0 || newStart<start) {
1044 /* some overlap, or just move the whole block */
1045 movedStart=newStart-overlap;
1046 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1047 trie->map[mapIndex++]=movedStart;
1048 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1049 }
1050
1051 /* move the non-overlapping indexes to their new positions */
1052 start+=overlap;
1053 for(i=blockLength-overlap; i>0; --i) {
1054 trie->data[newStart++]=trie->data[start++];
1055 }
1056 } else /* no overlap && newStart==start */ {
1057 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1058 trie->map[mapIndex++]=start;
1059 start+=UTRIE2_DATA_BLOCK_LENGTH;
1060 }
1061 newStart=start;
1062 }
1063 }
1064
1065 /* now adjust the index-2 table */
1066 for(i=0; i<trie->index2Length; ++i) {
1067 if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
1068 /* Gap indexes are invalid (-1). Skip over the gap. */
1069 i+=UNEWTRIE2_INDEX_GAP_LENGTH;
1070 }
1071 trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
1072 }
1073 trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
1074
1075 /* ensure dataLength alignment */
1076 while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1077 trie->data[newStart++]=trie->initialValue;
1078 }
1079
1080 #ifdef UTRIE2_DEBUG
1081 /* we saved some space */
1082 printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n",
1083 (long)trie->dataLength, (long)newStart);
1084 #endif
1085
1086 trie->dataLength=newStart;
1087 }
1088
1089 static void
compactIndex2(UNewTrie2 * trie)1090 compactIndex2(UNewTrie2 *trie) {
1091 int32_t i, start, newStart, movedStart, overlap;
1092
1093 /* do not compact linear-BMP index-2 blocks */
1094 newStart=UTRIE2_INDEX_2_BMP_LENGTH;
1095 for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
1096 trie->map[i]=start;
1097 }
1098
1099 /* Reduce the index table gap to what will be needed at runtime. */
1100 newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
1101
1102 for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
1103 /*
1104 * start: index of first entry of current block
1105 * newStart: index where the current block is to be moved
1106 * (right after current end of already-compacted data)
1107 */
1108
1109 /* search for an identical block */
1110 if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
1111 >=0
1112 ) {
1113 /* found an identical block, set the other block's index value for the current block */
1114 trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
1115
1116 /* advance start to the next block */
1117 start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1118
1119 /* leave newStart with the previous block! */
1120 continue;
1121 }
1122
1123 /* see if the beginning of this block can be overlapped with the end of the previous block */
1124 /* look for maximum overlap with the previous, adjacent block */
1125 for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
1126 overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
1127 --overlap) {}
1128
1129 if(overlap>0 || newStart<start) {
1130 /* some overlap, or just move the whole block */
1131 trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
1132
1133 /* move the non-overlapping indexes to their new positions */
1134 start+=overlap;
1135 for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
1136 trie->index2[newStart++]=trie->index2[start++];
1137 }
1138 } else /* no overlap && newStart==start */ {
1139 trie->map[start>>UTRIE2_SHIFT_1_2]=start;
1140 start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1141 newStart=start;
1142 }
1143 }
1144
1145 /* now adjust the index-1 table */
1146 for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
1147 trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
1148 }
1149 trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
1150
1151 /*
1152 * Ensure data table alignment:
1153 * Needs to be granularity-aligned for 16-bit trie
1154 * (so that dataMove will be down-shiftable),
1155 * and 2-aligned for uint32_t data.
1156 */
1157 while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
1158 /* Arbitrary value: 0x3fffc not possible for real data. */
1159 trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT;
1160 }
1161
1162 #ifdef UTRIE2_DEBUG
1163 /* we saved some space */
1164 printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n",
1165 (long)trie->index2Length, (long)newStart);
1166 #endif
1167
1168 trie->index2Length=newStart;
1169 }
1170
1171 static void
compactTrie(UTrie2 * trie,UErrorCode * pErrorCode)1172 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
1173 UNewTrie2 *newTrie;
1174 UChar32 highStart, suppHighStart;
1175 uint32_t highValue;
1176
1177 newTrie=trie->newTrie;
1178
1179 /* find highStart and round it up */
1180 highValue=utrie2_get32(trie, 0x10ffff);
1181 highStart=findHighStart(newTrie, highValue);
1182 highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
1183 if(highStart==0x110000) {
1184 highValue=trie->errorValue;
1185 }
1186
1187 /*
1188 * Set trie->highStart only after utrie2_get32(trie, highStart).
1189 * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
1190 */
1191 trie->highStart=newTrie->highStart=highStart;
1192
1193 #ifdef UTRIE2_DEBUG
1194 printf("UTrie2: highStart U+%04lx highValue 0x%lx initialValue 0x%lx\n",
1195 (long)highStart, (long)highValue, (long)trie->initialValue);
1196 #endif
1197
1198 if(highStart<0x110000) {
1199 /* Blank out [highStart..10ffff] to release associated data blocks. */
1200 suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
1201 utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode);
1202 if(U_FAILURE(*pErrorCode)) {
1203 return;
1204 }
1205 }
1206
1207 compactData(newTrie);
1208 if(highStart>0x10000) {
1209 compactIndex2(newTrie);
1210 #ifdef UTRIE2_DEBUG
1211 } else {
1212 printf("UTrie2: highStart U+%04lx count of 16-bit index-2 words %lu->%lu\n",
1213 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
1214 #endif
1215 }
1216
1217 /*
1218 * Store the highValue in the data array and round up the dataLength.
1219 * Must be done after compactData() because that assumes that dataLength
1220 * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
1221 */
1222 newTrie->data[newTrie->dataLength++]=highValue;
1223 while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1224 newTrie->data[newTrie->dataLength++]=trie->initialValue;
1225 }
1226
1227 newTrie->isCompacted=TRUE;
1228 }
1229
1230 /* serialization ------------------------------------------------------------ */
1231
1232 /**
1233 * Maximum length of the runtime index array.
1234 * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
1235 * (The actual maximum length is lower,
1236 * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
1237 */
1238 #define UTRIE2_MAX_INDEX_LENGTH 0xffff
1239
1240 /**
1241 * Maximum length of the runtime data array.
1242 * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
1243 * and by uint16_t UTrie2Header.shiftedDataLength.
1244 */
1245 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
1246
1247 /* Compact and internally serialize the trie. */
1248 U_CAPI void U_EXPORT2
utrie2_freeze(UTrie2 * trie,UTrie2ValueBits valueBits,UErrorCode * pErrorCode)1249 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
1250 UNewTrie2 *newTrie;
1251 UTrie2Header *header;
1252 uint32_t *p;
1253 uint16_t *dest16;
1254 int32_t i, length;
1255 int32_t allIndexesLength;
1256 int32_t dataMove; /* >0 if the data is moved to the end of the index array */
1257 UChar32 highStart;
1258
1259 /* argument check */
1260 if(U_FAILURE(*pErrorCode)) {
1261 return;
1262 }
1263 if( trie==NULL ||
1264 valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
1265 ) {
1266 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1267 return;
1268 }
1269 newTrie=trie->newTrie;
1270 if(newTrie==NULL) {
1271 /* already frozen */
1272 UTrie2ValueBits frozenValueBits=
1273 trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
1274 if(valueBits!=frozenValueBits) {
1275 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1276 }
1277 return;
1278 }
1279
1280 /* compact if necessary */
1281 if(!newTrie->isCompacted) {
1282 compactTrie(trie, pErrorCode);
1283 if(U_FAILURE(*pErrorCode)) {
1284 return;
1285 }
1286 }
1287 highStart=trie->highStart;
1288
1289 if(highStart<=0x10000) {
1290 allIndexesLength=UTRIE2_INDEX_1_OFFSET;
1291 } else {
1292 allIndexesLength=newTrie->index2Length;
1293 }
1294 if(valueBits==UTRIE2_16_VALUE_BITS) {
1295 dataMove=allIndexesLength;
1296 } else {
1297 dataMove=0;
1298 }
1299
1300 /* are indexLength and dataLength within limits? */
1301 if( /* for unshifted indexLength */
1302 allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
1303 /* for unshifted dataNullOffset */
1304 (dataMove+newTrie->dataNullOffset)>0xffff ||
1305 /* for unshifted 2-byte UTF-8 index-2 values */
1306 (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
1307 /* for shiftedDataLength */
1308 (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
1309 ) {
1310 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1311 return;
1312 }
1313
1314 /* calculate the total serialized length */
1315 length=sizeof(UTrie2Header)+allIndexesLength*2;
1316 if(valueBits==UTRIE2_16_VALUE_BITS) {
1317 length+=newTrie->dataLength*2;
1318 } else {
1319 length+=newTrie->dataLength*4;
1320 }
1321
1322 trie->memory=uprv_malloc(length);
1323 if(trie->memory==NULL) {
1324 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1325 return;
1326 }
1327 trie->length=length;
1328 trie->isMemoryOwned=TRUE;
1329
1330 trie->indexLength=allIndexesLength;
1331 trie->dataLength=newTrie->dataLength;
1332 if(highStart<=0x10000) {
1333 trie->index2NullOffset=0xffff;
1334 } else {
1335 trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset;
1336 }
1337 trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
1338 trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
1339
1340 /* set the header fields */
1341 header=(UTrie2Header *)trie->memory;
1342
1343 header->signature=UTRIE2_SIG; /* "Tri2" */
1344 header->options=(uint16_t)valueBits;
1345
1346 header->indexLength=(uint16_t)trie->indexLength;
1347 header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
1348 header->index2NullOffset=trie->index2NullOffset;
1349 header->dataNullOffset=trie->dataNullOffset;
1350 header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
1351
1352 /* fill the index and data arrays */
1353 dest16=(uint16_t *)(header+1);
1354 trie->index=dest16;
1355
1356 /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
1357 p=(uint32_t *)newTrie->index2;
1358 for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
1359 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1360 }
1361
1362 /* write UTF-8 2-byte index-2 values, not right-shifted */
1363 for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */
1364 *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
1365 }
1366 for(; i<(0xe0-0xc0); ++i) { /* C2..DF */
1367 *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
1368 }
1369
1370 if(highStart>0x10000) {
1371 int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
1372 int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
1373
1374 /* write 16-bit index-1 values for supplementary code points */
1375 p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
1376 for(i=index1Length; i>0; --i) {
1377 *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
1378 }
1379
1380 /*
1381 * write the index-2 array values for supplementary code points,
1382 * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
1383 */
1384 p=(uint32_t *)newTrie->index2+index2Offset;
1385 for(i=newTrie->index2Length-index2Offset; i>0; --i) {
1386 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1387 }
1388 }
1389
1390 /* write the 16/32-bit data array */
1391 switch(valueBits) {
1392 case UTRIE2_16_VALUE_BITS:
1393 /* write 16-bit data values */
1394 trie->data16=dest16;
1395 trie->data32=NULL;
1396 p=newTrie->data;
1397 for(i=newTrie->dataLength; i>0; --i) {
1398 *dest16++=(uint16_t)*p++;
1399 }
1400 break;
1401 case UTRIE2_32_VALUE_BITS:
1402 /* write 32-bit data values */
1403 trie->data16=NULL;
1404 trie->data32=(uint32_t *)dest16;
1405 uprv_memcpy(dest16, newTrie->data, newTrie->dataLength*4);
1406 break;
1407 default:
1408 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1409 return;
1410 }
1411
1412 /* Delete the UNewTrie2. */
1413 uprv_free(newTrie->data);
1414 uprv_free(newTrie);
1415 trie->newTrie=NULL;
1416 }
1417
1418 /*
1419 * This is here to avoid a dependency from utrie2.cpp on utrie.c.
1420 * This file already depends on utrie.c.
1421 * Otherwise, this should be in utrie2.cpp right after utrie2_swap().
1422 */
1423 U_CAPI int32_t U_EXPORT2
utrie2_swapAnyVersion(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)1424 utrie2_swapAnyVersion(const UDataSwapper *ds,
1425 const void *inData, int32_t length, void *outData,
1426 UErrorCode *pErrorCode) {
1427 if(U_SUCCESS(*pErrorCode)) {
1428 switch(utrie2_getVersion(inData, length, TRUE)) {
1429 case 1:
1430 return utrie_swap(ds, inData, length, outData, pErrorCode);
1431 case 2:
1432 return utrie2_swap(ds, inData, length, outData, pErrorCode);
1433 default:
1434 *pErrorCode=U_INVALID_FORMAT_ERROR;
1435 return 0;
1436 }
1437 }
1438 return 0;
1439 }
1440