1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2002-2011, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  propsvec.c
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002feb22
16 *   created by: Markus W. Scherer
17 *
18 *   Store bits (Unicode character properties) in bit set vectors.
19 */
20 
21 #include <stdlib.h>
22 #include "unicode/utypes.h"
23 #include "cmemory.h"
24 #include "utrie.h"
25 #include "utrie2.h"
26 #include "uarrsort.h"
27 #include "propsvec.h"
28 #include "uassert.h"
29 
30 struct UPropsVectors {
31     uint32_t *v;
32     int32_t columns;  /* number of columns, plus two for start & limit values */
33     int32_t maxRows;
34     int32_t rows;
35     int32_t prevRow;  /* search optimization: remember last row seen */
36     UBool isCompacted;
37 };
38 
39 #define UPVEC_INITIAL_ROWS (1<<12)
40 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
41 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
42 
43 U_CAPI UPropsVectors * U_EXPORT2
upvec_open(int32_t columns,UErrorCode * pErrorCode)44 upvec_open(int32_t columns, UErrorCode *pErrorCode) {
45     UPropsVectors *pv;
46     uint32_t *v, *row;
47     uint32_t cp;
48 
49     if(U_FAILURE(*pErrorCode)) {
50         return NULL;
51     }
52     if(columns<1) {
53         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
54         return NULL;
55     }
56     columns+=2; /* count range start and limit columns */
57 
58     pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
59     v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
60     if(pv==NULL || v==NULL) {
61         uprv_free(pv);
62         uprv_free(v);
63         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
64         return NULL;
65     }
66     uprv_memset(pv, 0, sizeof(UPropsVectors));
67     pv->v=v;
68     pv->columns=columns;
69     pv->maxRows=UPVEC_INITIAL_ROWS;
70     pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
71 
72     /* set the all-Unicode row and the special-value rows */
73     row=pv->v;
74     uprv_memset(row, 0, pv->rows*columns*4);
75     row[0]=0;
76     row[1]=0x110000;
77     row+=columns;
78     for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
79         row[0]=cp;
80         row[1]=cp+1;
81         row+=columns;
82     }
83     return pv;
84 }
85 
86 U_CAPI void U_EXPORT2
upvec_close(UPropsVectors * pv)87 upvec_close(UPropsVectors *pv) {
88     if(pv!=NULL) {
89         uprv_free(pv->v);
90         uprv_free(pv);
91     }
92 }
93 
94 static uint32_t *
_findRow(UPropsVectors * pv,UChar32 rangeStart)95 _findRow(UPropsVectors *pv, UChar32 rangeStart) {
96     uint32_t *row;
97     int32_t columns, i, start, limit, prevRow;
98 
99     columns=pv->columns;
100     limit=pv->rows;
101     prevRow=pv->prevRow;
102 
103     /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
104     row=pv->v+prevRow*columns;
105     if(rangeStart>=(UChar32)row[0]) {
106         if(rangeStart<(UChar32)row[1]) {
107             /* same row as last seen */
108             return row;
109         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
110             /* next row after the last one */
111             pv->prevRow=prevRow+1;
112             return row;
113         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
114             /* second row after the last one */
115             pv->prevRow=prevRow+2;
116             return row;
117         } else if((rangeStart-(UChar32)row[1])<10) {
118             /* we are close, continue looping */
119             prevRow+=2;
120             do {
121                 ++prevRow;
122                 row+=columns;
123             } while(rangeStart>=(UChar32)row[1]);
124             pv->prevRow=prevRow;
125             return row;
126         }
127     } else if(rangeStart<(UChar32)pv->v[1]) {
128         /* the very first row */
129         pv->prevRow=0;
130         return pv->v;
131     }
132 
133     /* do a binary search for the start of the range */
134     start=0;
135     while(start<limit-1) {
136         i=(start+limit)/2;
137         row=pv->v+i*columns;
138         if(rangeStart<(UChar32)row[0]) {
139             limit=i;
140         } else if(rangeStart<(UChar32)row[1]) {
141             pv->prevRow=i;
142             return row;
143         } else {
144             start=i;
145         }
146     }
147 
148     /* must be found because all ranges together always cover all of Unicode */
149     pv->prevRow=start;
150     return pv->v+start*columns;
151 }
152 
153 U_CAPI void U_EXPORT2
upvec_setValue(UPropsVectors * pv,UChar32 start,UChar32 end,int32_t column,uint32_t value,uint32_t mask,UErrorCode * pErrorCode)154 upvec_setValue(UPropsVectors *pv,
155                UChar32 start, UChar32 end,
156                int32_t column,
157                uint32_t value, uint32_t mask,
158                UErrorCode *pErrorCode) {
159     uint32_t *firstRow, *lastRow;
160     int32_t columns;
161     UChar32 limit;
162     UBool splitFirstRow, splitLastRow;
163 
164     /* argument checking */
165     if(U_FAILURE(*pErrorCode)) {
166         return;
167     }
168     if( pv==NULL ||
169         start<0 || start>end || end>UPVEC_MAX_CP ||
170         column<0 || column>=(pv->columns-2)
171     ) {
172         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
173         return;
174     }
175     if(pv->isCompacted) {
176         *pErrorCode=U_NO_WRITE_PERMISSION;
177         return;
178     }
179     limit=end+1;
180 
181     /* initialize */
182     columns=pv->columns;
183     column+=2; /* skip range start and limit columns */
184     value&=mask;
185 
186     /* find the rows whose ranges overlap with the input range */
187 
188     /* find the first and last rows, always successful */
189     firstRow=_findRow(pv, start);
190     lastRow=_findRow(pv, end);
191 
192     /*
193      * Rows need to be split if they partially overlap with the
194      * input range (only possible for the first and last rows)
195      * and if their value differs from the input value.
196      */
197     splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
198     splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
199 
200     /* split first/last rows if necessary */
201     if(splitFirstRow || splitLastRow) {
202         int32_t count, rows;
203 
204         rows=pv->rows;
205         if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
206             uint32_t *newVectors;
207             int32_t newMaxRows;
208 
209             if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
210                 newMaxRows=UPVEC_MEDIUM_ROWS;
211             } else if(pv->maxRows<UPVEC_MAX_ROWS) {
212                 newMaxRows=UPVEC_MAX_ROWS;
213             } else {
214                 /* Implementation bug, or UPVEC_MAX_ROWS too low. */
215                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
216                 return;
217             }
218             newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
219             if(newVectors==NULL) {
220                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
221                 return;
222             }
223             uprv_memcpy(newVectors, pv->v, (size_t)rows*columns*4);
224             firstRow=newVectors+(firstRow-pv->v);
225             lastRow=newVectors+(lastRow-pv->v);
226             uprv_free(pv->v);
227             pv->v=newVectors;
228             pv->maxRows=newMaxRows;
229         }
230 
231         /* count the number of row cells to move after the last row, and move them */
232         count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
233         if(count>0) {
234             uprv_memmove(
235                 lastRow+(1+splitFirstRow+splitLastRow)*columns,
236                 lastRow+columns,
237                 count*4);
238         }
239         pv->rows=rows+splitFirstRow+splitLastRow;
240 
241         /* split the first row, and move the firstRow pointer to the second part */
242         if(splitFirstRow) {
243             /* copy all affected rows up one and move the lastRow pointer */
244             count = (int32_t)((lastRow-firstRow)+columns);
245             uprv_memmove(firstRow+columns, firstRow, (size_t)count*4);
246             lastRow+=columns;
247 
248             /* split the range and move the firstRow pointer */
249             firstRow[1]=firstRow[columns]=(uint32_t)start;
250             firstRow+=columns;
251         }
252 
253         /* split the last row */
254         if(splitLastRow) {
255             /* copy the last row data */
256             uprv_memcpy(lastRow+columns, lastRow, (size_t)columns*4);
257 
258             /* split the range and move the firstRow pointer */
259             lastRow[1]=lastRow[columns]=(uint32_t)limit;
260         }
261     }
262 
263     /* set the "row last seen" to the last row for the range */
264     pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
265 
266     /* set the input value in all remaining rows */
267     firstRow+=column;
268     lastRow+=column;
269     mask=~mask;
270     for(;;) {
271         *firstRow=(*firstRow&mask)|value;
272         if(firstRow==lastRow) {
273             break;
274         }
275         firstRow+=columns;
276     }
277 }
278 
279 U_CAPI uint32_t U_EXPORT2
upvec_getValue(const UPropsVectors * pv,UChar32 c,int32_t column)280 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
281     uint32_t *row;
282     UPropsVectors *ncpv;
283 
284     if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
285         return 0;
286     }
287     ncpv=(UPropsVectors *)pv;
288     row=_findRow(ncpv, c);
289     return row[2+column];
290 }
291 
292 U_CAPI uint32_t * U_EXPORT2
upvec_getRow(const UPropsVectors * pv,int32_t rowIndex,UChar32 * pRangeStart,UChar32 * pRangeEnd)293 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
294              UChar32 *pRangeStart, UChar32 *pRangeEnd) {
295     uint32_t *row;
296     int32_t columns;
297 
298     if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
299         return NULL;
300     }
301 
302     columns=pv->columns;
303     row=pv->v+rowIndex*columns;
304     if(pRangeStart!=NULL) {
305         *pRangeStart=(UChar32)row[0];
306     }
307     if(pRangeEnd!=NULL) {
308         *pRangeEnd=(UChar32)row[1]-1;
309     }
310     return row+2;
311 }
312 
313 static int32_t U_CALLCONV
upvec_compareRows(const void * context,const void * l,const void * r)314 upvec_compareRows(const void *context, const void *l, const void *r) {
315     const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
316     const UPropsVectors *pv=(const UPropsVectors *)context;
317     int32_t i, count, columns;
318 
319     count=columns=pv->columns; /* includes start/limit columns */
320 
321     /* start comparing after start/limit but wrap around to them */
322     i=2;
323     do {
324         if(left[i]!=right[i]) {
325             return left[i]<right[i] ? -1 : 1;
326         }
327         if(++i==columns) {
328             i=0;
329         }
330     } while(--count>0);
331 
332     return 0;
333 }
334 
335 U_CAPI void U_EXPORT2
upvec_compact(UPropsVectors * pv,UPVecCompactHandler * handler,void * context,UErrorCode * pErrorCode)336 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
337     uint32_t *row;
338     int32_t i, columns, valueColumns, rows, count;
339     UChar32 start, limit;
340 
341     /* argument checking */
342     if(U_FAILURE(*pErrorCode)) {
343         return;
344     }
345     if(handler==NULL) {
346         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
347         return;
348     }
349     if(pv->isCompacted) {
350         return;
351     }
352 
353     /* Set the flag now: Sorting and compacting destroys the builder data structure. */
354     pv->isCompacted=TRUE;
355 
356     rows=pv->rows;
357     columns=pv->columns;
358     U_ASSERT(columns>=3); /* upvec_open asserts this */
359     valueColumns=columns-2; /* not counting start & limit */
360 
361     /* sort the properties vectors to find unique vector values */
362     uprv_sortArray(pv->v, rows, columns*4,
363                    upvec_compareRows, pv, FALSE, pErrorCode);
364     if(U_FAILURE(*pErrorCode)) {
365         return;
366     }
367 
368     /*
369      * Find and set the special values.
370      * This has to do almost the same work as the compaction below,
371      * to find the indexes where the special-value rows will move.
372      */
373     row=pv->v;
374     count=-valueColumns;
375     for(i=0; i<rows; ++i) {
376         start=(UChar32)row[0];
377 
378         /* count a new values vector if it is different from the current one */
379         if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
380             count+=valueColumns;
381         }
382 
383         if(start>=UPVEC_FIRST_SPECIAL_CP) {
384             handler(context, start, start, count, row+2, valueColumns, pErrorCode);
385             if(U_FAILURE(*pErrorCode)) {
386                 return;
387             }
388         }
389 
390         row+=columns;
391     }
392 
393     /* count is at the beginning of the last vector, add valueColumns to include that last vector */
394     count+=valueColumns;
395 
396     /* Call the handler once more to signal the start of delivering real values. */
397     handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
398             count, row-valueColumns, valueColumns, pErrorCode);
399     if(U_FAILURE(*pErrorCode)) {
400         return;
401     }
402 
403     /*
404      * Move vector contents up to a contiguous array with only unique
405      * vector values, and call the handler function for each vector.
406      *
407      * This destroys the Properties Vector structure and replaces it
408      * with an array of just vector values.
409      */
410     row=pv->v;
411     count=-valueColumns;
412     for(i=0; i<rows; ++i) {
413         /* fetch these first before memmove() may overwrite them */
414         start=(UChar32)row[0];
415         limit=(UChar32)row[1];
416 
417         /* add a new values vector if it is different from the current one */
418         if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
419             count+=valueColumns;
420             uprv_memmove(pv->v+count, row+2, (size_t)valueColumns*4);
421         }
422 
423         if(start<UPVEC_FIRST_SPECIAL_CP) {
424             handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
425             if(U_FAILURE(*pErrorCode)) {
426                 return;
427             }
428         }
429 
430         row+=columns;
431     }
432 
433     /* count is at the beginning of the last vector, add one to include that last vector */
434     pv->rows=count/valueColumns+1;
435 }
436 
437 U_CAPI const uint32_t * U_EXPORT2
upvec_getArray(const UPropsVectors * pv,int32_t * pRows,int32_t * pColumns)438 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
439     if(!pv->isCompacted) {
440         return NULL;
441     }
442     if(pRows!=NULL) {
443         *pRows=pv->rows;
444     }
445     if(pColumns!=NULL) {
446         *pColumns=pv->columns-2;
447     }
448     return pv->v;
449 }
450 
451 U_CAPI uint32_t * U_EXPORT2
upvec_cloneArray(const UPropsVectors * pv,int32_t * pRows,int32_t * pColumns,UErrorCode * pErrorCode)452 upvec_cloneArray(const UPropsVectors *pv,
453                  int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
454     uint32_t *clonedArray;
455     int32_t byteLength;
456 
457     if(U_FAILURE(*pErrorCode)) {
458         return NULL;
459     }
460     if(!pv->isCompacted) {
461         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
462         return NULL;
463     }
464     byteLength=pv->rows*(pv->columns-2)*4;
465     clonedArray=(uint32_t *)uprv_malloc(byteLength);
466     if(clonedArray==NULL) {
467         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
468         return NULL;
469     }
470     uprv_memcpy(clonedArray, pv->v, byteLength);
471     if(pRows!=NULL) {
472         *pRows=pv->rows;
473     }
474     if(pColumns!=NULL) {
475         *pColumns=pv->columns-2;
476     }
477     return clonedArray;
478 }
479 
480 U_CAPI UTrie2 * U_EXPORT2
upvec_compactToUTrie2WithRowIndexes(UPropsVectors * pv,UErrorCode * pErrorCode)481 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
482     UPVecToUTrie2Context toUTrie2={ NULL, 0, 0, 0 };
483     upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
484     utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
485     if(U_FAILURE(*pErrorCode)) {
486         utrie2_close(toUTrie2.trie);
487         toUTrie2.trie=NULL;
488     }
489     return toUTrie2.trie;
490 }
491 
492 /*
493  * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
494  * some 16-bit field and builds and returns a UTrie2.
495  */
496 
497 U_CAPI void U_CALLCONV
upvec_compactToUTrie2Handler(void * context,UChar32 start,UChar32 end,int32_t rowIndex,uint32_t * row,int32_t columns,UErrorCode * pErrorCode)498 upvec_compactToUTrie2Handler(void *context,
499                              UChar32 start, UChar32 end,
500                              int32_t rowIndex, uint32_t *row, int32_t columns,
501                              UErrorCode *pErrorCode) {
502     (void)row;
503     (void)columns;
504     UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
505     if(start<UPVEC_FIRST_SPECIAL_CP) {
506         utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
507     } else {
508         switch(start) {
509         case UPVEC_INITIAL_VALUE_CP:
510             toUTrie2->initialValue=rowIndex;
511             break;
512         case UPVEC_ERROR_VALUE_CP:
513             toUTrie2->errorValue=rowIndex;
514             break;
515         case UPVEC_START_REAL_VALUES_CP:
516             toUTrie2->maxValue=rowIndex;
517             if(rowIndex>0xffff) {
518                 /* too many rows for a 16-bit trie */
519                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
520             } else {
521                 toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
522                                            toUTrie2->errorValue, pErrorCode);
523             }
524             break;
525         default:
526             break;
527         }
528     }
529 }
530