1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2002-2011, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  propsvec.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002feb22
14 *   created by: Markus W. Scherer
15 *
16 *   Store bits (Unicode character properties) in bit set vectors.
17 */
18 
19 #include <stdlib.h>
20 #include "unicode/utypes.h"
21 #include "cmemory.h"
22 #include "utrie.h"
23 #include "utrie2.h"
24 #include "uarrsort.h"
25 #include "propsvec.h"
26 #include "uassert.h"
27 
28 struct UPropsVectors {
29     uint32_t *v;
30     int32_t columns;  /* number of columns, plus two for start & limit values */
31     int32_t maxRows;
32     int32_t rows;
33     int32_t prevRow;  /* search optimization: remember last row seen */
34     UBool isCompacted;
35 };
36 
37 #define UPVEC_INITIAL_ROWS (1<<12)
38 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
39 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
40 
41 U_CAPI UPropsVectors * U_EXPORT2
upvec_open(int32_t columns,UErrorCode * pErrorCode)42 upvec_open(int32_t columns, UErrorCode *pErrorCode) {
43     UPropsVectors *pv;
44     uint32_t *v, *row;
45     uint32_t cp;
46 
47     if(U_FAILURE(*pErrorCode)) {
48         return NULL;
49     }
50     if(columns<1) {
51         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
52         return NULL;
53     }
54     columns+=2; /* count range start and limit columns */
55 
56     pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
57     v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
58     if(pv==NULL || v==NULL) {
59         uprv_free(pv);
60         uprv_free(v);
61         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
62         return NULL;
63     }
64     uprv_memset(pv, 0, sizeof(UPropsVectors));
65     pv->v=v;
66     pv->columns=columns;
67     pv->maxRows=UPVEC_INITIAL_ROWS;
68     pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
69 
70     /* set the all-Unicode row and the special-value rows */
71     row=pv->v;
72     uprv_memset(row, 0, pv->rows*columns*4);
73     row[0]=0;
74     row[1]=0x110000;
75     row+=columns;
76     for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
77         row[0]=cp;
78         row[1]=cp+1;
79         row+=columns;
80     }
81     return pv;
82 }
83 
84 U_CAPI void U_EXPORT2
upvec_close(UPropsVectors * pv)85 upvec_close(UPropsVectors *pv) {
86     if(pv!=NULL) {
87         uprv_free(pv->v);
88         uprv_free(pv);
89     }
90 }
91 
92 static uint32_t *
_findRow(UPropsVectors * pv,UChar32 rangeStart)93 _findRow(UPropsVectors *pv, UChar32 rangeStart) {
94     uint32_t *row;
95     int32_t columns, i, start, limit, prevRow;
96 
97     columns=pv->columns;
98     limit=pv->rows;
99     prevRow=pv->prevRow;
100 
101     /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
102     row=pv->v+prevRow*columns;
103     if(rangeStart>=(UChar32)row[0]) {
104         if(rangeStart<(UChar32)row[1]) {
105             /* same row as last seen */
106             return row;
107         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
108             /* next row after the last one */
109             pv->prevRow=prevRow+1;
110             return row;
111         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
112             /* second row after the last one */
113             pv->prevRow=prevRow+2;
114             return row;
115         } else if((rangeStart-(UChar32)row[1])<10) {
116             /* we are close, continue looping */
117             prevRow+=2;
118             do {
119                 ++prevRow;
120                 row+=columns;
121             } while(rangeStart>=(UChar32)row[1]);
122             pv->prevRow=prevRow;
123             return row;
124         }
125     } else if(rangeStart<(UChar32)pv->v[1]) {
126         /* the very first row */
127         pv->prevRow=0;
128         return pv->v;
129     }
130 
131     /* do a binary search for the start of the range */
132     start=0;
133     while(start<limit-1) {
134         i=(start+limit)/2;
135         row=pv->v+i*columns;
136         if(rangeStart<(UChar32)row[0]) {
137             limit=i;
138         } else if(rangeStart<(UChar32)row[1]) {
139             pv->prevRow=i;
140             return row;
141         } else {
142             start=i;
143         }
144     }
145 
146     /* must be found because all ranges together always cover all of Unicode */
147     pv->prevRow=start;
148     return pv->v+start*columns;
149 }
150 
151 U_CAPI void U_EXPORT2
upvec_setValue(UPropsVectors * pv,UChar32 start,UChar32 end,int32_t column,uint32_t value,uint32_t mask,UErrorCode * pErrorCode)152 upvec_setValue(UPropsVectors *pv,
153                UChar32 start, UChar32 end,
154                int32_t column,
155                uint32_t value, uint32_t mask,
156                UErrorCode *pErrorCode) {
157     uint32_t *firstRow, *lastRow;
158     int32_t columns;
159     UChar32 limit;
160     UBool splitFirstRow, splitLastRow;
161 
162     /* argument checking */
163     if(U_FAILURE(*pErrorCode)) {
164         return;
165     }
166     if( pv==NULL ||
167         start<0 || start>end || end>UPVEC_MAX_CP ||
168         column<0 || column>=(pv->columns-2)
169     ) {
170         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
171         return;
172     }
173     if(pv->isCompacted) {
174         *pErrorCode=U_NO_WRITE_PERMISSION;
175         return;
176     }
177     limit=end+1;
178 
179     /* initialize */
180     columns=pv->columns;
181     column+=2; /* skip range start and limit columns */
182     value&=mask;
183 
184     /* find the rows whose ranges overlap with the input range */
185 
186     /* find the first and last rows, always successful */
187     firstRow=_findRow(pv, start);
188     lastRow=_findRow(pv, end);
189 
190     /*
191      * Rows need to be split if they partially overlap with the
192      * input range (only possible for the first and last rows)
193      * and if their value differs from the input value.
194      */
195     splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
196     splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
197 
198     /* split first/last rows if necessary */
199     if(splitFirstRow || splitLastRow) {
200         int32_t count, rows;
201 
202         rows=pv->rows;
203         if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
204             uint32_t *newVectors;
205             int32_t newMaxRows;
206 
207             if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
208                 newMaxRows=UPVEC_MEDIUM_ROWS;
209             } else if(pv->maxRows<UPVEC_MAX_ROWS) {
210                 newMaxRows=UPVEC_MAX_ROWS;
211             } else {
212                 /* Implementation bug, or UPVEC_MAX_ROWS too low. */
213                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
214                 return;
215             }
216             newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
217             if(newVectors==NULL) {
218                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
219                 return;
220             }
221             uprv_memcpy(newVectors, pv->v, rows*columns*4);
222             firstRow=newVectors+(firstRow-pv->v);
223             lastRow=newVectors+(lastRow-pv->v);
224             uprv_free(pv->v);
225             pv->v=newVectors;
226             pv->maxRows=newMaxRows;
227         }
228 
229         /* count the number of row cells to move after the last row, and move them */
230         count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
231         if(count>0) {
232             uprv_memmove(
233                 lastRow+(1+splitFirstRow+splitLastRow)*columns,
234                 lastRow+columns,
235                 count*4);
236         }
237         pv->rows=rows+splitFirstRow+splitLastRow;
238 
239         /* split the first row, and move the firstRow pointer to the second part */
240         if(splitFirstRow) {
241             /* copy all affected rows up one and move the lastRow pointer */
242             count = (int32_t)((lastRow-firstRow)+columns);
243             uprv_memmove(firstRow+columns, firstRow, count*4);
244             lastRow+=columns;
245 
246             /* split the range and move the firstRow pointer */
247             firstRow[1]=firstRow[columns]=(uint32_t)start;
248             firstRow+=columns;
249         }
250 
251         /* split the last row */
252         if(splitLastRow) {
253             /* copy the last row data */
254             uprv_memcpy(lastRow+columns, lastRow, columns*4);
255 
256             /* split the range and move the firstRow pointer */
257             lastRow[1]=lastRow[columns]=(uint32_t)limit;
258         }
259     }
260 
261     /* set the "row last seen" to the last row for the range */
262     pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
263 
264     /* set the input value in all remaining rows */
265     firstRow+=column;
266     lastRow+=column;
267     mask=~mask;
268     for(;;) {
269         *firstRow=(*firstRow&mask)|value;
270         if(firstRow==lastRow) {
271             break;
272         }
273         firstRow+=columns;
274     }
275 }
276 
277 U_CAPI uint32_t U_EXPORT2
upvec_getValue(const UPropsVectors * pv,UChar32 c,int32_t column)278 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
279     uint32_t *row;
280     UPropsVectors *ncpv;
281 
282     if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
283         return 0;
284     }
285     ncpv=(UPropsVectors *)pv;
286     row=_findRow(ncpv, c);
287     return row[2+column];
288 }
289 
290 U_CAPI uint32_t * U_EXPORT2
upvec_getRow(const UPropsVectors * pv,int32_t rowIndex,UChar32 * pRangeStart,UChar32 * pRangeEnd)291 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
292              UChar32 *pRangeStart, UChar32 *pRangeEnd) {
293     uint32_t *row;
294     int32_t columns;
295 
296     if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
297         return NULL;
298     }
299 
300     columns=pv->columns;
301     row=pv->v+rowIndex*columns;
302     if(pRangeStart!=NULL) {
303         *pRangeStart=(UChar32)row[0];
304     }
305     if(pRangeEnd!=NULL) {
306         *pRangeEnd=(UChar32)row[1]-1;
307     }
308     return row+2;
309 }
310 
311 static int32_t U_CALLCONV
upvec_compareRows(const void * context,const void * l,const void * r)312 upvec_compareRows(const void *context, const void *l, const void *r) {
313     const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
314     const UPropsVectors *pv=(const UPropsVectors *)context;
315     int32_t i, count, columns;
316 
317     count=columns=pv->columns; /* includes start/limit columns */
318 
319     /* start comparing after start/limit but wrap around to them */
320     i=2;
321     do {
322         if(left[i]!=right[i]) {
323             return left[i]<right[i] ? -1 : 1;
324         }
325         if(++i==columns) {
326             i=0;
327         }
328     } while(--count>0);
329 
330     return 0;
331 }
332 
333 U_CAPI void U_EXPORT2
upvec_compact(UPropsVectors * pv,UPVecCompactHandler * handler,void * context,UErrorCode * pErrorCode)334 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
335     uint32_t *row;
336     int32_t i, columns, valueColumns, rows, count;
337     UChar32 start, limit;
338 
339     /* argument checking */
340     if(U_FAILURE(*pErrorCode)) {
341         return;
342     }
343     if(handler==NULL) {
344         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
345         return;
346     }
347     if(pv->isCompacted) {
348         return;
349     }
350 
351     /* Set the flag now: Sorting and compacting destroys the builder data structure. */
352     pv->isCompacted=TRUE;
353 
354     rows=pv->rows;
355     columns=pv->columns;
356     U_ASSERT(columns>=3); /* upvec_open asserts this */
357     valueColumns=columns-2; /* not counting start & limit */
358 
359     /* sort the properties vectors to find unique vector values */
360     uprv_sortArray(pv->v, rows, columns*4,
361                    upvec_compareRows, pv, FALSE, pErrorCode);
362     if(U_FAILURE(*pErrorCode)) {
363         return;
364     }
365 
366     /*
367      * Find and set the special values.
368      * This has to do almost the same work as the compaction below,
369      * to find the indexes where the special-value rows will move.
370      */
371     row=pv->v;
372     count=-valueColumns;
373     for(i=0; i<rows; ++i) {
374         start=(UChar32)row[0];
375 
376         /* count a new values vector if it is different from the current one */
377         if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
378             count+=valueColumns;
379         }
380 
381         if(start>=UPVEC_FIRST_SPECIAL_CP) {
382             handler(context, start, start, count, row+2, valueColumns, pErrorCode);
383             if(U_FAILURE(*pErrorCode)) {
384                 return;
385             }
386         }
387 
388         row+=columns;
389     }
390 
391     /* count is at the beginning of the last vector, add valueColumns to include that last vector */
392     count+=valueColumns;
393 
394     /* Call the handler once more to signal the start of delivering real values. */
395     handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
396             count, row-valueColumns, valueColumns, pErrorCode);
397     if(U_FAILURE(*pErrorCode)) {
398         return;
399     }
400 
401     /*
402      * Move vector contents up to a contiguous array with only unique
403      * vector values, and call the handler function for each vector.
404      *
405      * This destroys the Properties Vector structure and replaces it
406      * with an array of just vector values.
407      */
408     row=pv->v;
409     count=-valueColumns;
410     for(i=0; i<rows; ++i) {
411         /* fetch these first before memmove() may overwrite them */
412         start=(UChar32)row[0];
413         limit=(UChar32)row[1];
414 
415         /* add a new values vector if it is different from the current one */
416         if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
417             count+=valueColumns;
418             uprv_memmove(pv->v+count, row+2, valueColumns*4);
419         }
420 
421         if(start<UPVEC_FIRST_SPECIAL_CP) {
422             handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
423             if(U_FAILURE(*pErrorCode)) {
424                 return;
425             }
426         }
427 
428         row+=columns;
429     }
430 
431     /* count is at the beginning of the last vector, add one to include that last vector */
432     pv->rows=count/valueColumns+1;
433 }
434 
435 U_CAPI const uint32_t * U_EXPORT2
upvec_getArray(const UPropsVectors * pv,int32_t * pRows,int32_t * pColumns)436 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
437     if(!pv->isCompacted) {
438         return NULL;
439     }
440     if(pRows!=NULL) {
441         *pRows=pv->rows;
442     }
443     if(pColumns!=NULL) {
444         *pColumns=pv->columns-2;
445     }
446     return pv->v;
447 }
448 
449 U_CAPI uint32_t * U_EXPORT2
upvec_cloneArray(const UPropsVectors * pv,int32_t * pRows,int32_t * pColumns,UErrorCode * pErrorCode)450 upvec_cloneArray(const UPropsVectors *pv,
451                  int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
452     uint32_t *clonedArray;
453     int32_t byteLength;
454 
455     if(U_FAILURE(*pErrorCode)) {
456         return NULL;
457     }
458     if(!pv->isCompacted) {
459         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
460         return NULL;
461     }
462     byteLength=pv->rows*(pv->columns-2)*4;
463     clonedArray=(uint32_t *)uprv_malloc(byteLength);
464     if(clonedArray==NULL) {
465         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
466         return NULL;
467     }
468     uprv_memcpy(clonedArray, pv->v, byteLength);
469     if(pRows!=NULL) {
470         *pRows=pv->rows;
471     }
472     if(pColumns!=NULL) {
473         *pColumns=pv->columns-2;
474     }
475     return clonedArray;
476 }
477 
478 U_CAPI UTrie2 * U_EXPORT2
upvec_compactToUTrie2WithRowIndexes(UPropsVectors * pv,UErrorCode * pErrorCode)479 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
480     UPVecToUTrie2Context toUTrie2={ NULL };
481     upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
482     utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
483     if(U_FAILURE(*pErrorCode)) {
484         utrie2_close(toUTrie2.trie);
485         toUTrie2.trie=NULL;
486     }
487     return toUTrie2.trie;
488 }
489 
490 /*
491  * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
492  * some 16-bit field and builds and returns a UTrie2.
493  */
494 
495 U_CAPI void U_CALLCONV
upvec_compactToUTrie2Handler(void * context,UChar32 start,UChar32 end,int32_t rowIndex,uint32_t * row,int32_t columns,UErrorCode * pErrorCode)496 upvec_compactToUTrie2Handler(void *context,
497                              UChar32 start, UChar32 end,
498                              int32_t rowIndex, uint32_t *row, int32_t columns,
499                              UErrorCode *pErrorCode) {
500     UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
501     if(start<UPVEC_FIRST_SPECIAL_CP) {
502         utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
503     } else {
504         switch(start) {
505         case UPVEC_INITIAL_VALUE_CP:
506             toUTrie2->initialValue=rowIndex;
507             break;
508         case UPVEC_ERROR_VALUE_CP:
509             toUTrie2->errorValue=rowIndex;
510             break;
511         case UPVEC_START_REAL_VALUES_CP:
512             toUTrie2->maxValue=rowIndex;
513             if(rowIndex>0xffff) {
514                 /* too many rows for a 16-bit trie */
515                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
516             } else {
517                 toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
518                                            toUTrie2->errorValue, pErrorCode);
519             }
520             break;
521         default:
522             break;
523         }
524     }
525 }
526