1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  ******************************************************************************
5  * Copyright (C) 1996-2011, International Business Machines Corporation and   *
6  * others. All Rights Reserved.                                               *
7  ******************************************************************************
8  */
9 
10 /**
11  * Store bits (Unicode character properties) in bit set vectors.
12  *
13  * This is a port of the C++ class UPropsVectors from ICU4C
14  *
15  * @author Shaopeng Jia
16  * @internal
17  */
18 
19 package com.ibm.icu.impl;
20 
21 import java.util.Arrays;
22 import java.util.Comparator;
23 
24 /**
25  * Unicode Properties Vectors associated with code point ranges.
26  *
27  * Rows of primitive integers in a contiguous array store the range limits and
28  * the properties vectors.
29  *
30  * In each row, row[0] contains the start code point and row[1] contains the
31  * limit code point, which is the start of the next range.
32  *
33  * Initially, there is only one range [0..0x110000] with values 0.
34  *
35  * It would be possible to store only one range boundary per row, but
36  * self-contained rows allow to later sort them by contents.
37  */
38 public class PropsVectors {
39     private int v[];
40     private int columns; // number of columns, plus two for start
41     // and limit values
42     private int maxRows;
43     private int rows;
44     private int prevRow; // search optimization: remember last row seen
45     private boolean isCompacted;
46 
47     // internal function to compare elements in v and target. Return true iff
48     // elements in v starting from index1 to index1 + length - 1
49     // are exactly the same as elements in target
50     // starting from index2 to index2 + length - 1
areElementsSame(int index1, int[] target, int index2, int length)51     private boolean areElementsSame(int index1, int[] target, int index2,
52             int length) {
53         for (int i = 0; i < length; ++i) {
54             if (v[index1 + i] != target[index2 + i]) {
55                 return false;
56             }
57         }
58         return true;
59     }
60 
61     // internal function which given rangeStart, returns
62     // index where v[index]<=rangeStart<v[index+1].
63     // The returned index is a multiple of columns, and therefore
64     // points to the start of a row.
findRow(int rangeStart)65     private int findRow(int rangeStart) {
66         int index = 0;
67 
68         // check the vicinity of the last-seen row (start
69         // searching with an unrolled loop)
70 
71         index = prevRow * columns;
72         if (rangeStart >= v[index]) {
73             if (rangeStart < v[index + 1]) {
74                 // same row as last seen
75                 return index;
76             } else {
77                 index += columns;
78                 if (rangeStart < v[index + 1]) {
79                     ++prevRow;
80                     return index;
81                 } else {
82                     index += columns;
83                     if (rangeStart < v[index + 1]) {
84                         prevRow += 2;
85                         return index;
86                     } else if ((rangeStart - v[index + 1]) < 10) {
87                         // we are close, continue looping
88                         prevRow += 2;
89                         do {
90                             ++prevRow;
91                             index += columns;
92                         } while (rangeStart >= v[index + 1]);
93                         return index;
94                     }
95                 }
96             }
97         } else if (rangeStart < v[1]) {
98             // the very first row
99             prevRow = 0;
100             return 0;
101         }
102 
103         // do a binary search for the start of the range
104         int start = 0;
105         int mid = 0;
106         int limit = rows;
107         while (start < limit - 1) {
108             mid = (start + limit) / 2;
109             index = columns * mid;
110             if (rangeStart < v[index]) {
111                 limit = mid;
112             } else if (rangeStart < v[index + 1]) {
113                 prevRow = mid;
114                 return index;
115             } else {
116                 start = mid;
117             }
118         }
119 
120         // must be found because all ranges together always cover
121         // all of Unicode
122         prevRow = start;
123         index = start * columns;
124         return index;
125     }
126 
127     /*
128      * Special pseudo code points for storing the initialValue and the
129      * errorValue which are used to initialize a Trie or similar.
130      */
131     public final static int FIRST_SPECIAL_CP = 0x110000;
132     public final static int INITIAL_VALUE_CP = 0x110000;
133     public final static int ERROR_VALUE_CP = 0x110001;
134     public final static int MAX_CP = 0x110001;
135 
136     public final static int INITIAL_ROWS = 1 << 12;
137     public final static int MEDIUM_ROWS = 1 << 16;
138     public final static int MAX_ROWS = MAX_CP + 1;
139 
140     /*
141      * Constructor.
142      * @param numOfColumns Number of value integers (32-bit int) per row.
143      */
PropsVectors(int numOfColumns)144     public PropsVectors(int numOfColumns) {
145         if (numOfColumns < 1) {
146             throw new IllegalArgumentException("numOfColumns need to be no "
147                     + "less than 1; but it is " + numOfColumns);
148         }
149         columns = numOfColumns + 2; // count range start and limit columns
150         v = new int[INITIAL_ROWS * columns];
151         maxRows = INITIAL_ROWS;
152         rows = 2 + (MAX_CP - FIRST_SPECIAL_CP);
153         prevRow = 0;
154         isCompacted = false;
155         v[0] = 0;
156         v[1] = 0x110000;
157         int index = columns;
158         for (int cp = FIRST_SPECIAL_CP; cp <= MAX_CP; ++cp) {
159             v[index] = cp;
160             v[index + 1] = cp + 1;
161             index += columns;
162         }
163     }
164 
165     /*
166      * In rows for code points [start..end], select the column, reset the mask
167      * bits and set the value bits (ANDed with the mask).
168      *
169      * @throws IllegalArgumentException
170      *
171      * @throws IllegalStateException
172      *
173      * @throws IndexOutOfBoundsException
174      */
setValue(int start, int end, int column, int value, int mask)175     public void setValue(int start, int end, int column, int value, int mask) {
176         if (start < 0 || start > end || end > MAX_CP || column < 0
177                 || column >= (columns - 2)) {
178             throw new IllegalArgumentException();
179         }
180         if (isCompacted) {
181             throw new IllegalStateException("Shouldn't be called after"
182                     + "compact()!");
183         }
184 
185         int firstRow, lastRow;
186         int limit = end + 1;
187         boolean splitFirstRow, splitLastRow;
188         // skip range start and limit columns
189         column += 2;
190         value &= mask;
191 
192         // find the rows whose ranges overlap with the input range
193         // find the first and last row, always successful
194         firstRow = findRow(start);
195         lastRow = findRow(end);
196 
197         /*
198          * Rows need to be split if they partially overlap with the input range
199          * (only possible for the first and last rows) and if their value
200          * differs from the input value.
201          */
202         splitFirstRow = (start != v[firstRow] && value != (v[firstRow + column] & mask));
203         splitLastRow = (limit != v[lastRow + 1] && value != (v[lastRow + column] & mask));
204 
205         // split first/last rows if necessary
206         if (splitFirstRow || splitLastRow) {
207             int rowsToExpand = 0;
208             if (splitFirstRow) {
209                 ++rowsToExpand;
210             }
211             if (splitLastRow) {
212                 ++rowsToExpand;
213             }
214             int newMaxRows = 0;
215             if ((rows + rowsToExpand) > maxRows) {
216                 if (maxRows < MEDIUM_ROWS) {
217                     newMaxRows = MEDIUM_ROWS;
218                 } else if (maxRows < MAX_ROWS) {
219                     newMaxRows = MAX_ROWS;
220                 } else {
221                     throw new IndexOutOfBoundsException(
222                             "MAX_ROWS exceeded! Increase it to a higher value" +
223                             "in the implementation");
224                 }
225                 int[] temp = new int[newMaxRows * columns];
226                 System.arraycopy(v, 0, temp, 0, rows * columns);
227                 v = temp;
228                 maxRows = newMaxRows;
229             }
230 
231             // count the number of row cells to move after the last row,
232             // and move them
233             int count = (rows * columns) - (lastRow + columns);
234             if (count > 0) {
235                 System.arraycopy(v, lastRow + columns, v, lastRow
236                         + (1 + rowsToExpand) * columns, count);
237             }
238             rows += rowsToExpand;
239 
240             // split the first row, and move the firstRow pointer
241             // to the second part
242             if (splitFirstRow) {
243                 // copy all affected rows up one and move the lastRow pointer
244                 count = lastRow - firstRow + columns;
245                 System.arraycopy(v, firstRow, v, firstRow + columns, count);
246                 lastRow += columns;
247 
248                 // split the range and move the firstRow pointer
249                 v[firstRow + 1] = v[firstRow + columns] = start;
250                 firstRow += columns;
251             }
252 
253             // split the last row
254             if (splitLastRow) {
255                 // copy the last row data
256                 System.arraycopy(v, lastRow, v, lastRow + columns, columns);
257 
258                 // split the range and move the firstRow pointer
259                 v[lastRow + 1] = v[lastRow + columns] = limit;
260             }
261         }
262 
263         // set the "row last seen" to the last row for the range
264         prevRow = lastRow / columns;
265 
266         // set the input value in all remaining rows
267         firstRow += column;
268         lastRow += column;
269         mask = ~mask;
270         for (;;) {
271             v[firstRow] = (v[firstRow] & mask) | value;
272             if (firstRow == lastRow) {
273                 break;
274             }
275             firstRow += columns;
276         }
277     }
278 
279     /*
280      * Always returns 0 if called after compact().
281      */
getValue(int c, int column)282     public int getValue(int c, int column) {
283         if (isCompacted || c < 0 || c > MAX_CP || column < 0
284                 || column >= (columns - 2)) {
285             return 0;
286         }
287         int index = findRow(c);
288         return v[index + 2 + column];
289     }
290 
291     /*
292      * Returns an array which contains value elements
293      * in row rowIndex.
294      *
295      * @throws IllegalStateException
296      * @throws IllegalArgumentException
297      */
getRow(int rowIndex)298     public int[] getRow(int rowIndex) {
299         if (isCompacted) {
300             throw new IllegalStateException(
301                     "Illegal Invocation of the method after compact()");
302         }
303         if (rowIndex < 0 || rowIndex > rows) {
304             throw new IllegalArgumentException("rowIndex out of bound!");
305         }
306         int[] rowToReturn = new int[columns - 2];
307         System.arraycopy(v, rowIndex * columns + 2, rowToReturn, 0,
308                          columns - 2);
309         return rowToReturn;
310     }
311 
312     /*
313      * Returns an int which is the start codepoint
314      * in row rowIndex.
315      *
316      * @throws IllegalStateException
317      *
318      * @throws IllegalArgumentException
319      */
getRowStart(int rowIndex)320     public int getRowStart(int rowIndex) {
321         if (isCompacted) {
322             throw new IllegalStateException(
323                     "Illegal Invocation of the method after compact()");
324         }
325         if (rowIndex < 0 || rowIndex > rows) {
326             throw new IllegalArgumentException("rowIndex out of bound!");
327         }
328         return v[rowIndex * columns];
329     }
330 
331     /*
332      * Returns an int which is the limit codepoint
333      * minus 1 in row rowIndex.
334      *
335      * @throws IllegalStateException
336      *
337      * @throws IllegalArgumentException
338      */
getRowEnd(int rowIndex)339     public int getRowEnd(int rowIndex) {
340         if (isCompacted) {
341             throw new IllegalStateException(
342                     "Illegal Invocation of the method after compact()");
343         }
344         if (rowIndex < 0 || rowIndex > rows) {
345             throw new IllegalArgumentException("rowIndex out of bound!");
346         }
347         return v[rowIndex * columns + 1] - 1;
348     }
349 
350     /*
351      * Compact the vectors:
352      * - modify the memory
353      * - keep only unique vectors
354      * - store them contiguously from the beginning of the memory
355      * - for each (non-unique) row, call the respective function in
356      *   CompactHandler
357      *
358      * The handler's rowIndex is the index of the row in the compacted
359      * memory block. Therefore, it starts at 0 increases in increments of the
360      * columns value.
361      *
362      * In a first phase, only special values are delivered (each exactly once).
363      * Then CompactHandler::startRealValues() is called
364      * where rowIndex is the length of the compacted array.
365      * Then, in the second phase, the CompactHandler::setRowIndexForRange() is
366      * called for each row of real values.
367      */
compact(CompactHandler compactor)368     public void compact(CompactHandler compactor) {
369         if (isCompacted) {
370             return;
371         }
372 
373         // Set the flag now: Sorting and compacting destroys the builder
374         // data structure.
375         isCompacted = true;
376         int valueColumns = columns - 2; // not counting start & limit
377 
378         // sort the properties vectors to find unique vector values
379         Integer[] indexArray = new Integer[rows];
380         for (int i = 0; i < rows; ++i) {
381             indexArray[i] = Integer.valueOf(columns * i);
382         }
383 
384         Arrays.sort(indexArray, new Comparator<Integer>() {
385             @Override
386             public int compare(Integer o1, Integer o2) {
387                 int indexOfRow1 = o1.intValue();
388                 int indexOfRow2 = o2.intValue();
389                 int count = columns; // includes start/limit columns
390 
391                 // start comparing after start/limit
392                 // but wrap around to them
393                 int index = 2;
394                 do {
395                     if (v[indexOfRow1 + index] != v[indexOfRow2 + index]) {
396                         return v[indexOfRow1 + index] < v[indexOfRow2 + index] ? -1
397                                 : 1;
398                     }
399                     if (++index == columns) {
400                         index = 0;
401                     }
402                 } while (--count > 0);
403 
404                 return 0;
405             }
406         });
407 
408         /*
409          * Find and set the special values. This has to do almost the same work
410          * as the compaction below, to find the indexes where the special-value
411          * rows will move.
412          */
413         int count = -valueColumns;
414         for (int i = 0; i < rows; ++i) {
415             int start = v[indexArray[i].intValue()];
416 
417             // count a new values vector if it is different
418             // from the current one
419             if (count < 0 || !areElementsSame(indexArray[i].intValue() + 2, v,
420                     indexArray[i-1].intValue() + 2, valueColumns)) {
421                 count += valueColumns;
422             }
423 
424             if (start == INITIAL_VALUE_CP) {
425                 compactor.setRowIndexForInitialValue(count);
426             } else if (start == ERROR_VALUE_CP) {
427                 compactor.setRowIndexForErrorValue(count);
428             }
429         }
430 
431         // count is at the beginning of the last vector,
432         // add valueColumns to include that last vector
433         count += valueColumns;
434 
435         // Call the handler once more to signal the start of
436         // delivering real values.
437         compactor.startRealValues(count);
438 
439         /*
440          * Move vector contents up to a contiguous array with only unique
441          * vector values, and call the handler function for each vector.
442          *
443          * This destroys the Properties Vector structure and replaces it
444          * with an array of just vector values.
445          */
446         int[] temp = new int[count];
447         count = -valueColumns;
448         for (int i = 0; i < rows; ++i) {
449             int start = v[indexArray[i].intValue()];
450             int limit = v[indexArray[i].intValue() + 1];
451 
452             // count a new values vector if it is different
453             // from the current one
454             if (count < 0 || !areElementsSame(indexArray[i].intValue() + 2,
455                     temp, count, valueColumns)) {
456                 count += valueColumns;
457                 System.arraycopy(v, indexArray[i].intValue() + 2, temp, count,
458                         valueColumns);
459             }
460 
461             if (start < FIRST_SPECIAL_CP) {
462                 compactor.setRowIndexForRange(start, limit - 1, count);
463             }
464         }
465         v = temp;
466 
467         // count is at the beginning of the last vector,
468         // add one to include that last vector
469         rows = count / valueColumns + 1;
470     }
471 
472     /*
473      * Get the vectors array after calling compact().
474      *
475      * @throws IllegalStateException
476      */
getCompactedArray()477     public int[] getCompactedArray() {
478         if (!isCompacted) {
479             throw new IllegalStateException(
480                     "Illegal Invocation of the method before compact()");
481         }
482         return v;
483     }
484 
485     /*
486      * Get the number of rows for the compacted array.
487      *
488      * @throws IllegalStateException
489      */
getCompactedRows()490     public int getCompactedRows() {
491         if (!isCompacted) {
492             throw new IllegalStateException(
493                     "Illegal Invocation of the method before compact()");
494         }
495         return rows;
496     }
497 
498     /*
499      * Get the number of columns for the compacted array.
500      *
501      * @throws IllegalStateException
502      */
getCompactedColumns()503     public int getCompactedColumns() {
504         if (!isCompacted) {
505             throw new IllegalStateException(
506                     "Illegal Invocation of the method before compact()");
507         }
508         return columns - 2;
509     }
510 
511     /*
512      * Call compact(), create a IntTrie with indexes into the compacted
513      * vectors array.
514      */
compactToTrieWithRowIndexes()515     public IntTrie compactToTrieWithRowIndexes() {
516         PVecToTrieCompactHandler compactor = new PVecToTrieCompactHandler();
517         compact(compactor);
518         return compactor.builder.serialize(new DefaultGetFoldedValue(
519                 compactor.builder), new DefaultGetFoldingOffset());
520     }
521 
522     // inner class implementation of Trie.DataManipulate
523     private static class DefaultGetFoldingOffset implements Trie.DataManipulate {
524         @Override
getFoldingOffset(int value)525         public int getFoldingOffset(int value) {
526             return value;
527         }
528     }
529 
530     // inner class implementation of TrieBuilder.DataManipulate
531     private static class DefaultGetFoldedValue implements
532             TrieBuilder.DataManipulate {
533         private IntTrieBuilder builder;
534 
DefaultGetFoldedValue(IntTrieBuilder inBuilder)535         public DefaultGetFoldedValue(IntTrieBuilder inBuilder) {
536             builder = inBuilder;
537         }
538 
539         @Override
getFoldedValue(int start, int offset)540         public int getFoldedValue(int start, int offset) {
541             int initialValue = builder.m_initialValue_;
542             int limit = start + 0x400;
543             while (start < limit) {
544                 boolean[] inBlockZero = new boolean[1];
545                 int value = builder.getValue(start, inBlockZero);
546                 if (inBlockZero[0]) {
547                     start += TrieBuilder.DATA_BLOCK_LENGTH;
548                 } else if (value != initialValue) {
549                     return offset;
550                 } else {
551                     ++start;
552                 }
553             }
554             return 0;
555         }
556     }
557 
558     public static interface CompactHandler {
setRowIndexForRange(int start, int end, int rowIndex)559         public void setRowIndexForRange(int start, int end, int rowIndex);
setRowIndexForInitialValue(int rowIndex)560         public void setRowIndexForInitialValue(int rowIndex);
setRowIndexForErrorValue(int rowIndex)561         public void setRowIndexForErrorValue(int rowIndex);
startRealValues(int rowIndex)562         public void startRealValues(int rowIndex);
563     }
564 }