1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1996-2011, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ****************************************************************************** 8 */ 9 10 /** 11 * Store bits (Unicode character properties) in bit set vectors. 12 * 13 * This is a port of the C++ class UPropsVectors from ICU4C 14 * 15 * @author Shaopeng Jia 16 * @internal 17 */ 18 19 package com.ibm.icu.impl; 20 21 import java.util.Arrays; 22 import java.util.Comparator; 23 24 /** 25 * Unicode Properties Vectors associated with code point ranges. 26 * 27 * Rows of primitive integers in a contiguous array store the range limits and 28 * the properties vectors. 29 * 30 * In each row, row[0] contains the start code point and row[1] contains the 31 * limit code point, which is the start of the next range. 32 * 33 * Initially, there is only one range [0..0x110000] with values 0. 34 * 35 * It would be possible to store only one range boundary per row, but 36 * self-contained rows allow to later sort them by contents. 37 */ 38 public class PropsVectors { 39 private int v[]; 40 private int columns; // number of columns, plus two for start 41 // and limit values 42 private int maxRows; 43 private int rows; 44 private int prevRow; // search optimization: remember last row seen 45 private boolean isCompacted; 46 47 // internal function to compare elements in v and target. Return true iff 48 // elements in v starting from index1 to index1 + length - 1 49 // are exactly the same as elements in target 50 // starting from index2 to index2 + length - 1 areElementsSame(int index1, int[] target, int index2, int length)51 private boolean areElementsSame(int index1, int[] target, int index2, 52 int length) { 53 for (int i = 0; i < length; ++i) { 54 if (v[index1 + i] != target[index2 + i]) { 55 return false; 56 } 57 } 58 return true; 59 } 60 61 // internal function which given rangeStart, returns 62 // index where v[index]<=rangeStart<v[index+1]. 63 // The returned index is a multiple of columns, and therefore 64 // points to the start of a row. findRow(int rangeStart)65 private int findRow(int rangeStart) { 66 int index = 0; 67 68 // check the vicinity of the last-seen row (start 69 // searching with an unrolled loop) 70 71 index = prevRow * columns; 72 if (rangeStart >= v[index]) { 73 if (rangeStart < v[index + 1]) { 74 // same row as last seen 75 return index; 76 } else { 77 index += columns; 78 if (rangeStart < v[index + 1]) { 79 ++prevRow; 80 return index; 81 } else { 82 index += columns; 83 if (rangeStart < v[index + 1]) { 84 prevRow += 2; 85 return index; 86 } else if ((rangeStart - v[index + 1]) < 10) { 87 // we are close, continue looping 88 prevRow += 2; 89 do { 90 ++prevRow; 91 index += columns; 92 } while (rangeStart >= v[index + 1]); 93 return index; 94 } 95 } 96 } 97 } else if (rangeStart < v[1]) { 98 // the very first row 99 prevRow = 0; 100 return 0; 101 } 102 103 // do a binary search for the start of the range 104 int start = 0; 105 int mid = 0; 106 int limit = rows; 107 while (start < limit - 1) { 108 mid = (start + limit) / 2; 109 index = columns * mid; 110 if (rangeStart < v[index]) { 111 limit = mid; 112 } else if (rangeStart < v[index + 1]) { 113 prevRow = mid; 114 return index; 115 } else { 116 start = mid; 117 } 118 } 119 120 // must be found because all ranges together always cover 121 // all of Unicode 122 prevRow = start; 123 index = start * columns; 124 return index; 125 } 126 127 /* 128 * Special pseudo code points for storing the initialValue and the 129 * errorValue which are used to initialize a Trie or similar. 130 */ 131 public final static int FIRST_SPECIAL_CP = 0x110000; 132 public final static int INITIAL_VALUE_CP = 0x110000; 133 public final static int ERROR_VALUE_CP = 0x110001; 134 public final static int MAX_CP = 0x110001; 135 136 public final static int INITIAL_ROWS = 1 << 12; 137 public final static int MEDIUM_ROWS = 1 << 16; 138 public final static int MAX_ROWS = MAX_CP + 1; 139 140 /* 141 * Constructor. 142 * @param numOfColumns Number of value integers (32-bit int) per row. 143 */ PropsVectors(int numOfColumns)144 public PropsVectors(int numOfColumns) { 145 if (numOfColumns < 1) { 146 throw new IllegalArgumentException("numOfColumns need to be no " 147 + "less than 1; but it is " + numOfColumns); 148 } 149 columns = numOfColumns + 2; // count range start and limit columns 150 v = new int[INITIAL_ROWS * columns]; 151 maxRows = INITIAL_ROWS; 152 rows = 2 + (MAX_CP - FIRST_SPECIAL_CP); 153 prevRow = 0; 154 isCompacted = false; 155 v[0] = 0; 156 v[1] = 0x110000; 157 int index = columns; 158 for (int cp = FIRST_SPECIAL_CP; cp <= MAX_CP; ++cp) { 159 v[index] = cp; 160 v[index + 1] = cp + 1; 161 index += columns; 162 } 163 } 164 165 /* 166 * In rows for code points [start..end], select the column, reset the mask 167 * bits and set the value bits (ANDed with the mask). 168 * 169 * @throws IllegalArgumentException 170 * 171 * @throws IllegalStateException 172 * 173 * @throws IndexOutOfBoundsException 174 */ setValue(int start, int end, int column, int value, int mask)175 public void setValue(int start, int end, int column, int value, int mask) { 176 if (start < 0 || start > end || end > MAX_CP || column < 0 177 || column >= (columns - 2)) { 178 throw new IllegalArgumentException(); 179 } 180 if (isCompacted) { 181 throw new IllegalStateException("Shouldn't be called after" 182 + "compact()!"); 183 } 184 185 int firstRow, lastRow; 186 int limit = end + 1; 187 boolean splitFirstRow, splitLastRow; 188 // skip range start and limit columns 189 column += 2; 190 value &= mask; 191 192 // find the rows whose ranges overlap with the input range 193 // find the first and last row, always successful 194 firstRow = findRow(start); 195 lastRow = findRow(end); 196 197 /* 198 * Rows need to be split if they partially overlap with the input range 199 * (only possible for the first and last rows) and if their value 200 * differs from the input value. 201 */ 202 splitFirstRow = (start != v[firstRow] && value != (v[firstRow + column] & mask)); 203 splitLastRow = (limit != v[lastRow + 1] && value != (v[lastRow + column] & mask)); 204 205 // split first/last rows if necessary 206 if (splitFirstRow || splitLastRow) { 207 int rowsToExpand = 0; 208 if (splitFirstRow) { 209 ++rowsToExpand; 210 } 211 if (splitLastRow) { 212 ++rowsToExpand; 213 } 214 int newMaxRows = 0; 215 if ((rows + rowsToExpand) > maxRows) { 216 if (maxRows < MEDIUM_ROWS) { 217 newMaxRows = MEDIUM_ROWS; 218 } else if (maxRows < MAX_ROWS) { 219 newMaxRows = MAX_ROWS; 220 } else { 221 throw new IndexOutOfBoundsException( 222 "MAX_ROWS exceeded! Increase it to a higher value" + 223 "in the implementation"); 224 } 225 int[] temp = new int[newMaxRows * columns]; 226 System.arraycopy(v, 0, temp, 0, rows * columns); 227 v = temp; 228 maxRows = newMaxRows; 229 } 230 231 // count the number of row cells to move after the last row, 232 // and move them 233 int count = (rows * columns) - (lastRow + columns); 234 if (count > 0) { 235 System.arraycopy(v, lastRow + columns, v, lastRow 236 + (1 + rowsToExpand) * columns, count); 237 } 238 rows += rowsToExpand; 239 240 // split the first row, and move the firstRow pointer 241 // to the second part 242 if (splitFirstRow) { 243 // copy all affected rows up one and move the lastRow pointer 244 count = lastRow - firstRow + columns; 245 System.arraycopy(v, firstRow, v, firstRow + columns, count); 246 lastRow += columns; 247 248 // split the range and move the firstRow pointer 249 v[firstRow + 1] = v[firstRow + columns] = start; 250 firstRow += columns; 251 } 252 253 // split the last row 254 if (splitLastRow) { 255 // copy the last row data 256 System.arraycopy(v, lastRow, v, lastRow + columns, columns); 257 258 // split the range and move the firstRow pointer 259 v[lastRow + 1] = v[lastRow + columns] = limit; 260 } 261 } 262 263 // set the "row last seen" to the last row for the range 264 prevRow = lastRow / columns; 265 266 // set the input value in all remaining rows 267 firstRow += column; 268 lastRow += column; 269 mask = ~mask; 270 for (;;) { 271 v[firstRow] = (v[firstRow] & mask) | value; 272 if (firstRow == lastRow) { 273 break; 274 } 275 firstRow += columns; 276 } 277 } 278 279 /* 280 * Always returns 0 if called after compact(). 281 */ getValue(int c, int column)282 public int getValue(int c, int column) { 283 if (isCompacted || c < 0 || c > MAX_CP || column < 0 284 || column >= (columns - 2)) { 285 return 0; 286 } 287 int index = findRow(c); 288 return v[index + 2 + column]; 289 } 290 291 /* 292 * Returns an array which contains value elements 293 * in row rowIndex. 294 * 295 * @throws IllegalStateException 296 * @throws IllegalArgumentException 297 */ getRow(int rowIndex)298 public int[] getRow(int rowIndex) { 299 if (isCompacted) { 300 throw new IllegalStateException( 301 "Illegal Invocation of the method after compact()"); 302 } 303 if (rowIndex < 0 || rowIndex > rows) { 304 throw new IllegalArgumentException("rowIndex out of bound!"); 305 } 306 int[] rowToReturn = new int[columns - 2]; 307 System.arraycopy(v, rowIndex * columns + 2, rowToReturn, 0, 308 columns - 2); 309 return rowToReturn; 310 } 311 312 /* 313 * Returns an int which is the start codepoint 314 * in row rowIndex. 315 * 316 * @throws IllegalStateException 317 * 318 * @throws IllegalArgumentException 319 */ getRowStart(int rowIndex)320 public int getRowStart(int rowIndex) { 321 if (isCompacted) { 322 throw new IllegalStateException( 323 "Illegal Invocation of the method after compact()"); 324 } 325 if (rowIndex < 0 || rowIndex > rows) { 326 throw new IllegalArgumentException("rowIndex out of bound!"); 327 } 328 return v[rowIndex * columns]; 329 } 330 331 /* 332 * Returns an int which is the limit codepoint 333 * minus 1 in row rowIndex. 334 * 335 * @throws IllegalStateException 336 * 337 * @throws IllegalArgumentException 338 */ getRowEnd(int rowIndex)339 public int getRowEnd(int rowIndex) { 340 if (isCompacted) { 341 throw new IllegalStateException( 342 "Illegal Invocation of the method after compact()"); 343 } 344 if (rowIndex < 0 || rowIndex > rows) { 345 throw new IllegalArgumentException("rowIndex out of bound!"); 346 } 347 return v[rowIndex * columns + 1] - 1; 348 } 349 350 /* 351 * Compact the vectors: 352 * - modify the memory 353 * - keep only unique vectors 354 * - store them contiguously from the beginning of the memory 355 * - for each (non-unique) row, call the respective function in 356 * CompactHandler 357 * 358 * The handler's rowIndex is the index of the row in the compacted 359 * memory block. Therefore, it starts at 0 increases in increments of the 360 * columns value. 361 * 362 * In a first phase, only special values are delivered (each exactly once). 363 * Then CompactHandler::startRealValues() is called 364 * where rowIndex is the length of the compacted array. 365 * Then, in the second phase, the CompactHandler::setRowIndexForRange() is 366 * called for each row of real values. 367 */ compact(CompactHandler compactor)368 public void compact(CompactHandler compactor) { 369 if (isCompacted) { 370 return; 371 } 372 373 // Set the flag now: Sorting and compacting destroys the builder 374 // data structure. 375 isCompacted = true; 376 int valueColumns = columns - 2; // not counting start & limit 377 378 // sort the properties vectors to find unique vector values 379 Integer[] indexArray = new Integer[rows]; 380 for (int i = 0; i < rows; ++i) { 381 indexArray[i] = Integer.valueOf(columns * i); 382 } 383 384 Arrays.sort(indexArray, new Comparator<Integer>() { 385 @Override 386 public int compare(Integer o1, Integer o2) { 387 int indexOfRow1 = o1.intValue(); 388 int indexOfRow2 = o2.intValue(); 389 int count = columns; // includes start/limit columns 390 391 // start comparing after start/limit 392 // but wrap around to them 393 int index = 2; 394 do { 395 if (v[indexOfRow1 + index] != v[indexOfRow2 + index]) { 396 return v[indexOfRow1 + index] < v[indexOfRow2 + index] ? -1 397 : 1; 398 } 399 if (++index == columns) { 400 index = 0; 401 } 402 } while (--count > 0); 403 404 return 0; 405 } 406 }); 407 408 /* 409 * Find and set the special values. This has to do almost the same work 410 * as the compaction below, to find the indexes where the special-value 411 * rows will move. 412 */ 413 int count = -valueColumns; 414 for (int i = 0; i < rows; ++i) { 415 int start = v[indexArray[i].intValue()]; 416 417 // count a new values vector if it is different 418 // from the current one 419 if (count < 0 || !areElementsSame(indexArray[i].intValue() + 2, v, 420 indexArray[i-1].intValue() + 2, valueColumns)) { 421 count += valueColumns; 422 } 423 424 if (start == INITIAL_VALUE_CP) { 425 compactor.setRowIndexForInitialValue(count); 426 } else if (start == ERROR_VALUE_CP) { 427 compactor.setRowIndexForErrorValue(count); 428 } 429 } 430 431 // count is at the beginning of the last vector, 432 // add valueColumns to include that last vector 433 count += valueColumns; 434 435 // Call the handler once more to signal the start of 436 // delivering real values. 437 compactor.startRealValues(count); 438 439 /* 440 * Move vector contents up to a contiguous array with only unique 441 * vector values, and call the handler function for each vector. 442 * 443 * This destroys the Properties Vector structure and replaces it 444 * with an array of just vector values. 445 */ 446 int[] temp = new int[count]; 447 count = -valueColumns; 448 for (int i = 0; i < rows; ++i) { 449 int start = v[indexArray[i].intValue()]; 450 int limit = v[indexArray[i].intValue() + 1]; 451 452 // count a new values vector if it is different 453 // from the current one 454 if (count < 0 || !areElementsSame(indexArray[i].intValue() + 2, 455 temp, count, valueColumns)) { 456 count += valueColumns; 457 System.arraycopy(v, indexArray[i].intValue() + 2, temp, count, 458 valueColumns); 459 } 460 461 if (start < FIRST_SPECIAL_CP) { 462 compactor.setRowIndexForRange(start, limit - 1, count); 463 } 464 } 465 v = temp; 466 467 // count is at the beginning of the last vector, 468 // add one to include that last vector 469 rows = count / valueColumns + 1; 470 } 471 472 /* 473 * Get the vectors array after calling compact(). 474 * 475 * @throws IllegalStateException 476 */ getCompactedArray()477 public int[] getCompactedArray() { 478 if (!isCompacted) { 479 throw new IllegalStateException( 480 "Illegal Invocation of the method before compact()"); 481 } 482 return v; 483 } 484 485 /* 486 * Get the number of rows for the compacted array. 487 * 488 * @throws IllegalStateException 489 */ getCompactedRows()490 public int getCompactedRows() { 491 if (!isCompacted) { 492 throw new IllegalStateException( 493 "Illegal Invocation of the method before compact()"); 494 } 495 return rows; 496 } 497 498 /* 499 * Get the number of columns for the compacted array. 500 * 501 * @throws IllegalStateException 502 */ getCompactedColumns()503 public int getCompactedColumns() { 504 if (!isCompacted) { 505 throw new IllegalStateException( 506 "Illegal Invocation of the method before compact()"); 507 } 508 return columns - 2; 509 } 510 511 /* 512 * Call compact(), create a IntTrie with indexes into the compacted 513 * vectors array. 514 */ compactToTrieWithRowIndexes()515 public IntTrie compactToTrieWithRowIndexes() { 516 PVecToTrieCompactHandler compactor = new PVecToTrieCompactHandler(); 517 compact(compactor); 518 return compactor.builder.serialize(new DefaultGetFoldedValue( 519 compactor.builder), new DefaultGetFoldingOffset()); 520 } 521 522 // inner class implementation of Trie.DataManipulate 523 private static class DefaultGetFoldingOffset implements Trie.DataManipulate { 524 @Override getFoldingOffset(int value)525 public int getFoldingOffset(int value) { 526 return value; 527 } 528 } 529 530 // inner class implementation of TrieBuilder.DataManipulate 531 private static class DefaultGetFoldedValue implements 532 TrieBuilder.DataManipulate { 533 private IntTrieBuilder builder; 534 DefaultGetFoldedValue(IntTrieBuilder inBuilder)535 public DefaultGetFoldedValue(IntTrieBuilder inBuilder) { 536 builder = inBuilder; 537 } 538 539 @Override getFoldedValue(int start, int offset)540 public int getFoldedValue(int start, int offset) { 541 int initialValue = builder.m_initialValue_; 542 int limit = start + 0x400; 543 while (start < limit) { 544 boolean[] inBlockZero = new boolean[1]; 545 int value = builder.getValue(start, inBlockZero); 546 if (inBlockZero[0]) { 547 start += TrieBuilder.DATA_BLOCK_LENGTH; 548 } else if (value != initialValue) { 549 return offset; 550 } else { 551 ++start; 552 } 553 } 554 return 0; 555 } 556 } 557 558 public static interface CompactHandler { setRowIndexForRange(int start, int end, int rowIndex)559 public void setRowIndexForRange(int start, int end, int rowIndex); setRowIndexForInitialValue(int rowIndex)560 public void setRowIndexForInitialValue(int rowIndex); setRowIndexForErrorValue(int rowIndex)561 public void setRowIndexForErrorValue(int rowIndex); startRealValues(int rowIndex)562 public void startRealValues(int rowIndex); 563 } 564 }