1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2009-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 */ 11 12 package com.ibm.icu.impl; 13 14 import com.ibm.icu.text.UnicodeSet.SpanCondition; 15 import com.ibm.icu.util.OutputInt; 16 17 /** 18 * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points. 19 * 20 * Latin-1: Look up bytes. 21 * 2-byte characters: Bits organized vertically. 22 * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. 23 * Supplementary characters: Binary search over 24 * the supplementary part of the parent set's inversion list. 25 */ 26 public final class BMPSet { 27 public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000); 28 29 /** 30 * One boolean ('true' or 'false') per Latin-1 character. 31 */ 32 private boolean[] latin1Contains; 33 34 /** 35 * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points 36 * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6} 37 * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead) 38 * 39 * Bits for 0..FF are unused (0). 40 */ 41 private int[] table7FF; 42 43 /** 44 * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks 45 * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12} 46 * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit 47 * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed 48 * and set.contains(c) must be called. 49 * 50 * Bits for 0..7FF are unused (0). 51 */ 52 private int[] bmpBlockBits; 53 54 /** 55 * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000, 56 * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are 57 * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points. 58 */ 59 private int[] list4kStarts; 60 61 /** 62 * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for 63 * supplementary code points. The list is terminated with list[listLength-1]=0x110000. 64 */ 65 private final int[] list; 66 private final int listLength; // length used; list may be longer to minimize reallocs 67 BMPSet(final int[] parentList, int parentListLength)68 public BMPSet(final int[] parentList, int parentListLength) { 69 list = parentList; 70 listLength = parentListLength; 71 latin1Contains = new boolean[0x100]; 72 table7FF = new int[64]; 73 bmpBlockBits = new int[64]; 74 list4kStarts = new int[18]; 75 76 /* 77 * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the 78 * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of 79 * indexes is for finding supplementary code points. 80 */ 81 list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1); 82 int i; 83 for (i = 1; i <= 0x10; ++i) { 84 list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1); 85 } 86 list4kStarts[0x11] = listLength - 1; 87 88 initBits(); 89 } 90 BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength)91 public BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength) { 92 list = newParentList; 93 listLength = newParentListLength; 94 latin1Contains = otherBMPSet.latin1Contains.clone(); 95 table7FF = otherBMPSet.table7FF.clone(); 96 bmpBlockBits = otherBMPSet.bmpBlockBits.clone(); 97 list4kStarts = otherBMPSet.list4kStarts.clone(); 98 } 99 contains(int c)100 public boolean contains(int c) { 101 if (c <= 0xff) { 102 return (latin1Contains[c]); 103 } else if (c <= 0x7ff) { 104 return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0); 105 } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) { 106 int lead = c >> 12; 107 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 108 if (twoBits <= 1) { 109 // All 64 code points with the same bits 15..6 110 // are either in the set or not. 111 return (0 != twoBits); 112 } else { 113 // Look up the code point in its 4k block of code points. 114 return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]); 115 } 116 } else if (c <= 0x10ffff) { 117 // surrogate or supplementary code point 118 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); 119 } else { 120 // Out-of-range code points get false, consistent with long-standing 121 // behavior of UnicodeSet.contains(c). 122 return false; 123 } 124 } 125 126 /** 127 * Span the initial substring for which each character c has spanCondition==contains(c). It must be 128 * spanCondition==0 or 1. 129 * 130 * @param start The start index 131 * @param outCount If not null: Receives the number of code points in the span. 132 * @return the limit (exclusive end) of the span 133 * 134 * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for 135 * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points 136 * as usual in ICU. 137 */ span(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)138 public final int span(CharSequence s, int start, SpanCondition spanCondition, 139 OutputInt outCount) { 140 char c, c2; 141 int i = start; 142 int limit = s.length(); 143 int numSupplementary = 0; 144 if (SpanCondition.NOT_CONTAINED != spanCondition) { 145 // span 146 while (i < limit) { 147 c = s.charAt(i); 148 if (c <= 0xff) { 149 if (!latin1Contains[c]) { 150 break; 151 } 152 } else if (c <= 0x7ff) { 153 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { 154 break; 155 } 156 } else if (c < 0xd800 || 157 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { 158 int lead = c >> 12; 159 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 160 if (twoBits <= 1) { 161 // All 64 code points with the same bits 15..6 162 // are either in the set or not. 163 if (twoBits == 0) { 164 break; 165 } 166 } else { 167 // Look up the code point in its 4k block of code points. 168 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 169 break; 170 } 171 } 172 } else { 173 // surrogate pair 174 int supplementary = Character.toCodePoint(c, c2); 175 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 176 break; 177 } 178 ++numSupplementary; 179 ++i; 180 } 181 ++i; 182 } 183 } else { 184 // span not 185 while (i < limit) { 186 c = s.charAt(i); 187 if (c <= 0xff) { 188 if (latin1Contains[c]) { 189 break; 190 } 191 } else if (c <= 0x7ff) { 192 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { 193 break; 194 } 195 } else if (c < 0xd800 || 196 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { 197 int lead = c >> 12; 198 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 199 if (twoBits <= 1) { 200 // All 64 code points with the same bits 15..6 201 // are either in the set or not. 202 if (twoBits != 0) { 203 break; 204 } 205 } else { 206 // Look up the code point in its 4k block of code points. 207 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 208 break; 209 } 210 } 211 } else { 212 // surrogate pair 213 int supplementary = Character.toCodePoint(c, c2); 214 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 215 break; 216 } 217 ++numSupplementary; 218 ++i; 219 } 220 ++i; 221 } 222 } 223 if (outCount != null) { 224 int spanLength = i - start; 225 outCount.value = spanLength - numSupplementary; // number of code points 226 } 227 return i; 228 } 229 230 /** 231 * Symmetrical with span(). 232 * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >= 233 * limit and spanCondition==0 or 1. 234 * 235 * @return The string index which starts the span (i.e. inclusive). 236 */ spanBack(CharSequence s, int limit, SpanCondition spanCondition)237 public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) { 238 char c, c2; 239 240 if (SpanCondition.NOT_CONTAINED != spanCondition) { 241 // span 242 for (;;) { 243 c = s.charAt(--limit); 244 if (c <= 0xff) { 245 if (!latin1Contains[c]) { 246 break; 247 } 248 } else if (c <= 0x7ff) { 249 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { 250 break; 251 } 252 } else if (c < 0xd800 || 253 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { 254 int lead = c >> 12; 255 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 256 if (twoBits <= 1) { 257 // All 64 code points with the same bits 15..6 258 // are either in the set or not. 259 if (twoBits == 0) { 260 break; 261 } 262 } else { 263 // Look up the code point in its 4k block of code points. 264 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 265 break; 266 } 267 } 268 } else { 269 // surrogate pair 270 int supplementary = Character.toCodePoint(c2, c); 271 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 272 break; 273 } 274 --limit; 275 } 276 if (0 == limit) { 277 return 0; 278 } 279 } 280 } else { 281 // span not 282 for (;;) { 283 c = s.charAt(--limit); 284 if (c <= 0xff) { 285 if (latin1Contains[c]) { 286 break; 287 } 288 } else if (c <= 0x7ff) { 289 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { 290 break; 291 } 292 } else if (c < 0xd800 || 293 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { 294 int lead = c >> 12; 295 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 296 if (twoBits <= 1) { 297 // All 64 code points with the same bits 15..6 298 // are either in the set or not. 299 if (twoBits != 0) { 300 break; 301 } 302 } else { 303 // Look up the code point in its 4k block of code points. 304 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 305 break; 306 } 307 } 308 } else { 309 // surrogate pair 310 int supplementary = Character.toCodePoint(c2, c); 311 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 312 break; 313 } 314 --limit; 315 } 316 if (0 == limit) { 317 return 0; 318 } 319 } 320 } 321 return limit + 1; 322 } 323 324 /** 325 * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800 326 */ set32x64Bits(int[] table, int start, int limit)327 private static void set32x64Bits(int[] table, int start, int limit) { 328 assert (64 == table.length); 329 int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits. 330 int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. 331 332 // Set one bit indicating an all-one block. 333 int bits = 1 << lead; 334 if ((start + 1) == limit) { // Single-character shortcut. 335 table[trail] |= bits; 336 return; 337 } 338 339 int limitLead = limit >> 6; 340 int limitTrail = limit & 0x3f; 341 342 if (lead == limitLead) { 343 // Partial vertical bit column. 344 while (trail < limitTrail) { 345 table[trail++] |= bits; 346 } 347 } else { 348 // Partial vertical bit column, 349 // followed by a bit rectangle, 350 // followed by another partial vertical bit column. 351 if (trail > 0) { 352 do { 353 table[trail++] |= bits; 354 } while (trail < 64); 355 ++lead; 356 } 357 if (lead < limitLead) { 358 bits = ~((1 << lead) - 1); 359 if (limitLead < 0x20) { 360 bits &= (1 << limitLead) - 1; 361 } 362 for (trail = 0; trail < 64; ++trail) { 363 table[trail] |= bits; 364 } 365 } 366 // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. 367 // In that case, bits=1<<limitLead == 1<<0 == 1 368 // (because Java << uses only the lower 5 bits of the shift operand) 369 // but the bits value is not used because trail<limitTrail is already false. 370 bits = 1 << limitLead; 371 for (trail = 0; trail < limitTrail; ++trail) { 372 table[trail] |= bits; 373 } 374 } 375 } 376 initBits()377 private void initBits() { 378 int start, limit; 379 int listIndex = 0; 380 381 // Set latin1Contains[]. 382 do { 383 start = list[listIndex++]; 384 if (listIndex < listLength) { 385 limit = list[listIndex++]; 386 } else { 387 limit = 0x110000; 388 } 389 if (start >= 0x100) { 390 break; 391 } 392 do { 393 latin1Contains[start++] = true; 394 } while (start < limit && start < 0x100); 395 } while (limit <= 0x100); 396 397 // Set table7FF[]. 398 while (start < 0x800) { 399 set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800); 400 if (limit > 0x800) { 401 start = 0x800; 402 break; 403 } 404 405 start = list[listIndex++]; 406 if (listIndex < listLength) { 407 limit = list[listIndex++]; 408 } else { 409 limit = 0x110000; 410 } 411 } 412 413 // Set bmpBlockBits[]. 414 int minStart = 0x800; 415 while (start < 0x10000) { 416 if (limit > 0x10000) { 417 limit = 0x10000; 418 } 419 420 if (start < minStart) { 421 start = minStart; 422 } 423 if (start < limit) { // Else: Another range entirely in a known mixed-value block. 424 if (0 != (start & 0x3f)) { 425 // Mixed-value block of 64 code points. 426 start >>= 6; 427 bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6); 428 start = (start + 1) << 6; // Round up to the next block boundary. 429 minStart = start; // Ignore further ranges in this block. 430 } 431 if (start < limit) { 432 if (start < (limit & ~0x3f)) { 433 // Multiple all-ones blocks of 64 code points each. 434 set32x64Bits(bmpBlockBits, start >> 6, limit >> 6); 435 } 436 437 if (0 != (limit & 0x3f)) { 438 // Mixed-value block of 64 code points. 439 limit >>= 6; 440 bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6); 441 limit = (limit + 1) << 6; // Round up to the next block boundary. 442 minStart = limit; // Ignore further ranges in this block. 443 } 444 } 445 } 446 447 if (limit == 0x10000) { 448 break; 449 } 450 451 start = list[listIndex++]; 452 if (listIndex < listLength) { 453 limit = list[listIndex++]; 454 } else { 455 limit = 0x110000; 456 } 457 } 458 } 459 460 461 /** 462 * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code 463 * points in a certain range. 464 * 465 * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and 466 * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1. 467 * 468 * @param c 469 * a character in a subrange of MIN_VALUE..MAX_VALUE 470 * @param lo 471 * The lowest index to be returned. 472 * @param hi 473 * The highest index to be returned. 474 * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i] 475 */ findCodePoint(int c, int lo, int hi)476 private int findCodePoint(int c, int lo, int hi) { 477 /* Examples: 478 findCodePoint(c) 479 set list[] c=0 1 3 4 7 8 480 === ============== =========== 481 [] [110000] 0 0 0 0 0 0 482 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 483 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 484 [:Any:] [0, 110000] 1 1 1 1 1 1 485 */ 486 487 // Return the smallest i such that c < list[i]. Assume 488 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 489 if (c < list[lo]) 490 return lo; 491 // High runner test. c is often after the last range, so an 492 // initial check for this condition pays off. 493 if (lo >= hi || c >= list[hi - 1]) 494 return hi; 495 // invariant: c >= list[lo] 496 // invariant: c < list[hi] 497 for (;;) { 498 int i = (lo + hi) >>> 1; 499 if (i == lo) { 500 break; // Found! 501 } else if (c < list[i]) { 502 hi = i; 503 } else { 504 lo = i; 505 } 506 } 507 return hi; 508 } 509 containsSlow(int c, int lo, int hi)510 private final boolean containsSlow(int c, int lo, int hi) { 511 return (0 != (findCodePoint(c, lo, hi) & 1)); 512 } 513 } 514