1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * Collation.java, ported from collation.h/.cpp 7 * 8 * C++ version created on: 2010oct27 9 * created by: Markus W. Scherer 10 */ 11 12 package com.ibm.icu.impl.coll; 13 14 /** 15 * Collation v2 basic definitions and static helper functions. 16 * 17 * Data structures except for expansion tables store 32-bit CEs which are 18 * either specials (see tags below) or are compact forms of 64-bit CEs. 19 */ 20 public final class Collation { 21 /** UChar32 U_SENTINEL. 22 * TODO: Create a common, public constant? 23 */ 24 public static final int SENTINEL_CP = -1; 25 26 // ICU4C compare() API returns enum UCollationResult values (with UCOL_ prefix). 27 // ICU4J just returns int. We use these constants for ease of porting. 28 public static final int LESS = -1; 29 public static final int EQUAL = 0; 30 public static final int GREATER = 1; 31 32 // Special sort key bytes for all levels. 33 public static final int TERMINATOR_BYTE = 0; 34 public static final int LEVEL_SEPARATOR_BYTE = 1; 35 36 /** The secondary/tertiary lower limit for tailoring before any root elements. */ 37 static final int BEFORE_WEIGHT16 = 0x100; 38 39 /** 40 * Merge-sort-key separator. 41 * Same as the unique primary and identical-level weights of U+FFFE. 42 * Must not be used as primary compression low terminator. 43 * Otherwise usable. 44 */ 45 public static final int MERGE_SEPARATOR_BYTE = 2; 46 public static final long MERGE_SEPARATOR_PRIMARY = 0x02000000; // U+FFFE 47 static final int MERGE_SEPARATOR_CE32 = 0x02000505; // U+FFFE 48 49 /** 50 * Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE. 51 * Reserved value in primary second byte if the lead byte is compressible. 52 * Otherwise usable in all CE weight bytes. 53 */ 54 public static final int PRIMARY_COMPRESSION_LOW_BYTE = 3; 55 /** 56 * Primary compression high terminator. 57 * Reserved value in primary second byte if the lead byte is compressible. 58 * Otherwise usable in all CE weight bytes. 59 */ 60 public static final int PRIMARY_COMPRESSION_HIGH_BYTE = 0xff; 61 62 /** Default secondary/tertiary weight lead byte. */ 63 static final int COMMON_BYTE = 5; 64 public static final int COMMON_WEIGHT16 = 0x0500; 65 /** Middle 16 bits of a CE with a common secondary weight. */ 66 static final int COMMON_SECONDARY_CE = 0x05000000; 67 /** Lower 16 bits of a CE with a common tertiary weight. */ 68 static final int COMMON_TERTIARY_CE = 0x0500; 69 /** Lower 32 bits of a CE with common secondary and tertiary weights. */ 70 public static final int COMMON_SEC_AND_TER_CE = 0x05000500; 71 72 static final int SECONDARY_MASK = 0xffff0000; 73 public static final int CASE_MASK = 0xc000; 74 static final int SECONDARY_AND_CASE_MASK = SECONDARY_MASK | CASE_MASK; 75 /** Only the 2*6 bits for the pure tertiary weight. */ 76 public static final int ONLY_TERTIARY_MASK = 0x3f3f; 77 /** Only the secondary & tertiary bits; no case, no quaternary. */ 78 static final int ONLY_SEC_TER_MASK = SECONDARY_MASK | ONLY_TERTIARY_MASK; 79 /** Case bits and tertiary bits. */ 80 static final int CASE_AND_TERTIARY_MASK = CASE_MASK | ONLY_TERTIARY_MASK; 81 public static final int QUATERNARY_MASK = 0xc0; 82 /** Case bits and quaternary bits. */ 83 public static final int CASE_AND_QUATERNARY_MASK = CASE_MASK | QUATERNARY_MASK; 84 85 static final int UNASSIGNED_IMPLICIT_BYTE = 0xfe; // compressible 86 /** 87 * First unassigned: AlphabeticIndex overflow boundary. 88 * We want a 3-byte primary so that it fits into the root elements table. 89 * 90 * This 3-byte primary will not collide with 91 * any unassigned-implicit 4-byte primaries because 92 * the first few hundred Unicode code points all have real mappings. 93 */ 94 static final long FIRST_UNASSIGNED_PRIMARY = 0xfe040200L; 95 96 static final int TRAIL_WEIGHT_BYTE = 0xff; // not compressible 97 static final long FIRST_TRAILING_PRIMARY = 0xff020200L; // [first trailing] 98 public static final long MAX_PRIMARY = 0xffff0000L; // U+FFFF 99 static final int MAX_REGULAR_CE32 = 0xffff0505; // U+FFFF 100 101 // CE32 value for U+FFFD as well as illegal UTF-8 byte sequences (which behave like U+FFFD). 102 // We use the third-highest primary weight for U+FFFD (as in UCA 6.3+). 103 public static final long FFFD_PRIMARY = MAX_PRIMARY - 0x20000; 104 static final int FFFD_CE32 = MAX_REGULAR_CE32 - 0x20000; 105 106 /** 107 * A CE32 is special if its low byte is this or greater. 108 * Impossible case bits 11 mark special CE32s. 109 * This value itself is used to indicate a fallback to the base collator. 110 */ 111 static final int SPECIAL_CE32_LOW_BYTE = 0xc0; 112 static final int FALLBACK_CE32 = SPECIAL_CE32_LOW_BYTE; 113 /** 114 * Low byte of a long-primary special CE32. 115 */ 116 static final int LONG_PRIMARY_CE32_LOW_BYTE = 0xc1; // SPECIAL_CE32_LOW_BYTE | LONG_PRIMARY_TAG 117 118 static final int UNASSIGNED_CE32 = 0xffffffff; // Compute an unassigned-implicit CE. 119 120 static final int NO_CE32 = 1; 121 122 /** No CE: End of input. Only used in runtime code, not stored in data. */ 123 static final long NO_CE_PRIMARY = 1; // not a left-adjusted weight 124 static final int NO_CE_WEIGHT16 = 0x0100; // weight of LEVEL_SEPARATOR_BYTE 125 public static final long NO_CE = 0x101000100L; // NO_CE_PRIMARY, NO_CE_WEIGHT16, NO_CE_WEIGHT16 126 127 /** Sort key levels. */ 128 129 /** Unspecified level. */ 130 public static final int NO_LEVEL = 0; 131 public static final int PRIMARY_LEVEL = 1; 132 public static final int SECONDARY_LEVEL = 2; 133 public static final int CASE_LEVEL = 3; 134 public static final int TERTIARY_LEVEL = 4; 135 public static final int QUATERNARY_LEVEL = 5; 136 public static final int IDENTICAL_LEVEL = 6; 137 /** Beyond sort key bytes. */ 138 public static final int ZERO_LEVEL = 7; 139 140 /** 141 * Sort key level flags: xx_FLAG = 1 << xx_LEVEL. 142 * In Java, use enum Level with flag() getters, or use EnumSet rather than hand-made bit sets. 143 */ 144 static final int NO_LEVEL_FLAG = 1; 145 static final int PRIMARY_LEVEL_FLAG = 2; 146 static final int SECONDARY_LEVEL_FLAG = 4; 147 static final int CASE_LEVEL_FLAG = 8; 148 static final int TERTIARY_LEVEL_FLAG = 0x10; 149 static final int QUATERNARY_LEVEL_FLAG = 0x20; 150 static final int IDENTICAL_LEVEL_FLAG = 0x40; 151 static final int ZERO_LEVEL_FLAG = 0x80; 152 153 /** 154 * Special-CE32 tags, from bits 3..0 of a special 32-bit CE. 155 * Bits 31..8 are available for tag-specific data. 156 * Bits 5..4: Reserved. May be used in the future to indicate lccc!=0 and tccc!=0. 157 */ 158 159 /** 160 * Fall back to the base collator. 161 * This is the tag value in SPECIAL_CE32_LOW_BYTE and FALLBACK_CE32. 162 * Bits 31..8: Unused, 0. 163 */ 164 static final int FALLBACK_TAG = 0; 165 /** 166 * Long-primary CE with COMMON_SEC_AND_TER_CE. 167 * Bits 31..8: Three-byte primary. 168 */ 169 static final int LONG_PRIMARY_TAG = 1; 170 /** 171 * Long-secondary CE with zero primary. 172 * Bits 31..16: Secondary weight. 173 * Bits 15.. 8: Tertiary weight. 174 */ 175 static final int LONG_SECONDARY_TAG = 2; 176 /** 177 * Unused. 178 * May be used in the future for single-byte secondary CEs (SHORT_SECONDARY_TAG), 179 * storing the secondary in bits 31..24, the ccc in bits 23..16, 180 * and the tertiary in bits 15..8. 181 */ 182 static final int RESERVED_TAG_3 = 3; 183 /** 184 * Latin mini expansions of two simple CEs [pp, 05, tt] [00, ss, 05]. 185 * Bits 31..24: Single-byte primary weight pp of the first CE. 186 * Bits 23..16: Tertiary weight tt of the first CE. 187 * Bits 15.. 8: Secondary weight ss of the second CE. 188 */ 189 static final int LATIN_EXPANSION_TAG = 4; 190 /** 191 * Points to one or more simple/long-primary/long-secondary 32-bit CE32s. 192 * Bits 31..13: Index into int table. 193 * Bits 12.. 8: Length=1..31. 194 */ 195 static final int EXPANSION32_TAG = 5; 196 /** 197 * Points to one or more 64-bit CEs. 198 * Bits 31..13: Index into CE table. 199 * Bits 12.. 8: Length=1..31. 200 */ 201 static final int EXPANSION_TAG = 6; 202 /** 203 * Builder data, used only in the CollationDataBuilder, not in runtime data. 204 * 205 * If bit 8 is 0: Builder context, points to a list of context-sensitive mappings. 206 * Bits 31..13: Index to the builder's list of ConditionalCE32 for this character. 207 * Bits 12.. 9: Unused, 0. 208 * 209 * If bit 8 is 1 (IS_BUILDER_JAMO_CE32): Builder-only jamoCE32 value. 210 * The builder fetches the Jamo CE32 from the trie. 211 * Bits 31..13: Jamo code point. 212 * Bits 12.. 9: Unused, 0. 213 */ 214 static final int BUILDER_DATA_TAG = 7; 215 /** 216 * Points to prefix trie. 217 * Bits 31..13: Index into prefix/contraction data. 218 * Bits 12.. 8: Unused, 0. 219 */ 220 static final int PREFIX_TAG = 8; 221 /** 222 * Points to contraction data. 223 * Bits 31..13: Index into prefix/contraction data. 224 * Bits 12..11: Unused, 0. 225 * Bit 10: CONTRACT_TRAILING_CCC flag. 226 * Bit 9: CONTRACT_NEXT_CCC flag. 227 * Bit 8: CONTRACT_SINGLE_CP_NO_MATCH flag. 228 */ 229 static final int CONTRACTION_TAG = 9; 230 /** 231 * Decimal digit. 232 * Bits 31..13: Index into int table for non-numeric-collation CE32. 233 * Bit 12: Unused, 0. 234 * Bits 11.. 8: Digit value 0..9. 235 */ 236 static final int DIGIT_TAG = 10; 237 /** 238 * Tag for U+0000, for moving the NUL-termination handling 239 * from the regular fastpath into specials-handling code. 240 * Bits 31..8: Unused, 0. 241 */ 242 static final int U0000_TAG = 11; 243 /** 244 * Tag for a Hangul syllable. 245 * Bits 31..9: Unused, 0. 246 * Bit 8: HANGUL_NO_SPECIAL_JAMO flag. 247 */ 248 static final int HANGUL_TAG = 12; 249 /** 250 * Tag for a lead surrogate code unit. 251 * Optional optimization for UTF-16 string processing. 252 * Bits 31..10: Unused, 0. 253 * 9.. 8: =0: All associated supplementary code points are unassigned-implict. 254 * =1: All associated supplementary code points fall back to the base data. 255 * else: (Normally 2) Look up the data for the supplementary code point. 256 */ 257 static final int LEAD_SURROGATE_TAG = 13; 258 /** 259 * Tag for CEs with primary weights in code point order. 260 * Bits 31..13: Index into CE table, for one data "CE". 261 * Bits 12.. 8: Unused, 0. 262 * 263 * This data "CE" has the following bit fields: 264 * Bits 63..32: Three-byte primary pppppp00. 265 * 31.. 8: Start/base code point of the in-order range. 266 * 7: Flag isCompressible primary. 267 * 6.. 0: Per-code point primary-weight increment. 268 */ 269 static final int OFFSET_TAG = 14; 270 /** 271 * Implicit CE tag. Compute an unassigned-implicit CE. 272 * All bits are set (UNASSIGNED_CE32=0xffffffff). 273 */ 274 static final int IMPLICIT_TAG = 15; 275 isAssignedCE32(int ce32)276 static boolean isAssignedCE32(int ce32) { 277 return ce32 != FALLBACK_CE32 && ce32 != UNASSIGNED_CE32; 278 } 279 280 /** 281 * We limit the number of CEs in an expansion 282 * so that we can use a small number of length bits in the data structure, 283 * and so that an implementation can copy CEs at runtime without growing a destination buffer. 284 */ 285 static final int MAX_EXPANSION_LENGTH = 31; 286 static final int MAX_INDEX = 0x7ffff; 287 288 /** 289 * Set if there is no match for the single (no-suffix) character itself. 290 * This is only possible if there is a prefix. 291 * In this case, discontiguous contraction matching cannot add combining marks 292 * starting from an empty suffix. 293 * The default CE32 is used anyway if there is no suffix match. 294 */ 295 static final int CONTRACT_SINGLE_CP_NO_MATCH = 0x100; 296 /** Set if the first character of every contraction suffix has lccc!=0. */ 297 static final int CONTRACT_NEXT_CCC = 0x200; 298 /** Set if any contraction suffix ends with lccc!=0. */ 299 static final int CONTRACT_TRAILING_CCC = 0x400; 300 301 /** For HANGUL_TAG: None of its Jamo CE32s isSpecialCE32(). */ 302 static final int HANGUL_NO_SPECIAL_JAMO = 0x100; 303 304 static final int LEAD_ALL_UNASSIGNED = 0; 305 static final int LEAD_ALL_FALLBACK = 0x100; 306 static final int LEAD_MIXED = 0x200; 307 static final int LEAD_TYPE_MASK = 0x300; 308 makeLongPrimaryCE32(long p)309 static int makeLongPrimaryCE32(long p) { return (int)(p | LONG_PRIMARY_CE32_LOW_BYTE); } 310 311 /** Turns the long-primary CE32 into a primary weight pppppp00. */ primaryFromLongPrimaryCE32(int ce32)312 static long primaryFromLongPrimaryCE32(int ce32) { 313 return (long)ce32 & 0xffffff00L; 314 } ceFromLongPrimaryCE32(int ce32)315 static long ceFromLongPrimaryCE32(int ce32) { 316 return ((long)(ce32 & 0xffffff00) << 32) | COMMON_SEC_AND_TER_CE; 317 } 318 makeLongSecondaryCE32(int lower32)319 static int makeLongSecondaryCE32(int lower32) { 320 return lower32 | SPECIAL_CE32_LOW_BYTE | LONG_SECONDARY_TAG; 321 } ceFromLongSecondaryCE32(int ce32)322 static long ceFromLongSecondaryCE32(int ce32) { 323 return (long)ce32 & 0xffffff00L; 324 } 325 326 /** Makes a special CE32 with tag, index and length. */ makeCE32FromTagIndexAndLength(int tag, int index, int length)327 static int makeCE32FromTagIndexAndLength(int tag, int index, int length) { 328 return (index << 13) | (length << 8) | SPECIAL_CE32_LOW_BYTE | tag; 329 } 330 /** Makes a special CE32 with only tag and index. */ makeCE32FromTagAndIndex(int tag, int index)331 static int makeCE32FromTagAndIndex(int tag, int index) { 332 return (index << 13) | SPECIAL_CE32_LOW_BYTE | tag; 333 } 334 isSpecialCE32(int ce32)335 static boolean isSpecialCE32(int ce32) { 336 return (ce32 & 0xff) >= SPECIAL_CE32_LOW_BYTE; 337 } 338 tagFromCE32(int ce32)339 static int tagFromCE32(int ce32) { 340 return ce32 & 0xf; 341 } 342 hasCE32Tag(int ce32, int tag)343 static boolean hasCE32Tag(int ce32, int tag) { 344 return isSpecialCE32(ce32) && tagFromCE32(ce32) == tag; 345 } 346 isLongPrimaryCE32(int ce32)347 static boolean isLongPrimaryCE32(int ce32) { 348 return hasCE32Tag(ce32, LONG_PRIMARY_TAG); 349 } 350 isSimpleOrLongCE32(int ce32)351 static boolean isSimpleOrLongCE32(int ce32) { 352 return !isSpecialCE32(ce32) || 353 tagFromCE32(ce32) == LONG_PRIMARY_TAG || 354 tagFromCE32(ce32) == LONG_SECONDARY_TAG; 355 } 356 357 /** 358 * @return true if the ce32 yields one or more CEs without further data lookups 359 */ isSelfContainedCE32(int ce32)360 static boolean isSelfContainedCE32(int ce32) { 361 return !isSpecialCE32(ce32) || 362 tagFromCE32(ce32) == LONG_PRIMARY_TAG || 363 tagFromCE32(ce32) == LONG_SECONDARY_TAG || 364 tagFromCE32(ce32) == LATIN_EXPANSION_TAG; 365 } 366 isPrefixCE32(int ce32)367 static boolean isPrefixCE32(int ce32) { 368 return hasCE32Tag(ce32, PREFIX_TAG); 369 } 370 isContractionCE32(int ce32)371 static boolean isContractionCE32(int ce32) { 372 return hasCE32Tag(ce32, CONTRACTION_TAG); 373 } 374 ce32HasContext(int ce32)375 static boolean ce32HasContext(int ce32) { 376 return isSpecialCE32(ce32) && 377 (tagFromCE32(ce32) == PREFIX_TAG || 378 tagFromCE32(ce32) == CONTRACTION_TAG); 379 } 380 381 /** 382 * Get the first of the two Latin-expansion CEs encoded in ce32. 383 * @see LATIN_EXPANSION_TAG 384 */ latinCE0FromCE32(int ce32)385 static long latinCE0FromCE32(int ce32) { 386 return ((long)(ce32 & 0xff000000) << 32) | COMMON_SECONDARY_CE | ((ce32 & 0xff0000) >> 8); 387 } 388 389 /** 390 * Get the second of the two Latin-expansion CEs encoded in ce32. 391 * @see LATIN_EXPANSION_TAG 392 */ latinCE1FromCE32(int ce32)393 static long latinCE1FromCE32(int ce32) { 394 return (((long)ce32 & 0xff00) << 16) | COMMON_TERTIARY_CE; 395 } 396 397 /** 398 * Returns the data index from a special CE32. 399 */ indexFromCE32(int ce32)400 static int indexFromCE32(int ce32) { 401 return ce32 >>> 13; 402 } 403 404 /** 405 * Returns the data length from a ce32. 406 */ lengthFromCE32(int ce32)407 static int lengthFromCE32(int ce32) { 408 return (ce32 >> 8) & 31; 409 } 410 411 /** 412 * Returns the digit value from a DIGIT_TAG ce32. 413 */ digitFromCE32(int ce32)414 static char digitFromCE32(int ce32) { 415 return (char)((ce32 >> 8) & 0xf); 416 } 417 418 /** Returns a 64-bit CE from a simple CE32 (not special). */ ceFromSimpleCE32(int ce32)419 static long ceFromSimpleCE32(int ce32) { 420 // normal form ppppsstt -> pppp0000ss00tt00 421 assert (ce32 & 0xff) < SPECIAL_CE32_LOW_BYTE; 422 return ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | ((ce32 & 0xff) << 8); 423 } 424 425 /** Returns a 64-bit CE from a simple/long-primary/long-secondary CE32. */ 426 static long ceFromCE32(int ce32) { 427 int tertiary = ce32 & 0xff; 428 if(tertiary < SPECIAL_CE32_LOW_BYTE) { 429 // normal form ppppsstt -> pppp0000ss00tt00 430 return ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (tertiary << 8); 431 } else { 432 ce32 -= tertiary; 433 if((tertiary & 0xf) == LONG_PRIMARY_TAG) { 434 // long-primary form ppppppC1 -> pppppp00050000500 435 return ((long)ce32 << 32) | COMMON_SEC_AND_TER_CE; 436 } else { 437 // long-secondary form ssssttC2 -> 00000000sssstt00 438 assert (tertiary & 0xf) == LONG_SECONDARY_TAG; 439 return ce32 & 0xffffffffL; 440 } 441 } 442 } 443 444 /** Creates a CE from a primary weight. */ 445 public static long makeCE(long p) { 446 return (p << 32) | COMMON_SEC_AND_TER_CE; 447 } 448 /** 449 * Creates a CE from a primary weight, 450 * 16-bit secondary/tertiary weights, and a 2-bit quaternary. 451 */ 452 static long makeCE(long p, int s, int t, int q) { 453 return (p << 32) | ((long)s << 16) | t | (q << 6); 454 } 455 456 /** 457 * Increments a 2-byte primary by a code point offset. 458 */ 459 public static long incTwoBytePrimaryByOffset(long basePrimary, boolean isCompressible, 460 int offset) { 461 // Extract the second byte, minus the minimum byte value, 462 // plus the offset, modulo the number of usable byte values, plus the minimum. 463 // Reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. 464 long primary; 465 if(isCompressible) { 466 offset += ((int)(basePrimary >> 16) & 0xff) - 4; 467 primary = ((offset % 251) + 4) << 16; 468 offset /= 251; 469 } else { 470 offset += ((int)(basePrimary >> 16) & 0xff) - 2; 471 primary = ((offset % 254) + 2) << 16; 472 offset /= 254; 473 } 474 // First byte, assume no further overflow. 475 return primary | ((basePrimary & 0xff000000L) + ((long)offset << 24)); 476 } 477 478 /** 479 * Increments a 3-byte primary by a code point offset. 480 */ 481 public static long incThreeBytePrimaryByOffset(long basePrimary, boolean isCompressible, 482 int offset) { 483 // Extract the third byte, minus the minimum byte value, 484 // plus the offset, modulo the number of usable byte values, plus the minimum. 485 offset += ((int)(basePrimary >> 8) & 0xff) - 2; 486 long primary = ((offset % 254) + 2) << 8; 487 offset /= 254; 488 // Same with the second byte, 489 // but reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. 490 if(isCompressible) { 491 offset += ((int)(basePrimary >> 16) & 0xff) - 4; 492 primary |= ((offset % 251) + 4) << 16; 493 offset /= 251; 494 } else { 495 offset += ((int)(basePrimary >> 16) & 0xff) - 2; 496 primary |= ((offset % 254) + 2) << 16; 497 offset /= 254; 498 } 499 // First byte, assume no further overflow. 500 return primary | ((basePrimary & 0xff000000L) + ((long)offset << 24)); 501 } 502 503 /** 504 * Decrements a 2-byte primary by one range step (1..0x7f). 505 */ 506 static long decTwoBytePrimaryByOneStep(long basePrimary, boolean isCompressible, int step) { 507 // Extract the second byte, minus the minimum byte value, 508 // minus the step, modulo the number of usable byte values, plus the minimum. 509 // Reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. 510 // Assume no further underflow for the first byte. 511 assert(0 < step && step <= 0x7f); 512 int byte2 = ((int)(basePrimary >> 16) & 0xff) - step; 513 if(isCompressible) { 514 if(byte2 < 4) { 515 byte2 += 251; 516 basePrimary -= 0x1000000; 517 } 518 } else { 519 if(byte2 < 2) { 520 byte2 += 254; 521 basePrimary -= 0x1000000; 522 } 523 } 524 return (basePrimary & 0xff000000L) | (byte2 << 16); 525 } 526 527 /** 528 * Decrements a 3-byte primary by one range step (1..0x7f). 529 */ 530 static long decThreeBytePrimaryByOneStep(long basePrimary, boolean isCompressible, int step) { 531 // Extract the third byte, minus the minimum byte value, 532 // minus the step, modulo the number of usable byte values, plus the minimum. 533 assert(0 < step && step <= 0x7f); 534 int byte3 = ((int)(basePrimary >> 8) & 0xff) - step; 535 if(byte3 >= 2) { 536 return (basePrimary & 0xffff0000L) | (byte3 << 8); 537 } 538 byte3 += 254; 539 // Same with the second byte, 540 // but reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. 541 int byte2 = ((int)(basePrimary >> 16) & 0xff) - 1; 542 if(isCompressible) { 543 if(byte2 < 4) { 544 byte2 = 0xfe; 545 basePrimary -= 0x1000000; 546 } 547 } else { 548 if(byte2 < 2) { 549 byte2 = 0xff; 550 basePrimary -= 0x1000000; 551 } 552 } 553 // First byte, assume no further underflow. 554 return (basePrimary & 0xff000000L) | (byte2 << 16) | (byte3 << 8); 555 } 556 557 /** 558 * Computes a 3-byte primary for c's OFFSET_TAG data "CE". 559 */ 560 static long getThreeBytePrimaryForOffsetData(int c, long dataCE) { 561 long p = dataCE >>> 32; // three-byte primary pppppp00 562 int lower32 = (int)dataCE; // base code point b & step s: bbbbbbss (bit 7: isCompressible) 563 int offset = (c - (lower32 >> 8)) * (lower32 & 0x7f); // delta * increment 564 boolean isCompressible = (lower32 & 0x80) != 0; 565 return Collation.incThreeBytePrimaryByOffset(p, isCompressible, offset); 566 } 567 568 /** 569 * Returns the unassigned-character implicit primary weight for any valid code point c. 570 */ 571 static long unassignedPrimaryFromCodePoint(int c) { 572 // Create a gap before U+0000. Use c=-1 for [first unassigned]. 573 ++c; 574 // Fourth byte: 18 values, every 14th byte value (gap of 13). 575 long primary = 2 + (c % 18) * 14; 576 c /= 18; 577 // Third byte: 254 values. 578 primary |= (2 + (c % 254)) << 8; 579 c /= 254; 580 // Second byte: 251 values 04..FE excluding the primary compression bytes. 581 primary |= (4 + (c % 251)) << 16; 582 // One lead byte covers all code points (c < 0x1182B4 = 1*251*254*18). 583 return primary | ((long)UNASSIGNED_IMPLICIT_BYTE << 24); 584 } 585 586 static long unassignedCEFromCodePoint(int c) { 587 return makeCE(unassignedPrimaryFromCodePoint(c)); 588 } 589 590 // private Collation() // No instantiation. 591 } 592