1 /** 2 ******************************************************************************* 3 * Copyright (C) 1996-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.text; 9 10 import com.ibm.icu.impl.UCharacterProperty; 11 12 /** 13 * <p> 14 * Standalone utility class providing UTF16 character conversions and indexing conversions. 15 * </p> 16 * <p> 17 * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap, 18 * so searching for strings is a safe operation. Similarly, concatenation is always safe. 19 * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the 20 * values for start and end are on those boundaries, since they arose from operations like 21 * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>. 22 * </p> 23 * <strong>Examples:</strong> 24 * <p> 25 * The following examples illustrate use of some of these methods. 26 * 27 * <pre> 28 * // iteration forwards: Original 29 * for (int i = 0; i < s.length(); ++i) { 30 * char ch = s.charAt(i); 31 * doSomethingWith(ch); 32 * } 33 * 34 * // iteration forwards: Changes for UTF-32 35 * int ch; 36 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { 37 * ch = UTF16.charAt(s, i); 38 * doSomethingWith(ch); 39 * } 40 * 41 * // iteration backwards: Original 42 * for (int i = s.length() - 1; i >= 0; --i) { 43 * char ch = s.charAt(i); 44 * doSomethingWith(ch); 45 * } 46 * 47 * // iteration backwards: Changes for UTF-32 48 * int ch; 49 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { 50 * ch = UTF16.charAt(s, i); 51 * doSomethingWith(ch); 52 * } 53 * </pre> 54 * 55 * <strong>Notes:</strong> 56 * <ul> 57 * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code> 58 * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string. 59 * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16 60 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32 61 * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li> 62 * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a 63 * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16 64 * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>. 65 * </li> 66 * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out 67 * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates 68 * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to 69 * check for validity if desired. </li> 70 * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then 71 * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It 72 * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4, 73 * 5.5). </li> 74 * <li> <strong>Optimization:</strong> The method implementations may need optimization if the 75 * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small 76 * percentage of all the text in the world, the singleton case should always be optimized for. </li> 77 * </ul> 78 * 79 * @author Mark Davis, with help from Markus Scherer 80 * @stable ICU 2.1 81 */ 82 83 public final class UTF16 { 84 // public variables --------------------------------------------------- 85 86 /** 87 * Value returned in <code><a href="#bounds(java.lang.String, int)"> 88 * bounds()</a></code>. 89 * These values are chosen specifically so that it actually represents the position of the 90 * character [offset16 - (value >> 2), offset16 + (value & 3)] 91 * 92 * @stable ICU 2.1 93 */ 94 public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2, 95 TRAIL_SURROGATE_BOUNDARY = 5; 96 97 /** 98 * The lowest Unicode code point value. 99 * 100 * @stable ICU 2.1 101 */ 102 public static final int CODEPOINT_MIN_VALUE = 0; 103 104 /** 105 * The highest Unicode code point value (scalar value) according to the Unicode Standard. 106 * 107 * @stable ICU 2.1 108 */ 109 public static final int CODEPOINT_MAX_VALUE = 0x10ffff; 110 111 /** 112 * The minimum value for Supplementary code points 113 * 114 * @stable ICU 2.1 115 */ 116 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 117 118 /** 119 * Lead surrogate minimum value 120 * 121 * @stable ICU 2.1 122 */ 123 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 124 125 /** 126 * Trail surrogate minimum value 127 * 128 * @stable ICU 2.1 129 */ 130 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 131 132 /** 133 * Lead surrogate maximum value 134 * 135 * @stable ICU 2.1 136 */ 137 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 138 139 /** 140 * Trail surrogate maximum value 141 * 142 * @stable ICU 2.1 143 */ 144 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 145 146 /** 147 * Surrogate minimum value 148 * 149 * @stable ICU 2.1 150 */ 151 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; 152 153 /** 154 * Maximum surrogate value 155 * 156 * @stable ICU 2.1 157 */ 158 public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE; 159 160 /** 161 * Lead surrogate bitmask 162 */ 163 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; 164 165 /** 166 * Trail surrogate bitmask 167 */ 168 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; 169 170 /** 171 * Surrogate bitmask 172 */ 173 private static final int SURROGATE_BITMASK = 0xFFFFF800; 174 175 /** 176 * Lead surrogate bits 177 */ 178 private static final int LEAD_SURROGATE_BITS = 0xD800; 179 180 /** 181 * Trail surrogate bits 182 */ 183 private static final int TRAIL_SURROGATE_BITS = 0xDC00; 184 185 /** 186 * Surrogate bits 187 */ 188 private static final int SURROGATE_BITS = 0xD800; 189 190 // constructor -------------------------------------------------------- 191 192 // /CLOVER:OFF 193 /** 194 * Prevent instance from being created. 195 */ UTF16()196 private UTF16() { 197 } 198 199 // /CLOVER:ON 200 // public method ------------------------------------------------------ 201 202 /** 203 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 204 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 205 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 206 * UCharacter.isLegal()</a></code> 207 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 208 * character will be returned. If a complete supplementary character is not found the incomplete 209 * character will be returned 210 * 211 * @param source Array of UTF-16 chars 212 * @param offset16 UTF-16 offset to the start of the character. 213 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 214 * of that codepoint are the same as in <code>bounds32()</code>. 215 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 216 * @stable ICU 2.1 217 */ charAt(String source, int offset16)218 public static int charAt(String source, int offset16) { 219 char single = source.charAt(offset16); 220 if (single < LEAD_SURROGATE_MIN_VALUE) { 221 return single; 222 } 223 return _charAt(source, offset16, single); 224 } 225 _charAt(String source, int offset16, char single)226 private static int _charAt(String source, int offset16, char single) { 227 if (single > TRAIL_SURROGATE_MAX_VALUE) { 228 return single; 229 } 230 231 // Convert the UTF-16 surrogate pair if necessary. 232 // For simplicity in usage, and because the frequency of pairs is 233 // low, look both directions. 234 235 if (single <= LEAD_SURROGATE_MAX_VALUE) { 236 ++offset16; 237 if (source.length() != offset16) { 238 char trail = source.charAt(offset16); 239 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { 240 return UCharacterProperty.getRawSupplementary(single, trail); 241 } 242 } 243 } else { 244 --offset16; 245 if (offset16 >= 0) { 246 // single is a trail surrogate so 247 char lead = source.charAt(offset16); 248 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { 249 return UCharacterProperty.getRawSupplementary(lead, single); 250 } 251 } 252 } 253 return single; // return unmatched surrogate 254 } 255 256 /** 257 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 258 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 259 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 260 * UCharacter.isLegal()</a></code> 261 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 262 * character will be returned. If a complete supplementary character is not found the incomplete 263 * character will be returned 264 * 265 * @param source Array of UTF-16 chars 266 * @param offset16 UTF-16 offset to the start of the character. 267 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 268 * of that codepoint are the same as in <code>bounds32()</code>. 269 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 270 * @stable ICU 2.1 271 */ charAt(CharSequence source, int offset16)272 public static int charAt(CharSequence source, int offset16) { 273 char single = source.charAt(offset16); 274 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { 275 return single; 276 } 277 return _charAt(source, offset16, single); 278 } 279 _charAt(CharSequence source, int offset16, char single)280 private static int _charAt(CharSequence source, int offset16, char single) { 281 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { 282 return single; 283 } 284 285 // Convert the UTF-16 surrogate pair if necessary. 286 // For simplicity in usage, and because the frequency of pairs is 287 // low, look both directions. 288 289 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 290 ++offset16; 291 if (source.length() != offset16) { 292 char trail = source.charAt(offset16); 293 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE 294 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { 295 return UCharacterProperty.getRawSupplementary(single, trail); 296 } 297 } 298 } else { 299 --offset16; 300 if (offset16 >= 0) { 301 // single is a trail surrogate so 302 char lead = source.charAt(offset16); 303 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE 304 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 305 return UCharacterProperty.getRawSupplementary(lead, single); 306 } 307 } 308 } 309 return single; // return unmatched surrogate 310 } 311 312 /** 313 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 314 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 315 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 316 * </a></code> 317 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 318 * character will be returned. If a complete supplementary character is not found the incomplete 319 * character will be returned 320 * 321 * @param source UTF-16 chars string buffer 322 * @param offset16 UTF-16 offset to the start of the character. 323 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 324 * of that codepoint are the same as in <code>bounds32()</code>. 325 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 326 * @stable ICU 2.1 327 */ charAt(StringBuffer source, int offset16)328 public static int charAt(StringBuffer source, int offset16) { 329 if (offset16 < 0 || offset16 >= source.length()) { 330 throw new StringIndexOutOfBoundsException(offset16); 331 } 332 333 char single = source.charAt(offset16); 334 if (!isSurrogate(single)) { 335 return single; 336 } 337 338 // Convert the UTF-16 surrogate pair if necessary. 339 // For simplicity in usage, and because the frequency of pairs is 340 // low, look both directions. 341 342 if (single <= LEAD_SURROGATE_MAX_VALUE) { 343 ++offset16; 344 if (source.length() != offset16) { 345 char trail = source.charAt(offset16); 346 if (isTrailSurrogate(trail)) 347 return UCharacterProperty.getRawSupplementary(single, trail); 348 } 349 } else { 350 --offset16; 351 if (offset16 >= 0) { 352 // single is a trail surrogate so 353 char lead = source.charAt(offset16); 354 if (isLeadSurrogate(lead)) { 355 return UCharacterProperty.getRawSupplementary(lead, single); 356 } 357 } 358 } 359 return single; // return unmatched surrogate 360 } 361 362 /** 363 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards 364 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 365 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 366 * </a></code> 367 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 368 * character will be returned. If a complete supplementary character is not found the incomplete 369 * character will be returned 370 * 371 * @param source Array of UTF-16 chars 372 * @param start Offset to substring in the source array for analyzing 373 * @param limit Offset to substring in the source array for analyzing 374 * @param offset16 UTF-16 offset relative to start 375 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 376 * of that codepoint are the same as in <code>bounds32()</code>. 377 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit. 378 * @stable ICU 2.1 379 */ charAt(char source[], int start, int limit, int offset16)380 public static int charAt(char source[], int start, int limit, int offset16) { 381 offset16 += start; 382 if (offset16 < start || offset16 >= limit) { 383 throw new ArrayIndexOutOfBoundsException(offset16); 384 } 385 386 char single = source[offset16]; 387 if (!isSurrogate(single)) { 388 return single; 389 } 390 391 // Convert the UTF-16 surrogate pair if necessary. 392 // For simplicity in usage, and because the frequency of pairs is 393 // low, look both directions. 394 if (single <= LEAD_SURROGATE_MAX_VALUE) { 395 offset16++; 396 if (offset16 >= limit) { 397 return single; 398 } 399 char trail = source[offset16]; 400 if (isTrailSurrogate(trail)) { 401 return UCharacterProperty.getRawSupplementary(single, trail); 402 } 403 } else { // isTrailSurrogate(single), so 404 if (offset16 == start) { 405 return single; 406 } 407 offset16--; 408 char lead = source[offset16]; 409 if (isLeadSurrogate(lead)) 410 return UCharacterProperty.getRawSupplementary(lead, single); 411 } 412 return single; // return unmatched surrogate 413 } 414 415 /** 416 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 417 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 418 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 419 * </a></code> 420 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 421 * character will be returned. If a complete supplementary character is not found the incomplete 422 * character will be returned 423 * 424 * @param source UTF-16 chars string buffer 425 * @param offset16 UTF-16 offset to the start of the character. 426 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 427 * of that codepoint are the same as in <code>bounds32()</code>. 428 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 429 * @stable ICU 2.1 430 */ charAt(Replaceable source, int offset16)431 public static int charAt(Replaceable source, int offset16) { 432 if (offset16 < 0 || offset16 >= source.length()) { 433 throw new StringIndexOutOfBoundsException(offset16); 434 } 435 436 char single = source.charAt(offset16); 437 if (!isSurrogate(single)) { 438 return single; 439 } 440 441 // Convert the UTF-16 surrogate pair if necessary. 442 // For simplicity in usage, and because the frequency of pairs is 443 // low, look both directions. 444 445 if (single <= LEAD_SURROGATE_MAX_VALUE) { 446 ++offset16; 447 if (source.length() != offset16) { 448 char trail = source.charAt(offset16); 449 if (isTrailSurrogate(trail)) 450 return UCharacterProperty.getRawSupplementary(single, trail); 451 } 452 } else { 453 --offset16; 454 if (offset16 >= 0) { 455 // single is a trail surrogate so 456 char lead = source.charAt(offset16); 457 if (isLeadSurrogate(lead)) { 458 return UCharacterProperty.getRawSupplementary(lead, single); 459 } 460 } 461 } 462 return single; // return unmatched surrogate 463 } 464 465 /** 466 * Determines how many chars this char32 requires. If a validity check is required, use <code> 467 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 468 * on char32 before calling. 469 * 470 * @param char32 The input codepoint. 471 * @return 2 if is in supplementary space, otherwise 1. 472 * @stable ICU 2.1 473 */ getCharCount(int char32)474 public static int getCharCount(int char32) { 475 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 476 return 1; 477 } 478 return 2; 479 } 480 481 /** 482 * Returns the type of the boundaries around the char at offset16. Used for random access. 483 * 484 * @param source Text to analyse 485 * @param offset16 UTF-16 offset 486 * @return 487 * <ul> 488 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1] 489 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 490 * are [offset16, offset16 + 2] 491 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 492 * bounds are [offset16 - 1, offset16 + 1] 493 * </ul> 494 * For bit-twiddlers, the return values for these are chosen so that the boundaries 495 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 496 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 497 * @stable ICU 2.1 498 */ bounds(String source, int offset16)499 public static int bounds(String source, int offset16) { 500 char ch = source.charAt(offset16); 501 if (isSurrogate(ch)) { 502 if (isLeadSurrogate(ch)) { 503 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 504 return LEAD_SURROGATE_BOUNDARY; 505 } 506 } else { 507 // isTrailSurrogate(ch), so 508 --offset16; 509 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 510 return TRAIL_SURROGATE_BOUNDARY; 511 } 512 } 513 } 514 return SINGLE_CHAR_BOUNDARY; 515 } 516 517 /** 518 * Returns the type of the boundaries around the char at offset16. Used for random access. 519 * 520 * @param source String buffer to analyse 521 * @param offset16 UTF16 offset 522 * @return 523 * <ul> 524 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1] 525 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 526 * are [offset16, offset16 + 2] 527 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 528 * bounds are [offset16 - 1, offset16 + 1] 529 * </ul> 530 * For bit-twiddlers, the return values for these are chosen so that the boundaries 531 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 532 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 533 * @stable ICU 2.1 534 */ bounds(StringBuffer source, int offset16)535 public static int bounds(StringBuffer source, int offset16) { 536 char ch = source.charAt(offset16); 537 if (isSurrogate(ch)) { 538 if (isLeadSurrogate(ch)) { 539 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 540 return LEAD_SURROGATE_BOUNDARY; 541 } 542 } else { 543 // isTrailSurrogate(ch), so 544 --offset16; 545 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 546 return TRAIL_SURROGATE_BOUNDARY; 547 } 548 } 549 } 550 return SINGLE_CHAR_BOUNDARY; 551 } 552 553 /** 554 * Returns the type of the boundaries around the char at offset16. Used for random access. Note 555 * that the boundaries are determined with respect to the subarray, hence the char array 556 * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1. 557 * 558 * @param source Char array to analyse 559 * @param start Offset to substring in the source array for analyzing 560 * @param limit Offset to substring in the source array for analyzing 561 * @param offset16 UTF16 offset relative to start 562 * @return 563 * <ul> 564 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are 565 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 566 * are [offset16, offset16 + 2] 567 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 568 * bounds are [offset16 - 1, offset16 + 1] 569 * </ul> 570 * For bit-twiddlers, the boundary values for these are chosen so that the boundaries 571 * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)]. 572 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 573 * @stable ICU 2.1 574 */ bounds(char source[], int start, int limit, int offset16)575 public static int bounds(char source[], int start, int limit, int offset16) { 576 offset16 += start; 577 if (offset16 < start || offset16 >= limit) { 578 throw new ArrayIndexOutOfBoundsException(offset16); 579 } 580 char ch = source[offset16]; 581 if (isSurrogate(ch)) { 582 if (isLeadSurrogate(ch)) { 583 ++offset16; 584 if (offset16 < limit && isTrailSurrogate(source[offset16])) { 585 return LEAD_SURROGATE_BOUNDARY; 586 } 587 } else { // isTrailSurrogate(ch), so 588 --offset16; 589 if (offset16 >= start && isLeadSurrogate(source[offset16])) { 590 return TRAIL_SURROGATE_BOUNDARY; 591 } 592 } 593 } 594 return SINGLE_CHAR_BOUNDARY; 595 } 596 597 /** 598 * Determines whether the code value is a surrogate. 599 * 600 * @param char16 The input character. 601 * @return true If the input character is a surrogate. 602 * @stable ICU 2.1 603 */ isSurrogate(char char16)604 public static boolean isSurrogate(char char16) { 605 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; 606 } 607 608 /** 609 * Determines whether the character is a trail surrogate. 610 * 611 * @param char16 The input character. 612 * @return true If the input character is a trail surrogate. 613 * @stable ICU 2.1 614 */ isTrailSurrogate(char char16)615 public static boolean isTrailSurrogate(char char16) { 616 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; 617 } 618 619 /** 620 * Determines whether the character is a lead surrogate. 621 * 622 * @param char16 The input character. 623 * @return true If the input character is a lead surrogate 624 * @stable ICU 2.1 625 */ isLeadSurrogate(char char16)626 public static boolean isLeadSurrogate(char char16) { 627 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; 628 } 629 630 /** 631 * Returns the lead surrogate. If a validity check is required, use 632 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 633 * before calling. 634 * 635 * @param char32 The input character. 636 * @return lead surrogate if the getCharCount(ch) is 2; <br> 637 * and 0 otherwise (note: 0 is not a valid lead surrogate). 638 * @stable ICU 2.1 639 */ getLeadSurrogate(int char32)640 public static char getLeadSurrogate(int char32) { 641 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 642 return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_)); 643 } 644 return 0; 645 } 646 647 /** 648 * Returns the trail surrogate. If a validity check is required, use 649 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 650 * before calling. 651 * 652 * @param char32 The input character. 653 * @return the trail surrogate if the getCharCount(ch) is 2; <br> 654 * otherwise the character itself 655 * @stable ICU 2.1 656 */ getTrailSurrogate(int char32)657 public static char getTrailSurrogate(int char32) { 658 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 659 return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_)); 660 } 661 return (char) char32; 662 } 663 664 /** 665 * Convenience method corresponding to String.valueOf(char). Returns a one or two char string 666 * containing the UTF-32 value in UTF16 format. If a validity check is required, use <a 667 * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before calling. 668 * 669 * @param char32 The input character. 670 * @return string value of char32 in UTF16 format 671 * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint. 672 * @stable ICU 2.1 673 */ valueOf(int char32)674 public static String valueOf(int char32) { 675 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 676 throw new IllegalArgumentException("Illegal codepoint"); 677 } 678 return toString(char32); 679 } 680 681 /** 682 * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or 683 * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate 684 * character, the whole supplementary codepoint will be returned. If a validity check is 685 * required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the 686 * codepoint at offset16 before calling. The result returned will be a newly created String 687 * obtained by calling source.substring(..) with the appropriate indexes. 688 * 689 * @param source The input string. 690 * @param offset16 The UTF16 index to the codepoint in source 691 * @return string value of char32 in UTF16 format 692 * @stable ICU 2.1 693 */ valueOf(String source, int offset16)694 public static String valueOf(String source, int offset16) { 695 switch (bounds(source, offset16)) { 696 case LEAD_SURROGATE_BOUNDARY: 697 return source.substring(offset16, offset16 + 2); 698 case TRAIL_SURROGATE_BOUNDARY: 699 return source.substring(offset16 - 1, offset16 + 1); 700 default: 701 return source.substring(offset16, offset16 + 1); 702 } 703 } 704 705 /** 706 * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a 707 * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a 708 * surrogate character, the whole supplementary codepoint will be returned. If a validity check 709 * is required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on 710 * the codepoint at offset16 before calling. The result returned will be a newly created String 711 * obtained by calling source.substring(..) with the appropriate indexes. 712 * 713 * @param source The input string buffer. 714 * @param offset16 The UTF16 index to the codepoint in source 715 * @return string value of char32 in UTF16 format 716 * @stable ICU 2.1 717 */ valueOf(StringBuffer source, int offset16)718 public static String valueOf(StringBuffer source, int offset16) { 719 switch (bounds(source, offset16)) { 720 case LEAD_SURROGATE_BOUNDARY: 721 return source.substring(offset16, offset16 + 2); 722 case TRAIL_SURROGATE_BOUNDARY: 723 return source.substring(offset16 - 1, offset16 + 1); 724 default: 725 return source.substring(offset16, offset16 + 1); 726 } 727 } 728 729 /** 730 * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16 731 * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be 732 * returned, except when either the leading or trailing surrogate character lies out of the 733 * specified subarray. In the latter case, only the surrogate character within bounds will be 734 * returned. If a validity check is required, use <a 735 * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the codepoint at 736 * offset16 before calling. The result returned will be a newly created String containing the 737 * relevant characters. 738 * 739 * @param source The input char array. 740 * @param start Start index of the subarray 741 * @param limit End index of the subarray 742 * @param offset16 The UTF16 index to the codepoint in source relative to start 743 * @return string value of char32 in UTF16 format 744 * @stable ICU 2.1 745 */ valueOf(char source[], int start, int limit, int offset16)746 public static String valueOf(char source[], int start, int limit, int offset16) { 747 switch (bounds(source, start, limit, offset16)) { 748 case LEAD_SURROGATE_BOUNDARY: 749 return new String(source, start + offset16, 2); 750 case TRAIL_SURROGATE_BOUNDARY: 751 return new String(source, start + offset16 - 1, 2); 752 } 753 return new String(source, start + offset16, 1); 754 } 755 756 /** 757 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 758 * the <a name="_top_">class description</a> for notes on roundtripping. 759 * 760 * @param source The UTF-16 string 761 * @param offset32 UTF-32 offset 762 * @return UTF-16 offset 763 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 764 * @stable ICU 2.1 765 */ findOffsetFromCodePoint(String source, int offset32)766 public static int findOffsetFromCodePoint(String source, int offset32) { 767 char ch; 768 int size = source.length(), result = 0, count = offset32; 769 if (offset32 < 0 || offset32 > size) { 770 throw new StringIndexOutOfBoundsException(offset32); 771 } 772 while (result < size && count > 0) { 773 ch = source.charAt(result); 774 if (isLeadSurrogate(ch) && ((result + 1) < size) 775 && isTrailSurrogate(source.charAt(result + 1))) { 776 result++; 777 } 778 779 count--; 780 result++; 781 } 782 if (count != 0) { 783 throw new StringIndexOutOfBoundsException(offset32); 784 } 785 return result; 786 } 787 788 /** 789 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 790 * the <a name="_top_">class description</a> for notes on roundtripping. 791 * 792 * @param source The UTF-16 string buffer 793 * @param offset32 UTF-32 offset 794 * @return UTF-16 offset 795 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 796 * @stable ICU 2.1 797 */ findOffsetFromCodePoint(StringBuffer source, int offset32)798 public static int findOffsetFromCodePoint(StringBuffer source, int offset32) { 799 char ch; 800 int size = source.length(), result = 0, count = offset32; 801 if (offset32 < 0 || offset32 > size) { 802 throw new StringIndexOutOfBoundsException(offset32); 803 } 804 while (result < size && count > 0) { 805 ch = source.charAt(result); 806 if (isLeadSurrogate(ch) && ((result + 1) < size) 807 && isTrailSurrogate(source.charAt(result + 1))) { 808 result++; 809 } 810 811 count--; 812 result++; 813 } 814 if (count != 0) { 815 throw new StringIndexOutOfBoundsException(offset32); 816 } 817 return result; 818 } 819 820 /** 821 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 822 * the <a name="_top_">class description</a> for notes on roundtripping. 823 * 824 * @param source The UTF-16 char array whose substring is to be analysed 825 * @param start Offset of the substring to be analysed 826 * @param limit Offset of the substring to be analysed 827 * @param offset32 UTF-32 offset relative to start 828 * @return UTF-16 offset relative to start 829 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 830 * @stable ICU 2.1 831 */ findOffsetFromCodePoint(char source[], int start, int limit, int offset32)832 public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) { 833 char ch; 834 int result = start, count = offset32; 835 if (offset32 > limit - start) { 836 throw new ArrayIndexOutOfBoundsException(offset32); 837 } 838 while (result < limit && count > 0) { 839 ch = source[result]; 840 if (isLeadSurrogate(ch) && ((result + 1) < limit) 841 && isTrailSurrogate(source[result + 1])) { 842 result++; 843 } 844 845 count--; 846 result++; 847 } 848 if (count != 0) { 849 throw new ArrayIndexOutOfBoundsException(offset32); 850 } 851 return result - start; 852 } 853 854 /** 855 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given 856 * UTF-16 offset. Used for random access. See the <a name="_top_">class description</a> for 857 * notes on roundtripping.<br> 858 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 859 * of the <strong>lead</strong> of the pair is returned. </i> 860 * <p> 861 * To find the UTF-32 length of a string, use: 862 * 863 * <pre> 864 * len32 = countCodePoint(source, source.length()); 865 * </pre> 866 * 867 * </p> 868 * <p> 869 * 870 * @param source Text to analyse 871 * @param offset16 UTF-16 offset < source text length. 872 * @return UTF-32 offset 873 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 874 * @stable ICU 2.1 875 */ findCodePointOffset(String source, int offset16)876 public static int findCodePointOffset(String source, int offset16) { 877 if (offset16 < 0 || offset16 > source.length()) { 878 throw new StringIndexOutOfBoundsException(offset16); 879 } 880 881 int result = 0; 882 char ch; 883 boolean hadLeadSurrogate = false; 884 885 for (int i = 0; i < offset16; ++i) { 886 ch = source.charAt(i); 887 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 888 hadLeadSurrogate = false; // count valid trail as zero 889 } else { 890 hadLeadSurrogate = isLeadSurrogate(ch); 891 ++result; // count others as 1 892 } 893 } 894 895 if (offset16 == source.length()) { 896 return result; 897 } 898 899 // end of source being the less significant surrogate character 900 // shift result back to the start of the supplementary character 901 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 902 result--; 903 } 904 905 return result; 906 } 907 908 /** 909 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 910 * offset. Used for random access. See the <a name="_top_">class description</a> for notes on 911 * roundtripping.<br> 912 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 913 * of the <strong>lead</strong> of the pair is returned. </i> 914 * <p> 915 * To find the UTF-32 length of a string, use: 916 * 917 * <pre> 918 * len32 = countCodePoint(source); 919 * </pre> 920 * 921 * </p> 922 * <p> 923 * 924 * @param source Text to analyse 925 * @param offset16 UTF-16 offset < source text length. 926 * @return UTF-32 offset 927 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 928 * @stable ICU 2.1 929 */ findCodePointOffset(StringBuffer source, int offset16)930 public static int findCodePointOffset(StringBuffer source, int offset16) { 931 if (offset16 < 0 || offset16 > source.length()) { 932 throw new StringIndexOutOfBoundsException(offset16); 933 } 934 935 int result = 0; 936 char ch; 937 boolean hadLeadSurrogate = false; 938 939 for (int i = 0; i < offset16; ++i) { 940 ch = source.charAt(i); 941 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 942 hadLeadSurrogate = false; // count valid trail as zero 943 } else { 944 hadLeadSurrogate = isLeadSurrogate(ch); 945 ++result; // count others as 1 946 } 947 } 948 949 if (offset16 == source.length()) { 950 return result; 951 } 952 953 // end of source being the less significant surrogate character 954 // shift result back to the start of the supplementary character 955 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 956 result--; 957 } 958 959 return result; 960 } 961 962 /** 963 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 964 * offset. Used for random access. See the <a name="_top_">class description</a> for notes on 965 * roundtripping.<br> 966 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 967 * of the <strong>lead</strong> of the pair is returned. </i> 968 * <p> 969 * To find the UTF-32 length of a substring, use: 970 * 971 * <pre> 972 * len32 = countCodePoint(source, start, limit); 973 * </pre> 974 * 975 * </p> 976 * <p> 977 * 978 * @param source Text to analyse 979 * @param start Offset of the substring 980 * @param limit Offset of the substring 981 * @param offset16 UTF-16 relative to start 982 * @return UTF-32 offset relative to start 983 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 984 * @stable ICU 2.1 985 */ findCodePointOffset(char source[], int start, int limit, int offset16)986 public static int findCodePointOffset(char source[], int start, int limit, int offset16) { 987 offset16 += start; 988 if (offset16 > limit) { 989 throw new StringIndexOutOfBoundsException(offset16); 990 } 991 992 int result = 0; 993 char ch; 994 boolean hadLeadSurrogate = false; 995 996 for (int i = start; i < offset16; ++i) { 997 ch = source[i]; 998 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 999 hadLeadSurrogate = false; // count valid trail as zero 1000 } else { 1001 hadLeadSurrogate = isLeadSurrogate(ch); 1002 ++result; // count others as 1 1003 } 1004 } 1005 1006 if (offset16 == limit) { 1007 return result; 1008 } 1009 1010 // end of source being the less significant surrogate character 1011 // shift result back to the start of the supplementary character 1012 if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) { 1013 result--; 1014 } 1015 1016 return result; 1017 } 1018 1019 /** 1020 * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required, 1021 * use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before 1022 * calling. 1023 * 1024 * @param target The buffer to append to 1025 * @param char32 Value to append. 1026 * @return the updated StringBuffer 1027 * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints 1028 * @stable ICU 2.1 1029 */ append(StringBuffer target, int char32)1030 public static StringBuffer append(StringBuffer target, int char32) { 1031 // Check for irregular values 1032 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1033 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); 1034 } 1035 1036 // Write the UTF-16 values 1037 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 1038 target.append(getLeadSurrogate(char32)); 1039 target.append(getTrailSurrogate(char32)); 1040 } else { 1041 target.append((char) char32); 1042 } 1043 return target; 1044 } 1045 1046 /** 1047 * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a 1048 * convenience. 1049 * 1050 * @param target The buffer to append to 1051 * @param cp The code point to append 1052 * @return the updated StringBuffer 1053 * @throws IllegalArgumentException If cp is not a valid code point 1054 * @stable ICU 3.0 1055 */ appendCodePoint(StringBuffer target, int cp)1056 public static StringBuffer appendCodePoint(StringBuffer target, int cp) { 1057 return append(target, cp); 1058 } 1059 1060 /** 1061 * Adds a codepoint to offset16 position of the argument char array. 1062 * 1063 * @param target Char array to be append with the new code point 1064 * @param limit UTF16 offset which the codepoint will be appended. 1065 * @param char32 Code point to be appended 1066 * @return offset after char32 in the array. 1067 * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not 1068 * lie within the range of the Unicode codepoints. 1069 * @stable ICU 2.1 1070 */ append(char[] target, int limit, int char32)1071 public static int append(char[] target, int limit, int char32) { 1072 // Check for irregular values 1073 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1074 throw new IllegalArgumentException("Illegal codepoint"); 1075 } 1076 // Write the UTF-16 values 1077 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 1078 target[limit++] = getLeadSurrogate(char32); 1079 target[limit++] = getTrailSurrogate(char32); 1080 } else { 1081 target[limit++] = (char) char32; 1082 } 1083 return limit; 1084 } 1085 1086 /** 1087 * Number of codepoints in a UTF16 String 1088 * 1089 * @param source UTF16 string 1090 * @return number of codepoint in string 1091 * @stable ICU 2.1 1092 */ countCodePoint(String source)1093 public static int countCodePoint(String source) { 1094 if (source == null || source.length() == 0) { 1095 return 0; 1096 } 1097 return findCodePointOffset(source, source.length()); 1098 } 1099 1100 /** 1101 * Number of codepoints in a UTF16 String buffer 1102 * 1103 * @param source UTF16 string buffer 1104 * @return number of codepoint in string 1105 * @stable ICU 2.1 1106 */ countCodePoint(StringBuffer source)1107 public static int countCodePoint(StringBuffer source) { 1108 if (source == null || source.length() == 0) { 1109 return 0; 1110 } 1111 return findCodePointOffset(source, source.length()); 1112 } 1113 1114 /** 1115 * Number of codepoints in a UTF16 char array substring 1116 * 1117 * @param source UTF16 char array 1118 * @param start Offset of the substring 1119 * @param limit Offset of the substring 1120 * @return number of codepoint in the substring 1121 * @exception IndexOutOfBoundsException If start and limit are not valid. 1122 * @stable ICU 2.1 1123 */ countCodePoint(char source[], int start, int limit)1124 public static int countCodePoint(char source[], int start, int limit) { 1125 if (source == null || source.length == 0) { 1126 return 0; 1127 } 1128 return findCodePointOffset(source, start, limit, limit - start); 1129 } 1130 1131 /** 1132 * Set a code point into a UTF16 position. Adjusts target according if we are replacing a 1133 * non-supplementary codepoint with a supplementary and vice versa. 1134 * 1135 * @param target Stringbuffer 1136 * @param offset16 UTF16 position to insert into 1137 * @param char32 Code point 1138 * @stable ICU 2.1 1139 */ setCharAt(StringBuffer target, int offset16, int char32)1140 public static void setCharAt(StringBuffer target, int offset16, int char32) { 1141 int count = 1; 1142 char single = target.charAt(offset16); 1143 1144 if (isSurrogate(single)) { 1145 // pairs of the surrogate with offset16 at the lead char found 1146 if (isLeadSurrogate(single) && (target.length() > offset16 + 1) 1147 && isTrailSurrogate(target.charAt(offset16 + 1))) { 1148 count++; 1149 } else { 1150 // pairs of the surrogate with offset16 at the trail char 1151 // found 1152 if (isTrailSurrogate(single) && (offset16 > 0) 1153 && isLeadSurrogate(target.charAt(offset16 - 1))) { 1154 offset16--; 1155 count++; 1156 } 1157 } 1158 } 1159 target.replace(offset16, offset16 + count, valueOf(char32)); 1160 } 1161 1162 /** 1163 * Set a code point into a UTF16 position in a char array. Adjusts target according if we are 1164 * replacing a non-supplementary codepoint with a supplementary and vice versa. 1165 * 1166 * @param target char array 1167 * @param limit numbers of valid chars in target, different from target.length. limit counts the 1168 * number of chars in target that represents a string, not the size of array target. 1169 * @param offset16 UTF16 position to insert into 1170 * @param char32 code point 1171 * @return new number of chars in target that represents a string 1172 * @exception IndexOutOfBoundsException if offset16 is out of range 1173 * @stable ICU 2.1 1174 */ setCharAt(char target[], int limit, int offset16, int char32)1175 public static int setCharAt(char target[], int limit, int offset16, int char32) { 1176 if (offset16 >= limit) { 1177 throw new ArrayIndexOutOfBoundsException(offset16); 1178 } 1179 int count = 1; 1180 char single = target[offset16]; 1181 1182 if (isSurrogate(single)) { 1183 // pairs of the surrogate with offset16 at the lead char found 1184 if (isLeadSurrogate(single) && (target.length > offset16 + 1) 1185 && isTrailSurrogate(target[offset16 + 1])) { 1186 count++; 1187 } else { 1188 // pairs of the surrogate with offset16 at the trail char 1189 // found 1190 if (isTrailSurrogate(single) && (offset16 > 0) 1191 && isLeadSurrogate(target[offset16 - 1])) { 1192 offset16--; 1193 count++; 1194 } 1195 } 1196 } 1197 1198 String str = valueOf(char32); 1199 int result = limit; 1200 int strlength = str.length(); 1201 target[offset16] = str.charAt(0); 1202 if (count == strlength) { 1203 if (count == 2) { 1204 target[offset16 + 1] = str.charAt(1); 1205 } 1206 } else { 1207 // this is not exact match in space, we'll have to do some 1208 // shifting 1209 System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit 1210 - (offset16 + count)); 1211 if (count < strlength) { 1212 // char32 is a supplementary character trying to squeeze into 1213 // a non-supplementary space 1214 target[offset16 + 1] = str.charAt(1); 1215 result++; 1216 if (result < target.length) { 1217 target[result] = 0; 1218 } 1219 } else { 1220 // char32 is a non-supplementary character trying to fill 1221 // into a supplementary space 1222 result--; 1223 target[result] = 0; 1224 } 1225 } 1226 return result; 1227 } 1228 1229 /** 1230 * Shifts offset16 by the argument number of codepoints 1231 * 1232 * @param source string 1233 * @param offset16 UTF16 position to shift 1234 * @param shift32 number of codepoints to shift 1235 * @return new shifted offset16 1236 * @exception IndexOutOfBoundsException if the new offset16 is out of bounds. 1237 * @stable ICU 2.1 1238 */ moveCodePointOffset(String source, int offset16, int shift32)1239 public static int moveCodePointOffset(String source, int offset16, int shift32) { 1240 int result = offset16; 1241 int size = source.length(); 1242 int count; 1243 char ch; 1244 if (offset16 < 0 || offset16 > size) { 1245 throw new StringIndexOutOfBoundsException(offset16); 1246 } 1247 if (shift32 > 0) { 1248 if (shift32 + offset16 > size) { 1249 throw new StringIndexOutOfBoundsException(offset16); 1250 } 1251 count = shift32; 1252 while (result < size && count > 0) { 1253 ch = source.charAt(result); 1254 if (isLeadSurrogate(ch) && ((result + 1) < size) 1255 && isTrailSurrogate(source.charAt(result + 1))) { 1256 result++; 1257 } 1258 count--; 1259 result++; 1260 } 1261 } else { 1262 if (offset16 + shift32 < 0) { 1263 throw new StringIndexOutOfBoundsException(offset16); 1264 } 1265 for (count = -shift32; count > 0; count--) { 1266 result--; 1267 if (result < 0) { 1268 break; 1269 } 1270 ch = source.charAt(result); 1271 if (isTrailSurrogate(ch) && result > 0 1272 && isLeadSurrogate(source.charAt(result - 1))) { 1273 result--; 1274 } 1275 } 1276 } 1277 if (count != 0) { 1278 throw new StringIndexOutOfBoundsException(shift32); 1279 } 1280 return result; 1281 } 1282 1283 /** 1284 * Shifts offset16 by the argument number of codepoints 1285 * 1286 * @param source String buffer 1287 * @param offset16 UTF16 position to shift 1288 * @param shift32 Number of codepoints to shift 1289 * @return new shifted offset16 1290 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds. 1291 * @stable ICU 2.1 1292 */ moveCodePointOffset(StringBuffer source, int offset16, int shift32)1293 public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) { 1294 int result = offset16; 1295 int size = source.length(); 1296 int count; 1297 char ch; 1298 if (offset16 < 0 || offset16 > size) { 1299 throw new StringIndexOutOfBoundsException(offset16); 1300 } 1301 if (shift32 > 0) { 1302 if (shift32 + offset16 > size) { 1303 throw new StringIndexOutOfBoundsException(offset16); 1304 } 1305 count = shift32; 1306 while (result < size && count > 0) { 1307 ch = source.charAt(result); 1308 if (isLeadSurrogate(ch) && ((result + 1) < size) 1309 && isTrailSurrogate(source.charAt(result + 1))) { 1310 result++; 1311 } 1312 count--; 1313 result++; 1314 } 1315 } else { 1316 if (offset16 + shift32 < 0) { 1317 throw new StringIndexOutOfBoundsException(offset16); 1318 } 1319 for (count = -shift32; count > 0; count--) { 1320 result--; 1321 if (result < 0) { 1322 break; 1323 } 1324 ch = source.charAt(result); 1325 if (isTrailSurrogate(ch) && result > 0 1326 && isLeadSurrogate(source.charAt(result - 1))) { 1327 result--; 1328 } 1329 } 1330 } 1331 if (count != 0) { 1332 throw new StringIndexOutOfBoundsException(shift32); 1333 } 1334 return result; 1335 } 1336 1337 /** 1338 * Shifts offset16 by the argument number of codepoints within a subarray. 1339 * 1340 * @param source Char array 1341 * @param start Position of the subarray to be performed on 1342 * @param limit Position of the subarray to be performed on 1343 * @param offset16 UTF16 position to shift relative to start 1344 * @param shift32 Number of codepoints to shift 1345 * @return new shifted offset16 relative to start 1346 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the 1347 * subarray bounds are out of range. 1348 * @stable ICU 2.1 1349 */ moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32)1350 public static int moveCodePointOffset(char source[], int start, int limit, int offset16, 1351 int shift32) { 1352 int size = source.length; 1353 int count; 1354 char ch; 1355 int result = offset16 + start; 1356 if (start < 0 || limit < start) { 1357 throw new StringIndexOutOfBoundsException(start); 1358 } 1359 if (limit > size) { 1360 throw new StringIndexOutOfBoundsException(limit); 1361 } 1362 if (offset16 < 0 || result > limit) { 1363 throw new StringIndexOutOfBoundsException(offset16); 1364 } 1365 if (shift32 > 0) { 1366 if (shift32 + result > size) { 1367 throw new StringIndexOutOfBoundsException(result); 1368 } 1369 count = shift32; 1370 while (result < limit && count > 0) { 1371 ch = source[result]; 1372 if (isLeadSurrogate(ch) && (result + 1 < limit) 1373 && isTrailSurrogate(source[result + 1])) { 1374 result++; 1375 } 1376 count--; 1377 result++; 1378 } 1379 } else { 1380 if (result + shift32 < start) { 1381 throw new StringIndexOutOfBoundsException(result); 1382 } 1383 for (count = -shift32; count > 0; count--) { 1384 result--; 1385 if (result < start) { 1386 break; 1387 } 1388 ch = source[result]; 1389 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { 1390 result--; 1391 } 1392 } 1393 } 1394 if (count != 0) { 1395 throw new StringIndexOutOfBoundsException(shift32); 1396 } 1397 result -= start; 1398 return result; 1399 } 1400 1401 /** 1402 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1403 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1404 * codepoint. The length of target increases by one if codepoint is non-supplementary, 2 1405 * otherwise. 1406 * <p> 1407 * The overall effect is exactly as if the argument were converted to a string by the method 1408 * valueOf(char) and the characters in that string were then inserted into target at the 1409 * position indicated by offset16. 1410 * </p> 1411 * <p> 1412 * The offset argument must be greater than or equal to 0, and less than or equal to the length 1413 * of source. 1414 * 1415 * @param target String buffer to insert to 1416 * @param offset16 Offset which char32 will be inserted in 1417 * @param char32 Codepoint to be inserted 1418 * @return a reference to target 1419 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1420 * @stable ICU 2.1 1421 */ insert(StringBuffer target, int offset16, int char32)1422 public static StringBuffer insert(StringBuffer target, int offset16, int char32) { 1423 String str = valueOf(char32); 1424 if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1425 offset16++; 1426 } 1427 target.insert(offset16, str); 1428 return target; 1429 } 1430 1431 /** 1432 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1433 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1434 * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise. 1435 * <p> 1436 * The overall effect is exactly as if the argument were converted to a string by the method 1437 * valueOf(char) and the characters in that string were then inserted into target at the 1438 * position indicated by offset16. 1439 * </p> 1440 * <p> 1441 * The offset argument must be greater than or equal to 0, and less than or equal to the limit. 1442 * 1443 * @param target Char array to insert to 1444 * @param limit End index of the char array, limit <= target.length 1445 * @param offset16 Offset which char32 will be inserted in 1446 * @param char32 Codepoint to be inserted 1447 * @return new limit size 1448 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1449 * @stable ICU 2.1 1450 */ insert(char target[], int limit, int offset16, int char32)1451 public static int insert(char target[], int limit, int offset16, int char32) { 1452 String str = valueOf(char32); 1453 if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1454 offset16++; 1455 } 1456 int size = str.length(); 1457 if (limit + size > target.length) { 1458 throw new ArrayIndexOutOfBoundsException(offset16 + size); 1459 } 1460 System.arraycopy(target, offset16, target, offset16 + size, limit - offset16); 1461 target[offset16] = str.charAt(0); 1462 if (size == 2) { 1463 target[offset16 + 1] = str.charAt(1); 1464 } 1465 return limit + size; 1466 } 1467 1468 /** 1469 * Removes the codepoint at the specified position in this target (shortening target by 1 1470 * character if the codepoint is a non-supplementary, 2 otherwise). 1471 * 1472 * @param target String buffer to remove codepoint from 1473 * @param offset16 Offset which the codepoint will be removed 1474 * @return a reference to target 1475 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1476 * @stable ICU 2.1 1477 */ delete(StringBuffer target, int offset16)1478 public static StringBuffer delete(StringBuffer target, int offset16) { 1479 int count = 1; 1480 switch (bounds(target, offset16)) { 1481 case LEAD_SURROGATE_BOUNDARY: 1482 count++; 1483 break; 1484 case TRAIL_SURROGATE_BOUNDARY: 1485 count++; 1486 offset16--; 1487 break; 1488 } 1489 target.delete(offset16, offset16 + count); 1490 return target; 1491 } 1492 1493 /** 1494 * Removes the codepoint at the specified position in this target (shortening target by 1 1495 * character if the codepoint is a non-supplementary, 2 otherwise). 1496 * 1497 * @param target String buffer to remove codepoint from 1498 * @param limit End index of the char array, limit <= target.length 1499 * @param offset16 Offset which the codepoint will be removed 1500 * @return a new limit size 1501 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1502 * @stable ICU 2.1 1503 */ delete(char target[], int limit, int offset16)1504 public static int delete(char target[], int limit, int offset16) { 1505 int count = 1; 1506 switch (bounds(target, 0, limit, offset16)) { 1507 case LEAD_SURROGATE_BOUNDARY: 1508 count++; 1509 break; 1510 case TRAIL_SURROGATE_BOUNDARY: 1511 count++; 1512 offset16--; 1513 break; 1514 } 1515 System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count)); 1516 target[limit - count] = 0; 1517 return limit - count; 1518 } 1519 1520 /** 1521 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1522 * the argument codepoint. I.e., the smallest index <code>i</code> such that 1523 * <code>UTF16.charAt(source, i) == 1524 * char32</code> is true. 1525 * <p> 1526 * If no such character occurs in this string, then -1 is returned. 1527 * </p> 1528 * <p> 1529 * Examples:<br> 1530 * UTF16.indexOf("abc", 'a') returns 0<br> 1531 * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1532 * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1533 * </p> 1534 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1535 * characters to its fullest. 1536 * 1537 * @param source UTF16 format Unicode string that will be searched 1538 * @param char32 Codepoint to search for 1539 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1540 * -1 if the codepoint does not occur. 1541 * @stable ICU 2.6 1542 */ indexOf(String source, int char32)1543 public static int indexOf(String source, int char32) { 1544 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1545 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1546 } 1547 // non-surrogate bmp 1548 if (char32 < LEAD_SURROGATE_MIN_VALUE 1549 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1550 return source.indexOf((char) char32); 1551 } 1552 // surrogate 1553 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1554 int result = source.indexOf((char) char32); 1555 if (result >= 0) { 1556 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1557 && isTrailSurrogate(source.charAt(result + 1))) { 1558 return indexOf(source, char32, result + 1); 1559 } 1560 // trail surrogate 1561 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1562 return indexOf(source, char32, result + 1); 1563 } 1564 } 1565 return result; 1566 } 1567 // supplementary 1568 String char32str = toString(char32); 1569 return source.indexOf(char32str); 1570 } 1571 1572 /** 1573 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1574 * the argument string str. This method is implemented based on codepoints, hence a "lead 1575 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1576 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1577 * character before str found at in source will not have a valid match. Vice versa for lead 1578 * surrogates that ends str. See example below. 1579 * <p> 1580 * If no such string str occurs in this source, then -1 is returned. 1581 * </p> 1582 * <p> 1583 * Examples:<br> 1584 * UTF16.indexOf("abc", "ab") returns 0<br> 1585 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1586 * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1587 * </p> 1588 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1589 * characters to its fullest. 1590 * 1591 * @param source UTF16 format Unicode string that will be searched 1592 * @param str UTF16 format Unicode string to search for 1593 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1594 * -1 if the codepoint does not occur. 1595 * @stable ICU 2.6 1596 */ indexOf(String source, String str)1597 public static int indexOf(String source, String str) { 1598 int strLength = str.length(); 1599 // non-surrogate ends 1600 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1601 return source.indexOf(str); 1602 } 1603 1604 int result = source.indexOf(str); 1605 int resultEnd = result + strLength; 1606 if (result >= 0) { 1607 // check last character 1608 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1609 && isTrailSurrogate(source.charAt(resultEnd + 1))) { 1610 return indexOf(source, str, resultEnd + 1); 1611 } 1612 // check first character which is a trail surrogate 1613 if (isTrailSurrogate(str.charAt(0)) && result > 0 1614 && isLeadSurrogate(source.charAt(result - 1))) { 1615 return indexOf(source, str, resultEnd + 1); 1616 } 1617 } 1618 return result; 1619 } 1620 1621 /** 1622 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1623 * the argument codepoint. I.e., the smallest index i such that: <br> 1624 * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true. 1625 * <p> 1626 * If no such character occurs in this string, then -1 is returned. 1627 * </p> 1628 * <p> 1629 * Examples:<br> 1630 * UTF16.indexOf("abc", 'a', 1) returns -1<br> 1631 * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br> 1632 * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br> 1633 * </p> 1634 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1635 * characters to its fullest. 1636 * 1637 * @param source UTF16 format Unicode string that will be searched 1638 * @param char32 Codepoint to search for 1639 * @param fromIndex The index to start the search from. 1640 * @return the index of the first occurrence of the codepoint in the argument Unicode string at 1641 * or after fromIndex, or -1 if the codepoint does not occur. 1642 * @stable ICU 2.6 1643 */ indexOf(String source, int char32, int fromIndex)1644 public static int indexOf(String source, int char32, int fromIndex) { 1645 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1646 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1647 } 1648 // non-surrogate bmp 1649 if (char32 < LEAD_SURROGATE_MIN_VALUE 1650 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1651 return source.indexOf((char) char32, fromIndex); 1652 } 1653 // surrogate 1654 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1655 int result = source.indexOf((char) char32, fromIndex); 1656 if (result >= 0) { 1657 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1658 && isTrailSurrogate(source.charAt(result + 1))) { 1659 return indexOf(source, char32, result + 1); 1660 } 1661 // trail surrogate 1662 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1663 return indexOf(source, char32, result + 1); 1664 } 1665 } 1666 return result; 1667 } 1668 // supplementary 1669 String char32str = toString(char32); 1670 return source.indexOf(char32str, fromIndex); 1671 } 1672 1673 /** 1674 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1675 * the argument string str. This method is implemented based on codepoints, hence a "lead 1676 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1677 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1678 * character before str found at in source will not have a valid match. Vice versa for lead 1679 * surrogates that ends str. See example below. 1680 * <p> 1681 * If no such string str occurs in this source, then -1 is returned. 1682 * </p> 1683 * <p> 1684 * Examples:<br> 1685 * UTF16.indexOf("abc", "ab", 0) returns 0<br> 1686 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br> 1687 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br> 1688 * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br> 1689 * </p> 1690 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1691 * characters to its fullest. 1692 * 1693 * @param source UTF16 format Unicode string that will be searched 1694 * @param str UTF16 format Unicode string to search for 1695 * @param fromIndex The index to start the search from. 1696 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1697 * -1 if the codepoint does not occur. 1698 * @stable ICU 2.6 1699 */ indexOf(String source, String str, int fromIndex)1700 public static int indexOf(String source, String str, int fromIndex) { 1701 int strLength = str.length(); 1702 // non-surrogate ends 1703 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1704 return source.indexOf(str, fromIndex); 1705 } 1706 1707 int result = source.indexOf(str, fromIndex); 1708 int resultEnd = result + strLength; 1709 if (result >= 0) { 1710 // check last character 1711 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1712 && isTrailSurrogate(source.charAt(resultEnd))) { 1713 return indexOf(source, str, resultEnd + 1); 1714 } 1715 // check first character which is a trail surrogate 1716 if (isTrailSurrogate(str.charAt(0)) && result > 0 1717 && isLeadSurrogate(source.charAt(result - 1))) { 1718 return indexOf(source, str, resultEnd + 1); 1719 } 1720 } 1721 return result; 1722 } 1723 1724 /** 1725 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1726 * the argument codepoint. I.e., the index returned is the largest value i such that: 1727 * UTF16.charAt(source, i) == char32 is true. 1728 * <p> 1729 * Examples:<br> 1730 * UTF16.lastIndexOf("abc", 'a') returns 0<br> 1731 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1732 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1733 * </p> 1734 * <p> 1735 * source is searched backwards starting at the last character. 1736 * </p> 1737 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1738 * characters to its fullest. 1739 * 1740 * @param source UTF16 format Unicode string that will be searched 1741 * @param char32 Codepoint to search for 1742 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1743 * does not occur. 1744 * @stable ICU 2.6 1745 */ lastIndexOf(String source, int char32)1746 public static int lastIndexOf(String source, int char32) { 1747 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1748 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1749 } 1750 // non-surrogate bmp 1751 if (char32 < LEAD_SURROGATE_MIN_VALUE 1752 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1753 return source.lastIndexOf((char) char32); 1754 } 1755 // surrogate 1756 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1757 int result = source.lastIndexOf((char) char32); 1758 if (result >= 0) { 1759 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1760 && isTrailSurrogate(source.charAt(result + 1))) { 1761 return lastIndexOf(source, char32, result - 1); 1762 } 1763 // trail surrogate 1764 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1765 return lastIndexOf(source, char32, result - 1); 1766 } 1767 } 1768 return result; 1769 } 1770 // supplementary 1771 String char32str = toString(char32); 1772 return source.lastIndexOf(char32str); 1773 } 1774 1775 /** 1776 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1777 * the argument string str. This method is implemented based on codepoints, hence a "lead 1778 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1779 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1780 * character before str found at in source will not have a valid match. Vice versa for lead 1781 * surrogates that ends str. See example below. 1782 * <p> 1783 * Examples:<br> 1784 * UTF16.lastIndexOf("abc", "a") returns 0<br> 1785 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1786 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1787 * </p> 1788 * <p> 1789 * source is searched backwards starting at the last character. 1790 * </p> 1791 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1792 * characters to its fullest. 1793 * 1794 * @param source UTF16 format Unicode string that will be searched 1795 * @param str UTF16 format Unicode string to search for 1796 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1797 * does not occur. 1798 * @stable ICU 2.6 1799 */ lastIndexOf(String source, String str)1800 public static int lastIndexOf(String source, String str) { 1801 int strLength = str.length(); 1802 // non-surrogate ends 1803 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1804 return source.lastIndexOf(str); 1805 } 1806 1807 int result = source.lastIndexOf(str); 1808 if (result >= 0) { 1809 // check last character 1810 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1811 && isTrailSurrogate(source.charAt(result + strLength + 1))) { 1812 return lastIndexOf(source, str, result - 1); 1813 } 1814 // check first character which is a trail surrogate 1815 if (isTrailSurrogate(str.charAt(0)) && result > 0 1816 && isLeadSurrogate(source.charAt(result - 1))) { 1817 return lastIndexOf(source, str, result - 1); 1818 } 1819 } 1820 return result; 1821 } 1822 1823 /** 1824 * <p> 1825 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1826 * the argument codepoint, where the result is less than or equals to fromIndex. 1827 * </p> 1828 * <p> 1829 * This method is implemented based on codepoints, hence a single surrogate character will not 1830 * match a supplementary character. 1831 * </p> 1832 * <p> 1833 * source is searched backwards starting at the last character starting at the specified index. 1834 * </p> 1835 * <p> 1836 * Examples:<br> 1837 * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br> 1838 * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br> 1839 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br> 1840 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br> 1841 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1842 * </p> 1843 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1844 * characters to its fullest. 1845 * 1846 * @param source UTF16 format Unicode string that will be searched 1847 * @param char32 Codepoint to search for 1848 * @param fromIndex the index to start the search from. There is no restriction on the value of 1849 * fromIndex. If it is greater than or equal to the length of this string, it has the 1850 * same effect as if it were equal to one less than the length of this string: this 1851 * entire string may be searched. If it is negative, it has the same effect as if it 1852 * were -1: -1 is returned. 1853 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1854 * does not occur. 1855 * @stable ICU 2.6 1856 */ lastIndexOf(String source, int char32, int fromIndex)1857 public static int lastIndexOf(String source, int char32, int fromIndex) { 1858 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1859 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1860 } 1861 // non-surrogate bmp 1862 if (char32 < LEAD_SURROGATE_MIN_VALUE 1863 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1864 return source.lastIndexOf((char) char32, fromIndex); 1865 } 1866 // surrogate 1867 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1868 int result = source.lastIndexOf((char) char32, fromIndex); 1869 if (result >= 0) { 1870 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1871 && isTrailSurrogate(source.charAt(result + 1))) { 1872 return lastIndexOf(source, char32, result - 1); 1873 } 1874 // trail surrogate 1875 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1876 return lastIndexOf(source, char32, result - 1); 1877 } 1878 } 1879 return result; 1880 } 1881 // supplementary 1882 String char32str = toString(char32); 1883 return source.lastIndexOf(char32str, fromIndex); 1884 } 1885 1886 /** 1887 * <p> 1888 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1889 * the argument string str, where the result is less than or equals to fromIndex. 1890 * </p> 1891 * <p> 1892 * This method is implemented based on codepoints, hence a "lead surrogate character + trail 1893 * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate 1894 * character at index 0, a source with a leading a surrogate character before str found at in 1895 * source will not have a valid match. Vice versa for lead surrogates that ends str. 1896 * </p> 1897 * See example below. 1898 * <p> 1899 * Examples:<br> 1900 * UTF16.lastIndexOf("abc", "c", 2) returns 2<br> 1901 * UTF16.lastIndexOf("abc", "c", 1) returns -1<br> 1902 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br> 1903 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br> 1904 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br> 1905 * </p> 1906 * <p> 1907 * source is searched backwards starting at the last character. 1908 * </p> 1909 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1910 * characters to its fullest. 1911 * 1912 * @param source UTF16 format Unicode string that will be searched 1913 * @param str UTF16 format Unicode string to search for 1914 * @param fromIndex the index to start the search from. There is no restriction on the value of 1915 * fromIndex. If it is greater than or equal to the length of this string, it has the 1916 * same effect as if it were equal to one less than the length of this string: this 1917 * entire string may be searched. If it is negative, it has the same effect as if it 1918 * were -1: -1 is returned. 1919 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1920 * does not occur. 1921 * @stable ICU 2.6 1922 */ lastIndexOf(String source, String str, int fromIndex)1923 public static int lastIndexOf(String source, String str, int fromIndex) { 1924 int strLength = str.length(); 1925 // non-surrogate ends 1926 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1927 return source.lastIndexOf(str, fromIndex); 1928 } 1929 1930 int result = source.lastIndexOf(str, fromIndex); 1931 if (result >= 0) { 1932 // check last character 1933 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1934 && isTrailSurrogate(source.charAt(result + strLength))) { 1935 return lastIndexOf(source, str, result - 1); 1936 } 1937 // check first character which is a trail surrogate 1938 if (isTrailSurrogate(str.charAt(0)) && result > 0 1939 && isLeadSurrogate(source.charAt(result - 1))) { 1940 return lastIndexOf(source, str, result - 1); 1941 } 1942 } 1943 return result; 1944 } 1945 1946 /** 1947 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of 1948 * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16 1949 * format Unicode string source, then source will be returned. Otherwise, a new String object is 1950 * created that represents a codepoint sequence identical to the codepoint sequence represented 1951 * by source, except that every occurrence of oldChar32 is replaced by an occurrence of 1952 * newChar32. 1953 * <p> 1954 * Examples: <br> 1955 * UTF16.replace("mesquite in your cellar", 'e', 'o');<br> 1956 * returns "mosquito in your collar"<br> 1957 * UTF16.replace("JonL", 'q', 'x');<br> 1958 * returns "JonL" (no change)<br> 1959 * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br> 1960 * returns "Supplementary character !"<br> 1961 * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br> 1962 * returns "Supplementary character \ud800\udc00"<br> 1963 * </p> 1964 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1965 * characters to its fullest. 1966 * 1967 * @param source UTF16 format Unicode string which the codepoint replacements will be based on. 1968 * @param oldChar32 Non-zero old codepoint to be replaced. 1969 * @param newChar32 The new codepoint to replace oldChar32 1970 * @return new String derived from source by replacing every occurrence of oldChar32 with 1971 * newChar32, unless when no oldChar32 is found in source then source will be returned. 1972 * @stable ICU 2.6 1973 */ replace(String source, int oldChar32, int newChar32)1974 public static String replace(String source, int oldChar32, int newChar32) { 1975 if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) { 1976 throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint"); 1977 } 1978 if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) { 1979 throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint"); 1980 } 1981 1982 int index = indexOf(source, oldChar32); 1983 if (index == -1) { 1984 return source; 1985 } 1986 String newChar32Str = toString(newChar32); 1987 int oldChar32Size = 1; 1988 int newChar32Size = newChar32Str.length(); 1989 StringBuffer result = new StringBuffer(source); 1990 int resultIndex = index; 1991 1992 if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) { 1993 oldChar32Size = 2; 1994 } 1995 1996 while (index != -1) { 1997 int endResultIndex = resultIndex + oldChar32Size; 1998 result.replace(resultIndex, endResultIndex, newChar32Str); 1999 int lastEndIndex = index + oldChar32Size; 2000 index = indexOf(source, oldChar32, lastEndIndex); 2001 resultIndex += newChar32Size + index - lastEndIndex; 2002 } 2003 return result.toString(); 2004 } 2005 2006 /** 2007 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr 2008 * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string 2009 * source, then source will be returned. Otherwise, a new String object is created that 2010 * represents a codepoint sequence identical to the codepoint sequence represented by source, 2011 * except that every occurrence of oldStr is replaced by an occurrence of newStr. 2012 * <p> 2013 * Examples: <br> 2014 * UTF16.replace("mesquite in your cellar", "e", "o");<br> 2015 * returns "mosquito in your collar"<br> 2016 * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br> 2017 * returns "cat in your cellar"<br> 2018 * UTF16.replace("JonL", "q", "x");<br> 2019 * returns "JonL" (no change)<br> 2020 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br> 2021 * returns "Supplementary character !"<br> 2022 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br> 2023 * returns "Supplementary character \ud800\udc00"<br> 2024 * </p> 2025 * Note this method is provided as support to jdk 1.3, which does not support supplementary 2026 * characters to its fullest. 2027 * 2028 * @param source UTF16 format Unicode string which the replacements will be based on. 2029 * @param oldStr Non-zero-length string to be replaced. 2030 * @param newStr The new string to replace oldStr 2031 * @return new String derived from source by replacing every occurrence of oldStr with newStr. 2032 * When no oldStr is found in source, then source will be returned. 2033 * @stable ICU 2.6 2034 */ replace(String source, String oldStr, String newStr)2035 public static String replace(String source, String oldStr, String newStr) { 2036 int index = indexOf(source, oldStr); 2037 if (index == -1) { 2038 return source; 2039 } 2040 int oldStrSize = oldStr.length(); 2041 int newStrSize = newStr.length(); 2042 StringBuffer result = new StringBuffer(source); 2043 int resultIndex = index; 2044 2045 while (index != -1) { 2046 int endResultIndex = resultIndex + oldStrSize; 2047 result.replace(resultIndex, endResultIndex, newStr); 2048 int lastEndIndex = index + oldStrSize; 2049 index = indexOf(source, oldStr, lastEndIndex); 2050 resultIndex += newStrSize + index - lastEndIndex; 2051 } 2052 return result.toString(); 2053 } 2054 2055 /** 2056 * Reverses a UTF16 format Unicode string and replaces source's content with it. This method 2057 * will reverse surrogate characters correctly, instead of blindly reversing every character. 2058 * <p> 2059 * Examples:<br> 2060 * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br> 2061 * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS". 2062 * 2063 * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed 2064 * @return a modified source with reversed UTF16 format Unicode string. 2065 * @stable ICU 2.6 2066 */ reverse(StringBuffer source)2067 public static StringBuffer reverse(StringBuffer source) { 2068 int length = source.length(); 2069 StringBuffer result = new StringBuffer(length); 2070 for (int i = length; i-- > 0;) { 2071 char ch = source.charAt(i); 2072 if (isTrailSurrogate(ch) && i > 0) { 2073 char ch2 = source.charAt(i - 1); 2074 if (isLeadSurrogate(ch2)) { 2075 result.append(ch2); 2076 result.append(ch); 2077 --i; 2078 continue; 2079 } 2080 } 2081 result.append(ch); 2082 } 2083 return result; 2084 } 2085 2086 /** 2087 * Check if the string contains more Unicode code points than a certain number. This is more 2088 * efficient than counting all code points in the entire string and comparing that number with a 2089 * threshold. This function may not need to scan the string at all if the length is within a 2090 * certain range, and never needs to count more than 'number + 1' code points. Logically 2091 * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two 2092 * code units. 2093 * 2094 * @param source The input string. 2095 * @param number The number of code points in the string is compared against the 'number' 2096 * parameter. 2097 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2098 * @stable ICU 2.4 2099 */ hasMoreCodePointsThan(String source, int number)2100 public static boolean hasMoreCodePointsThan(String source, int number) { 2101 if (number < 0) { 2102 return true; 2103 } 2104 if (source == null) { 2105 return false; 2106 } 2107 int length = source.length(); 2108 2109 // length >= 0 known 2110 // source contains at least (length + 1) / 2 code points: <= 2 2111 // chars per cp 2112 if (((length + 1) >> 1) > number) { 2113 return true; 2114 } 2115 2116 // check if source does not even contain enough chars 2117 int maxsupplementary = length - number; 2118 if (maxsupplementary <= 0) { 2119 return false; 2120 } 2121 2122 // there are maxsupplementary = length - number more chars than 2123 // asked-for code points 2124 2125 // count code points until they exceed and also check that there are 2126 // no more than maxsupplementary supplementary code points (char pairs) 2127 int start = 0; 2128 while (true) { 2129 if (length == 0) { 2130 return false; 2131 } 2132 if (number == 0) { 2133 return true; 2134 } 2135 if (isLeadSurrogate(source.charAt(start++)) && start != length 2136 && isTrailSurrogate(source.charAt(start))) { 2137 start++; 2138 if (--maxsupplementary <= 0) { 2139 // too many pairs - too few code points 2140 return false; 2141 } 2142 } 2143 --number; 2144 } 2145 } 2146 2147 /** 2148 * Check if the sub-range of char array, from argument start to limit, contains more Unicode 2149 * code points than a certain number. This is more efficient than counting all code points in 2150 * the entire char array range and comparing that number with a threshold. This function may not 2151 * need to scan the char array at all if start and limit is within a certain range, and never 2152 * needs to count more than 'number + 1' code points. Logically equivalent to 2153 * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one 2154 * or two code units. 2155 * 2156 * @param source Array of UTF-16 chars 2157 * @param start Offset to substring in the source array for analyzing 2158 * @param limit Offset to substring in the source array for analyzing 2159 * @param number The number of code points in the string is compared against the 'number' 2160 * parameter. 2161 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2162 * @exception IndexOutOfBoundsException Thrown when limit < start 2163 * @stable ICU 2.4 2164 */ hasMoreCodePointsThan(char source[], int start, int limit, int number)2165 public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) { 2166 int length = limit - start; 2167 if (length < 0 || start < 0 || limit < 0) { 2168 throw new IndexOutOfBoundsException( 2169 "Start and limit indexes should be non-negative and start <= limit"); 2170 } 2171 if (number < 0) { 2172 return true; 2173 } 2174 if (source == null) { 2175 return false; 2176 } 2177 2178 // length >= 0 known 2179 // source contains at least (length + 1) / 2 code points: <= 2 2180 // chars per cp 2181 if (((length + 1) >> 1) > number) { 2182 return true; 2183 } 2184 2185 // check if source does not even contain enough chars 2186 int maxsupplementary = length - number; 2187 if (maxsupplementary <= 0) { 2188 return false; 2189 } 2190 2191 // there are maxsupplementary = length - number more chars than 2192 // asked-for code points 2193 2194 // count code points until they exceed and also check that there are 2195 // no more than maxsupplementary supplementary code points (char pairs) 2196 while (true) { 2197 if (length == 0) { 2198 return false; 2199 } 2200 if (number == 0) { 2201 return true; 2202 } 2203 if (isLeadSurrogate(source[start++]) && start != limit 2204 && isTrailSurrogate(source[start])) { 2205 start++; 2206 if (--maxsupplementary <= 0) { 2207 // too many pairs - too few code points 2208 return false; 2209 } 2210 } 2211 --number; 2212 } 2213 } 2214 2215 /** 2216 * Check if the string buffer contains more Unicode code points than a certain number. This is 2217 * more efficient than counting all code points in the entire string buffer and comparing that 2218 * number with a threshold. This function may not need to scan the string buffer at all if the 2219 * length is within a certain range, and never needs to count more than 'number + 1' code 2220 * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may occupy 2221 * either one or two code units. 2222 * 2223 * @param source The input string buffer. 2224 * @param number The number of code points in the string buffer is compared against the 'number' 2225 * parameter. 2226 * @return boolean value for whether the string buffer contains more Unicode code points than 2227 * 'number'. 2228 * @stable ICU 2.4 2229 */ hasMoreCodePointsThan(StringBuffer source, int number)2230 public static boolean hasMoreCodePointsThan(StringBuffer source, int number) { 2231 if (number < 0) { 2232 return true; 2233 } 2234 if (source == null) { 2235 return false; 2236 } 2237 int length = source.length(); 2238 2239 // length >= 0 known 2240 // source contains at least (length + 1) / 2 code points: <= 2 2241 // chars per cp 2242 if (((length + 1) >> 1) > number) { 2243 return true; 2244 } 2245 2246 // check if source does not even contain enough chars 2247 int maxsupplementary = length - number; 2248 if (maxsupplementary <= 0) { 2249 return false; 2250 } 2251 2252 // there are maxsupplementary = length - number more chars than 2253 // asked-for code points 2254 2255 // count code points until they exceed and also check that there are 2256 // no more than maxsupplementary supplementary code points (char pairs) 2257 int start = 0; 2258 while (true) { 2259 if (length == 0) { 2260 return false; 2261 } 2262 if (number == 0) { 2263 return true; 2264 } 2265 if (isLeadSurrogate(source.charAt(start++)) && start != length 2266 && isTrailSurrogate(source.charAt(start))) { 2267 start++; 2268 if (--maxsupplementary <= 0) { 2269 // too many pairs - too few code points 2270 return false; 2271 } 2272 } 2273 --number; 2274 } 2275 } 2276 2277 /** 2278 * Cover JDK 1.5 API. Create a String from an array of codePoints. 2279 * 2280 * @param codePoints The code array 2281 * @param offset The start of the text in the code point array 2282 * @param count The number of code points 2283 * @return a String representing the code points between offset and count 2284 * @throws IllegalArgumentException If an invalid code point is encountered 2285 * @throws IndexOutOfBoundsException If the offset or count are out of bounds. 2286 * @stable ICU 3.0 2287 */ newString(int[] codePoints, int offset, int count)2288 public static String newString(int[] codePoints, int offset, int count) { 2289 if (count < 0) { 2290 throw new IllegalArgumentException(); 2291 } 2292 char[] chars = new char[count]; 2293 int w = 0; 2294 for (int r = offset, e = offset + count; r < e; ++r) { 2295 int cp = codePoints[r]; 2296 if (cp < 0 || cp > 0x10ffff) { 2297 throw new IllegalArgumentException(); 2298 } 2299 while (true) { 2300 try { 2301 if (cp < 0x010000) { 2302 chars[w] = (char) cp; 2303 w++; 2304 } else { 2305 chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); 2306 chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); 2307 w += 2; 2308 } 2309 break; 2310 } catch (IndexOutOfBoundsException ex) { 2311 int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2) 2312 / (r - offset + 1))); 2313 char[] temp = new char[newlen]; 2314 System.arraycopy(chars, 0, temp, 0, w); 2315 chars = temp; 2316 } 2317 } 2318 } 2319 return new String(chars, 0, w); 2320 } 2321 2322 /** 2323 * <p> 2324 * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various 2325 * modes 2326 * </p> 2327 * <ul> 2328 * <li> Code point comparison or code unit comparison 2329 * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison 2330 * with special handling for character 'i'. 2331 * </ul> 2332 * <p> 2333 * The code unit or code point comparison differ only when comparing supplementary code points 2334 * (\u10000..\u10ffff) to BMP code points near the end of the BMP (i.e., 2335 * \ue000..\uffff). In code unit comparison, high BMP code points sort after 2336 * supplementary code points because they are stored as pairs of surrogates which are at 2337 * \ud800..\udfff. 2338 * </p> 2339 * 2340 * @see #FOLD_CASE_DEFAULT 2341 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2342 * @stable ICU 2.1 2343 */ 2344 public static final class StringComparator implements java.util.Comparator<String> { 2345 // public constructor ------------------------------------------------ 2346 2347 /** 2348 * Default constructor that does code unit comparison and case sensitive comparison. 2349 * 2350 * @stable ICU 2.1 2351 */ StringComparator()2352 public StringComparator() { 2353 this(false, false, FOLD_CASE_DEFAULT); 2354 } 2355 2356 /** 2357 * Constructor that does comparison based on the argument options. 2358 * 2359 * @param codepointcompare Flag to indicate true for code point comparison or false for code unit 2360 * comparison. 2361 * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison 2362 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2363 * when ignorecase is set to true. If ignorecase is false, this option is 2364 * ignored. 2365 * @see #FOLD_CASE_DEFAULT 2366 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2367 * @throws IllegalArgumentException If foldcaseoption is out of range 2368 * @stable ICU 2.4 2369 */ StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption)2370 public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) { 2371 setCodePointCompare(codepointcompare); 2372 m_ignoreCase_ = ignorecase; 2373 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2374 throw new IllegalArgumentException("Invalid fold case option"); 2375 } 2376 m_foldCase_ = foldcaseoption; 2377 } 2378 2379 // public data member ------------------------------------------------ 2380 2381 /** 2382 * Option value for case folding comparison: 2383 * 2384 * <p>Comparison is case insensitive, strings are folded using default mappings defined in 2385 * Unicode data file CaseFolding.txt, before comparison. 2386 * 2387 * @stable ICU 2.4 2388 */ 2389 public static final int FOLD_CASE_DEFAULT = 0; 2390 2391 /** 2392 * Option value for case folding: 2393 * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I 2394 * and dotless i appropriately for Turkic languages (tr, az). 2395 * 2396 * <p>Comparison is case insensitive, strings are folded using modified mappings defined in 2397 * Unicode data file CaseFolding.txt, before comparison. 2398 * 2399 * @stable ICU 2.4 2400 * @see com.ibm.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I 2401 */ 2402 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1; 2403 2404 // public methods ---------------------------------------------------- 2405 2406 // public setters ---------------------------------------------------- 2407 2408 /** 2409 * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode 2410 * is set to code unit compare 2411 * 2412 * @param flag True for code point compare, false for code unit compare 2413 * @stable ICU 2.4 2414 */ setCodePointCompare(boolean flag)2415 public void setCodePointCompare(boolean flag) { 2416 if (flag) { 2417 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER; 2418 } else { 2419 m_codePointCompare_ = 0; 2420 } 2421 } 2422 2423 /** 2424 * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise 2425 * case sensitive comparison mode if set to false. 2426 * 2427 * @param ignorecase True for case-insitive comparison, false for case sensitive comparison 2428 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2429 * when ignorecase is set to true. If ignorecase is false, this option is 2430 * ignored. 2431 * @see #FOLD_CASE_DEFAULT 2432 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2433 * @stable ICU 2.4 2434 */ setIgnoreCase(boolean ignorecase, int foldcaseoption)2435 public void setIgnoreCase(boolean ignorecase, int foldcaseoption) { 2436 m_ignoreCase_ = ignorecase; 2437 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2438 throw new IllegalArgumentException("Invalid fold case option"); 2439 } 2440 m_foldCase_ = foldcaseoption; 2441 } 2442 2443 // public getters ---------------------------------------------------- 2444 2445 /** 2446 * Checks if the comparison mode is code point compare. 2447 * 2448 * @return true for code point compare, false for code unit compare 2449 * @stable ICU 2.4 2450 */ getCodePointCompare()2451 public boolean getCodePointCompare() { 2452 return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2453 } 2454 2455 /** 2456 * Checks if Comparator is in the case insensitive mode. 2457 * 2458 * @return true if Comparator performs case insensitive comparison, false otherwise 2459 * @stable ICU 2.4 2460 */ getIgnoreCase()2461 public boolean getIgnoreCase() { 2462 return m_ignoreCase_; 2463 } 2464 2465 /** 2466 * Gets the fold case options set in Comparator to be used with case insensitive comparison. 2467 * 2468 * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I 2469 * @see #FOLD_CASE_DEFAULT 2470 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2471 * @stable ICU 2.4 2472 */ getIgnoreCaseOption()2473 public int getIgnoreCaseOption() { 2474 return m_foldCase_; 2475 } 2476 2477 // public other methods ---------------------------------------------- 2478 2479 /** 2480 * Compare two strings depending on the options selected during construction. 2481 * 2482 * @param a first source string. 2483 * @param b second source string. 2484 * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b, 2485 * a positive value is returned. 2486 * @exception ClassCastException thrown when either a or b is not a String object 2487 * @stable ICU 4.4 2488 */ compare(String a, String b)2489 public int compare(String a, String b) { 2490 if (a == b) { 2491 return 0; 2492 } 2493 if (a == null) { 2494 return -1; 2495 } 2496 if (b == null) { 2497 return 1; 2498 } 2499 2500 if (m_ignoreCase_) { 2501 return compareCaseInsensitive(a, b); 2502 } 2503 return compareCaseSensitive(a, b); 2504 } 2505 2506 // private data member ---------------------------------------------- 2507 2508 /** 2509 * Code unit comparison flag. True if code unit comparison is required. False if code point 2510 * comparison is required. 2511 */ 2512 private int m_codePointCompare_; 2513 2514 /** 2515 * Fold case comparison option. 2516 */ 2517 private int m_foldCase_; 2518 2519 /** 2520 * Flag indicator if ignore case is to be used during comparison 2521 */ 2522 private boolean m_ignoreCase_; 2523 2524 /** 2525 * Code point order offset for surrogate characters 2526 */ 2527 private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800; 2528 2529 // private method --------------------------------------------------- 2530 2531 /** 2532 * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life 2533 * easier. 2534 * 2535 * @param s1 2536 * first string to compare 2537 * @param s2 2538 * second string to compare 2539 * @return -1 is s1 < s2, 0 if equals, 2540 */ compareCaseInsensitive(String s1, String s2)2541 private int compareCaseInsensitive(String s1, String s2) { 2542 return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_ 2543 | Normalizer.COMPARE_IGNORE_CASE); 2544 } 2545 2546 /** 2547 * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life 2548 * easier. 2549 * 2550 * @param s1 2551 * first string to compare 2552 * @param s2 2553 * second string to compare 2554 * @return -1 is s1 < s2, 0 if equals, 2555 */ compareCaseSensitive(String s1, String s2)2556 private int compareCaseSensitive(String s1, String s2) { 2557 // compare identical prefixes - they do not need to be fixed up 2558 // limit1 = start1 + min(lenght1, length2) 2559 int length1 = s1.length(); 2560 int length2 = s2.length(); 2561 int minlength = length1; 2562 int result = 0; 2563 if (length1 < length2) { 2564 result = -1; 2565 } else if (length1 > length2) { 2566 result = 1; 2567 minlength = length2; 2568 } 2569 2570 char c1 = 0; 2571 char c2 = 0; 2572 int index = 0; 2573 for (; index < minlength; index++) { 2574 c1 = s1.charAt(index); 2575 c2 = s2.charAt(index); 2576 // check pseudo-limit 2577 if (c1 != c2) { 2578 break; 2579 } 2580 } 2581 2582 if (index == minlength) { 2583 return result; 2584 } 2585 2586 boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2587 // if both values are in or above the surrogate range, fix them up 2588 if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE 2589 && codepointcompare) { 2590 // subtract 0x2800 from BMP code points to make them smaller 2591 // than supplementary ones 2592 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1))) 2593 || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) { 2594 // part of a surrogate pair, leave >=d800 2595 } else { 2596 // BMP code point - may be surrogate code point - make 2597 // < d800 2598 c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2599 } 2600 2601 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1))) 2602 || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) { 2603 // part of a surrogate pair, leave >=d800 2604 } else { 2605 // BMP code point - may be surrogate code point - make <d800 2606 c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2607 } 2608 } 2609 2610 // now c1 and c2 are in UTF-32-compatible order 2611 return c1 - c2; 2612 } 2613 } 2614 2615 /** 2616 * Utility for getting a code point from a CharSequence that contains exactly one code point. 2617 * @return the code point IF the string is non-null and consists of a single code point. 2618 * otherwise returns -1. 2619 * @param s to test 2620 * @draft ICU 54 2621 * @provisional This API might change or be removed in a future release. 2622 */ getSingleCodePoint(CharSequence s)2623 public static int getSingleCodePoint(CharSequence s) { 2624 if (s == null || s.length() == 0) { 2625 return -1; 2626 } else if (s.length() == 1) { 2627 return s.charAt(0); 2628 } else if (s.length() > 2) { 2629 return -1; 2630 } 2631 2632 // at this point, len = 2 2633 int cp = Character.codePointAt(s, 0); 2634 if (cp > 0xFFFF) { // is surrogate pair 2635 return cp; 2636 } 2637 return -1; 2638 } 2639 2640 /** 2641 * Utility for comparing a code point to a string without having to create a new string. Returns the same results 2642 * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if 2643 * <pre> 2644 * sc = new StringComparator(true,false,0); 2645 * fast = UTF16.compareCodePoint(codePoint, charSequence) 2646 * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString()) 2647 * </pre> 2648 * then 2649 * </pre> 2650 * Integer.signum(fast) == Integer.signum(slower) 2651 * </pre> 2652 * @param codePoint to test 2653 * @param s to test 2654 * @return equivalent of code point comparator comparing two strings. 2655 * @draft ICU 54 2656 * @provisional This API might change or be removed in a future release. 2657 */ compareCodePoint(int codePoint, CharSequence s)2658 public static int compareCodePoint(int codePoint, CharSequence s) { 2659 if (s == null) { 2660 return 1; 2661 } 2662 final int strLen = s.length(); 2663 if (strLen == 0) { 2664 return 1; 2665 } 2666 int second = Character.codePointAt(s, 0); 2667 int diff = codePoint - second; 2668 if (diff != 0) { 2669 return diff; 2670 } 2671 return strLen == Character.charCount(codePoint) ? 0 : -1; 2672 } 2673 2674 // private data members ------------------------------------------------- 2675 2676 /** 2677 * Shift value for lead surrogate to form a supplementary character. 2678 */ 2679 private static final int LEAD_SURROGATE_SHIFT_ = 10; 2680 2681 /** 2682 * Mask to retrieve the significant value from a trail surrogate. 2683 */ 2684 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; 2685 2686 /** 2687 * Value that all lead surrogate starts with 2688 */ 2689 private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE 2690 - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); 2691 2692 // private methods ------------------------------------------------------ 2693 2694 /** 2695 * <p> 2696 * Converts argument code point and returns a String object representing the code point's value 2697 * in UTF16 format. 2698 * </p> 2699 * <p> 2700 * This method does not check for the validity of the codepoint, the results are not guaranteed 2701 * if a invalid codepoint is passed as argument. 2702 * </p> 2703 * <p> 2704 * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise. 2705 * </p> 2706 * 2707 * @param ch 2708 * code point 2709 * @return string representation of the code point 2710 */ toString(int ch)2711 private static String toString(int ch) { 2712 if (ch < SUPPLEMENTARY_MIN_VALUE) { 2713 return String.valueOf((char) ch); 2714 } 2715 2716 StringBuilder result = new StringBuilder(); 2717 result.append(getLeadSurrogate(ch)); 2718 result.append(getTrailSurrogate(ch)); 2719 return result.toString(); 2720 } 2721 } 2722 // eof 2723