1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 package com.ibm.icu.impl; 4 5 import java.io.IOException; 6 import java.text.CharacterIterator; 7 import java.util.Locale; 8 9 import com.ibm.icu.lang.UCharacter; 10 import com.ibm.icu.lang.UCharacterCategory; 11 import com.ibm.icu.text.BreakIterator; 12 import com.ibm.icu.text.Edits; 13 import com.ibm.icu.util.ICUUncheckedIOException; 14 import com.ibm.icu.util.ULocale; 15 16 public final class CaseMapImpl { 17 /** 18 * Implementation of UCaseProps.ContextIterator, iterates over a String. 19 * See ustrcase.c/utf16_caseContextIterator(). 20 */ 21 public static final class StringContextIterator implements UCaseProps.ContextIterator { 22 /** 23 * Constructor. 24 * @param src String to iterate over. 25 */ StringContextIterator(CharSequence src)26 public StringContextIterator(CharSequence src) { 27 this.s=src; 28 limit=src.length(); 29 cpStart=cpLimit=index=0; 30 dir=0; 31 } 32 33 /** 34 * Constructor. 35 * @param src String to iterate over. 36 * @param cpStart Start index of the current code point. 37 * @param cpLimit Limit index of the current code point. 38 */ StringContextIterator(CharSequence src, int cpStart, int cpLimit)39 public StringContextIterator(CharSequence src, int cpStart, int cpLimit) { 40 s = src; 41 index = 0; 42 limit = src.length(); 43 this.cpStart = cpStart; 44 this.cpLimit = cpLimit; 45 dir = 0; 46 } 47 48 /** 49 * Set the iteration limit for nextCaseMapCP() to an index within the string. 50 * If the limit parameter is negative or past the string, then the 51 * string length is restored as the iteration limit. 52 * 53 * <p>This limit does not affect the next() function which always 54 * iterates to the very end of the string. 55 * 56 * @param lim The iteration limit. 57 */ setLimit(int lim)58 public void setLimit(int lim) { 59 if(0<=lim && lim<=s.length()) { 60 limit=lim; 61 } else { 62 limit=s.length(); 63 } 64 } 65 66 /** 67 * Move to the iteration limit without fetching code points up to there. 68 */ moveToLimit()69 public void moveToLimit() { 70 cpStart=cpLimit=limit; 71 } 72 73 /** 74 * Iterate forward through the string to fetch the next code point 75 * to be case-mapped, and set the context indexes for it. 76 * 77 * <p>When the iteration limit is reached (and -1 is returned), 78 * getCPStart() will be at the iteration limit. 79 * 80 * <p>Iteration with next() does not affect the position for nextCaseMapCP(). 81 * 82 * @return The next code point to be case-mapped, or <0 when the iteration is done. 83 */ nextCaseMapCP()84 public int nextCaseMapCP() { 85 cpStart=cpLimit; 86 if(cpLimit<limit) { 87 int c=Character.codePointAt(s, cpLimit); 88 cpLimit+=Character.charCount(c); 89 return c; 90 } else { 91 return -1; 92 } 93 } 94 setCPStartAndLimit(int s, int l)95 public void setCPStartAndLimit(int s, int l) { 96 cpStart = s; 97 cpLimit = l; 98 dir = 0; 99 } 100 /** 101 * Returns the start of the code point that was last returned 102 * by nextCaseMapCP(). 103 */ getCPStart()104 public int getCPStart() { 105 return cpStart; 106 } 107 108 /** 109 * Returns the limit of the code point that was last returned 110 * by nextCaseMapCP(). 111 */ getCPLimit()112 public int getCPLimit() { 113 return cpLimit; 114 } 115 getCPLength()116 public int getCPLength() { 117 return cpLimit-cpStart; 118 } 119 120 // implement UCaseProps.ContextIterator 121 // The following code is not used anywhere in this private class 122 @Override reset(int direction)123 public void reset(int direction) { 124 if(direction>0) { 125 /* reset for forward iteration */ 126 dir=1; 127 index=cpLimit; 128 } else if(direction<0) { 129 /* reset for backward iteration */ 130 dir=-1; 131 index=cpStart; 132 } else { 133 // not a valid direction 134 dir=0; 135 index=0; 136 } 137 } 138 139 @Override next()140 public int next() { 141 int c; 142 143 if(dir>0 && index<s.length()) { 144 c=Character.codePointAt(s, index); 145 index+=Character.charCount(c); 146 return c; 147 } else if(dir<0 && index>0) { 148 c=Character.codePointBefore(s, index); 149 index-=Character.charCount(c); 150 return c; 151 } 152 return -1; 153 } 154 155 // variables 156 protected CharSequence s; 157 protected int index, limit, cpStart, cpLimit; 158 protected int dir; // 0=initial state >0=forward <0=backward 159 } 160 161 public static final int TITLECASE_WHOLE_STRING = 0x20; 162 public static final int TITLECASE_SENTENCES = 0x40; 163 164 /** 165 * Bit mask for the titlecasing iterator options bit field. 166 * Currently only 3 out of 8 values are used: 167 * 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES. 168 * See stringoptions.h. 169 * @internal 170 */ 171 private static final int TITLECASE_ITERATOR_MASK = 0xe0; 172 173 public static final int TITLECASE_ADJUST_TO_CASED = 0x400; 174 175 /** 176 * Bit mask for the titlecasing index adjustment options bit set. 177 * Currently two bits are defined: 178 * TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED. 179 * See stringoptions.h. 180 * @internal 181 */ 182 private static final int TITLECASE_ADJUSTMENT_MASK = 0x600; 183 addTitleAdjustmentOption(int options, int newOption)184 public static int addTitleAdjustmentOption(int options, int newOption) { 185 int adjOptions = options & TITLECASE_ADJUSTMENT_MASK; 186 if (adjOptions !=0 && adjOptions != newOption) { 187 throw new IllegalArgumentException("multiple titlecasing index adjustment options"); 188 } 189 return options | newOption; 190 } 191 192 private static final int LNS = 193 (1 << UCharacterCategory.UPPERCASE_LETTER) | 194 (1 << UCharacterCategory.LOWERCASE_LETTER) | 195 (1 << UCharacterCategory.TITLECASE_LETTER) | 196 // Not MODIFIER_LETTER: We count only cased modifier letters. 197 (1 << UCharacterCategory.OTHER_LETTER) | 198 199 (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) | 200 (1 << UCharacterCategory.LETTER_NUMBER) | 201 (1 << UCharacterCategory.OTHER_NUMBER) | 202 203 (1 << UCharacterCategory.MATH_SYMBOL) | 204 (1 << UCharacterCategory.CURRENCY_SYMBOL) | 205 (1 << UCharacterCategory.MODIFIER_SYMBOL) | 206 (1 << UCharacterCategory.OTHER_SYMBOL) | 207 208 (1 << UCharacterCategory.PRIVATE_USE); 209 isLNS(int c)210 private static boolean isLNS(int c) { 211 // Letter, number, symbol, 212 // or a private use code point because those are typically used as letters or numbers. 213 // Consider modifier letters only if they are cased. 214 int gc = UCharacterProperty.INSTANCE.getType(c); 215 return ((1 << gc) & LNS) != 0 || 216 (gc == UCharacterCategory.MODIFIER_LETTER && 217 UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE); 218 } 219 addTitleIteratorOption(int options, int newOption)220 public static int addTitleIteratorOption(int options, int newOption) { 221 int iterOptions = options & TITLECASE_ITERATOR_MASK; 222 if (iterOptions !=0 && iterOptions != newOption) { 223 throw new IllegalArgumentException("multiple titlecasing iterator options"); 224 } 225 return options | newOption; 226 } 227 getTitleBreakIterator( Locale locale, int options, BreakIterator iter)228 public static BreakIterator getTitleBreakIterator( 229 Locale locale, int options, BreakIterator iter) { 230 options &= TITLECASE_ITERATOR_MASK; 231 if (options != 0 && iter != null) { 232 throw new IllegalArgumentException( 233 "titlecasing iterator option together with an explicit iterator"); 234 } 235 if (iter == null) { 236 switch (options) { 237 case 0: 238 iter = BreakIterator.getWordInstance(locale); 239 break; 240 case TITLECASE_WHOLE_STRING: 241 iter = new WholeStringBreakIterator(); 242 break; 243 case TITLECASE_SENTENCES: 244 iter = BreakIterator.getSentenceInstance(locale); 245 break; 246 default: 247 throw new IllegalArgumentException("unknown titlecasing iterator option"); 248 } 249 } 250 return iter; 251 } 252 getTitleBreakIterator( ULocale locale, int options, BreakIterator iter)253 public static BreakIterator getTitleBreakIterator( 254 ULocale locale, int options, BreakIterator iter) { 255 options &= TITLECASE_ITERATOR_MASK; 256 if (options != 0 && iter != null) { 257 throw new IllegalArgumentException( 258 "titlecasing iterator option together with an explicit iterator"); 259 } 260 if (iter == null) { 261 switch (options) { 262 case 0: 263 iter = BreakIterator.getWordInstance(locale); 264 break; 265 case TITLECASE_WHOLE_STRING: 266 iter = new WholeStringBreakIterator(); 267 break; 268 case TITLECASE_SENTENCES: 269 iter = BreakIterator.getSentenceInstance(locale); 270 break; 271 default: 272 throw new IllegalArgumentException("unknown titlecasing iterator option"); 273 } 274 } 275 return iter; 276 } 277 278 /** 279 * Omit unchanged text when case-mapping with Edits. 280 */ 281 public static final int OMIT_UNCHANGED_TEXT = 0x4000; 282 283 private static final class WholeStringBreakIterator extends BreakIterator { 284 private int length; 285 notImplemented()286 private static void notImplemented() { 287 throw new UnsupportedOperationException("should not occur"); 288 } 289 290 @Override first()291 public int first() { 292 return 0; 293 } 294 295 @Override last()296 public int last() { 297 notImplemented(); 298 return 0; 299 } 300 301 @Override next(int n)302 public int next(int n) { 303 notImplemented(); 304 return 0; 305 } 306 307 @Override next()308 public int next() { 309 return length; 310 } 311 312 @Override previous()313 public int previous() { 314 notImplemented(); 315 return 0; 316 } 317 318 @Override following(int offset)319 public int following(int offset) { 320 notImplemented(); 321 return 0; 322 } 323 324 @Override current()325 public int current() { 326 notImplemented(); 327 return 0; 328 } 329 330 @Override getText()331 public CharacterIterator getText() { 332 notImplemented(); 333 return null; 334 } 335 336 @Override setText(CharacterIterator newText)337 public void setText(CharacterIterator newText) { 338 length = newText.getEndIndex(); 339 } 340 341 @Override setText(CharSequence newText)342 public void setText(CharSequence newText) { 343 length = newText.length(); 344 } 345 346 @Override setText(String newText)347 public void setText(String newText) { 348 length = newText.length(); 349 } 350 } 351 appendCodePoint(Appendable a, int c)352 private static int appendCodePoint(Appendable a, int c) throws IOException { 353 if (c <= Character.MAX_VALUE) { 354 a.append((char)c); 355 return 1; 356 } else { 357 a.append((char)(0xd7c0 + (c >> 10))); 358 a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff))); 359 return 2; 360 } 361 } 362 363 /** 364 * Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. 365 * @throws IOException 366 */ appendResult(int result, Appendable dest, int cpLength, int options, Edits edits)367 private static void appendResult(int result, Appendable dest, 368 int cpLength, int options, Edits edits) throws IOException { 369 // Decode the result. 370 if (result < 0) { 371 // (not) original code point 372 if (edits != null) { 373 edits.addUnchanged(cpLength); 374 } 375 if ((options & OMIT_UNCHANGED_TEXT) != 0) { 376 return; 377 } 378 appendCodePoint(dest, ~result); 379 } else if (result <= UCaseProps.MAX_STRING_LENGTH) { 380 // The mapping has already been appended to result. 381 if (edits != null) { 382 edits.addReplace(cpLength, result); 383 } 384 } else { 385 // Append the single-code point mapping. 386 int length = appendCodePoint(dest, result); 387 if (edits != null) { 388 edits.addReplace(cpLength, length); 389 } 390 } 391 } 392 appendUnchanged(CharSequence src, int start, int length, Appendable dest, int options, Edits edits)393 private static final void appendUnchanged(CharSequence src, int start, int length, 394 Appendable dest, int options, Edits edits) throws IOException { 395 if (length > 0) { 396 if (edits != null) { 397 edits.addUnchanged(length); 398 } 399 if ((options & OMIT_UNCHANGED_TEXT) != 0) { 400 return; 401 } 402 dest.append(src, start, start + length); 403 } 404 } 405 applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits)406 private static String applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits) { 407 if (!edits.hasChanges()) { 408 return src.toString(); 409 } 410 StringBuilder result = new StringBuilder(src.length() + edits.lengthDelta()); 411 for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) { 412 if (ei.hasChange()) { 413 int i = ei.replacementIndex(); 414 result.append(replacementChars, i, i + ei.newLength()); 415 } else { 416 int i = ei.sourceIndex(); 417 result.append(src, i, i + ei.oldLength()); 418 } 419 } 420 return result.toString(); 421 } 422 423 private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie(); 424 425 /** 426 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. 427 * caseLocale < 0: Case-folds [srcStart..srcLimit[. 428 */ internalToLower(int caseLocale, int options, CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, Appendable dest, Edits edits)429 private static void internalToLower(int caseLocale, int options, 430 CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, 431 Appendable dest, Edits edits) throws IOException { 432 byte[] latinToLower; 433 if (caseLocale == UCaseProps.LOC_ROOT || 434 (caseLocale >= 0 ? 435 !(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) : 436 (options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) { 437 latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL; 438 } else { 439 latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT; 440 } 441 int prev = srcStart; 442 int srcIndex = srcStart; 443 outerLoop: 444 for (;;) { 445 // fast path for simple cases 446 char lead; 447 for (;;) { 448 if (srcIndex >= srcLimit) { 449 break outerLoop; 450 } 451 lead = src.charAt(srcIndex); 452 int delta; 453 if (lead < UCaseProps.LatinCase.LONG_S) { 454 byte d = latinToLower[lead]; 455 if (d == UCaseProps.LatinCase.EXC) { break; } 456 ++srcIndex; 457 if (d == 0) { continue; } 458 delta = d; 459 } else if (lead >= 0xd800) { 460 break; // surrogate or higher 461 } else { 462 int props = CASE_TRIE.getFromU16SingleLead(lead); 463 if (UCaseProps.propsHasException(props)) { break; } 464 ++srcIndex; 465 if (!UCaseProps.isUpperOrTitleFromProps(props) || 466 (delta = UCaseProps.getDelta(props)) == 0) { 467 continue; 468 } 469 } 470 lead += delta; 471 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); 472 dest.append(lead); 473 if (edits != null) { 474 edits.addReplace(1, 1); 475 } 476 prev = srcIndex; 477 } 478 // slow path 479 int cpStart = srcIndex++; 480 char trail; 481 int c; 482 if (Character.isHighSurrogate(lead) && srcIndex < srcLimit && 483 Character.isLowSurrogate(trail = src.charAt(srcIndex))) { 484 c = Character.toCodePoint(lead, trail); 485 ++srcIndex; 486 } else { 487 c = lead; 488 } 489 if (caseLocale >= 0) { 490 if (iter == null) { 491 iter = new StringContextIterator(src, cpStart, srcIndex); 492 } else { 493 iter.setCPStartAndLimit(cpStart, srcIndex); 494 } 495 c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale); 496 } else { 497 c = UCaseProps.INSTANCE.toFullFolding(c, dest, options); 498 } 499 if (c >= 0) { 500 appendUnchanged(src, prev, cpStart - prev, dest, options, edits); 501 appendResult(c, dest, srcIndex - cpStart, options, edits); 502 prev = srcIndex; 503 } 504 } 505 appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); 506 } 507 internalToUpper(int caseLocale, int options, CharSequence src, Appendable dest, Edits edits)508 private static void internalToUpper(int caseLocale, int options, 509 CharSequence src, Appendable dest, Edits edits) throws IOException { 510 StringContextIterator iter = null; 511 byte[] latinToUpper; 512 if (caseLocale == UCaseProps.LOC_TURKISH) { 513 latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR; 514 } else { 515 latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL; 516 } 517 int prev = 0; 518 int srcIndex = 0; 519 int srcLength = src.length(); 520 outerLoop: 521 for (;;) { 522 // fast path for simple cases 523 char lead; 524 for (;;) { 525 if (srcIndex >= srcLength) { 526 break outerLoop; 527 } 528 lead = src.charAt(srcIndex); 529 int delta; 530 if (lead < UCaseProps.LatinCase.LONG_S) { 531 byte d = latinToUpper[lead]; 532 if (d == UCaseProps.LatinCase.EXC) { break; } 533 ++srcIndex; 534 if (d == 0) { continue; } 535 delta = d; 536 } else if (lead >= 0xd800) { 537 break; // surrogate or higher 538 } else { 539 int props = CASE_TRIE.getFromU16SingleLead(lead); 540 if (UCaseProps.propsHasException(props)) { break; } 541 ++srcIndex; 542 if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER || 543 (delta = UCaseProps.getDelta(props)) == 0) { 544 continue; 545 } 546 } 547 lead += delta; 548 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits); 549 dest.append(lead); 550 if (edits != null) { 551 edits.addReplace(1, 1); 552 } 553 prev = srcIndex; 554 } 555 // slow path 556 int cpStart = srcIndex++; 557 char trail; 558 int c; 559 if (Character.isHighSurrogate(lead) && srcIndex < srcLength && 560 Character.isLowSurrogate(trail = src.charAt(srcIndex))) { 561 c = Character.toCodePoint(lead, trail); 562 ++srcIndex; 563 } else { 564 c = lead; 565 } 566 if (iter == null) { 567 iter = new StringContextIterator(src, cpStart, srcIndex); 568 } else { 569 iter.setCPStartAndLimit(cpStart, srcIndex); 570 } 571 c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale); 572 if (c >= 0) { 573 appendUnchanged(src, prev, cpStart - prev, dest, options, edits); 574 appendResult(c, dest, srcIndex - cpStart, options, edits); 575 prev = srcIndex; 576 } 577 } 578 appendUnchanged(src, prev, srcIndex - prev, dest, options, edits); 579 } 580 toLower(int caseLocale, int options, CharSequence src)581 public static String toLower(int caseLocale, int options, CharSequence src) { 582 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 583 if (src.length() == 0) { 584 return src.toString(); 585 } 586 // Collect and apply only changes. 587 // Good if no or few changes. Bad (slow) if many changes. 588 Edits edits = new Edits(); 589 StringBuilder replacementChars = toLower( 590 caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 591 return applyEdits(src, replacementChars, edits); 592 } else { 593 return toLower(caseLocale, options, src, 594 new StringBuilder(src.length()), null).toString(); 595 } 596 } 597 toLower(int caseLocale, int options, CharSequence src, A dest, Edits edits)598 public static <A extends Appendable> A toLower(int caseLocale, int options, 599 CharSequence src, A dest, Edits edits) { 600 try { 601 if (edits != null) { 602 edits.reset(); 603 } 604 internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits); 605 return dest; 606 } catch (IOException e) { 607 throw new ICUUncheckedIOException(e); 608 } 609 } 610 toUpper(int caseLocale, int options, CharSequence src)611 public static String toUpper(int caseLocale, int options, CharSequence src) { 612 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 613 if (src.length() == 0) { 614 return src.toString(); 615 } 616 // Collect and apply only changes. 617 // Good if no or few changes. Bad (slow) if many changes. 618 Edits edits = new Edits(); 619 StringBuilder replacementChars = toUpper( 620 caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 621 return applyEdits(src, replacementChars, edits); 622 } else { 623 return toUpper(caseLocale, options, src, 624 new StringBuilder(src.length()), null).toString(); 625 } 626 } 627 toUpper(int caseLocale, int options, CharSequence src, A dest, Edits edits)628 public static <A extends Appendable> A toUpper(int caseLocale, int options, 629 CharSequence src, A dest, Edits edits) { 630 try { 631 if (edits != null) { 632 edits.reset(); 633 } 634 if (caseLocale == UCaseProps.LOC_GREEK) { 635 return GreekUpper.toUpper(options, src, dest, edits); 636 } 637 internalToUpper(caseLocale, options, src, dest, edits); 638 return dest; 639 } catch (IOException e) { 640 throw new ICUUncheckedIOException(e); 641 } 642 } 643 toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src)644 public static String toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src) { 645 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 646 if (src.length() == 0) { 647 return src.toString(); 648 } 649 // Collect and apply only changes. 650 // Good if no or few changes. Bad (slow) if many changes. 651 Edits edits = new Edits(); 652 StringBuilder replacementChars = toTitle( 653 caseLocale, options | OMIT_UNCHANGED_TEXT, iter, src, 654 new StringBuilder(), edits); 655 return applyEdits(src, replacementChars, edits); 656 } else { 657 return toTitle(caseLocale, options, iter, src, 658 new StringBuilder(src.length()), null).toString(); 659 } 660 } 661 toTitle( int caseLocale, int options, BreakIterator titleIter, CharSequence src, A dest, Edits edits)662 public static <A extends Appendable> A toTitle( 663 int caseLocale, int options, BreakIterator titleIter, 664 CharSequence src, A dest, Edits edits) { 665 try { 666 if (edits != null) { 667 edits.reset(); 668 } 669 670 /* set up local variables */ 671 StringContextIterator iter = new StringContextIterator(src); 672 int srcLength = src.length(); 673 int prev=0; 674 boolean isFirstIndex=true; 675 676 /* titlecasing loop */ 677 while(prev<srcLength) { 678 /* find next index where to titlecase */ 679 int index; 680 if(isFirstIndex) { 681 isFirstIndex=false; 682 index=titleIter.first(); 683 } else { 684 index=titleIter.next(); 685 } 686 if(index==BreakIterator.DONE || index>srcLength) { 687 index=srcLength; 688 } 689 690 /* 691 * Segment [prev..index[ into 3 parts: 692 * a) skipped characters (copy as-is) [prev..titleStart[ 693 * b) first letter (titlecase) [titleStart..titleLimit[ 694 * c) subsequent characters (lowercase) [titleLimit..index[ 695 */ 696 if(prev<index) { 697 // Find and copy skipped characters [prev..titleStart[ 698 int titleStart=prev; 699 iter.setLimit(index); 700 int c=iter.nextCaseMapCP(); 701 if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) { 702 // Adjust the titlecasing index to the next cased character, 703 // or to the next letter/number/symbol/private use. 704 // Stop with titleStart<titleLimit<=index 705 // if there is a character to be titlecased, 706 // or else stop with titleStart==titleLimit==index. 707 boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0; 708 while ((toCased ? 709 UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) : 710 !CaseMapImpl.isLNS(c)) && 711 (c=iter.nextCaseMapCP())>=0) {} 712 // If c<0 then we have only uncased characters in [prev..index[ 713 // and stopped with titleStart==titleLimit==index. 714 titleStart=iter.getCPStart(); 715 if (prev < titleStart) { 716 appendUnchanged(src, prev, titleStart-prev, dest, options, edits); 717 } 718 } 719 720 if(titleStart<index) { 721 int titleLimit=iter.getCPLimit(); 722 // titlecase c which is from [titleStart..titleLimit[ 723 c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale); 724 appendResult(c, dest, iter.getCPLength(), options, edits); 725 726 // Special case Dutch IJ titlecasing 727 if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) { 728 char c1 = src.charAt(titleStart); 729 if ((c1 == 'i' || c1 == 'I')) { 730 char c2 = src.charAt(titleStart+1); 731 if (c2 == 'j') { 732 dest.append('J'); 733 if (edits != null) { 734 edits.addReplace(1, 1); 735 } 736 c = iter.nextCaseMapCP(); 737 titleLimit++; 738 assert c == c2; 739 assert titleLimit == iter.getCPLimit(); 740 } else if (c2 == 'J') { 741 // Keep the capital J from getting lowercased. 742 appendUnchanged(src, titleStart + 1, 1, dest, options, edits); 743 c = iter.nextCaseMapCP(); 744 titleLimit++; 745 assert c == c2; 746 assert titleLimit == iter.getCPLimit(); 747 } 748 } 749 } 750 751 // lowercase [titleLimit..index[ 752 if(titleLimit<index) { 753 if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) { 754 // Normal operation: Lowercase the rest of the word. 755 internalToLower(caseLocale, options, 756 src, titleLimit, index, iter, dest, edits); 757 } else { 758 // Optionally just copy the rest of the word unchanged. 759 appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits); 760 } 761 iter.moveToLimit(); 762 } 763 } 764 } 765 766 prev=index; 767 } 768 return dest; 769 } catch (IOException e) { 770 throw new ICUUncheckedIOException(e); 771 } 772 } 773 fold(int options, CharSequence src)774 public static String fold(int options, CharSequence src) { 775 if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { 776 if (src.length() == 0) { 777 return src.toString(); 778 } 779 // Collect and apply only changes. 780 // Good if no or few changes. Bad (slow) if many changes. 781 Edits edits = new Edits(); 782 StringBuilder replacementChars = fold( 783 options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits); 784 return applyEdits(src, replacementChars, edits); 785 } else { 786 return fold(options, src, new StringBuilder(src.length()), null).toString(); 787 } 788 } 789 fold(int options, CharSequence src, A dest, Edits edits)790 public static <A extends Appendable> A fold(int options, 791 CharSequence src, A dest, Edits edits) { 792 try { 793 if (edits != null) { 794 edits.reset(); 795 } 796 internalToLower(-1, options, src, 0, src.length(), null, dest, edits); 797 return dest; 798 } catch (IOException e) { 799 throw new ICUUncheckedIOException(e); 800 } 801 } 802 803 private static final class GreekUpper { 804 // Data bits. 805 private static final int UPPER_MASK = 0x3ff; 806 private static final int HAS_VOWEL = 0x1000; 807 private static final int HAS_YPOGEGRAMMENI = 0x2000; 808 private static final int HAS_ACCENT = 0x4000; 809 private static final int HAS_DIALYTIKA = 0x8000; 810 // Further bits during data building and processing, not stored in the data map. 811 private static final int HAS_COMBINING_DIALYTIKA = 0x10000; 812 private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000; 813 814 private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT; 815 private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA = 816 HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA; 817 private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA; 818 819 // State bits. 820 private static final int AFTER_CASED = 1; 821 private static final int AFTER_VOWEL_WITH_ACCENT = 2; 822 823 // Data generated by prototype code, see 824 // http://site.icu-project.org/design/case/greek-upper 825 // TODO: Move this data into ucase.icu. 826 private static final char[] data0370 = { 827 // U+0370..03FF 828 0x0370, // Ͱ 829 0x0370, // ͱ 830 0x0372, // Ͳ 831 0x0372, // ͳ 832 0, 833 0, 834 0x0376, // Ͷ 835 0x0376, // ͷ 836 0, 837 0, 838 0x037A, // ͺ 839 0x03FD, // ͻ 840 0x03FE, // ͼ 841 0x03FF, // ͽ 842 0, 843 0x037F, // Ϳ 844 0, 845 0, 846 0, 847 0, 848 0, 849 0, 850 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 851 0, 852 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 853 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 854 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 855 0, 856 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 857 0, 858 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 859 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 860 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 861 0x0391 | HAS_VOWEL, // Α 862 0x0392, // Β 863 0x0393, // Γ 864 0x0394, // Δ 865 0x0395 | HAS_VOWEL, // Ε 866 0x0396, // Ζ 867 0x0397 | HAS_VOWEL, // Η 868 0x0398, // Θ 869 0x0399 | HAS_VOWEL, // Ι 870 0x039A, // Κ 871 0x039B, // Λ 872 0x039C, // Μ 873 0x039D, // Ν 874 0x039E, // Ξ 875 0x039F | HAS_VOWEL, // Ο 876 0x03A0, // Π 877 0x03A1, // Ρ 878 0, 879 0x03A3, // Σ 880 0x03A4, // Τ 881 0x03A5 | HAS_VOWEL, // Υ 882 0x03A6, // Φ 883 0x03A7, // Χ 884 0x03A8, // Ψ 885 0x03A9 | HAS_VOWEL, // Ω 886 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // Ϊ 887 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // Ϋ 888 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 889 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 890 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 891 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 892 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 893 0x0391 | HAS_VOWEL, // α 894 0x0392, // β 895 0x0393, // γ 896 0x0394, // δ 897 0x0395 | HAS_VOWEL, // ε 898 0x0396, // ζ 899 0x0397 | HAS_VOWEL, // η 900 0x0398, // θ 901 0x0399 | HAS_VOWEL, // ι 902 0x039A, // κ 903 0x039B, // λ 904 0x039C, // μ 905 0x039D, // ν 906 0x039E, // ξ 907 0x039F | HAS_VOWEL, // ο 908 0x03A0, // π 909 0x03A1, // ρ 910 0x03A3, // ς 911 0x03A3, // σ 912 0x03A4, // τ 913 0x03A5 | HAS_VOWEL, // υ 914 0x03A6, // φ 915 0x03A7, // χ 916 0x03A8, // ψ 917 0x03A9 | HAS_VOWEL, // ω 918 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // ϊ 919 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // ϋ 920 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 921 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 922 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 923 0x03CF, // Ϗ 924 0x0392, // ϐ 925 0x0398, // ϑ 926 0x03D2, // ϒ 927 0x03D2 | HAS_ACCENT, // ϓ 928 0x03D2 | HAS_DIALYTIKA, // ϔ 929 0x03A6, // ϕ 930 0x03A0, // ϖ 931 0x03CF, // ϗ 932 0x03D8, // Ϙ 933 0x03D8, // ϙ 934 0x03DA, // Ϛ 935 0x03DA, // ϛ 936 0x03DC, // Ϝ 937 0x03DC, // ϝ 938 0x03DE, // Ϟ 939 0x03DE, // ϟ 940 0x03E0, // Ϡ 941 0x03E0, // ϡ 942 0, 943 0, 944 0, 945 0, 946 0, 947 0, 948 0, 949 0, 950 0, 951 0, 952 0, 953 0, 954 0, 955 0, 956 0x039A, // ϰ 957 0x03A1, // ϱ 958 0x03F9, // ϲ 959 0x037F, // ϳ 960 0x03F4, // ϴ 961 0x0395 | HAS_VOWEL, // ϵ 962 0, 963 0x03F7, // Ϸ 964 0x03F7, // ϸ 965 0x03F9, // Ϲ 966 0x03FA, // Ϻ 967 0x03FA, // ϻ 968 0x03FC, // ϼ 969 0x03FD, // Ͻ 970 0x03FE, // Ͼ 971 0x03FF, // Ͽ 972 }; 973 974 private static final char[] data1F00 = { 975 // U+1F00..1FFF 976 0x0391 | HAS_VOWEL, // ἀ 977 0x0391 | HAS_VOWEL, // ἁ 978 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἂ 979 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἃ 980 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἄ 981 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἅ 982 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἆ 983 0x0391 | HAS_VOWEL | HAS_ACCENT, // ἇ 984 0x0391 | HAS_VOWEL, // Ἀ 985 0x0391 | HAS_VOWEL, // Ἁ 986 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἂ 987 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἃ 988 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἄ 989 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἅ 990 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἆ 991 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἇ 992 0x0395 | HAS_VOWEL, // ἐ 993 0x0395 | HAS_VOWEL, // ἑ 994 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἒ 995 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἓ 996 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἔ 997 0x0395 | HAS_VOWEL | HAS_ACCENT, // ἕ 998 0, 999 0, 1000 0x0395 | HAS_VOWEL, // Ἐ 1001 0x0395 | HAS_VOWEL, // Ἑ 1002 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἒ 1003 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἓ 1004 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἔ 1005 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἕ 1006 0, 1007 0, 1008 0x0397 | HAS_VOWEL, // ἠ 1009 0x0397 | HAS_VOWEL, // ἡ 1010 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἢ 1011 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἣ 1012 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἤ 1013 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἥ 1014 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἦ 1015 0x0397 | HAS_VOWEL | HAS_ACCENT, // ἧ 1016 0x0397 | HAS_VOWEL, // Ἠ 1017 0x0397 | HAS_VOWEL, // Ἡ 1018 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἢ 1019 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἣ 1020 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἤ 1021 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἥ 1022 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἦ 1023 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἧ 1024 0x0399 | HAS_VOWEL, // ἰ 1025 0x0399 | HAS_VOWEL, // ἱ 1026 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἲ 1027 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἳ 1028 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἴ 1029 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἵ 1030 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἶ 1031 0x0399 | HAS_VOWEL | HAS_ACCENT, // ἷ 1032 0x0399 | HAS_VOWEL, // Ἰ 1033 0x0399 | HAS_VOWEL, // Ἱ 1034 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἲ 1035 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἳ 1036 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἴ 1037 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἵ 1038 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἶ 1039 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἷ 1040 0x039F | HAS_VOWEL, // ὀ 1041 0x039F | HAS_VOWEL, // ὁ 1042 0x039F | HAS_VOWEL | HAS_ACCENT, // ὂ 1043 0x039F | HAS_VOWEL | HAS_ACCENT, // ὃ 1044 0x039F | HAS_VOWEL | HAS_ACCENT, // ὄ 1045 0x039F | HAS_VOWEL | HAS_ACCENT, // ὅ 1046 0, 1047 0, 1048 0x039F | HAS_VOWEL, // Ὀ 1049 0x039F | HAS_VOWEL, // Ὁ 1050 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὂ 1051 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὃ 1052 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὄ 1053 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὅ 1054 0, 1055 0, 1056 0x03A5 | HAS_VOWEL, // ὐ 1057 0x03A5 | HAS_VOWEL, // ὑ 1058 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὒ 1059 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὓ 1060 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὔ 1061 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὕ 1062 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὖ 1063 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὗ 1064 0, 1065 0x03A5 | HAS_VOWEL, // Ὑ 1066 0, 1067 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὓ 1068 0, 1069 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὕ 1070 0, 1071 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὗ 1072 0x03A9 | HAS_VOWEL, // ὠ 1073 0x03A9 | HAS_VOWEL, // ὡ 1074 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὢ 1075 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὣ 1076 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὤ 1077 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὥ 1078 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὦ 1079 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὧ 1080 0x03A9 | HAS_VOWEL, // Ὠ 1081 0x03A9 | HAS_VOWEL, // Ὡ 1082 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὢ 1083 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὣ 1084 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὤ 1085 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὥ 1086 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὦ 1087 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὧ 1088 0x0391 | HAS_VOWEL | HAS_ACCENT, // ὰ 1089 0x0391 | HAS_VOWEL | HAS_ACCENT, // ά 1090 0x0395 | HAS_VOWEL | HAS_ACCENT, // ὲ 1091 0x0395 | HAS_VOWEL | HAS_ACCENT, // έ 1092 0x0397 | HAS_VOWEL | HAS_ACCENT, // ὴ 1093 0x0397 | HAS_VOWEL | HAS_ACCENT, // ή 1094 0x0399 | HAS_VOWEL | HAS_ACCENT, // ὶ 1095 0x0399 | HAS_VOWEL | HAS_ACCENT, // ί 1096 0x039F | HAS_VOWEL | HAS_ACCENT, // ὸ 1097 0x039F | HAS_VOWEL | HAS_ACCENT, // ό 1098 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὺ 1099 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ 1100 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὼ 1101 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ 1102 0, 1103 0, 1104 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾀ 1105 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾁ 1106 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾂ 1107 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾃ 1108 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾄ 1109 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾅ 1110 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾆ 1111 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾇ 1112 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾈ 1113 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾉ 1114 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾊ 1115 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾋ 1116 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾌ 1117 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾍ 1118 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾎ 1119 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾏ 1120 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾐ 1121 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾑ 1122 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾒ 1123 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾓ 1124 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾔ 1125 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾕ 1126 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾖ 1127 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾗ 1128 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾘ 1129 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾙ 1130 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾚ 1131 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾛ 1132 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾜ 1133 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾝ 1134 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾞ 1135 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾟ 1136 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾠ 1137 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾡ 1138 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾢ 1139 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾣ 1140 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾤ 1141 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾥ 1142 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾦ 1143 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾧ 1144 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾨ 1145 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾩ 1146 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾪ 1147 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾫ 1148 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾬ 1149 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾭ 1150 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾮ 1151 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾯ 1152 0x0391 | HAS_VOWEL, // ᾰ 1153 0x0391 | HAS_VOWEL, // ᾱ 1154 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾲ 1155 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾳ 1156 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾴ 1157 0, 1158 0x0391 | HAS_VOWEL | HAS_ACCENT, // ᾶ 1159 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾷ 1160 0x0391 | HAS_VOWEL, // Ᾰ 1161 0x0391 | HAS_VOWEL, // Ᾱ 1162 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ὰ 1163 0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά 1164 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾼ 1165 0, 1166 0x0399 | HAS_VOWEL, // ι 1167 0, 1168 0, 1169 0, 1170 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῂ 1171 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῃ 1172 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῄ 1173 0, 1174 0x0397 | HAS_VOWEL | HAS_ACCENT, // ῆ 1175 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῇ 1176 0x0395 | HAS_VOWEL | HAS_ACCENT, // Ὲ 1177 0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ 1178 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ὴ 1179 0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή 1180 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῌ 1181 0, 1182 0, 1183 0, 1184 0x0399 | HAS_VOWEL, // ῐ 1185 0x0399 | HAS_VOWEL, // ῑ 1186 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῒ 1187 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ 1188 0, 1189 0, 1190 0x0399 | HAS_VOWEL | HAS_ACCENT, // ῖ 1191 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῗ 1192 0x0399 | HAS_VOWEL, // Ῐ 1193 0x0399 | HAS_VOWEL, // Ῑ 1194 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ὶ 1195 0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί 1196 0, 1197 0, 1198 0, 1199 0, 1200 0x03A5 | HAS_VOWEL, // ῠ 1201 0x03A5 | HAS_VOWEL, // ῡ 1202 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῢ 1203 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ 1204 0x03A1, // ῤ 1205 0x03A1, // ῥ 1206 0x03A5 | HAS_VOWEL | HAS_ACCENT, // ῦ 1207 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῧ 1208 0x03A5 | HAS_VOWEL, // Ῠ 1209 0x03A5 | HAS_VOWEL, // Ῡ 1210 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὺ 1211 0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ 1212 0x03A1, // Ῥ 1213 0, 1214 0, 1215 0, 1216 0, 1217 0, 1218 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῲ 1219 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῳ 1220 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῴ 1221 0, 1222 0x03A9 | HAS_VOWEL | HAS_ACCENT, // ῶ 1223 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῷ 1224 0x039F | HAS_VOWEL | HAS_ACCENT, // Ὸ 1225 0x039F | HAS_VOWEL | HAS_ACCENT, // Ό 1226 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὼ 1227 0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ 1228 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῼ 1229 0, 1230 0, 1231 0, 1232 }; 1233 1234 // U+2126 Ohm sign 1235 private static final char data2126 = 0x03A9 | HAS_VOWEL; // Ω 1236 getLetterData(int c)1237 private static final int getLetterData(int c) { 1238 if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) { 1239 return 0; 1240 } else if (c <= 0x3ff) { 1241 return data0370[c - 0x370]; 1242 } else if (c <= 0x1fff) { 1243 return data1F00[c - 0x1f00]; 1244 } else if (c == 0x2126) { 1245 return data2126; 1246 } else { 1247 return 0; 1248 } 1249 } 1250 1251 /** 1252 * Returns a non-zero value for each of the Greek combining diacritics 1253 * listed in The Unicode Standard, version 8, chapter 7.2 Greek, 1254 * plus some perispomeni look-alikes. 1255 */ getDiacriticData(int c)1256 private static final int getDiacriticData(int c) { 1257 switch (c) { 1258 case '\u0300': // varia 1259 case '\u0301': // tonos = oxia 1260 case '\u0342': // perispomeni 1261 case '\u0302': // circumflex can look like perispomeni 1262 case '\u0303': // tilde can look like perispomeni 1263 case '\u0311': // inverted breve can look like perispomeni 1264 return HAS_ACCENT; 1265 case '\u0308': // dialytika = diaeresis 1266 return HAS_COMBINING_DIALYTIKA; 1267 case '\u0344': // dialytika tonos 1268 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT; 1269 case '\u0345': // ypogegrammeni = iota subscript 1270 return HAS_YPOGEGRAMMENI; 1271 case '\u0304': // macron 1272 case '\u0306': // breve 1273 case '\u0313': // comma above 1274 case '\u0314': // reversed comma above 1275 case '\u0343': // koronis 1276 return HAS_OTHER_GREEK_DIACRITIC; 1277 default: 1278 return 0; 1279 } 1280 } 1281 isFollowedByCasedLetter(CharSequence s, int i)1282 private static boolean isFollowedByCasedLetter(CharSequence s, int i) { 1283 while (i < s.length()) { 1284 int c = Character.codePointAt(s, i); 1285 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); 1286 if ((type & UCaseProps.IGNORABLE) != 0) { 1287 // Case-ignorable, continue with the loop. 1288 i += Character.charCount(c); 1289 } else if (type != UCaseProps.NONE) { 1290 return true; // Followed by cased letter. 1291 } else { 1292 return false; // Uncased and not case-ignorable. 1293 } 1294 } 1295 return false; // Not followed by cased letter. 1296 } 1297 1298 /** 1299 * Greek string uppercasing with a state machine. 1300 * Probably simpler than a stateless function that has to figure out complex context-before 1301 * for each character. 1302 * TODO: Try to re-consolidate one way or another with the non-Greek function. 1303 * 1304 * <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8). 1305 * @throws IOException 1306 */ toUpper(int options, CharSequence src, A dest, Edits edits)1307 private static <A extends Appendable> A toUpper(int options, 1308 CharSequence src, A dest, Edits edits) throws IOException { 1309 int state = 0; 1310 for (int i = 0; i < src.length();) { 1311 int c = Character.codePointAt(src, i); 1312 int nextIndex = i + Character.charCount(c); 1313 int nextState = 0; 1314 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); 1315 if ((type & UCaseProps.IGNORABLE) != 0) { 1316 // c is case-ignorable 1317 nextState |= (state & AFTER_CASED); 1318 } else if (type != UCaseProps.NONE) { 1319 // c is cased 1320 nextState |= AFTER_CASED; 1321 } 1322 int data = getLetterData(c); 1323 if (data > 0) { 1324 int upper = data & UPPER_MASK; 1325 // Add a dialytika to this iota or ypsilon vowel 1326 // if we removed a tonos from the previous vowel, 1327 // and that previous vowel did not also have (or gain) a dialytika. 1328 // Adding one only to the final vowel in a longer sequence 1329 // (which does not occur in normal writing) would require lookahead. 1330 // Set the same flag as for preserving an existing dialytika. 1331 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && 1332 (upper == 'Ι' || upper == 'Υ')) { 1333 data |= HAS_DIALYTIKA; 1334 } 1335 int numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. 1336 if ((data & HAS_YPOGEGRAMMENI) != 0) { 1337 numYpogegrammeni = 1; 1338 } 1339 // Skip combining diacritics after this Greek letter. 1340 while (nextIndex < src.length()) { 1341 int diacriticData = getDiacriticData(src.charAt(nextIndex)); 1342 if (diacriticData != 0) { 1343 data |= diacriticData; 1344 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { 1345 ++numYpogegrammeni; 1346 } 1347 ++nextIndex; 1348 } else { 1349 break; // not a Greek diacritic 1350 } 1351 } 1352 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { 1353 nextState |= AFTER_VOWEL_WITH_ACCENT; 1354 } 1355 // Map according to Greek rules. 1356 boolean addTonos = false; 1357 if (upper == 'Η' && 1358 (data & HAS_ACCENT) != 0 && 1359 numYpogegrammeni == 0 && 1360 (state & AFTER_CASED) == 0 && 1361 !isFollowedByCasedLetter(src, nextIndex)) { 1362 // Keep disjunctive "or" with (only) a tonos. 1363 // We use the same "word boundary" conditions as for the Final_Sigma test. 1364 if (i == nextIndex) { 1365 upper = 'Ή'; // Preserve the precomposed form. 1366 } else { 1367 addTonos = true; 1368 } 1369 } else if ((data & HAS_DIALYTIKA) != 0) { 1370 // Preserve a vowel with dialytika in precomposed form if it exists. 1371 if (upper == 'Ι') { 1372 upper = 'Ϊ'; 1373 data &= ~HAS_EITHER_DIALYTIKA; 1374 } else if (upper == 'Υ') { 1375 upper = 'Ϋ'; 1376 data &= ~HAS_EITHER_DIALYTIKA; 1377 } 1378 } 1379 1380 boolean change; 1381 if (edits == null && (options & OMIT_UNCHANGED_TEXT) == 0) { 1382 change = true; // common, simple usage 1383 } else { 1384 // Find out first whether we are changing the text. 1385 change = src.charAt(i) != upper || numYpogegrammeni > 0; 1386 int i2 = i + 1; 1387 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 1388 change |= i2 >= nextIndex || src.charAt(i2) != 0x308; 1389 ++i2; 1390 } 1391 if (addTonos) { 1392 change |= i2 >= nextIndex || src.charAt(i2) != 0x301; 1393 ++i2; 1394 } 1395 int oldLength = nextIndex - i; 1396 int newLength = (i2 - i) + numYpogegrammeni; 1397 change |= oldLength != newLength; 1398 if (change) { 1399 if (edits != null) { 1400 edits.addReplace(oldLength, newLength); 1401 } 1402 } else { 1403 if (edits != null) { 1404 edits.addUnchanged(oldLength); 1405 } 1406 // Write unchanged text? 1407 change = (options & OMIT_UNCHANGED_TEXT) == 0; 1408 } 1409 } 1410 1411 if (change) { 1412 dest.append((char)upper); 1413 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 1414 dest.append('\u0308'); // restore or add a dialytika 1415 } 1416 if (addTonos) { 1417 dest.append('\u0301'); 1418 } 1419 while (numYpogegrammeni > 0) { 1420 dest.append('Ι'); 1421 --numYpogegrammeni; 1422 } 1423 } 1424 } else { 1425 c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK); 1426 appendResult(c, dest, nextIndex - i, options, edits); 1427 } 1428 i = nextIndex; 1429 state = nextState; 1430 } 1431 return dest; 1432 } 1433 } 1434 } 1435