1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 package com.ibm.icu.impl;
4 
5 import java.io.IOException;
6 import java.text.CharacterIterator;
7 import java.util.Locale;
8 
9 import com.ibm.icu.lang.UCharacter;
10 import com.ibm.icu.lang.UCharacterCategory;
11 import com.ibm.icu.text.BreakIterator;
12 import com.ibm.icu.text.Edits;
13 import com.ibm.icu.util.ICUUncheckedIOException;
14 import com.ibm.icu.util.ULocale;
15 
16 public final class CaseMapImpl {
17     /**
18      * Implementation of UCaseProps.ContextIterator, iterates over a String.
19      * See ustrcase.c/utf16_caseContextIterator().
20      */
21     public static final class StringContextIterator implements UCaseProps.ContextIterator {
22         /**
23          * Constructor.
24          * @param src String to iterate over.
25          */
StringContextIterator(CharSequence src)26         public StringContextIterator(CharSequence src) {
27             this.s=src;
28             limit=src.length();
29             cpStart=cpLimit=index=0;
30             dir=0;
31         }
32 
33         /**
34          * Constructor.
35          * @param src String to iterate over.
36          * @param cpStart Start index of the current code point.
37          * @param cpLimit Limit index of the current code point.
38          */
StringContextIterator(CharSequence src, int cpStart, int cpLimit)39         public StringContextIterator(CharSequence src, int cpStart, int cpLimit) {
40             s = src;
41             index = 0;
42             limit = src.length();
43             this.cpStart = cpStart;
44             this.cpLimit = cpLimit;
45             dir = 0;
46         }
47 
48         /**
49          * Set the iteration limit for nextCaseMapCP() to an index within the string.
50          * If the limit parameter is negative or past the string, then the
51          * string length is restored as the iteration limit.
52          *
53          * <p>This limit does not affect the next() function which always
54          * iterates to the very end of the string.
55          *
56          * @param lim The iteration limit.
57          */
setLimit(int lim)58         public void setLimit(int lim) {
59             if(0<=lim && lim<=s.length()) {
60                 limit=lim;
61             } else {
62                 limit=s.length();
63             }
64         }
65 
66         /**
67          * Move to the iteration limit without fetching code points up to there.
68          */
moveToLimit()69         public void moveToLimit() {
70             cpStart=cpLimit=limit;
71         }
72 
73         /**
74          * Iterate forward through the string to fetch the next code point
75          * to be case-mapped, and set the context indexes for it.
76          *
77          * <p>When the iteration limit is reached (and -1 is returned),
78          * getCPStart() will be at the iteration limit.
79          *
80          * <p>Iteration with next() does not affect the position for nextCaseMapCP().
81          *
82          * @return The next code point to be case-mapped, or <0 when the iteration is done.
83          */
nextCaseMapCP()84         public int nextCaseMapCP() {
85             cpStart=cpLimit;
86             if(cpLimit<limit) {
87                 int c=Character.codePointAt(s, cpLimit);
88                 cpLimit+=Character.charCount(c);
89                 return c;
90             } else {
91                 return -1;
92             }
93         }
94 
setCPStartAndLimit(int s, int l)95         public void setCPStartAndLimit(int s, int l) {
96             cpStart = s;
97             cpLimit = l;
98             dir = 0;
99         }
100         /**
101          * Returns the start of the code point that was last returned
102          * by nextCaseMapCP().
103          */
getCPStart()104         public int getCPStart() {
105             return cpStart;
106         }
107 
108         /**
109          * Returns the limit of the code point that was last returned
110          * by nextCaseMapCP().
111          */
getCPLimit()112         public int getCPLimit() {
113             return cpLimit;
114         }
115 
getCPLength()116         public int getCPLength() {
117             return cpLimit-cpStart;
118         }
119 
120         // implement UCaseProps.ContextIterator
121         // The following code is not used anywhere in this private class
122         @Override
reset(int direction)123         public void reset(int direction) {
124             if(direction>0) {
125                 /* reset for forward iteration */
126                 dir=1;
127                 index=cpLimit;
128             } else if(direction<0) {
129                 /* reset for backward iteration */
130                 dir=-1;
131                 index=cpStart;
132             } else {
133                 // not a valid direction
134                 dir=0;
135                 index=0;
136             }
137         }
138 
139         @Override
next()140         public int next() {
141             int c;
142 
143             if(dir>0 && index<s.length()) {
144                 c=Character.codePointAt(s, index);
145                 index+=Character.charCount(c);
146                 return c;
147             } else if(dir<0 && index>0) {
148                 c=Character.codePointBefore(s, index);
149                 index-=Character.charCount(c);
150                 return c;
151             }
152             return -1;
153         }
154 
155         // variables
156         protected CharSequence s;
157         protected int index, limit, cpStart, cpLimit;
158         protected int dir; // 0=initial state  >0=forward  <0=backward
159     }
160 
161     public static final int TITLECASE_WHOLE_STRING = 0x20;
162     public static final int TITLECASE_SENTENCES = 0x40;
163 
164     /**
165      * Bit mask for the titlecasing iterator options bit field.
166      * Currently only 3 out of 8 values are used:
167      * 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES.
168      * See stringoptions.h.
169      * @internal
170      */
171     private static final int TITLECASE_ITERATOR_MASK = 0xe0;
172 
173     public static final int TITLECASE_ADJUST_TO_CASED = 0x400;
174 
175     /**
176      * Bit mask for the titlecasing index adjustment options bit set.
177      * Currently two bits are defined:
178      * TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED.
179      * See stringoptions.h.
180      * @internal
181      */
182     private static final int TITLECASE_ADJUSTMENT_MASK = 0x600;
183 
addTitleAdjustmentOption(int options, int newOption)184     public static int addTitleAdjustmentOption(int options, int newOption) {
185         int adjOptions = options & TITLECASE_ADJUSTMENT_MASK;
186         if (adjOptions !=0 && adjOptions != newOption) {
187             throw new IllegalArgumentException("multiple titlecasing index adjustment options");
188         }
189         return options | newOption;
190     }
191 
192     private static final int LNS =
193             (1 << UCharacterCategory.UPPERCASE_LETTER) |
194             (1 << UCharacterCategory.LOWERCASE_LETTER) |
195             (1 << UCharacterCategory.TITLECASE_LETTER) |
196             // Not MODIFIER_LETTER: We count only cased modifier letters.
197             (1 << UCharacterCategory.OTHER_LETTER) |
198 
199             (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
200             (1 << UCharacterCategory.LETTER_NUMBER) |
201             (1 << UCharacterCategory.OTHER_NUMBER) |
202 
203             (1 << UCharacterCategory.MATH_SYMBOL) |
204             (1 << UCharacterCategory.CURRENCY_SYMBOL) |
205             (1 << UCharacterCategory.MODIFIER_SYMBOL) |
206             (1 << UCharacterCategory.OTHER_SYMBOL) |
207 
208             (1 << UCharacterCategory.PRIVATE_USE);
209 
isLNS(int c)210     private static boolean isLNS(int c) {
211         // Letter, number, symbol,
212         // or a private use code point because those are typically used as letters or numbers.
213         // Consider modifier letters only if they are cased.
214         int gc = UCharacterProperty.INSTANCE.getType(c);
215         return ((1 << gc) & LNS) != 0 ||
216                 (gc == UCharacterCategory.MODIFIER_LETTER &&
217                     UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE);
218     }
219 
addTitleIteratorOption(int options, int newOption)220     public static int addTitleIteratorOption(int options, int newOption) {
221         int iterOptions = options & TITLECASE_ITERATOR_MASK;
222         if (iterOptions !=0 && iterOptions != newOption) {
223             throw new IllegalArgumentException("multiple titlecasing iterator options");
224         }
225         return options | newOption;
226     }
227 
getTitleBreakIterator( Locale locale, int options, BreakIterator iter)228     public static BreakIterator getTitleBreakIterator(
229             Locale locale, int options, BreakIterator iter) {
230         options &= TITLECASE_ITERATOR_MASK;
231         if (options != 0 && iter != null) {
232             throw new IllegalArgumentException(
233                     "titlecasing iterator option together with an explicit iterator");
234         }
235         if (iter == null) {
236             switch (options) {
237             case 0:
238                 iter = BreakIterator.getWordInstance(locale);
239                 break;
240             case TITLECASE_WHOLE_STRING:
241                 iter = new WholeStringBreakIterator();
242                 break;
243             case TITLECASE_SENTENCES:
244                 iter = BreakIterator.getSentenceInstance(locale);
245                 break;
246             default:
247                 throw new IllegalArgumentException("unknown titlecasing iterator option");
248             }
249         }
250         return iter;
251     }
252 
getTitleBreakIterator( ULocale locale, int options, BreakIterator iter)253     public static BreakIterator getTitleBreakIterator(
254             ULocale locale, int options, BreakIterator iter) {
255         options &= TITLECASE_ITERATOR_MASK;
256         if (options != 0 && iter != null) {
257             throw new IllegalArgumentException(
258                     "titlecasing iterator option together with an explicit iterator");
259         }
260         if (iter == null) {
261             switch (options) {
262             case 0:
263                 iter = BreakIterator.getWordInstance(locale);
264                 break;
265             case TITLECASE_WHOLE_STRING:
266                 iter = new WholeStringBreakIterator();
267                 break;
268             case TITLECASE_SENTENCES:
269                 iter = BreakIterator.getSentenceInstance(locale);
270                 break;
271             default:
272                 throw new IllegalArgumentException("unknown titlecasing iterator option");
273             }
274         }
275         return iter;
276     }
277 
278     /**
279      * Omit unchanged text when case-mapping with Edits.
280      */
281     public static final int OMIT_UNCHANGED_TEXT = 0x4000;
282 
283     private static final class WholeStringBreakIterator extends BreakIterator {
284         private int length;
285 
notImplemented()286         private static void notImplemented() {
287             throw new UnsupportedOperationException("should not occur");
288         }
289 
290         @Override
first()291         public int first() {
292             return 0;
293         }
294 
295         @Override
last()296         public int last() {
297             notImplemented();
298             return 0;
299         }
300 
301         @Override
next(int n)302         public int next(int n) {
303             notImplemented();
304             return 0;
305         }
306 
307         @Override
next()308         public int next() {
309             return length;
310         }
311 
312         @Override
previous()313         public int previous() {
314             notImplemented();
315             return 0;
316         }
317 
318         @Override
following(int offset)319         public int following(int offset) {
320             notImplemented();
321             return 0;
322         }
323 
324         @Override
current()325         public int current() {
326             notImplemented();
327             return 0;
328         }
329 
330         @Override
getText()331         public CharacterIterator getText() {
332             notImplemented();
333             return null;
334         }
335 
336         @Override
setText(CharacterIterator newText)337         public void setText(CharacterIterator newText) {
338             length = newText.getEndIndex();
339         }
340 
341         @Override
setText(CharSequence newText)342         public void setText(CharSequence newText) {
343             length = newText.length();
344         }
345 
346         @Override
setText(String newText)347         public void setText(String newText) {
348             length = newText.length();
349         }
350     }
351 
appendCodePoint(Appendable a, int c)352     private static int appendCodePoint(Appendable a, int c) throws IOException {
353         if (c <= Character.MAX_VALUE) {
354             a.append((char)c);
355             return 1;
356         } else {
357             a.append((char)(0xd7c0 + (c >> 10)));
358             a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff)));
359             return 2;
360         }
361     }
362 
363     /**
364      * Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}.
365      * @throws IOException
366      */
appendResult(int result, Appendable dest, int cpLength, int options, Edits edits)367     private static void appendResult(int result, Appendable dest,
368             int cpLength, int options, Edits edits) throws IOException {
369         // Decode the result.
370         if (result < 0) {
371             // (not) original code point
372             if (edits != null) {
373                 edits.addUnchanged(cpLength);
374             }
375             if ((options & OMIT_UNCHANGED_TEXT) != 0) {
376                 return;
377             }
378             appendCodePoint(dest, ~result);
379         } else if (result <= UCaseProps.MAX_STRING_LENGTH) {
380             // The mapping has already been appended to result.
381             if (edits != null) {
382                 edits.addReplace(cpLength, result);
383             }
384         } else {
385             // Append the single-code point mapping.
386             int length = appendCodePoint(dest, result);
387             if (edits != null) {
388                 edits.addReplace(cpLength, length);
389             }
390         }
391     }
392 
appendUnchanged(CharSequence src, int start, int length, Appendable dest, int options, Edits edits)393     private static final void appendUnchanged(CharSequence src, int start, int length,
394             Appendable dest, int options, Edits edits) throws IOException {
395         if (length > 0) {
396             if (edits != null) {
397                 edits.addUnchanged(length);
398             }
399             if ((options & OMIT_UNCHANGED_TEXT) != 0) {
400                 return;
401             }
402             dest.append(src, start, start + length);
403         }
404     }
405 
applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits)406     private static String applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits) {
407         if (!edits.hasChanges()) {
408             return src.toString();
409         }
410         StringBuilder result = new StringBuilder(src.length() + edits.lengthDelta());
411         for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) {
412             if (ei.hasChange()) {
413                 int i = ei.replacementIndex();
414                 result.append(replacementChars, i, i + ei.newLength());
415             } else {
416                 int i = ei.sourceIndex();
417                 result.append(src, i, i + ei.oldLength());
418             }
419         }
420         return result.toString();
421     }
422 
423     private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie();
424 
425     /**
426      * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
427      * caseLocale < 0: Case-folds [srcStart..srcLimit[.
428      */
internalToLower(int caseLocale, int options, CharSequence src, int srcStart, int srcLimit, StringContextIterator iter, Appendable dest, Edits edits)429     private static void internalToLower(int caseLocale, int options,
430             CharSequence src, int srcStart, int srcLimit, StringContextIterator iter,
431             Appendable dest, Edits edits) throws IOException {
432         byte[] latinToLower;
433         if (caseLocale == UCaseProps.LOC_ROOT ||
434                 (caseLocale >= 0 ?
435                     !(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) :
436                     (options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) {
437             latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL;
438         } else {
439             latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT;
440         }
441         int prev = srcStart;
442         int srcIndex = srcStart;
443         outerLoop:
444         for (;;) {
445             // fast path for simple cases
446             char lead;
447             for (;;) {
448                 if (srcIndex >= srcLimit) {
449                     break outerLoop;
450                 }
451                 lead = src.charAt(srcIndex);
452                 int delta;
453                 if (lead < UCaseProps.LatinCase.LONG_S) {
454                     byte d = latinToLower[lead];
455                     if (d == UCaseProps.LatinCase.EXC) { break; }
456                     ++srcIndex;
457                     if (d == 0) { continue; }
458                     delta = d;
459                 } else if (lead >= 0xd800) {
460                     break;  // surrogate or higher
461                 } else {
462                     int props = CASE_TRIE.getFromU16SingleLead(lead);
463                     if (UCaseProps.propsHasException(props)) { break; }
464                     ++srcIndex;
465                     if (!UCaseProps.isUpperOrTitleFromProps(props) ||
466                             (delta = UCaseProps.getDelta(props)) == 0) {
467                         continue;
468                     }
469                 }
470                 lead += delta;
471                 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
472                 dest.append(lead);
473                 if (edits != null) {
474                     edits.addReplace(1, 1);
475                 }
476                 prev = srcIndex;
477             }
478             // slow path
479             int cpStart = srcIndex++;
480             char trail;
481             int c;
482             if (Character.isHighSurrogate(lead) && srcIndex < srcLimit &&
483                     Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
484                 c = Character.toCodePoint(lead, trail);
485                 ++srcIndex;
486             } else {
487                 c = lead;
488             }
489             if (caseLocale >= 0) {
490                 if (iter == null) {
491                     iter = new StringContextIterator(src, cpStart, srcIndex);
492                 } else {
493                     iter.setCPStartAndLimit(cpStart, srcIndex);
494                 }
495                 c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
496             } else {
497                 c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
498             }
499             if (c >= 0) {
500                 appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
501                 appendResult(c, dest, srcIndex - cpStart, options, edits);
502                 prev = srcIndex;
503             }
504         }
505         appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
506     }
507 
internalToUpper(int caseLocale, int options, CharSequence src, Appendable dest, Edits edits)508     private static void internalToUpper(int caseLocale, int options,
509             CharSequence src, Appendable dest, Edits edits) throws IOException {
510         StringContextIterator iter = null;
511         byte[] latinToUpper;
512         if (caseLocale == UCaseProps.LOC_TURKISH) {
513             latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR;
514         } else {
515             latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL;
516         }
517         int prev = 0;
518         int srcIndex = 0;
519         int srcLength = src.length();
520         outerLoop:
521         for (;;) {
522             // fast path for simple cases
523             char lead;
524             for (;;) {
525                 if (srcIndex >= srcLength) {
526                     break outerLoop;
527                 }
528                 lead = src.charAt(srcIndex);
529                 int delta;
530                 if (lead < UCaseProps.LatinCase.LONG_S) {
531                     byte d = latinToUpper[lead];
532                     if (d == UCaseProps.LatinCase.EXC) { break; }
533                     ++srcIndex;
534                     if (d == 0) { continue; }
535                     delta = d;
536                 } else if (lead >= 0xd800) {
537                     break;  // surrogate or higher
538                 } else {
539                     int props = CASE_TRIE.getFromU16SingleLead(lead);
540                     if (UCaseProps.propsHasException(props)) { break; }
541                     ++srcIndex;
542                     if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER ||
543                             (delta = UCaseProps.getDelta(props)) == 0) {
544                         continue;
545                     }
546                 }
547                 lead += delta;
548                 appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
549                 dest.append(lead);
550                 if (edits != null) {
551                     edits.addReplace(1, 1);
552                 }
553                 prev = srcIndex;
554             }
555             // slow path
556             int cpStart = srcIndex++;
557             char trail;
558             int c;
559             if (Character.isHighSurrogate(lead) && srcIndex < srcLength &&
560                     Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
561                 c = Character.toCodePoint(lead, trail);
562                 ++srcIndex;
563             } else {
564                 c = lead;
565             }
566             if (iter == null) {
567                 iter = new StringContextIterator(src, cpStart, srcIndex);
568             } else {
569                 iter.setCPStartAndLimit(cpStart, srcIndex);
570             }
571             c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
572             if (c >= 0) {
573                 appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
574                 appendResult(c, dest, srcIndex - cpStart, options, edits);
575                 prev = srcIndex;
576             }
577         }
578         appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
579     }
580 
toLower(int caseLocale, int options, CharSequence src)581     public static String toLower(int caseLocale, int options, CharSequence src) {
582         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
583             if (src.length() == 0) {
584                 return src.toString();
585             }
586             // Collect and apply only changes.
587             // Good if no or few changes. Bad (slow) if many changes.
588             Edits edits = new Edits();
589             StringBuilder replacementChars = toLower(
590                     caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
591             return applyEdits(src, replacementChars, edits);
592         } else {
593             return toLower(caseLocale, options, src,
594                     new StringBuilder(src.length()), null).toString();
595         }
596     }
597 
toLower(int caseLocale, int options, CharSequence src, A dest, Edits edits)598     public static <A extends Appendable> A toLower(int caseLocale, int options,
599             CharSequence src, A dest, Edits edits) {
600         try {
601             if (edits != null) {
602                 edits.reset();
603             }
604             internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits);
605             return dest;
606         } catch (IOException e) {
607             throw new ICUUncheckedIOException(e);
608         }
609     }
610 
toUpper(int caseLocale, int options, CharSequence src)611     public static String toUpper(int caseLocale, int options, CharSequence src) {
612         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
613             if (src.length() == 0) {
614                 return src.toString();
615             }
616             // Collect and apply only changes.
617             // Good if no or few changes. Bad (slow) if many changes.
618             Edits edits = new Edits();
619             StringBuilder replacementChars = toUpper(
620                     caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
621             return applyEdits(src, replacementChars, edits);
622         } else {
623             return toUpper(caseLocale, options, src,
624                     new StringBuilder(src.length()), null).toString();
625         }
626     }
627 
toUpper(int caseLocale, int options, CharSequence src, A dest, Edits edits)628     public static <A extends Appendable> A toUpper(int caseLocale, int options,
629             CharSequence src, A dest, Edits edits) {
630         try {
631             if (edits != null) {
632                 edits.reset();
633             }
634             if (caseLocale == UCaseProps.LOC_GREEK) {
635                 return GreekUpper.toUpper(options, src, dest, edits);
636             }
637             internalToUpper(caseLocale, options, src, dest, edits);
638             return dest;
639         } catch (IOException e) {
640             throw new ICUUncheckedIOException(e);
641         }
642     }
643 
toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src)644     public static String toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src) {
645         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
646             if (src.length() == 0) {
647                 return src.toString();
648             }
649             // Collect and apply only changes.
650             // Good if no or few changes. Bad (slow) if many changes.
651             Edits edits = new Edits();
652             StringBuilder replacementChars = toTitle(
653                     caseLocale, options | OMIT_UNCHANGED_TEXT, iter, src,
654                     new StringBuilder(), edits);
655             return applyEdits(src, replacementChars, edits);
656         } else {
657             return toTitle(caseLocale, options, iter, src,
658                     new StringBuilder(src.length()), null).toString();
659         }
660     }
661 
toTitle( int caseLocale, int options, BreakIterator titleIter, CharSequence src, A dest, Edits edits)662     public static <A extends Appendable> A toTitle(
663             int caseLocale, int options, BreakIterator titleIter,
664             CharSequence src, A dest, Edits edits) {
665         try {
666             if (edits != null) {
667                 edits.reset();
668             }
669 
670             /* set up local variables */
671             StringContextIterator iter = new StringContextIterator(src);
672             int srcLength = src.length();
673             int prev=0;
674             boolean isFirstIndex=true;
675 
676             /* titlecasing loop */
677             while(prev<srcLength) {
678                 /* find next index where to titlecase */
679                 int index;
680                 if(isFirstIndex) {
681                     isFirstIndex=false;
682                     index=titleIter.first();
683                 } else {
684                     index=titleIter.next();
685                 }
686                 if(index==BreakIterator.DONE || index>srcLength) {
687                     index=srcLength;
688                 }
689 
690                 /*
691                  * Segment [prev..index[ into 3 parts:
692                  * a) skipped characters (copy as-is) [prev..titleStart[
693                  * b) first letter (titlecase)              [titleStart..titleLimit[
694                  * c) subsequent characters (lowercase)                 [titleLimit..index[
695                  */
696                 if(prev<index) {
697                     // Find and copy skipped characters [prev..titleStart[
698                     int titleStart=prev;
699                     iter.setLimit(index);
700                     int c=iter.nextCaseMapCP();
701                     if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
702                         // Adjust the titlecasing index to the next cased character,
703                         // or to the next letter/number/symbol/private use.
704                         // Stop with titleStart<titleLimit<=index
705                         // if there is a character to be titlecased,
706                         // or else stop with titleStart==titleLimit==index.
707                         boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0;
708                         while ((toCased ?
709                                     UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) :
710                                         !CaseMapImpl.isLNS(c)) &&
711                                 (c=iter.nextCaseMapCP())>=0) {}
712                         // If c<0 then we have only uncased characters in [prev..index[
713                         // and stopped with titleStart==titleLimit==index.
714                         titleStart=iter.getCPStart();
715                         if (prev < titleStart) {
716                             appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
717                         }
718                     }
719 
720                     if(titleStart<index) {
721                         int titleLimit=iter.getCPLimit();
722                         // titlecase c which is from [titleStart..titleLimit[
723                         c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale);
724                         appendResult(c, dest, iter.getCPLength(), options, edits);
725 
726                         // Special case Dutch IJ titlecasing
727                         if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) {
728                             char c1 = src.charAt(titleStart);
729                             if ((c1 == 'i' || c1 == 'I')) {
730                                 char c2 = src.charAt(titleStart+1);
731                                 if (c2 == 'j') {
732                                     dest.append('J');
733                                     if (edits != null) {
734                                         edits.addReplace(1, 1);
735                                     }
736                                     c = iter.nextCaseMapCP();
737                                     titleLimit++;
738                                     assert c == c2;
739                                     assert titleLimit == iter.getCPLimit();
740                                 } else if (c2 == 'J') {
741                                     // Keep the capital J from getting lowercased.
742                                     appendUnchanged(src, titleStart + 1, 1, dest, options, edits);
743                                     c = iter.nextCaseMapCP();
744                                     titleLimit++;
745                                     assert c == c2;
746                                     assert titleLimit == iter.getCPLimit();
747                                 }
748                             }
749                         }
750 
751                         // lowercase [titleLimit..index[
752                         if(titleLimit<index) {
753                             if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) {
754                                 // Normal operation: Lowercase the rest of the word.
755                                 internalToLower(caseLocale, options,
756                                         src, titleLimit, index, iter, dest, edits);
757                             } else {
758                                 // Optionally just copy the rest of the word unchanged.
759                                 appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits);
760                             }
761                             iter.moveToLimit();
762                         }
763                     }
764                 }
765 
766                 prev=index;
767             }
768             return dest;
769         } catch (IOException e) {
770             throw new ICUUncheckedIOException(e);
771         }
772     }
773 
fold(int options, CharSequence src)774     public static String fold(int options, CharSequence src) {
775         if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
776             if (src.length() == 0) {
777                 return src.toString();
778             }
779             // Collect and apply only changes.
780             // Good if no or few changes. Bad (slow) if many changes.
781             Edits edits = new Edits();
782             StringBuilder replacementChars = fold(
783                     options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
784             return applyEdits(src, replacementChars, edits);
785         } else {
786             return fold(options, src, new StringBuilder(src.length()), null).toString();
787         }
788     }
789 
fold(int options, CharSequence src, A dest, Edits edits)790     public static <A extends Appendable> A fold(int options,
791             CharSequence src, A dest, Edits edits) {
792         try {
793             if (edits != null) {
794                 edits.reset();
795             }
796             internalToLower(-1, options, src, 0, src.length(), null, dest, edits);
797             return dest;
798         } catch (IOException e) {
799             throw new ICUUncheckedIOException(e);
800         }
801     }
802 
803     private static final class GreekUpper {
804         // Data bits.
805         private static final int UPPER_MASK = 0x3ff;
806         private static final int HAS_VOWEL = 0x1000;
807         private static final int HAS_YPOGEGRAMMENI = 0x2000;
808         private static final int HAS_ACCENT = 0x4000;
809         private static final int HAS_DIALYTIKA = 0x8000;
810         // Further bits during data building and processing, not stored in the data map.
811         private static final int HAS_COMBINING_DIALYTIKA = 0x10000;
812         private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000;
813 
814         private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
815         private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
816                 HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
817         private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
818 
819         // State bits.
820         private static final int AFTER_CASED = 1;
821         private static final int AFTER_VOWEL_WITH_ACCENT = 2;
822 
823         // Data generated by prototype code, see
824         // http://site.icu-project.org/design/case/greek-upper
825         // TODO: Move this data into ucase.icu.
826         private static final char[] data0370 = {
827             // U+0370..03FF
828             0x0370,  // Ͱ
829             0x0370,  // ͱ
830             0x0372,  // Ͳ
831             0x0372,  // ͳ
832             0,
833             0,
834             0x0376,  // Ͷ
835             0x0376,  // ͷ
836             0,
837             0,
838             0x037A,  // ͺ
839             0x03FD,  // ͻ
840             0x03FE,  // ͼ
841             0x03FF,  // ͽ
842             0,
843             0x037F,  // Ϳ
844             0,
845             0,
846             0,
847             0,
848             0,
849             0,
850             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ά
851             0,
852             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Έ
853             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ή
854             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ί
855             0,
856             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ό
857             0,
858             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ύ
859             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ώ
860             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΐ
861             0x0391 | HAS_VOWEL,  // Α
862             0x0392,  // Β
863             0x0393,  // Γ
864             0x0394,  // Δ
865             0x0395 | HAS_VOWEL,  // Ε
866             0x0396,  // Ζ
867             0x0397 | HAS_VOWEL,  // Η
868             0x0398,  // Θ
869             0x0399 | HAS_VOWEL,  // Ι
870             0x039A,  // Κ
871             0x039B,  // Λ
872             0x039C,  // Μ
873             0x039D,  // Ν
874             0x039E,  // Ξ
875             0x039F | HAS_VOWEL,  // Ο
876             0x03A0,  // Π
877             0x03A1,  // Ρ
878             0,
879             0x03A3,  // Σ
880             0x03A4,  // Τ
881             0x03A5 | HAS_VOWEL,  // Υ
882             0x03A6,  // Φ
883             0x03A7,  // Χ
884             0x03A8,  // Ψ
885             0x03A9 | HAS_VOWEL,  // Ω
886             0x0399 | HAS_VOWEL | HAS_DIALYTIKA,  // Ϊ
887             0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,  // Ϋ
888             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ά
889             0x0395 | HAS_VOWEL | HAS_ACCENT,  // έ
890             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ή
891             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ί
892             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΰ
893             0x0391 | HAS_VOWEL,  // α
894             0x0392,  // β
895             0x0393,  // γ
896             0x0394,  // δ
897             0x0395 | HAS_VOWEL,  // ε
898             0x0396,  // ζ
899             0x0397 | HAS_VOWEL,  // η
900             0x0398,  // θ
901             0x0399 | HAS_VOWEL,  // ι
902             0x039A,  // κ
903             0x039B,  // λ
904             0x039C,  // μ
905             0x039D,  // ν
906             0x039E,  // ξ
907             0x039F | HAS_VOWEL,  // ο
908             0x03A0,  // π
909             0x03A1,  // ρ
910             0x03A3,  // ς
911             0x03A3,  // σ
912             0x03A4,  // τ
913             0x03A5 | HAS_VOWEL,  // υ
914             0x03A6,  // φ
915             0x03A7,  // χ
916             0x03A8,  // ψ
917             0x03A9 | HAS_VOWEL,  // ω
918             0x0399 | HAS_VOWEL | HAS_DIALYTIKA,  // ϊ
919             0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,  // ϋ
920             0x039F | HAS_VOWEL | HAS_ACCENT,  // ό
921             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ύ
922             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ώ
923             0x03CF,  // Ϗ
924             0x0392,  // ϐ
925             0x0398,  // ϑ
926             0x03D2,  // ϒ
927             0x03D2 | HAS_ACCENT,  // ϓ
928             0x03D2 | HAS_DIALYTIKA,  // ϔ
929             0x03A6,  // ϕ
930             0x03A0,  // ϖ
931             0x03CF,  // ϗ
932             0x03D8,  // Ϙ
933             0x03D8,  // ϙ
934             0x03DA,  // Ϛ
935             0x03DA,  // ϛ
936             0x03DC,  // Ϝ
937             0x03DC,  // ϝ
938             0x03DE,  // Ϟ
939             0x03DE,  // ϟ
940             0x03E0,  // Ϡ
941             0x03E0,  // ϡ
942             0,
943             0,
944             0,
945             0,
946             0,
947             0,
948             0,
949             0,
950             0,
951             0,
952             0,
953             0,
954             0,
955             0,
956             0x039A,  // ϰ
957             0x03A1,  // ϱ
958             0x03F9,  // ϲ
959             0x037F,  // ϳ
960             0x03F4,  // ϴ
961             0x0395 | HAS_VOWEL,  // ϵ
962             0,
963             0x03F7,  // Ϸ
964             0x03F7,  // ϸ
965             0x03F9,  // Ϲ
966             0x03FA,  // Ϻ
967             0x03FA,  // ϻ
968             0x03FC,  // ϼ
969             0x03FD,  // Ͻ
970             0x03FE,  // Ͼ
971             0x03FF,  // Ͽ
972         };
973 
974         private static final char[] data1F00 = {
975             // U+1F00..1FFF
976             0x0391 | HAS_VOWEL,  // ἀ
977             0x0391 | HAS_VOWEL,  // ἁ
978             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἂ
979             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἃ
980             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἄ
981             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἅ
982             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἆ
983             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ἇ
984             0x0391 | HAS_VOWEL,  // Ἀ
985             0x0391 | HAS_VOWEL,  // Ἁ
986             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἂ
987             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἃ
988             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἄ
989             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἅ
990             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἆ
991             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ἇ
992             0x0395 | HAS_VOWEL,  // ἐ
993             0x0395 | HAS_VOWEL,  // ἑ
994             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἒ
995             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἓ
996             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἔ
997             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ἕ
998             0,
999             0,
1000             0x0395 | HAS_VOWEL,  // Ἐ
1001             0x0395 | HAS_VOWEL,  // Ἑ
1002             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἒ
1003             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἓ
1004             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἔ
1005             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ἕ
1006             0,
1007             0,
1008             0x0397 | HAS_VOWEL,  // ἠ
1009             0x0397 | HAS_VOWEL,  // ἡ
1010             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἢ
1011             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἣ
1012             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἤ
1013             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἥ
1014             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἦ
1015             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ἧ
1016             0x0397 | HAS_VOWEL,  // Ἠ
1017             0x0397 | HAS_VOWEL,  // Ἡ
1018             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἢ
1019             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἣ
1020             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἤ
1021             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἥ
1022             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἦ
1023             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ἧ
1024             0x0399 | HAS_VOWEL,  // ἰ
1025             0x0399 | HAS_VOWEL,  // ἱ
1026             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἲ
1027             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἳ
1028             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἴ
1029             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἵ
1030             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἶ
1031             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ἷ
1032             0x0399 | HAS_VOWEL,  // Ἰ
1033             0x0399 | HAS_VOWEL,  // Ἱ
1034             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἲ
1035             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἳ
1036             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἴ
1037             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἵ
1038             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἶ
1039             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ἷ
1040             0x039F | HAS_VOWEL,  // ὀ
1041             0x039F | HAS_VOWEL,  // ὁ
1042             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὂ
1043             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὃ
1044             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὄ
1045             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὅ
1046             0,
1047             0,
1048             0x039F | HAS_VOWEL,  // Ὀ
1049             0x039F | HAS_VOWEL,  // Ὁ
1050             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὂ
1051             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὃ
1052             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὄ
1053             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὅ
1054             0,
1055             0,
1056             0x03A5 | HAS_VOWEL,  // ὐ
1057             0x03A5 | HAS_VOWEL,  // ὑ
1058             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὒ
1059             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὓ
1060             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὔ
1061             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὕ
1062             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὖ
1063             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὗ
1064             0,
1065             0x03A5 | HAS_VOWEL,  // Ὑ
1066             0,
1067             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὓ
1068             0,
1069             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὕ
1070             0,
1071             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὗ
1072             0x03A9 | HAS_VOWEL,  // ὠ
1073             0x03A9 | HAS_VOWEL,  // ὡ
1074             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὢ
1075             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὣ
1076             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὤ
1077             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὥ
1078             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὦ
1079             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὧ
1080             0x03A9 | HAS_VOWEL,  // Ὠ
1081             0x03A9 | HAS_VOWEL,  // Ὡ
1082             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὢ
1083             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὣ
1084             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὤ
1085             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὥ
1086             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὦ
1087             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὧ
1088             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ὰ
1089             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ά
1090             0x0395 | HAS_VOWEL | HAS_ACCENT,  // ὲ
1091             0x0395 | HAS_VOWEL | HAS_ACCENT,  // έ
1092             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ὴ
1093             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ή
1094             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ὶ
1095             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ί
1096             0x039F | HAS_VOWEL | HAS_ACCENT,  // ὸ
1097             0x039F | HAS_VOWEL | HAS_ACCENT,  // ό
1098             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ὺ
1099             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ύ
1100             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ὼ
1101             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ώ
1102             0,
1103             0,
1104             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾀ
1105             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾁ
1106             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾂ
1107             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾃ
1108             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾄ
1109             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾅ
1110             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾆ
1111             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾇ
1112             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾈ
1113             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾉ
1114             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾊ
1115             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾋ
1116             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾌ
1117             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾍ
1118             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾎ
1119             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾏ
1120             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾐ
1121             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾑ
1122             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾒ
1123             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾓ
1124             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾔ
1125             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾕ
1126             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾖ
1127             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾗ
1128             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾘ
1129             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾙ
1130             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾚ
1131             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾛ
1132             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾜ
1133             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾝ
1134             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾞ
1135             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾟ
1136             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾠ
1137             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾡ
1138             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾢ
1139             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾣ
1140             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾤ
1141             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾥ
1142             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾦ
1143             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾧ
1144             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾨ
1145             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾩ
1146             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾪ
1147             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾫ
1148             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾬ
1149             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾭ
1150             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾮ
1151             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾯ
1152             0x0391 | HAS_VOWEL,  // ᾰ
1153             0x0391 | HAS_VOWEL,  // ᾱ
1154             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾲ
1155             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾳ
1156             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾴ
1157             0,
1158             0x0391 | HAS_VOWEL | HAS_ACCENT,  // ᾶ
1159             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ᾷ
1160             0x0391 | HAS_VOWEL,  // Ᾰ
1161             0x0391 | HAS_VOWEL,  // Ᾱ
1162             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ὰ
1163             0x0391 | HAS_VOWEL | HAS_ACCENT,  // Ά
1164             0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ᾼ
1165             0,
1166             0x0399 | HAS_VOWEL,  // ι
1167             0,
1168             0,
1169             0,
1170             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῂ
1171             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῃ
1172             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῄ
1173             0,
1174             0x0397 | HAS_VOWEL | HAS_ACCENT,  // ῆ
1175             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῇ
1176             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Ὲ
1177             0x0395 | HAS_VOWEL | HAS_ACCENT,  // Έ
1178             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ὴ
1179             0x0397 | HAS_VOWEL | HAS_ACCENT,  // Ή
1180             0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῌ
1181             0,
1182             0,
1183             0,
1184             0x0399 | HAS_VOWEL,  // ῐ
1185             0x0399 | HAS_VOWEL,  // ῑ
1186             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῒ
1187             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΐ
1188             0,
1189             0,
1190             0x0399 | HAS_VOWEL | HAS_ACCENT,  // ῖ
1191             0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῗ
1192             0x0399 | HAS_VOWEL,  // Ῐ
1193             0x0399 | HAS_VOWEL,  // Ῑ
1194             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ὶ
1195             0x0399 | HAS_VOWEL | HAS_ACCENT,  // Ί
1196             0,
1197             0,
1198             0,
1199             0,
1200             0x03A5 | HAS_VOWEL,  // ῠ
1201             0x03A5 | HAS_VOWEL,  // ῡ
1202             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῢ
1203             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ΰ
1204             0x03A1,  // ῤ
1205             0x03A1,  // ῥ
1206             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // ῦ
1207             0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,  // ῧ
1208             0x03A5 | HAS_VOWEL,  // Ῠ
1209             0x03A5 | HAS_VOWEL,  // Ῡ
1210             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ὺ
1211             0x03A5 | HAS_VOWEL | HAS_ACCENT,  // Ύ
1212             0x03A1,  // Ῥ
1213             0,
1214             0,
1215             0,
1216             0,
1217             0,
1218             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῲ
1219             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῳ
1220             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῴ
1221             0,
1222             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // ῶ
1223             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,  // ῷ
1224             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ὸ
1225             0x039F | HAS_VOWEL | HAS_ACCENT,  // Ό
1226             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ὼ
1227             0x03A9 | HAS_VOWEL | HAS_ACCENT,  // Ώ
1228             0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,  // ῼ
1229             0,
1230             0,
1231             0,
1232         };
1233 
1234         // U+2126 Ohm sign
1235         private static final char data2126 = 0x03A9 | HAS_VOWEL;  // Ω
1236 
getLetterData(int c)1237         private static final int getLetterData(int c) {
1238             if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
1239                 return 0;
1240             } else if (c <= 0x3ff) {
1241                 return data0370[c - 0x370];
1242             } else if (c <= 0x1fff) {
1243                 return data1F00[c - 0x1f00];
1244             } else if (c == 0x2126) {
1245                 return data2126;
1246             } else {
1247                 return 0;
1248             }
1249         }
1250 
1251         /**
1252          * Returns a non-zero value for each of the Greek combining diacritics
1253          * listed in The Unicode Standard, version 8, chapter 7.2 Greek,
1254          * plus some perispomeni look-alikes.
1255          */
getDiacriticData(int c)1256         private static final int getDiacriticData(int c) {
1257             switch (c) {
1258             case '\u0300':  // varia
1259             case '\u0301':  // tonos = oxia
1260             case '\u0342':  // perispomeni
1261             case '\u0302':  // circumflex can look like perispomeni
1262             case '\u0303':  // tilde can look like perispomeni
1263             case '\u0311':  // inverted breve can look like perispomeni
1264                 return HAS_ACCENT;
1265             case '\u0308':  // dialytika = diaeresis
1266                 return HAS_COMBINING_DIALYTIKA;
1267             case '\u0344':  // dialytika tonos
1268                 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
1269             case '\u0345':  // ypogegrammeni = iota subscript
1270                 return HAS_YPOGEGRAMMENI;
1271             case '\u0304':  // macron
1272             case '\u0306':  // breve
1273             case '\u0313':  // comma above
1274             case '\u0314':  // reversed comma above
1275             case '\u0343':  // koronis
1276                 return HAS_OTHER_GREEK_DIACRITIC;
1277             default:
1278                 return 0;
1279             }
1280         }
1281 
isFollowedByCasedLetter(CharSequence s, int i)1282         private static boolean isFollowedByCasedLetter(CharSequence s, int i) {
1283             while (i < s.length()) {
1284                 int c = Character.codePointAt(s, i);
1285                 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
1286                 if ((type & UCaseProps.IGNORABLE) != 0) {
1287                     // Case-ignorable, continue with the loop.
1288                     i += Character.charCount(c);
1289                 } else if (type != UCaseProps.NONE) {
1290                     return true;  // Followed by cased letter.
1291                 } else {
1292                     return false;  // Uncased and not case-ignorable.
1293                 }
1294             }
1295             return false;  // Not followed by cased letter.
1296         }
1297 
1298         /**
1299          * Greek string uppercasing with a state machine.
1300          * Probably simpler than a stateless function that has to figure out complex context-before
1301          * for each character.
1302          * TODO: Try to re-consolidate one way or another with the non-Greek function.
1303          *
1304          * <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8).
1305          * @throws IOException
1306          */
toUpper(int options, CharSequence src, A dest, Edits edits)1307         private static <A extends Appendable> A toUpper(int options,
1308                 CharSequence src, A dest, Edits edits) throws IOException {
1309             int state = 0;
1310             for (int i = 0; i < src.length();) {
1311                 int c = Character.codePointAt(src, i);
1312                 int nextIndex = i + Character.charCount(c);
1313                 int nextState = 0;
1314                 int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
1315                 if ((type & UCaseProps.IGNORABLE) != 0) {
1316                     // c is case-ignorable
1317                     nextState |= (state & AFTER_CASED);
1318                 } else if (type != UCaseProps.NONE) {
1319                     // c is cased
1320                     nextState |= AFTER_CASED;
1321                 }
1322                 int data = getLetterData(c);
1323                 if (data > 0) {
1324                     int upper = data & UPPER_MASK;
1325                     // Add a dialytika to this iota or ypsilon vowel
1326                     // if we removed a tonos from the previous vowel,
1327                     // and that previous vowel did not also have (or gain) a dialytika.
1328                     // Adding one only to the final vowel in a longer sequence
1329                     // (which does not occur in normal writing) would require lookahead.
1330                     // Set the same flag as for preserving an existing dialytika.
1331                     if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
1332                             (upper == 'Ι' || upper == 'Υ')) {
1333                         data |= HAS_DIALYTIKA;
1334                     }
1335                     int numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
1336                     if ((data & HAS_YPOGEGRAMMENI) != 0) {
1337                         numYpogegrammeni = 1;
1338                     }
1339                     // Skip combining diacritics after this Greek letter.
1340                     while (nextIndex < src.length()) {
1341                         int diacriticData = getDiacriticData(src.charAt(nextIndex));
1342                         if (diacriticData != 0) {
1343                             data |= diacriticData;
1344                             if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
1345                                 ++numYpogegrammeni;
1346                             }
1347                             ++nextIndex;
1348                         } else {
1349                             break;  // not a Greek diacritic
1350                         }
1351                     }
1352                     if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
1353                         nextState |= AFTER_VOWEL_WITH_ACCENT;
1354                     }
1355                     // Map according to Greek rules.
1356                     boolean addTonos = false;
1357                     if (upper == 'Η' &&
1358                             (data & HAS_ACCENT) != 0 &&
1359                             numYpogegrammeni == 0 &&
1360                             (state & AFTER_CASED) == 0 &&
1361                             !isFollowedByCasedLetter(src, nextIndex)) {
1362                         // Keep disjunctive "or" with (only) a tonos.
1363                         // We use the same "word boundary" conditions as for the Final_Sigma test.
1364                         if (i == nextIndex) {
1365                             upper = 'Ή';  // Preserve the precomposed form.
1366                         } else {
1367                             addTonos = true;
1368                         }
1369                     } else if ((data & HAS_DIALYTIKA) != 0) {
1370                         // Preserve a vowel with dialytika in precomposed form if it exists.
1371                         if (upper == 'Ι') {
1372                             upper = 'Ϊ';
1373                             data &= ~HAS_EITHER_DIALYTIKA;
1374                         } else if (upper == 'Υ') {
1375                             upper = 'Ϋ';
1376                             data &= ~HAS_EITHER_DIALYTIKA;
1377                         }
1378                     }
1379 
1380                     boolean change;
1381                     if (edits == null && (options & OMIT_UNCHANGED_TEXT) == 0) {
1382                         change = true;  // common, simple usage
1383                     } else {
1384                         // Find out first whether we are changing the text.
1385                         change = src.charAt(i) != upper || numYpogegrammeni > 0;
1386                         int i2 = i + 1;
1387                         if ((data & HAS_EITHER_DIALYTIKA) != 0) {
1388                             change |= i2 >= nextIndex || src.charAt(i2) != 0x308;
1389                             ++i2;
1390                         }
1391                         if (addTonos) {
1392                             change |= i2 >= nextIndex || src.charAt(i2) != 0x301;
1393                             ++i2;
1394                         }
1395                         int oldLength = nextIndex - i;
1396                         int newLength = (i2 - i) + numYpogegrammeni;
1397                         change |= oldLength != newLength;
1398                         if (change) {
1399                             if (edits != null) {
1400                                 edits.addReplace(oldLength, newLength);
1401                             }
1402                         } else {
1403                             if (edits != null) {
1404                                 edits.addUnchanged(oldLength);
1405                             }
1406                             // Write unchanged text?
1407                             change = (options & OMIT_UNCHANGED_TEXT) == 0;
1408                         }
1409                     }
1410 
1411                     if (change) {
1412                         dest.append((char)upper);
1413                         if ((data & HAS_EITHER_DIALYTIKA) != 0) {
1414                             dest.append('\u0308');  // restore or add a dialytika
1415                         }
1416                         if (addTonos) {
1417                             dest.append('\u0301');
1418                         }
1419                         while (numYpogegrammeni > 0) {
1420                             dest.append('Ι');
1421                             --numYpogegrammeni;
1422                         }
1423                     }
1424                 } else {
1425                     c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK);
1426                     appendResult(c, dest, nextIndex - i, options, edits);
1427                 }
1428                 i = nextIndex;
1429                 state = nextState;
1430             }
1431             return dest;
1432         }
1433     }
1434 }
1435