1 /**
2  *******************************************************************************
3  * Copyright (C) 1996-2014, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 
8 package com.ibm.icu.text;
9 
10 import com.ibm.icu.impl.UCharacterProperty;
11 
12 /**
13  * <p>
14  * Standalone utility class providing UTF16 character conversions and indexing conversions.
15  * </p>
16  * <p>
17  * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
18  * so searching for strings is a safe operation. Similarly, concatenation is always safe.
19  * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
20  * values for start and end are on those boundaries, since they arose from operations like
21  * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
22  * </p>
23  * <strong>Examples:</strong>
24  * <p>
25  * The following examples illustrate use of some of these methods.
26  *
27  * <pre>
28  * // iteration forwards: Original
29  * for (int i = 0; i &lt; s.length(); ++i) {
30  *     char ch = s.charAt(i);
31  *     doSomethingWith(ch);
32  * }
33  *
34  * // iteration forwards: Changes for UTF-32
35  * int ch;
36  * for (int i = 0; i &lt; s.length(); i += UTF16.getCharCount(ch)) {
37  *     ch = UTF16.charAt(s, i);
38  *     doSomethingWith(ch);
39  * }
40  *
41  * // iteration backwards: Original
42  * for (int i = s.length() - 1; i &gt;= 0; --i) {
43  *     char ch = s.charAt(i);
44  *     doSomethingWith(ch);
45  * }
46  *
47  * // iteration backwards: Changes for UTF-32
48  * int ch;
49  * for (int i = s.length() - 1; i &gt; 0; i -= UTF16.getCharCount(ch)) {
50  *     ch = UTF16.charAt(s, i);
51  *     doSomethingWith(ch);
52  * }
53  * </pre>
54  *
55  * <strong>Notes:</strong>
56  * <ul>
57  * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code>
58  * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string.
59  * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16
60  * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32
61  * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li>
62  * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a
63  * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16
64  * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
65  * </li>
66  * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out
67  * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates
68  * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to
69  * check for validity if desired. </li>
70  * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then
71  * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It
72  * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4,
73  * 5.5). </li>
74  * <li> <strong>Optimization:</strong> The method implementations may need optimization if the
75  * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small
76  * percentage of all the text in the world, the singleton case should always be optimized for. </li>
77  * </ul>
78  *
79  * @author Mark Davis, with help from Markus Scherer
80  * @stable ICU 2.1
81  */
82 
83 public final class UTF16 {
84     // public variables ---------------------------------------------------
85 
86     /**
87      * Value returned in <code><a href="#bounds(java.lang.String, int)">
88      * bounds()</a></code>.
89      * These values are chosen specifically so that it actually represents the position of the
90      * character [offset16 - (value >> 2), offset16 + (value & 3)]
91      *
92      * @stable ICU 2.1
93      */
94     public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
95             TRAIL_SURROGATE_BOUNDARY = 5;
96 
97     /**
98      * The lowest Unicode code point value.
99      *
100      * @stable ICU 2.1
101      */
102     public static final int CODEPOINT_MIN_VALUE = 0;
103 
104     /**
105      * The highest Unicode code point value (scalar value) according to the Unicode Standard.
106      *
107      * @stable ICU 2.1
108      */
109     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
110 
111     /**
112      * The minimum value for Supplementary code points
113      *
114      * @stable ICU 2.1
115      */
116     public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
117 
118     /**
119      * Lead surrogate minimum value
120      *
121      * @stable ICU 2.1
122      */
123     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
124 
125     /**
126      * Trail surrogate minimum value
127      *
128      * @stable ICU 2.1
129      */
130     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
131 
132     /**
133      * Lead surrogate maximum value
134      *
135      * @stable ICU 2.1
136      */
137     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
138 
139     /**
140      * Trail surrogate maximum value
141      *
142      * @stable ICU 2.1
143      */
144     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
145 
146     /**
147      * Surrogate minimum value
148      *
149      * @stable ICU 2.1
150      */
151     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
152 
153     /**
154      * Maximum surrogate value
155      *
156      * @stable ICU 2.1
157      */
158     public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
159 
160     /**
161      * Lead surrogate bitmask
162      */
163     private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
164 
165     /**
166      * Trail surrogate bitmask
167      */
168     private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
169 
170     /**
171      * Surrogate bitmask
172      */
173     private static final int SURROGATE_BITMASK = 0xFFFFF800;
174 
175     /**
176      * Lead surrogate bits
177      */
178     private static final int LEAD_SURROGATE_BITS = 0xD800;
179 
180     /**
181      * Trail surrogate bits
182      */
183     private static final int TRAIL_SURROGATE_BITS = 0xDC00;
184 
185     /**
186      * Surrogate bits
187      */
188     private static final int SURROGATE_BITS = 0xD800;
189 
190     // constructor --------------------------------------------------------
191 
192     // /CLOVER:OFF
193     /**
194      * Prevent instance from being created.
195      */
UTF16()196     private UTF16() {
197     }
198 
199     // /CLOVER:ON
200     // public method ------------------------------------------------------
201 
202     /**
203      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
204      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
205      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
206      * UCharacter.isLegal()</a></code>
207      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
208      * character will be returned. If a complete supplementary character is not found the incomplete
209      * character will be returned
210      *
211      * @param source Array of UTF-16 chars
212      * @param offset16 UTF-16 offset to the start of the character.
213      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
214      *         of that codepoint are the same as in <code>bounds32()</code>.
215      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
216      * @stable ICU 2.1
217      */
charAt(String source, int offset16)218     public static int charAt(String source, int offset16) {
219         char single = source.charAt(offset16);
220         if (single < LEAD_SURROGATE_MIN_VALUE) {
221             return single;
222         }
223         return _charAt(source, offset16, single);
224     }
225 
_charAt(String source, int offset16, char single)226     private static int _charAt(String source, int offset16, char single) {
227         if (single > TRAIL_SURROGATE_MAX_VALUE) {
228             return single;
229         }
230 
231         // Convert the UTF-16 surrogate pair if necessary.
232         // For simplicity in usage, and because the frequency of pairs is
233         // low, look both directions.
234 
235         if (single <= LEAD_SURROGATE_MAX_VALUE) {
236             ++offset16;
237             if (source.length() != offset16) {
238                 char trail = source.charAt(offset16);
239                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
240                     return UCharacterProperty.getRawSupplementary(single, trail);
241                 }
242             }
243         } else {
244             --offset16;
245             if (offset16 >= 0) {
246                 // single is a trail surrogate so
247                 char lead = source.charAt(offset16);
248                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
249                     return UCharacterProperty.getRawSupplementary(lead, single);
250                 }
251             }
252         }
253         return single; // return unmatched surrogate
254     }
255 
256     /**
257      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
258      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
259      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
260      * UCharacter.isLegal()</a></code>
261      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
262      * character will be returned. If a complete supplementary character is not found the incomplete
263      * character will be returned
264      *
265      * @param source Array of UTF-16 chars
266      * @param offset16 UTF-16 offset to the start of the character.
267      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
268      *         of that codepoint are the same as in <code>bounds32()</code>.
269      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
270      * @stable ICU 2.1
271      */
charAt(CharSequence source, int offset16)272     public static int charAt(CharSequence source, int offset16) {
273         char single = source.charAt(offset16);
274         if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
275             return single;
276         }
277         return _charAt(source, offset16, single);
278     }
279 
_charAt(CharSequence source, int offset16, char single)280     private static int _charAt(CharSequence source, int offset16, char single) {
281         if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
282             return single;
283         }
284 
285         // Convert the UTF-16 surrogate pair if necessary.
286         // For simplicity in usage, and because the frequency of pairs is
287         // low, look both directions.
288 
289         if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
290             ++offset16;
291             if (source.length() != offset16) {
292                 char trail = source.charAt(offset16);
293                 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
294                         && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
295                     return UCharacterProperty.getRawSupplementary(single, trail);
296                 }
297             }
298         } else {
299             --offset16;
300             if (offset16 >= 0) {
301                 // single is a trail surrogate so
302                 char lead = source.charAt(offset16);
303                 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
304                         && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
305                     return UCharacterProperty.getRawSupplementary(lead, single);
306                 }
307             }
308         }
309         return single; // return unmatched surrogate
310     }
311 
312     /**
313      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
314      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
315      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
316      * </a></code>
317      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
318      * character will be returned. If a complete supplementary character is not found the incomplete
319      * character will be returned
320      *
321      * @param source UTF-16 chars string buffer
322      * @param offset16 UTF-16 offset to the start of the character.
323      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
324      *         of that codepoint are the same as in <code>bounds32()</code>.
325      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
326      * @stable ICU 2.1
327      */
charAt(StringBuffer source, int offset16)328     public static int charAt(StringBuffer source, int offset16) {
329         if (offset16 < 0 || offset16 >= source.length()) {
330             throw new StringIndexOutOfBoundsException(offset16);
331         }
332 
333         char single = source.charAt(offset16);
334         if (!isSurrogate(single)) {
335             return single;
336         }
337 
338         // Convert the UTF-16 surrogate pair if necessary.
339         // For simplicity in usage, and because the frequency of pairs is
340         // low, look both directions.
341 
342         if (single <= LEAD_SURROGATE_MAX_VALUE) {
343             ++offset16;
344             if (source.length() != offset16) {
345                 char trail = source.charAt(offset16);
346                 if (isTrailSurrogate(trail))
347                     return UCharacterProperty.getRawSupplementary(single, trail);
348             }
349         } else {
350             --offset16;
351             if (offset16 >= 0) {
352                 // single is a trail surrogate so
353                 char lead = source.charAt(offset16);
354                 if (isLeadSurrogate(lead)) {
355                     return UCharacterProperty.getRawSupplementary(lead, single);
356                 }
357             }
358         }
359         return single; // return unmatched surrogate
360     }
361 
362     /**
363      * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
364      * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
365      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
366      * </a></code>
367      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
368      * character will be returned. If a complete supplementary character is not found the incomplete
369      * character will be returned
370      *
371      * @param source Array of UTF-16 chars
372      * @param start Offset to substring in the source array for analyzing
373      * @param limit Offset to substring in the source array for analyzing
374      * @param offset16 UTF-16 offset relative to start
375      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
376      *         of that codepoint are the same as in <code>bounds32()</code>.
377      * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
378      * @stable ICU 2.1
379      */
charAt(char source[], int start, int limit, int offset16)380     public static int charAt(char source[], int start, int limit, int offset16) {
381         offset16 += start;
382         if (offset16 < start || offset16 >= limit) {
383             throw new ArrayIndexOutOfBoundsException(offset16);
384         }
385 
386         char single = source[offset16];
387         if (!isSurrogate(single)) {
388             return single;
389         }
390 
391         // Convert the UTF-16 surrogate pair if necessary.
392         // For simplicity in usage, and because the frequency of pairs is
393         // low, look both directions.
394         if (single <= LEAD_SURROGATE_MAX_VALUE) {
395             offset16++;
396             if (offset16 >= limit) {
397                 return single;
398             }
399             char trail = source[offset16];
400             if (isTrailSurrogate(trail)) {
401                 return UCharacterProperty.getRawSupplementary(single, trail);
402             }
403         } else { // isTrailSurrogate(single), so
404             if (offset16 == start) {
405                 return single;
406             }
407             offset16--;
408             char lead = source[offset16];
409             if (isLeadSurrogate(lead))
410                 return UCharacterProperty.getRawSupplementary(lead, single);
411         }
412         return single; // return unmatched surrogate
413     }
414 
415     /**
416      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
417      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
418      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
419      * </a></code>
420      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
421      * character will be returned. If a complete supplementary character is not found the incomplete
422      * character will be returned
423      *
424      * @param source UTF-16 chars string buffer
425      * @param offset16 UTF-16 offset to the start of the character.
426      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
427      *         of that codepoint are the same as in <code>bounds32()</code>.
428      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
429      * @stable ICU 2.1
430      */
charAt(Replaceable source, int offset16)431     public static int charAt(Replaceable source, int offset16) {
432         if (offset16 < 0 || offset16 >= source.length()) {
433             throw new StringIndexOutOfBoundsException(offset16);
434         }
435 
436         char single = source.charAt(offset16);
437         if (!isSurrogate(single)) {
438             return single;
439         }
440 
441         // Convert the UTF-16 surrogate pair if necessary.
442         // For simplicity in usage, and because the frequency of pairs is
443         // low, look both directions.
444 
445         if (single <= LEAD_SURROGATE_MAX_VALUE) {
446             ++offset16;
447             if (source.length() != offset16) {
448                 char trail = source.charAt(offset16);
449                 if (isTrailSurrogate(trail))
450                     return UCharacterProperty.getRawSupplementary(single, trail);
451             }
452         } else {
453             --offset16;
454             if (offset16 >= 0) {
455                 // single is a trail surrogate so
456                 char lead = source.charAt(offset16);
457                 if (isLeadSurrogate(lead)) {
458                     return UCharacterProperty.getRawSupplementary(lead, single);
459                 }
460             }
461         }
462         return single; // return unmatched surrogate
463     }
464 
465     /**
466      * Determines how many chars this char32 requires. If a validity check is required, use <code>
467      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
468      * on char32 before calling.
469      *
470      * @param char32 The input codepoint.
471      * @return 2 if is in supplementary space, otherwise 1.
472      * @stable ICU 2.1
473      */
getCharCount(int char32)474     public static int getCharCount(int char32) {
475         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
476             return 1;
477         }
478         return 2;
479     }
480 
481     /**
482      * Returns the type of the boundaries around the char at offset16. Used for random access.
483      *
484      * @param source Text to analyse
485      * @param offset16 UTF-16 offset
486      * @return
487      *            <ul>
488      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1]
489      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
490      *            are [offset16, offset16 + 2]
491      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
492      *            bounds are [offset16 - 1, offset16 + 1]
493      *            </ul>
494      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
495      *            can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
496      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
497      * @stable ICU 2.1
498      */
bounds(String source, int offset16)499     public static int bounds(String source, int offset16) {
500         char ch = source.charAt(offset16);
501         if (isSurrogate(ch)) {
502             if (isLeadSurrogate(ch)) {
503                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
504                     return LEAD_SURROGATE_BOUNDARY;
505                 }
506             } else {
507                 // isTrailSurrogate(ch), so
508                 --offset16;
509                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
510                     return TRAIL_SURROGATE_BOUNDARY;
511                 }
512             }
513         }
514         return SINGLE_CHAR_BOUNDARY;
515     }
516 
517     /**
518      * Returns the type of the boundaries around the char at offset16. Used for random access.
519      *
520      * @param source String buffer to analyse
521      * @param offset16 UTF16 offset
522      * @return
523      *            <ul>
524      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1]
525      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
526      *            are [offset16, offset16 + 2]
527      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
528      *            bounds are [offset16 - 1, offset16 + 1]
529      *            </ul>
530      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
531      *            can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
532      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
533      * @stable ICU 2.1
534      */
bounds(StringBuffer source, int offset16)535     public static int bounds(StringBuffer source, int offset16) {
536         char ch = source.charAt(offset16);
537         if (isSurrogate(ch)) {
538             if (isLeadSurrogate(ch)) {
539                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
540                     return LEAD_SURROGATE_BOUNDARY;
541                 }
542             } else {
543                 // isTrailSurrogate(ch), so
544                 --offset16;
545                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
546                     return TRAIL_SURROGATE_BOUNDARY;
547                 }
548             }
549         }
550         return SINGLE_CHAR_BOUNDARY;
551     }
552 
553     /**
554      * Returns the type of the boundaries around the char at offset16. Used for random access. Note
555      * that the boundaries are determined with respect to the subarray, hence the char array
556      * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
557      *
558      * @param source Char array to analyse
559      * @param start Offset to substring in the source array for analyzing
560      * @param limit Offset to substring in the source array for analyzing
561      * @param offset16 UTF16 offset relative to start
562      * @return
563      *            <ul>
564      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
565      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
566      *            are [offset16, offset16 + 2]
567      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
568      *            bounds are [offset16 - 1, offset16 + 1]
569      *            </ul>
570      *            For bit-twiddlers, the boundary values for these are chosen so that the boundaries
571      *            can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)].
572      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
573      * @stable ICU 2.1
574      */
bounds(char source[], int start, int limit, int offset16)575     public static int bounds(char source[], int start, int limit, int offset16) {
576         offset16 += start;
577         if (offset16 < start || offset16 >= limit) {
578             throw new ArrayIndexOutOfBoundsException(offset16);
579         }
580         char ch = source[offset16];
581         if (isSurrogate(ch)) {
582             if (isLeadSurrogate(ch)) {
583                 ++offset16;
584                 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
585                     return LEAD_SURROGATE_BOUNDARY;
586                 }
587             } else { // isTrailSurrogate(ch), so
588                 --offset16;
589                 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
590                     return TRAIL_SURROGATE_BOUNDARY;
591                 }
592             }
593         }
594         return SINGLE_CHAR_BOUNDARY;
595     }
596 
597     /**
598      * Determines whether the code value is a surrogate.
599      *
600      * @param char16 The input character.
601      * @return true If the input character is a surrogate.
602      * @stable ICU 2.1
603      */
isSurrogate(char char16)604     public static boolean isSurrogate(char char16) {
605         return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
606     }
607 
608     /**
609      * Determines whether the character is a trail surrogate.
610      *
611      * @param char16 The input character.
612      * @return true If the input character is a trail surrogate.
613      * @stable ICU 2.1
614      */
isTrailSurrogate(char char16)615     public static boolean isTrailSurrogate(char char16) {
616         return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
617     }
618 
619     /**
620      * Determines whether the character is a lead surrogate.
621      *
622      * @param char16 The input character.
623      * @return true If the input character is a lead surrogate
624      * @stable ICU 2.1
625      */
isLeadSurrogate(char char16)626     public static boolean isLeadSurrogate(char char16) {
627         return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
628     }
629 
630     /**
631      * Returns the lead surrogate. If a validity check is required, use
632      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
633      * before calling.
634      *
635      * @param char32 The input character.
636      * @return lead surrogate if the getCharCount(ch) is 2; <br>
637      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
638      * @stable ICU 2.1
639      */
getLeadSurrogate(int char32)640     public static char getLeadSurrogate(int char32) {
641         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
642             return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
643         }
644         return 0;
645     }
646 
647     /**
648      * Returns the trail surrogate. If a validity check is required, use
649      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
650      * before calling.
651      *
652      * @param char32 The input character.
653      * @return the trail surrogate if the getCharCount(ch) is 2; <br>
654      *         otherwise the character itself
655      * @stable ICU 2.1
656      */
getTrailSurrogate(int char32)657     public static char getTrailSurrogate(int char32) {
658         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
659             return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
660         }
661         return (char) char32;
662     }
663 
664     /**
665      * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
666      * containing the UTF-32 value in UTF16 format. If a validity check is required, use <a
667      * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before calling.
668      *
669      * @param char32 The input character.
670      * @return string value of char32 in UTF16 format
671      * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint.
672      * @stable ICU 2.1
673      */
valueOf(int char32)674     public static String valueOf(int char32) {
675         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
676             throw new IllegalArgumentException("Illegal codepoint");
677         }
678         return toString(char32);
679     }
680 
681     /**
682      * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
683      * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
684      * character, the whole supplementary codepoint will be returned. If a validity check is
685      * required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
686      * codepoint at offset16 before calling. The result returned will be a newly created String
687      * obtained by calling source.substring(..) with the appropriate indexes.
688      *
689      * @param source The input string.
690      * @param offset16 The UTF16 index to the codepoint in source
691      * @return string value of char32 in UTF16 format
692      * @stable ICU 2.1
693      */
valueOf(String source, int offset16)694     public static String valueOf(String source, int offset16) {
695         switch (bounds(source, offset16)) {
696         case LEAD_SURROGATE_BOUNDARY:
697             return source.substring(offset16, offset16 + 2);
698         case TRAIL_SURROGATE_BOUNDARY:
699             return source.substring(offset16 - 1, offset16 + 1);
700         default:
701             return source.substring(offset16, offset16 + 1);
702         }
703     }
704 
705     /**
706      * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
707      * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
708      * surrogate character, the whole supplementary codepoint will be returned. If a validity check
709      * is required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
710      * the codepoint at offset16 before calling. The result returned will be a newly created String
711      * obtained by calling source.substring(..) with the appropriate indexes.
712      *
713      * @param source The input string buffer.
714      * @param offset16 The UTF16 index to the codepoint in source
715      * @return string value of char32 in UTF16 format
716      * @stable ICU 2.1
717      */
valueOf(StringBuffer source, int offset16)718     public static String valueOf(StringBuffer source, int offset16) {
719         switch (bounds(source, offset16)) {
720         case LEAD_SURROGATE_BOUNDARY:
721             return source.substring(offset16, offset16 + 2);
722         case TRAIL_SURROGATE_BOUNDARY:
723             return source.substring(offset16 - 1, offset16 + 1);
724         default:
725             return source.substring(offset16, offset16 + 1);
726         }
727     }
728 
729     /**
730      * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
731      * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
732      * returned, except when either the leading or trailing surrogate character lies out of the
733      * specified subarray. In the latter case, only the surrogate character within bounds will be
734      * returned. If a validity check is required, use <a
735      * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the codepoint at
736      * offset16 before calling. The result returned will be a newly created String containing the
737      * relevant characters.
738      *
739      * @param source The input char array.
740      * @param start Start index of the subarray
741      * @param limit End index of the subarray
742      * @param offset16 The UTF16 index to the codepoint in source relative to start
743      * @return string value of char32 in UTF16 format
744      * @stable ICU 2.1
745      */
valueOf(char source[], int start, int limit, int offset16)746     public static String valueOf(char source[], int start, int limit, int offset16) {
747         switch (bounds(source, start, limit, offset16)) {
748         case LEAD_SURROGATE_BOUNDARY:
749             return new String(source, start + offset16, 2);
750         case TRAIL_SURROGATE_BOUNDARY:
751             return new String(source, start + offset16 - 1, 2);
752         }
753         return new String(source, start + offset16, 1);
754     }
755 
756     /**
757      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
758      * the <a name="_top_">class description</a> for notes on roundtripping.
759      *
760      * @param source The UTF-16 string
761      * @param offset32 UTF-32 offset
762      * @return UTF-16 offset
763      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
764      * @stable ICU 2.1
765      */
findOffsetFromCodePoint(String source, int offset32)766     public static int findOffsetFromCodePoint(String source, int offset32) {
767         char ch;
768         int size = source.length(), result = 0, count = offset32;
769         if (offset32 < 0 || offset32 > size) {
770             throw new StringIndexOutOfBoundsException(offset32);
771         }
772         while (result < size && count > 0) {
773             ch = source.charAt(result);
774             if (isLeadSurrogate(ch) && ((result + 1) < size)
775                     && isTrailSurrogate(source.charAt(result + 1))) {
776                 result++;
777             }
778 
779             count--;
780             result++;
781         }
782         if (count != 0) {
783             throw new StringIndexOutOfBoundsException(offset32);
784         }
785         return result;
786     }
787 
788     /**
789      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
790      * the <a name="_top_">class description</a> for notes on roundtripping.
791      *
792      * @param source The UTF-16 string buffer
793      * @param offset32 UTF-32 offset
794      * @return UTF-16 offset
795      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
796      * @stable ICU 2.1
797      */
findOffsetFromCodePoint(StringBuffer source, int offset32)798     public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
799         char ch;
800         int size = source.length(), result = 0, count = offset32;
801         if (offset32 < 0 || offset32 > size) {
802             throw new StringIndexOutOfBoundsException(offset32);
803         }
804         while (result < size && count > 0) {
805             ch = source.charAt(result);
806             if (isLeadSurrogate(ch) && ((result + 1) < size)
807                     && isTrailSurrogate(source.charAt(result + 1))) {
808                 result++;
809             }
810 
811             count--;
812             result++;
813         }
814         if (count != 0) {
815             throw new StringIndexOutOfBoundsException(offset32);
816         }
817         return result;
818     }
819 
820     /**
821      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
822      * the <a name="_top_">class description</a> for notes on roundtripping.
823      *
824      * @param source The UTF-16 char array whose substring is to be analysed
825      * @param start Offset of the substring to be analysed
826      * @param limit Offset of the substring to be analysed
827      * @param offset32 UTF-32 offset relative to start
828      * @return UTF-16 offset relative to start
829      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
830      * @stable ICU 2.1
831      */
findOffsetFromCodePoint(char source[], int start, int limit, int offset32)832     public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
833         char ch;
834         int result = start, count = offset32;
835         if (offset32 > limit - start) {
836             throw new ArrayIndexOutOfBoundsException(offset32);
837         }
838         while (result < limit && count > 0) {
839             ch = source[result];
840             if (isLeadSurrogate(ch) && ((result + 1) < limit)
841                     && isTrailSurrogate(source[result + 1])) {
842                 result++;
843             }
844 
845             count--;
846             result++;
847         }
848         if (count != 0) {
849             throw new ArrayIndexOutOfBoundsException(offset32);
850         }
851         return result - start;
852     }
853 
854     /**
855      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
856      * UTF-16 offset. Used for random access. See the <a name="_top_">class description</a> for
857      * notes on roundtripping.<br>
858      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
859      * of the <strong>lead</strong> of the pair is returned. </i>
860      * <p>
861      * To find the UTF-32 length of a string, use:
862      *
863      * <pre>
864      * len32 = countCodePoint(source, source.length());
865      * </pre>
866      *
867      * </p>
868      * <p>
869      *
870      * @param source Text to analyse
871      * @param offset16 UTF-16 offset < source text length.
872      * @return UTF-32 offset
873      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
874      * @stable ICU 2.1
875      */
findCodePointOffset(String source, int offset16)876     public static int findCodePointOffset(String source, int offset16) {
877         if (offset16 < 0 || offset16 > source.length()) {
878             throw new StringIndexOutOfBoundsException(offset16);
879         }
880 
881         int result = 0;
882         char ch;
883         boolean hadLeadSurrogate = false;
884 
885         for (int i = 0; i < offset16; ++i) {
886             ch = source.charAt(i);
887             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
888                 hadLeadSurrogate = false; // count valid trail as zero
889             } else {
890                 hadLeadSurrogate = isLeadSurrogate(ch);
891                 ++result; // count others as 1
892             }
893         }
894 
895         if (offset16 == source.length()) {
896             return result;
897         }
898 
899         // end of source being the less significant surrogate character
900         // shift result back to the start of the supplementary character
901         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
902             result--;
903         }
904 
905         return result;
906     }
907 
908     /**
909      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
910      * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
911      * roundtripping.<br>
912      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
913      * of the <strong>lead</strong> of the pair is returned. </i>
914      * <p>
915      * To find the UTF-32 length of a string, use:
916      *
917      * <pre>
918      * len32 = countCodePoint(source);
919      * </pre>
920      *
921      * </p>
922      * <p>
923      *
924      * @param source Text to analyse
925      * @param offset16 UTF-16 offset < source text length.
926      * @return UTF-32 offset
927      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
928      * @stable ICU 2.1
929      */
findCodePointOffset(StringBuffer source, int offset16)930     public static int findCodePointOffset(StringBuffer source, int offset16) {
931         if (offset16 < 0 || offset16 > source.length()) {
932             throw new StringIndexOutOfBoundsException(offset16);
933         }
934 
935         int result = 0;
936         char ch;
937         boolean hadLeadSurrogate = false;
938 
939         for (int i = 0; i < offset16; ++i) {
940             ch = source.charAt(i);
941             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
942                 hadLeadSurrogate = false; // count valid trail as zero
943             } else {
944                 hadLeadSurrogate = isLeadSurrogate(ch);
945                 ++result; // count others as 1
946             }
947         }
948 
949         if (offset16 == source.length()) {
950             return result;
951         }
952 
953         // end of source being the less significant surrogate character
954         // shift result back to the start of the supplementary character
955         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
956             result--;
957         }
958 
959         return result;
960     }
961 
962     /**
963      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
964      * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
965      * roundtripping.<br>
966      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
967      * of the <strong>lead</strong> of the pair is returned. </i>
968      * <p>
969      * To find the UTF-32 length of a substring, use:
970      *
971      * <pre>
972      * len32 = countCodePoint(source, start, limit);
973      * </pre>
974      *
975      * </p>
976      * <p>
977      *
978      * @param source Text to analyse
979      * @param start Offset of the substring
980      * @param limit Offset of the substring
981      * @param offset16 UTF-16 relative to start
982      * @return UTF-32 offset relative to start
983      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
984      * @stable ICU 2.1
985      */
findCodePointOffset(char source[], int start, int limit, int offset16)986     public static int findCodePointOffset(char source[], int start, int limit, int offset16) {
987         offset16 += start;
988         if (offset16 > limit) {
989             throw new StringIndexOutOfBoundsException(offset16);
990         }
991 
992         int result = 0;
993         char ch;
994         boolean hadLeadSurrogate = false;
995 
996         for (int i = start; i < offset16; ++i) {
997             ch = source[i];
998             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
999                 hadLeadSurrogate = false; // count valid trail as zero
1000             } else {
1001                 hadLeadSurrogate = isLeadSurrogate(ch);
1002                 ++result; // count others as 1
1003             }
1004         }
1005 
1006         if (offset16 == limit) {
1007             return result;
1008         }
1009 
1010         // end of source being the less significant surrogate character
1011         // shift result back to the start of the supplementary character
1012         if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
1013             result--;
1014         }
1015 
1016         return result;
1017     }
1018 
1019     /**
1020      * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required,
1021      * use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before
1022      * calling.
1023      *
1024      * @param target The buffer to append to
1025      * @param char32 Value to append.
1026      * @return the updated StringBuffer
1027      * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints
1028      * @stable ICU 2.1
1029      */
append(StringBuffer target, int char32)1030     public static StringBuffer append(StringBuffer target, int char32) {
1031         // Check for irregular values
1032         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1033             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
1034         }
1035 
1036         // Write the UTF-16 values
1037         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1038             target.append(getLeadSurrogate(char32));
1039             target.append(getTrailSurrogate(char32));
1040         } else {
1041             target.append((char) char32);
1042         }
1043         return target;
1044     }
1045 
1046     /**
1047      * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
1048      * convenience.
1049      *
1050      * @param target The buffer to append to
1051      * @param cp The code point to append
1052      * @return the updated StringBuffer
1053      * @throws IllegalArgumentException If cp is not a valid code point
1054      * @stable ICU 3.0
1055      */
appendCodePoint(StringBuffer target, int cp)1056     public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
1057         return append(target, cp);
1058     }
1059 
1060     /**
1061      * Adds a codepoint to offset16 position of the argument char array.
1062      *
1063      * @param target Char array to be append with the new code point
1064      * @param limit UTF16 offset which the codepoint will be appended.
1065      * @param char32 Code point to be appended
1066      * @return offset after char32 in the array.
1067      * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not
1068      *                lie within the range of the Unicode codepoints.
1069      * @stable ICU 2.1
1070      */
append(char[] target, int limit, int char32)1071     public static int append(char[] target, int limit, int char32) {
1072         // Check for irregular values
1073         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1074             throw new IllegalArgumentException("Illegal codepoint");
1075         }
1076         // Write the UTF-16 values
1077         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1078             target[limit++] = getLeadSurrogate(char32);
1079             target[limit++] = getTrailSurrogate(char32);
1080         } else {
1081             target[limit++] = (char) char32;
1082         }
1083         return limit;
1084     }
1085 
1086     /**
1087      * Number of codepoints in a UTF16 String
1088      *
1089      * @param source UTF16 string
1090      * @return number of codepoint in string
1091      * @stable ICU 2.1
1092      */
countCodePoint(String source)1093     public static int countCodePoint(String source) {
1094         if (source == null || source.length() == 0) {
1095             return 0;
1096         }
1097         return findCodePointOffset(source, source.length());
1098     }
1099 
1100     /**
1101      * Number of codepoints in a UTF16 String buffer
1102      *
1103      * @param source UTF16 string buffer
1104      * @return number of codepoint in string
1105      * @stable ICU 2.1
1106      */
countCodePoint(StringBuffer source)1107     public static int countCodePoint(StringBuffer source) {
1108         if (source == null || source.length() == 0) {
1109             return 0;
1110         }
1111         return findCodePointOffset(source, source.length());
1112     }
1113 
1114     /**
1115      * Number of codepoints in a UTF16 char array substring
1116      *
1117      * @param source UTF16 char array
1118      * @param start Offset of the substring
1119      * @param limit Offset of the substring
1120      * @return number of codepoint in the substring
1121      * @exception IndexOutOfBoundsException If start and limit are not valid.
1122      * @stable ICU 2.1
1123      */
countCodePoint(char source[], int start, int limit)1124     public static int countCodePoint(char source[], int start, int limit) {
1125         if (source == null || source.length == 0) {
1126             return 0;
1127         }
1128         return findCodePointOffset(source, start, limit, limit - start);
1129     }
1130 
1131     /**
1132      * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
1133      * non-supplementary codepoint with a supplementary and vice versa.
1134      *
1135      * @param target Stringbuffer
1136      * @param offset16 UTF16 position to insert into
1137      * @param char32 Code point
1138      * @stable ICU 2.1
1139      */
setCharAt(StringBuffer target, int offset16, int char32)1140     public static void setCharAt(StringBuffer target, int offset16, int char32) {
1141         int count = 1;
1142         char single = target.charAt(offset16);
1143 
1144         if (isSurrogate(single)) {
1145             // pairs of the surrogate with offset16 at the lead char found
1146             if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
1147                     && isTrailSurrogate(target.charAt(offset16 + 1))) {
1148                 count++;
1149             } else {
1150                 // pairs of the surrogate with offset16 at the trail char
1151                 // found
1152                 if (isTrailSurrogate(single) && (offset16 > 0)
1153                         && isLeadSurrogate(target.charAt(offset16 - 1))) {
1154                     offset16--;
1155                     count++;
1156                 }
1157             }
1158         }
1159         target.replace(offset16, offset16 + count, valueOf(char32));
1160     }
1161 
1162     /**
1163      * Set a code point into a UTF16 position in a char array. Adjusts target according if we are
1164      * replacing a non-supplementary codepoint with a supplementary and vice versa.
1165      *
1166      * @param target char array
1167      * @param limit numbers of valid chars in target, different from target.length. limit counts the
1168      *            number of chars in target that represents a string, not the size of array target.
1169      * @param offset16 UTF16 position to insert into
1170      * @param char32 code point
1171      * @return new number of chars in target that represents a string
1172      * @exception IndexOutOfBoundsException if offset16 is out of range
1173      * @stable ICU 2.1
1174      */
setCharAt(char target[], int limit, int offset16, int char32)1175     public static int setCharAt(char target[], int limit, int offset16, int char32) {
1176         if (offset16 >= limit) {
1177             throw new ArrayIndexOutOfBoundsException(offset16);
1178         }
1179         int count = 1;
1180         char single = target[offset16];
1181 
1182         if (isSurrogate(single)) {
1183             // pairs of the surrogate with offset16 at the lead char found
1184             if (isLeadSurrogate(single) && (target.length > offset16 + 1)
1185                     && isTrailSurrogate(target[offset16 + 1])) {
1186                 count++;
1187             } else {
1188                 // pairs of the surrogate with offset16 at the trail char
1189                 // found
1190                 if (isTrailSurrogate(single) && (offset16 > 0)
1191                         && isLeadSurrogate(target[offset16 - 1])) {
1192                     offset16--;
1193                     count++;
1194                 }
1195             }
1196         }
1197 
1198         String str = valueOf(char32);
1199         int result = limit;
1200         int strlength = str.length();
1201         target[offset16] = str.charAt(0);
1202         if (count == strlength) {
1203             if (count == 2) {
1204                 target[offset16 + 1] = str.charAt(1);
1205             }
1206         } else {
1207             // this is not exact match in space, we'll have to do some
1208             // shifting
1209             System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit
1210                     - (offset16 + count));
1211             if (count < strlength) {
1212                 // char32 is a supplementary character trying to squeeze into
1213                 // a non-supplementary space
1214                 target[offset16 + 1] = str.charAt(1);
1215                 result++;
1216                 if (result < target.length) {
1217                     target[result] = 0;
1218                 }
1219             } else {
1220                 // char32 is a non-supplementary character trying to fill
1221                 // into a supplementary space
1222                 result--;
1223                 target[result] = 0;
1224             }
1225         }
1226         return result;
1227     }
1228 
1229     /**
1230      * Shifts offset16 by the argument number of codepoints
1231      *
1232      * @param source string
1233      * @param offset16 UTF16 position to shift
1234      * @param shift32 number of codepoints to shift
1235      * @return new shifted offset16
1236      * @exception IndexOutOfBoundsException if the new offset16 is out of bounds.
1237      * @stable ICU 2.1
1238      */
moveCodePointOffset(String source, int offset16, int shift32)1239     public static int moveCodePointOffset(String source, int offset16, int shift32) {
1240         int result = offset16;
1241         int size = source.length();
1242         int count;
1243         char ch;
1244         if (offset16 < 0 || offset16 > size) {
1245             throw new StringIndexOutOfBoundsException(offset16);
1246         }
1247         if (shift32 > 0) {
1248             if (shift32 + offset16 > size) {
1249                 throw new StringIndexOutOfBoundsException(offset16);
1250             }
1251             count = shift32;
1252             while (result < size && count > 0) {
1253                 ch = source.charAt(result);
1254                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1255                         && isTrailSurrogate(source.charAt(result + 1))) {
1256                     result++;
1257                 }
1258                 count--;
1259                 result++;
1260             }
1261         } else {
1262             if (offset16 + shift32 < 0) {
1263                 throw new StringIndexOutOfBoundsException(offset16);
1264             }
1265             for (count = -shift32; count > 0; count--) {
1266                 result--;
1267                 if (result < 0) {
1268                     break;
1269                 }
1270                 ch = source.charAt(result);
1271                 if (isTrailSurrogate(ch) && result > 0
1272                         && isLeadSurrogate(source.charAt(result - 1))) {
1273                     result--;
1274                 }
1275             }
1276         }
1277         if (count != 0) {
1278             throw new StringIndexOutOfBoundsException(shift32);
1279         }
1280         return result;
1281     }
1282 
1283     /**
1284      * Shifts offset16 by the argument number of codepoints
1285      *
1286      * @param source String buffer
1287      * @param offset16 UTF16 position to shift
1288      * @param shift32 Number of codepoints to shift
1289      * @return new shifted offset16
1290      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds.
1291      * @stable ICU 2.1
1292      */
moveCodePointOffset(StringBuffer source, int offset16, int shift32)1293     public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) {
1294         int result = offset16;
1295         int size = source.length();
1296         int count;
1297         char ch;
1298         if (offset16 < 0 || offset16 > size) {
1299             throw new StringIndexOutOfBoundsException(offset16);
1300         }
1301         if (shift32 > 0) {
1302             if (shift32 + offset16 > size) {
1303                 throw new StringIndexOutOfBoundsException(offset16);
1304             }
1305             count = shift32;
1306             while (result < size && count > 0) {
1307                 ch = source.charAt(result);
1308                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1309                         && isTrailSurrogate(source.charAt(result + 1))) {
1310                     result++;
1311                 }
1312                 count--;
1313                 result++;
1314             }
1315         } else {
1316             if (offset16 + shift32 < 0) {
1317                 throw new StringIndexOutOfBoundsException(offset16);
1318             }
1319             for (count = -shift32; count > 0; count--) {
1320                 result--;
1321                 if (result < 0) {
1322                     break;
1323                 }
1324                 ch = source.charAt(result);
1325                 if (isTrailSurrogate(ch) && result > 0
1326                         && isLeadSurrogate(source.charAt(result - 1))) {
1327                     result--;
1328                 }
1329             }
1330         }
1331         if (count != 0) {
1332             throw new StringIndexOutOfBoundsException(shift32);
1333         }
1334         return result;
1335     }
1336 
1337     /**
1338      * Shifts offset16 by the argument number of codepoints within a subarray.
1339      *
1340      * @param source Char array
1341      * @param start Position of the subarray to be performed on
1342      * @param limit Position of the subarray to be performed on
1343      * @param offset16 UTF16 position to shift relative to start
1344      * @param shift32 Number of codepoints to shift
1345      * @return new shifted offset16 relative to start
1346      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the
1347      *                subarray bounds are out of range.
1348      * @stable ICU 2.1
1349      */
moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32)1350     public static int moveCodePointOffset(char source[], int start, int limit, int offset16,
1351             int shift32) {
1352         int size = source.length;
1353         int count;
1354         char ch;
1355         int result = offset16 + start;
1356         if (start < 0 || limit < start) {
1357             throw new StringIndexOutOfBoundsException(start);
1358         }
1359         if (limit > size) {
1360             throw new StringIndexOutOfBoundsException(limit);
1361         }
1362         if (offset16 < 0 || result > limit) {
1363             throw new StringIndexOutOfBoundsException(offset16);
1364         }
1365         if (shift32 > 0) {
1366             if (shift32 + result > size) {
1367                 throw new StringIndexOutOfBoundsException(result);
1368             }
1369             count = shift32;
1370             while (result < limit && count > 0) {
1371                 ch = source[result];
1372                 if (isLeadSurrogate(ch) && (result + 1 < limit)
1373                         && isTrailSurrogate(source[result + 1])) {
1374                     result++;
1375                 }
1376                 count--;
1377                 result++;
1378             }
1379         } else {
1380             if (result + shift32 < start) {
1381                 throw new StringIndexOutOfBoundsException(result);
1382             }
1383             for (count = -shift32; count > 0; count--) {
1384                 result--;
1385                 if (result < start) {
1386                     break;
1387                 }
1388                 ch = source[result];
1389                 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
1390                     result--;
1391                 }
1392             }
1393         }
1394         if (count != 0) {
1395             throw new StringIndexOutOfBoundsException(shift32);
1396         }
1397         result -= start;
1398         return result;
1399     }
1400 
1401     /**
1402      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1403      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1404      * codepoint. The length of target increases by one if codepoint is non-supplementary, 2
1405      * otherwise.
1406      * <p>
1407      * The overall effect is exactly as if the argument were converted to a string by the method
1408      * valueOf(char) and the characters in that string were then inserted into target at the
1409      * position indicated by offset16.
1410      * </p>
1411      * <p>
1412      * The offset argument must be greater than or equal to 0, and less than or equal to the length
1413      * of source.
1414      *
1415      * @param target String buffer to insert to
1416      * @param offset16 Offset which char32 will be inserted in
1417      * @param char32 Codepoint to be inserted
1418      * @return a reference to target
1419      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1420      * @stable ICU 2.1
1421      */
insert(StringBuffer target, int offset16, int char32)1422     public static StringBuffer insert(StringBuffer target, int offset16, int char32) {
1423         String str = valueOf(char32);
1424         if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1425             offset16++;
1426         }
1427         target.insert(offset16, str);
1428         return target;
1429     }
1430 
1431     /**
1432      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1433      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1434      * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise.
1435      * <p>
1436      * The overall effect is exactly as if the argument were converted to a string by the method
1437      * valueOf(char) and the characters in that string were then inserted into target at the
1438      * position indicated by offset16.
1439      * </p>
1440      * <p>
1441      * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
1442      *
1443      * @param target Char array to insert to
1444      * @param limit End index of the char array, limit <= target.length
1445      * @param offset16 Offset which char32 will be inserted in
1446      * @param char32 Codepoint to be inserted
1447      * @return new limit size
1448      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1449      * @stable ICU 2.1
1450      */
insert(char target[], int limit, int offset16, int char32)1451     public static int insert(char target[], int limit, int offset16, int char32) {
1452         String str = valueOf(char32);
1453         if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1454             offset16++;
1455         }
1456         int size = str.length();
1457         if (limit + size > target.length) {
1458             throw new ArrayIndexOutOfBoundsException(offset16 + size);
1459         }
1460         System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
1461         target[offset16] = str.charAt(0);
1462         if (size == 2) {
1463             target[offset16 + 1] = str.charAt(1);
1464         }
1465         return limit + size;
1466     }
1467 
1468     /**
1469      * Removes the codepoint at the specified position in this target (shortening target by 1
1470      * character if the codepoint is a non-supplementary, 2 otherwise).
1471      *
1472      * @param target String buffer to remove codepoint from
1473      * @param offset16 Offset which the codepoint will be removed
1474      * @return a reference to target
1475      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1476      * @stable ICU 2.1
1477      */
delete(StringBuffer target, int offset16)1478     public static StringBuffer delete(StringBuffer target, int offset16) {
1479         int count = 1;
1480         switch (bounds(target, offset16)) {
1481         case LEAD_SURROGATE_BOUNDARY:
1482             count++;
1483             break;
1484         case TRAIL_SURROGATE_BOUNDARY:
1485             count++;
1486             offset16--;
1487             break;
1488         }
1489         target.delete(offset16, offset16 + count);
1490         return target;
1491     }
1492 
1493     /**
1494      * Removes the codepoint at the specified position in this target (shortening target by 1
1495      * character if the codepoint is a non-supplementary, 2 otherwise).
1496      *
1497      * @param target String buffer to remove codepoint from
1498      * @param limit End index of the char array, limit <= target.length
1499      * @param offset16 Offset which the codepoint will be removed
1500      * @return a new limit size
1501      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1502      * @stable ICU 2.1
1503      */
delete(char target[], int limit, int offset16)1504     public static int delete(char target[], int limit, int offset16) {
1505         int count = 1;
1506         switch (bounds(target, 0, limit, offset16)) {
1507         case LEAD_SURROGATE_BOUNDARY:
1508             count++;
1509             break;
1510         case TRAIL_SURROGATE_BOUNDARY:
1511             count++;
1512             offset16--;
1513             break;
1514         }
1515         System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
1516         target[limit - count] = 0;
1517         return limit - count;
1518     }
1519 
1520     /**
1521      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1522      * the argument codepoint. I.e., the smallest index <code>i</code> such that
1523      * <code>UTF16.charAt(source, i) ==
1524      * char32</code> is true.
1525      * <p>
1526      * If no such character occurs in this string, then -1 is returned.
1527      * </p>
1528      * <p>
1529      * Examples:<br>
1530      * UTF16.indexOf("abc", 'a') returns 0<br>
1531      * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1532      * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1533      * </p>
1534      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1535      * characters to its fullest.
1536      *
1537      * @param source UTF16 format Unicode string that will be searched
1538      * @param char32 Codepoint to search for
1539      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1540      *         -1 if the codepoint does not occur.
1541      * @stable ICU 2.6
1542      */
indexOf(String source, int char32)1543     public static int indexOf(String source, int char32) {
1544         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1545             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1546         }
1547         // non-surrogate bmp
1548         if (char32 < LEAD_SURROGATE_MIN_VALUE
1549                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1550             return source.indexOf((char) char32);
1551         }
1552         // surrogate
1553         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1554             int result = source.indexOf((char) char32);
1555             if (result >= 0) {
1556                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1557                         && isTrailSurrogate(source.charAt(result + 1))) {
1558                     return indexOf(source, char32, result + 1);
1559                 }
1560                 // trail surrogate
1561                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1562                     return indexOf(source, char32, result + 1);
1563                 }
1564             }
1565             return result;
1566         }
1567         // supplementary
1568         String char32str = toString(char32);
1569         return source.indexOf(char32str);
1570     }
1571 
1572     /**
1573      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1574      * the argument string str. This method is implemented based on codepoints, hence a "lead
1575      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1576      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1577      * character before str found at in source will not have a valid match. Vice versa for lead
1578      * surrogates that ends str. See example below.
1579      * <p>
1580      * If no such string str occurs in this source, then -1 is returned.
1581      * </p>
1582      * <p>
1583      * Examples:<br>
1584      * UTF16.indexOf("abc", "ab") returns 0<br>
1585      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1586      * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1587      * </p>
1588      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1589      * characters to its fullest.
1590      *
1591      * @param source UTF16 format Unicode string that will be searched
1592      * @param str UTF16 format Unicode string to search for
1593      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1594      *         -1 if the codepoint does not occur.
1595      * @stable ICU 2.6
1596      */
indexOf(String source, String str)1597     public static int indexOf(String source, String str) {
1598         int strLength = str.length();
1599         // non-surrogate ends
1600         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1601             return source.indexOf(str);
1602         }
1603 
1604         int result = source.indexOf(str);
1605         int resultEnd = result + strLength;
1606         if (result >= 0) {
1607             // check last character
1608             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1609                     && isTrailSurrogate(source.charAt(resultEnd + 1))) {
1610                 return indexOf(source, str, resultEnd + 1);
1611             }
1612             // check first character which is a trail surrogate
1613             if (isTrailSurrogate(str.charAt(0)) && result > 0
1614                     && isLeadSurrogate(source.charAt(result - 1))) {
1615                 return indexOf(source, str, resultEnd + 1);
1616             }
1617         }
1618         return result;
1619     }
1620 
1621     /**
1622      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1623      * the argument codepoint. I.e., the smallest index i such that: <br>
1624      * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true.
1625      * <p>
1626      * If no such character occurs in this string, then -1 is returned.
1627      * </p>
1628      * <p>
1629      * Examples:<br>
1630      * UTF16.indexOf("abc", 'a', 1) returns -1<br>
1631      * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
1632      * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
1633      * </p>
1634      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1635      * characters to its fullest.
1636      *
1637      * @param source UTF16 format Unicode string that will be searched
1638      * @param char32 Codepoint to search for
1639      * @param fromIndex The index to start the search from.
1640      * @return the index of the first occurrence of the codepoint in the argument Unicode string at
1641      *         or after fromIndex, or -1 if the codepoint does not occur.
1642      * @stable ICU 2.6
1643      */
indexOf(String source, int char32, int fromIndex)1644     public static int indexOf(String source, int char32, int fromIndex) {
1645         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1646             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1647         }
1648         // non-surrogate bmp
1649         if (char32 < LEAD_SURROGATE_MIN_VALUE
1650                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1651             return source.indexOf((char) char32, fromIndex);
1652         }
1653         // surrogate
1654         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1655             int result = source.indexOf((char) char32, fromIndex);
1656             if (result >= 0) {
1657                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1658                         && isTrailSurrogate(source.charAt(result + 1))) {
1659                     return indexOf(source, char32, result + 1);
1660                 }
1661                 // trail surrogate
1662                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1663                     return indexOf(source, char32, result + 1);
1664                 }
1665             }
1666             return result;
1667         }
1668         // supplementary
1669         String char32str = toString(char32);
1670         return source.indexOf(char32str, fromIndex);
1671     }
1672 
1673     /**
1674      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1675      * the argument string str. This method is implemented based on codepoints, hence a "lead
1676      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1677      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1678      * character before str found at in source will not have a valid match. Vice versa for lead
1679      * surrogates that ends str. See example below.
1680      * <p>
1681      * If no such string str occurs in this source, then -1 is returned.
1682      * </p>
1683      * <p>
1684      * Examples:<br>
1685      * UTF16.indexOf("abc", "ab", 0) returns 0<br>
1686      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
1687      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
1688      * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
1689      * </p>
1690      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1691      * characters to its fullest.
1692      *
1693      * @param source UTF16 format Unicode string that will be searched
1694      * @param str UTF16 format Unicode string to search for
1695      * @param fromIndex The index to start the search from.
1696      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1697      *         -1 if the codepoint does not occur.
1698      * @stable ICU 2.6
1699      */
indexOf(String source, String str, int fromIndex)1700     public static int indexOf(String source, String str, int fromIndex) {
1701         int strLength = str.length();
1702         // non-surrogate ends
1703         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1704             return source.indexOf(str, fromIndex);
1705         }
1706 
1707         int result = source.indexOf(str, fromIndex);
1708         int resultEnd = result + strLength;
1709         if (result >= 0) {
1710             // check last character
1711             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1712                     && isTrailSurrogate(source.charAt(resultEnd))) {
1713                 return indexOf(source, str, resultEnd + 1);
1714             }
1715             // check first character which is a trail surrogate
1716             if (isTrailSurrogate(str.charAt(0)) && result > 0
1717                     && isLeadSurrogate(source.charAt(result - 1))) {
1718                 return indexOf(source, str, resultEnd + 1);
1719             }
1720         }
1721         return result;
1722     }
1723 
1724     /**
1725      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1726      * the argument codepoint. I.e., the index returned is the largest value i such that:
1727      * UTF16.charAt(source, i) == char32 is true.
1728      * <p>
1729      * Examples:<br>
1730      * UTF16.lastIndexOf("abc", 'a') returns 0<br>
1731      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1732      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1733      * </p>
1734      * <p>
1735      * source is searched backwards starting at the last character.
1736      * </p>
1737      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1738      * characters to its fullest.
1739      *
1740      * @param source UTF16 format Unicode string that will be searched
1741      * @param char32 Codepoint to search for
1742      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1743      *         does not occur.
1744      * @stable ICU 2.6
1745      */
lastIndexOf(String source, int char32)1746     public static int lastIndexOf(String source, int char32) {
1747         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1748             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1749         }
1750         // non-surrogate bmp
1751         if (char32 < LEAD_SURROGATE_MIN_VALUE
1752                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1753             return source.lastIndexOf((char) char32);
1754         }
1755         // surrogate
1756         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1757             int result = source.lastIndexOf((char) char32);
1758             if (result >= 0) {
1759                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1760                         && isTrailSurrogate(source.charAt(result + 1))) {
1761                     return lastIndexOf(source, char32, result - 1);
1762                 }
1763                 // trail surrogate
1764                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1765                     return lastIndexOf(source, char32, result - 1);
1766                 }
1767             }
1768             return result;
1769         }
1770         // supplementary
1771         String char32str = toString(char32);
1772         return source.lastIndexOf(char32str);
1773     }
1774 
1775     /**
1776      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1777      * the argument string str. This method is implemented based on codepoints, hence a "lead
1778      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1779      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1780      * character before str found at in source will not have a valid match. Vice versa for lead
1781      * surrogates that ends str. See example below.
1782      * <p>
1783      * Examples:<br>
1784      * UTF16.lastIndexOf("abc", "a") returns 0<br>
1785      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1786      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1787      * </p>
1788      * <p>
1789      * source is searched backwards starting at the last character.
1790      * </p>
1791      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1792      * characters to its fullest.
1793      *
1794      * @param source UTF16 format Unicode string that will be searched
1795      * @param str UTF16 format Unicode string to search for
1796      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1797      *         does not occur.
1798      * @stable ICU 2.6
1799      */
lastIndexOf(String source, String str)1800     public static int lastIndexOf(String source, String str) {
1801         int strLength = str.length();
1802         // non-surrogate ends
1803         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1804             return source.lastIndexOf(str);
1805         }
1806 
1807         int result = source.lastIndexOf(str);
1808         if (result >= 0) {
1809             // check last character
1810             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1811                     && isTrailSurrogate(source.charAt(result + strLength + 1))) {
1812                 return lastIndexOf(source, str, result - 1);
1813             }
1814             // check first character which is a trail surrogate
1815             if (isTrailSurrogate(str.charAt(0)) && result > 0
1816                     && isLeadSurrogate(source.charAt(result - 1))) {
1817                 return lastIndexOf(source, str, result - 1);
1818             }
1819         }
1820         return result;
1821     }
1822 
1823     /**
1824      * <p>
1825      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1826      * the argument codepoint, where the result is less than or equals to fromIndex.
1827      * </p>
1828      * <p>
1829      * This method is implemented based on codepoints, hence a single surrogate character will not
1830      * match a supplementary character.
1831      * </p>
1832      * <p>
1833      * source is searched backwards starting at the last character starting at the specified index.
1834      * </p>
1835      * <p>
1836      * Examples:<br>
1837      * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
1838      * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
1839      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
1840      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
1841      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1842      * </p>
1843      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1844      * characters to its fullest.
1845      *
1846      * @param source UTF16 format Unicode string that will be searched
1847      * @param char32 Codepoint to search for
1848      * @param fromIndex the index to start the search from. There is no restriction on the value of
1849      *            fromIndex. If it is greater than or equal to the length of this string, it has the
1850      *            same effect as if it were equal to one less than the length of this string: this
1851      *            entire string may be searched. If it is negative, it has the same effect as if it
1852      *            were -1: -1 is returned.
1853      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1854      *         does not occur.
1855      * @stable ICU 2.6
1856      */
lastIndexOf(String source, int char32, int fromIndex)1857     public static int lastIndexOf(String source, int char32, int fromIndex) {
1858         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1859             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1860         }
1861         // non-surrogate bmp
1862         if (char32 < LEAD_SURROGATE_MIN_VALUE
1863                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1864             return source.lastIndexOf((char) char32, fromIndex);
1865         }
1866         // surrogate
1867         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1868             int result = source.lastIndexOf((char) char32, fromIndex);
1869             if (result >= 0) {
1870                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1871                         && isTrailSurrogate(source.charAt(result + 1))) {
1872                     return lastIndexOf(source, char32, result - 1);
1873                 }
1874                 // trail surrogate
1875                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1876                     return lastIndexOf(source, char32, result - 1);
1877                 }
1878             }
1879             return result;
1880         }
1881         // supplementary
1882         String char32str = toString(char32);
1883         return source.lastIndexOf(char32str, fromIndex);
1884     }
1885 
1886     /**
1887      * <p>
1888      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1889      * the argument string str, where the result is less than or equals to fromIndex.
1890      * </p>
1891      * <p>
1892      * This method is implemented based on codepoints, hence a "lead surrogate character + trail
1893      * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate
1894      * character at index 0, a source with a leading a surrogate character before str found at in
1895      * source will not have a valid match. Vice versa for lead surrogates that ends str.
1896      * </p>
1897      * See example below.
1898      * <p>
1899      * Examples:<br>
1900      * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
1901      * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
1902      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
1903      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
1904      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
1905      * </p>
1906      * <p>
1907      * source is searched backwards starting at the last character.
1908      * </p>
1909      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1910      * characters to its fullest.
1911      *
1912      * @param source UTF16 format Unicode string that will be searched
1913      * @param str UTF16 format Unicode string to search for
1914      * @param fromIndex the index to start the search from. There is no restriction on the value of
1915      *            fromIndex. If it is greater than or equal to the length of this string, it has the
1916      *            same effect as if it were equal to one less than the length of this string: this
1917      *            entire string may be searched. If it is negative, it has the same effect as if it
1918      *            were -1: -1 is returned.
1919      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1920      *         does not occur.
1921      * @stable ICU 2.6
1922      */
lastIndexOf(String source, String str, int fromIndex)1923     public static int lastIndexOf(String source, String str, int fromIndex) {
1924         int strLength = str.length();
1925         // non-surrogate ends
1926         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1927             return source.lastIndexOf(str, fromIndex);
1928         }
1929 
1930         int result = source.lastIndexOf(str, fromIndex);
1931         if (result >= 0) {
1932             // check last character
1933             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1934                     && isTrailSurrogate(source.charAt(result + strLength))) {
1935                 return lastIndexOf(source, str, result - 1);
1936             }
1937             // check first character which is a trail surrogate
1938             if (isTrailSurrogate(str.charAt(0)) && result > 0
1939                     && isLeadSurrogate(source.charAt(result - 1))) {
1940                 return lastIndexOf(source, str, result - 1);
1941             }
1942         }
1943         return result;
1944     }
1945 
1946     /**
1947      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of
1948      * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16
1949      * format Unicode string source, then source will be returned. Otherwise, a new String object is
1950      * created that represents a codepoint sequence identical to the codepoint sequence represented
1951      * by source, except that every occurrence of oldChar32 is replaced by an occurrence of
1952      * newChar32.
1953      * <p>
1954      * Examples: <br>
1955      * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
1956      * returns "mosquito in your collar"<br>
1957      * UTF16.replace("JonL", 'q', 'x');<br>
1958      * returns "JonL" (no change)<br>
1959      * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br>
1960      * returns "Supplementary character !"<br>
1961      * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br>
1962      * returns "Supplementary character \ud800\udc00"<br>
1963      * </p>
1964      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1965      * characters to its fullest.
1966      *
1967      * @param source UTF16 format Unicode string which the codepoint replacements will be based on.
1968      * @param oldChar32 Non-zero old codepoint to be replaced.
1969      * @param newChar32 The new codepoint to replace oldChar32
1970      * @return new String derived from source by replacing every occurrence of oldChar32 with
1971      *         newChar32, unless when no oldChar32 is found in source then source will be returned.
1972      * @stable ICU 2.6
1973      */
replace(String source, int oldChar32, int newChar32)1974     public static String replace(String source, int oldChar32, int newChar32) {
1975         if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
1976             throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint");
1977         }
1978         if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
1979             throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint");
1980         }
1981 
1982         int index = indexOf(source, oldChar32);
1983         if (index == -1) {
1984             return source;
1985         }
1986         String newChar32Str = toString(newChar32);
1987         int oldChar32Size = 1;
1988         int newChar32Size = newChar32Str.length();
1989         StringBuffer result = new StringBuffer(source);
1990         int resultIndex = index;
1991 
1992         if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
1993             oldChar32Size = 2;
1994         }
1995 
1996         while (index != -1) {
1997             int endResultIndex = resultIndex + oldChar32Size;
1998             result.replace(resultIndex, endResultIndex, newChar32Str);
1999             int lastEndIndex = index + oldChar32Size;
2000             index = indexOf(source, oldChar32, lastEndIndex);
2001             resultIndex += newChar32Size + index - lastEndIndex;
2002         }
2003         return result.toString();
2004     }
2005 
2006     /**
2007      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr
2008      * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string
2009      * source, then source will be returned. Otherwise, a new String object is created that
2010      * represents a codepoint sequence identical to the codepoint sequence represented by source,
2011      * except that every occurrence of oldStr is replaced by an occurrence of newStr.
2012      * <p>
2013      * Examples: <br>
2014      * UTF16.replace("mesquite in your cellar", "e", "o");<br>
2015      * returns "mosquito in your collar"<br>
2016      * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
2017      * returns "cat in your cellar"<br>
2018      * UTF16.replace("JonL", "q", "x");<br>
2019      * returns "JonL" (no change)<br>
2020      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br>
2021      * returns "Supplementary character !"<br>
2022      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br>
2023      * returns "Supplementary character \ud800\udc00"<br>
2024      * </p>
2025      * Note this method is provided as support to jdk 1.3, which does not support supplementary
2026      * characters to its fullest.
2027      *
2028      * @param source UTF16 format Unicode string which the replacements will be based on.
2029      * @param oldStr Non-zero-length string to be replaced.
2030      * @param newStr The new string to replace oldStr
2031      * @return new String derived from source by replacing every occurrence of oldStr with newStr.
2032      *         When no oldStr is found in source, then source will be returned.
2033      * @stable ICU 2.6
2034      */
replace(String source, String oldStr, String newStr)2035     public static String replace(String source, String oldStr, String newStr) {
2036         int index = indexOf(source, oldStr);
2037         if (index == -1) {
2038             return source;
2039         }
2040         int oldStrSize = oldStr.length();
2041         int newStrSize = newStr.length();
2042         StringBuffer result = new StringBuffer(source);
2043         int resultIndex = index;
2044 
2045         while (index != -1) {
2046             int endResultIndex = resultIndex + oldStrSize;
2047             result.replace(resultIndex, endResultIndex, newStr);
2048             int lastEndIndex = index + oldStrSize;
2049             index = indexOf(source, oldStr, lastEndIndex);
2050             resultIndex += newStrSize + index - lastEndIndex;
2051         }
2052         return result.toString();
2053     }
2054 
2055     /**
2056      * Reverses a UTF16 format Unicode string and replaces source's content with it. This method
2057      * will reverse surrogate characters correctly, instead of blindly reversing every character.
2058      * <p>
2059      * Examples:<br>
2060      * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
2061      * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
2062      *
2063      * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed
2064      * @return a modified source with reversed UTF16 format Unicode string.
2065      * @stable ICU 2.6
2066      */
reverse(StringBuffer source)2067     public static StringBuffer reverse(StringBuffer source) {
2068         int length = source.length();
2069         StringBuffer result = new StringBuffer(length);
2070         for (int i = length; i-- > 0;) {
2071             char ch = source.charAt(i);
2072             if (isTrailSurrogate(ch) && i > 0) {
2073                 char ch2 = source.charAt(i - 1);
2074                 if (isLeadSurrogate(ch2)) {
2075                     result.append(ch2);
2076                     result.append(ch);
2077                     --i;
2078                     continue;
2079                 }
2080             }
2081             result.append(ch);
2082         }
2083         return result;
2084     }
2085 
2086     /**
2087      * Check if the string contains more Unicode code points than a certain number. This is more
2088      * efficient than counting all code points in the entire string and comparing that number with a
2089      * threshold. This function may not need to scan the string at all if the length is within a
2090      * certain range, and never needs to count more than 'number + 1' code points. Logically
2091      * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two
2092      * code units.
2093      *
2094      * @param source The input string.
2095      * @param number The number of code points in the string is compared against the 'number'
2096      *            parameter.
2097      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2098      * @stable ICU 2.4
2099      */
hasMoreCodePointsThan(String source, int number)2100     public static boolean hasMoreCodePointsThan(String source, int number) {
2101         if (number < 0) {
2102             return true;
2103         }
2104         if (source == null) {
2105             return false;
2106         }
2107         int length = source.length();
2108 
2109         // length >= 0 known
2110         // source contains at least (length + 1) / 2 code points: <= 2
2111         // chars per cp
2112         if (((length + 1) >> 1) > number) {
2113             return true;
2114         }
2115 
2116         // check if source does not even contain enough chars
2117         int maxsupplementary = length - number;
2118         if (maxsupplementary <= 0) {
2119             return false;
2120         }
2121 
2122         // there are maxsupplementary = length - number more chars than
2123         // asked-for code points
2124 
2125         // count code points until they exceed and also check that there are
2126         // no more than maxsupplementary supplementary code points (char pairs)
2127         int start = 0;
2128         while (true) {
2129             if (length == 0) {
2130                 return false;
2131             }
2132             if (number == 0) {
2133                 return true;
2134             }
2135             if (isLeadSurrogate(source.charAt(start++)) && start != length
2136                     && isTrailSurrogate(source.charAt(start))) {
2137                 start++;
2138                 if (--maxsupplementary <= 0) {
2139                     // too many pairs - too few code points
2140                     return false;
2141                 }
2142             }
2143             --number;
2144         }
2145     }
2146 
2147     /**
2148      * Check if the sub-range of char array, from argument start to limit, contains more Unicode
2149      * code points than a certain number. This is more efficient than counting all code points in
2150      * the entire char array range and comparing that number with a threshold. This function may not
2151      * need to scan the char array at all if start and limit is within a certain range, and never
2152      * needs to count more than 'number + 1' code points. Logically equivalent to
2153      * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one
2154      * or two code units.
2155      *
2156      * @param source Array of UTF-16 chars
2157      * @param start Offset to substring in the source array for analyzing
2158      * @param limit Offset to substring in the source array for analyzing
2159      * @param number The number of code points in the string is compared against the 'number'
2160      *            parameter.
2161      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2162      * @exception IndexOutOfBoundsException Thrown when limit &lt; start
2163      * @stable ICU 2.4
2164      */
hasMoreCodePointsThan(char source[], int start, int limit, int number)2165     public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
2166         int length = limit - start;
2167         if (length < 0 || start < 0 || limit < 0) {
2168             throw new IndexOutOfBoundsException(
2169                     "Start and limit indexes should be non-negative and start <= limit");
2170         }
2171         if (number < 0) {
2172             return true;
2173         }
2174         if (source == null) {
2175             return false;
2176         }
2177 
2178         // length >= 0 known
2179         // source contains at least (length + 1) / 2 code points: <= 2
2180         // chars per cp
2181         if (((length + 1) >> 1) > number) {
2182             return true;
2183         }
2184 
2185         // check if source does not even contain enough chars
2186         int maxsupplementary = length - number;
2187         if (maxsupplementary <= 0) {
2188             return false;
2189         }
2190 
2191         // there are maxsupplementary = length - number more chars than
2192         // asked-for code points
2193 
2194         // count code points until they exceed and also check that there are
2195         // no more than maxsupplementary supplementary code points (char pairs)
2196         while (true) {
2197             if (length == 0) {
2198                 return false;
2199             }
2200             if (number == 0) {
2201                 return true;
2202             }
2203             if (isLeadSurrogate(source[start++]) && start != limit
2204                     && isTrailSurrogate(source[start])) {
2205                 start++;
2206                 if (--maxsupplementary <= 0) {
2207                     // too many pairs - too few code points
2208                     return false;
2209                 }
2210             }
2211             --number;
2212         }
2213     }
2214 
2215     /**
2216      * Check if the string buffer contains more Unicode code points than a certain number. This is
2217      * more efficient than counting all code points in the entire string buffer and comparing that
2218      * number with a threshold. This function may not need to scan the string buffer at all if the
2219      * length is within a certain range, and never needs to count more than 'number + 1' code
2220      * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may occupy
2221      * either one or two code units.
2222      *
2223      * @param source The input string buffer.
2224      * @param number The number of code points in the string buffer is compared against the 'number'
2225      *            parameter.
2226      * @return boolean value for whether the string buffer contains more Unicode code points than
2227      *         'number'.
2228      * @stable ICU 2.4
2229      */
hasMoreCodePointsThan(StringBuffer source, int number)2230     public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
2231         if (number < 0) {
2232             return true;
2233         }
2234         if (source == null) {
2235             return false;
2236         }
2237         int length = source.length();
2238 
2239         // length >= 0 known
2240         // source contains at least (length + 1) / 2 code points: <= 2
2241         // chars per cp
2242         if (((length + 1) >> 1) > number) {
2243             return true;
2244         }
2245 
2246         // check if source does not even contain enough chars
2247         int maxsupplementary = length - number;
2248         if (maxsupplementary <= 0) {
2249             return false;
2250         }
2251 
2252         // there are maxsupplementary = length - number more chars than
2253         // asked-for code points
2254 
2255         // count code points until they exceed and also check that there are
2256         // no more than maxsupplementary supplementary code points (char pairs)
2257         int start = 0;
2258         while (true) {
2259             if (length == 0) {
2260                 return false;
2261             }
2262             if (number == 0) {
2263                 return true;
2264             }
2265             if (isLeadSurrogate(source.charAt(start++)) && start != length
2266                     && isTrailSurrogate(source.charAt(start))) {
2267                 start++;
2268                 if (--maxsupplementary <= 0) {
2269                     // too many pairs - too few code points
2270                     return false;
2271                 }
2272             }
2273             --number;
2274         }
2275     }
2276 
2277     /**
2278      * Cover JDK 1.5 API. Create a String from an array of codePoints.
2279      *
2280      * @param codePoints The code array
2281      * @param offset The start of the text in the code point array
2282      * @param count The number of code points
2283      * @return a String representing the code points between offset and count
2284      * @throws IllegalArgumentException If an invalid code point is encountered
2285      * @throws IndexOutOfBoundsException If the offset or count are out of bounds.
2286      * @stable ICU 3.0
2287      */
newString(int[] codePoints, int offset, int count)2288     public static String newString(int[] codePoints, int offset, int count) {
2289         if (count < 0) {
2290             throw new IllegalArgumentException();
2291         }
2292         char[] chars = new char[count];
2293         int w = 0;
2294         for (int r = offset, e = offset + count; r < e; ++r) {
2295             int cp = codePoints[r];
2296             if (cp < 0 || cp > 0x10ffff) {
2297                 throw new IllegalArgumentException();
2298             }
2299             while (true) {
2300                 try {
2301                     if (cp < 0x010000) {
2302                         chars[w] = (char) cp;
2303                         w++;
2304                     } else {
2305                         chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
2306                         chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
2307                         w += 2;
2308                     }
2309                     break;
2310                 } catch (IndexOutOfBoundsException ex) {
2311                     int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
2312                             / (r - offset + 1)));
2313                     char[] temp = new char[newlen];
2314                     System.arraycopy(chars, 0, temp, 0, w);
2315                     chars = temp;
2316                 }
2317             }
2318         }
2319         return new String(chars, 0, w);
2320     }
2321 
2322     /**
2323      * <p>
2324      * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various
2325      * modes
2326      * </p>
2327      * <ul>
2328      * <li> Code point comparison or code unit comparison
2329      * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison
2330      * with special handling for character 'i'.
2331      * </ul>
2332      * <p>
2333      * The code unit or code point comparison differ only when comparing supplementary code points
2334      * (&#92;u10000..&#92;u10ffff) to BMP code points near the end of the BMP (i.e.,
2335      * &#92;ue000..&#92;uffff). In code unit comparison, high BMP code points sort after
2336      * supplementary code points because they are stored as pairs of surrogates which are at
2337      * &#92;ud800..&#92;udfff.
2338      * </p>
2339      *
2340      * @see #FOLD_CASE_DEFAULT
2341      * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2342      * @stable ICU 2.1
2343      */
2344     public static final class StringComparator implements java.util.Comparator<String> {
2345         // public constructor ------------------------------------------------
2346 
2347         /**
2348          * Default constructor that does code unit comparison and case sensitive comparison.
2349          *
2350          * @stable ICU 2.1
2351          */
StringComparator()2352         public StringComparator() {
2353             this(false, false, FOLD_CASE_DEFAULT);
2354         }
2355 
2356         /**
2357          * Constructor that does comparison based on the argument options.
2358          *
2359          * @param codepointcompare Flag to indicate true for code point comparison or false for code unit
2360          *            comparison.
2361          * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison
2362          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2363          *            when ignorecase is set to true. If ignorecase is false, this option is
2364          *            ignored.
2365          * @see #FOLD_CASE_DEFAULT
2366          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2367          * @throws IllegalArgumentException If foldcaseoption is out of range
2368          * @stable ICU 2.4
2369          */
StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption)2370         public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) {
2371             setCodePointCompare(codepointcompare);
2372             m_ignoreCase_ = ignorecase;
2373             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2374                 throw new IllegalArgumentException("Invalid fold case option");
2375             }
2376             m_foldCase_ = foldcaseoption;
2377         }
2378 
2379         // public data member ------------------------------------------------
2380 
2381         /**
2382          * Option value for case folding comparison:
2383          *
2384          * <p>Comparison is case insensitive, strings are folded using default mappings defined in
2385          * Unicode data file CaseFolding.txt, before comparison.
2386          *
2387          * @stable ICU 2.4
2388          */
2389         public static final int FOLD_CASE_DEFAULT = 0;
2390 
2391         /**
2392          * Option value for case folding:
2393          * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
2394          * and dotless i appropriately for Turkic languages (tr, az).
2395          *
2396          * <p>Comparison is case insensitive, strings are folded using modified mappings defined in
2397          * Unicode data file CaseFolding.txt, before comparison.
2398          *
2399          * @stable ICU 2.4
2400          * @see com.ibm.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
2401          */
2402         public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
2403 
2404         // public methods ----------------------------------------------------
2405 
2406         // public setters ----------------------------------------------------
2407 
2408         /**
2409          * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode
2410          * is set to code unit compare
2411          *
2412          * @param flag True for code point compare, false for code unit compare
2413          * @stable ICU 2.4
2414          */
setCodePointCompare(boolean flag)2415         public void setCodePointCompare(boolean flag) {
2416             if (flag) {
2417                 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
2418             } else {
2419                 m_codePointCompare_ = 0;
2420             }
2421         }
2422 
2423         /**
2424          * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise
2425          * case sensitive comparison mode if set to false.
2426          *
2427          * @param ignorecase True for case-insitive comparison, false for case sensitive comparison
2428          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2429          *            when ignorecase is set to true. If ignorecase is false, this option is
2430          *            ignored.
2431          * @see #FOLD_CASE_DEFAULT
2432          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2433          * @stable ICU 2.4
2434          */
setIgnoreCase(boolean ignorecase, int foldcaseoption)2435         public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
2436             m_ignoreCase_ = ignorecase;
2437             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2438                 throw new IllegalArgumentException("Invalid fold case option");
2439             }
2440             m_foldCase_ = foldcaseoption;
2441         }
2442 
2443         // public getters ----------------------------------------------------
2444 
2445         /**
2446          * Checks if the comparison mode is code point compare.
2447          *
2448          * @return true for code point compare, false for code unit compare
2449          * @stable ICU 2.4
2450          */
getCodePointCompare()2451         public boolean getCodePointCompare() {
2452             return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2453         }
2454 
2455         /**
2456          * Checks if Comparator is in the case insensitive mode.
2457          *
2458          * @return true if Comparator performs case insensitive comparison, false otherwise
2459          * @stable ICU 2.4
2460          */
getIgnoreCase()2461         public boolean getIgnoreCase() {
2462             return m_ignoreCase_;
2463         }
2464 
2465         /**
2466          * Gets the fold case options set in Comparator to be used with case insensitive comparison.
2467          *
2468          * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
2469          * @see #FOLD_CASE_DEFAULT
2470          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2471          * @stable ICU 2.4
2472          */
getIgnoreCaseOption()2473         public int getIgnoreCaseOption() {
2474             return m_foldCase_;
2475         }
2476 
2477         // public other methods ----------------------------------------------
2478 
2479         /**
2480          * Compare two strings depending on the options selected during construction.
2481          *
2482          * @param a first source string.
2483          * @param b second source string.
2484          * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b,
2485          *         a positive value is returned.
2486          * @exception ClassCastException thrown when either a or b is not a String object
2487          * @stable ICU 4.4
2488          */
compare(String a, String b)2489         public int compare(String a, String b) {
2490             if (a == b) {
2491                 return 0;
2492             }
2493             if (a == null) {
2494                 return -1;
2495             }
2496             if (b == null) {
2497                 return 1;
2498             }
2499 
2500             if (m_ignoreCase_) {
2501                 return compareCaseInsensitive(a, b);
2502             }
2503             return compareCaseSensitive(a, b);
2504         }
2505 
2506         // private data member ----------------------------------------------
2507 
2508         /**
2509          * Code unit comparison flag. True if code unit comparison is required. False if code point
2510          * comparison is required.
2511          */
2512         private int m_codePointCompare_;
2513 
2514         /**
2515          * Fold case comparison option.
2516          */
2517         private int m_foldCase_;
2518 
2519         /**
2520          * Flag indicator if ignore case is to be used during comparison
2521          */
2522         private boolean m_ignoreCase_;
2523 
2524         /**
2525          * Code point order offset for surrogate characters
2526          */
2527         private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
2528 
2529         // private method ---------------------------------------------------
2530 
2531         /**
2532          * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life
2533          * easier.
2534          *
2535          * @param s1
2536          *            first string to compare
2537          * @param s2
2538          *            second string to compare
2539          * @return -1 is s1 &lt; s2, 0 if equals,
2540          */
compareCaseInsensitive(String s1, String s2)2541         private int compareCaseInsensitive(String s1, String s2) {
2542             return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_
2543                     | Normalizer.COMPARE_IGNORE_CASE);
2544         }
2545 
2546         /**
2547          * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life
2548          * easier.
2549          *
2550          * @param s1
2551          *            first string to compare
2552          * @param s2
2553          *            second string to compare
2554          * @return -1 is s1 &lt; s2, 0 if equals,
2555          */
compareCaseSensitive(String s1, String s2)2556         private int compareCaseSensitive(String s1, String s2) {
2557             // compare identical prefixes - they do not need to be fixed up
2558             // limit1 = start1 + min(lenght1, length2)
2559             int length1 = s1.length();
2560             int length2 = s2.length();
2561             int minlength = length1;
2562             int result = 0;
2563             if (length1 < length2) {
2564                 result = -1;
2565             } else if (length1 > length2) {
2566                 result = 1;
2567                 minlength = length2;
2568             }
2569 
2570             char c1 = 0;
2571             char c2 = 0;
2572             int index = 0;
2573             for (; index < minlength; index++) {
2574                 c1 = s1.charAt(index);
2575                 c2 = s2.charAt(index);
2576                 // check pseudo-limit
2577                 if (c1 != c2) {
2578                     break;
2579                 }
2580             }
2581 
2582             if (index == minlength) {
2583                 return result;
2584             }
2585 
2586             boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2587             // if both values are in or above the surrogate range, fix them up
2588             if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE
2589                     && codepointcompare) {
2590                 // subtract 0x2800 from BMP code points to make them smaller
2591                 // than supplementary ones
2592                 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1)))
2593                         || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) {
2594                     // part of a surrogate pair, leave >=d800
2595                 } else {
2596                     // BMP code point - may be surrogate code point - make
2597                     // < d800
2598                     c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2599                 }
2600 
2601                 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1)))
2602                         || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) {
2603                     // part of a surrogate pair, leave >=d800
2604                 } else {
2605                     // BMP code point - may be surrogate code point - make <d800
2606                     c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2607                 }
2608             }
2609 
2610             // now c1 and c2 are in UTF-32-compatible order
2611             return c1 - c2;
2612         }
2613     }
2614 
2615     /**
2616      * Utility for getting a code point from a CharSequence that contains exactly one code point.
2617      * @return the code point IF the string is non-null and consists of a single code point.
2618      * otherwise returns -1.
2619      * @param s to test
2620      * @draft ICU 54
2621      * @provisional This API might change or be removed in a future release.
2622      */
getSingleCodePoint(CharSequence s)2623     public static int getSingleCodePoint(CharSequence s) {
2624         if (s == null || s.length() == 0) {
2625             return -1;
2626         } else if (s.length() == 1) {
2627             return s.charAt(0);
2628         } else if (s.length() > 2) {
2629             return -1;
2630         }
2631 
2632         // at this point, len = 2
2633         int cp = Character.codePointAt(s, 0);
2634         if (cp > 0xFFFF) { // is surrogate pair
2635             return cp;
2636         }
2637         return -1;
2638     }
2639 
2640     /**
2641      * Utility for comparing a code point to a string without having to create a new string. Returns the same results
2642      * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if
2643      * <pre>
2644      * sc = new StringComparator(true,false,0);
2645      * fast = UTF16.compareCodePoint(codePoint, charSequence)
2646      * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString())
2647      * </pre>
2648      * then
2649      * </pre>
2650      * Integer.signum(fast) == Integer.signum(slower)
2651      * </pre>
2652      * @param codePoint to test
2653      * @param s to test
2654      * @return equivalent of code point comparator comparing two strings.
2655      * @draft ICU 54
2656      * @provisional This API might change or be removed in a future release.
2657      */
compareCodePoint(int codePoint, CharSequence s)2658     public static int compareCodePoint(int codePoint, CharSequence s) {
2659         if (s == null) {
2660             return 1;
2661         }
2662         final int strLen = s.length();
2663         if (strLen == 0) {
2664             return 1;
2665         }
2666         int second = Character.codePointAt(s, 0);
2667         int diff = codePoint - second;
2668         if (diff != 0) {
2669             return diff;
2670         }
2671         return strLen == Character.charCount(codePoint) ? 0 : -1;
2672     }
2673 
2674     // private data members -------------------------------------------------
2675 
2676     /**
2677      * Shift value for lead surrogate to form a supplementary character.
2678      */
2679     private static final int LEAD_SURROGATE_SHIFT_ = 10;
2680 
2681     /**
2682      * Mask to retrieve the significant value from a trail surrogate.
2683      */
2684     private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
2685 
2686     /**
2687      * Value that all lead surrogate starts with
2688      */
2689     private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
2690             - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
2691 
2692     // private methods ------------------------------------------------------
2693 
2694     /**
2695      * <p>
2696      * Converts argument code point and returns a String object representing the code point's value
2697      * in UTF16 format.
2698      * </p>
2699      * <p>
2700      * This method does not check for the validity of the codepoint, the results are not guaranteed
2701      * if a invalid codepoint is passed as argument.
2702      * </p>
2703      * <p>
2704      * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise.
2705      * </p>
2706      *
2707      * @param ch
2708      *            code point
2709      * @return string representation of the code point
2710      */
toString(int ch)2711     private static String toString(int ch) {
2712         if (ch < SUPPLEMENTARY_MIN_VALUE) {
2713             return String.valueOf((char) ch);
2714         }
2715 
2716         StringBuilder result = new StringBuilder();
2717         result.append(getLeadSurrogate(ch));
2718         result.append(getTrailSurrogate(ch));
2719         return result.toString();
2720     }
2721 }
2722 // eof
2723