1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /**
4  *******************************************************************************
5  * Copyright (C) 1996-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.text;
11 
12 import com.ibm.icu.impl.Utility;
13 
14 /**
15  * <p>
16  * Standalone utility class providing UTF16 character conversions and indexing conversions.
17  * </p>
18  * <p>
19  * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
20  * so searching for strings is a safe operation. Similarly, concatenation is always safe.
21  * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
22  * values for start and end are on those boundaries, since they arose from operations like
23  * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
24  * </p>
25  * <strong>Examples:</strong>
26  * <p>
27  * The following examples illustrate use of some of these methods.
28  *
29  * <pre>
30  * // iteration forwards: Original
31  * for (int i = 0; i &lt; s.length(); ++i) {
32  *     char ch = s.charAt(i);
33  *     doSomethingWith(ch);
34  * }
35  *
36  * // iteration forwards: Changes for UTF-32
37  * int ch;
38  * for (int i = 0; i &lt; s.length(); i += UTF16.getCharCount(ch)) {
39  *     ch = UTF16.charAt(s, i);
40  *     doSomethingWith(ch);
41  * }
42  *
43  * // iteration backwards: Original
44  * for (int i = s.length() - 1; i &gt;= 0; --i) {
45  *     char ch = s.charAt(i);
46  *     doSomethingWith(ch);
47  * }
48  *
49  * // iteration backwards: Changes for UTF-32
50  * int ch;
51  * for (int i = s.length() - 1; i &gt; 0; i -= UTF16.getCharCount(ch)) {
52  *     ch = UTF16.charAt(s, i);
53  *     doSomethingWith(ch);
54  * }
55  * </pre>
56  *
57  * <strong>Notes:</strong>
58  * <ul>
59  * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code>
60  * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string.
61  * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16
62  * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32
63  * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li>
64  * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a
65  * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16
66  * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
67  * </li>
68  * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out
69  * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates
70  * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to
71  * check for validity if desired. </li>
72  * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then
73  * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It
74  * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4,
75  * 5.5). </li>
76  * <li> <strong>Optimization:</strong> The method implementations may need optimization if the
77  * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small
78  * percentage of all the text in the world, the singleton case should always be optimized for. </li>
79  * </ul>
80  *
81  * @author Mark Davis, with help from Markus Scherer
82  * @stable ICU 2.1
83  */
84 
85 public final class UTF16 {
86     // public variables ---------------------------------------------------
87 
88     /**
89      * Value returned in {@link #bounds(String, int) bounds()}.
90      * These values are chosen specifically so that it actually represents the position of the
91      * character [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)]
92      *
93      * @stable ICU 2.1
94      */
95     public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
96             TRAIL_SURROGATE_BOUNDARY = 5;
97 
98     /**
99      * The lowest Unicode code point value.
100      *
101      * @stable ICU 2.1
102      */
103     public static final int CODEPOINT_MIN_VALUE = 0;
104 
105     /**
106      * The highest Unicode code point value (scalar value) according to the Unicode Standard.
107      *
108      * @stable ICU 2.1
109      */
110     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
111 
112     /**
113      * The minimum value for Supplementary code points
114      *
115      * @stable ICU 2.1
116      */
117     public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
118 
119     /**
120      * Lead surrogate minimum value
121      *
122      * @stable ICU 2.1
123      */
124     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
125 
126     /**
127      * Trail surrogate minimum value
128      *
129      * @stable ICU 2.1
130      */
131     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
132 
133     /**
134      * Lead surrogate maximum value
135      *
136      * @stable ICU 2.1
137      */
138     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
139 
140     /**
141      * Trail surrogate maximum value
142      *
143      * @stable ICU 2.1
144      */
145     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
146 
147     /**
148      * Surrogate minimum value
149      *
150      * @stable ICU 2.1
151      */
152     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
153 
154     /**
155      * Maximum surrogate value
156      *
157      * @stable ICU 2.1
158      */
159     public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
160 
161     /**
162      * Lead surrogate bitmask
163      */
164     private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
165 
166     /**
167      * Trail surrogate bitmask
168      */
169     private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
170 
171     /**
172      * Surrogate bitmask
173      */
174     private static final int SURROGATE_BITMASK = 0xFFFFF800;
175 
176     /**
177      * Lead surrogate bits
178      */
179     private static final int LEAD_SURROGATE_BITS = 0xD800;
180 
181     /**
182      * Trail surrogate bits
183      */
184     private static final int TRAIL_SURROGATE_BITS = 0xDC00;
185 
186     /**
187      * Surrogate bits
188      */
189     private static final int SURROGATE_BITS = 0xD800;
190 
191     // constructor --------------------------------------------------------
192 
193     // /CLOVER:OFF
194     /**
195      * Prevent instance from being created.
196      */
UTF16()197     private UTF16() {
198     }
199 
200     // /CLOVER:ON
201     // public method ------------------------------------------------------
202 
203     /**
204      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
205      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
206      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
207      * UCharacter.isLegal()</a></code>
208      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
209      * character will be returned. If a complete supplementary character is not found the incomplete
210      * character will be returned
211      *
212      * @param source Array of UTF-16 chars
213      * @param offset16 UTF-16 offset to the start of the character.
214      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
215      *         of that codepoint are the same as in <code>bounds32()</code>.
216      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
217      * @stable ICU 2.1
218      */
charAt(String source, int offset16)219     public static int charAt(String source, int offset16) {
220         char single = source.charAt(offset16);
221         if (single < LEAD_SURROGATE_MIN_VALUE) {
222             return single;
223         }
224         return _charAt(source, offset16, single);
225     }
226 
_charAt(String source, int offset16, char single)227     private static int _charAt(String source, int offset16, char single) {
228         if (single > TRAIL_SURROGATE_MAX_VALUE) {
229             return single;
230         }
231 
232         // Convert the UTF-16 surrogate pair if necessary.
233         // For simplicity in usage, and because the frequency of pairs is
234         // low, look both directions.
235 
236         if (single <= LEAD_SURROGATE_MAX_VALUE) {
237             ++offset16;
238             if (source.length() != offset16) {
239                 char trail = source.charAt(offset16);
240                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
241                     return Character.toCodePoint(single, trail);
242                 }
243             }
244         } else {
245             --offset16;
246             if (offset16 >= 0) {
247                 // single is a trail surrogate so
248                 char lead = source.charAt(offset16);
249                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
250                     return Character.toCodePoint(lead, single);
251                 }
252             }
253         }
254         return single; // return unmatched surrogate
255     }
256 
257     /**
258      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
259      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
260      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
261      * UCharacter.isLegal()</a></code>
262      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
263      * character will be returned. If a complete supplementary character is not found the incomplete
264      * character will be returned
265      *
266      * @param source Array of UTF-16 chars
267      * @param offset16 UTF-16 offset to the start of the character.
268      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
269      *         of that codepoint are the same as in <code>bounds32()</code>.
270      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
271      * @stable ICU 2.1
272      */
charAt(CharSequence source, int offset16)273     public static int charAt(CharSequence source, int offset16) {
274         char single = source.charAt(offset16);
275         if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
276             return single;
277         }
278         return _charAt(source, offset16, single);
279     }
280 
_charAt(CharSequence source, int offset16, char single)281     private static int _charAt(CharSequence source, int offset16, char single) {
282         if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
283             return single;
284         }
285 
286         // Convert the UTF-16 surrogate pair if necessary.
287         // For simplicity in usage, and because the frequency of pairs is
288         // low, look both directions.
289 
290         if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
291             ++offset16;
292             if (source.length() != offset16) {
293                 char trail = source.charAt(offset16);
294                 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
295                         && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
296                     return Character.toCodePoint(single, trail);
297                 }
298             }
299         } else {
300             --offset16;
301             if (offset16 >= 0) {
302                 // single is a trail surrogate so
303                 char lead = source.charAt(offset16);
304                 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
305                         && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
306                     return Character.toCodePoint(lead, single);
307                 }
308             }
309         }
310         return single; // return unmatched surrogate
311     }
312 
313     /**
314      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
315      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
316      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
317      * </a></code>
318      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
319      * character will be returned. If a complete supplementary character is not found the incomplete
320      * character will be returned
321      *
322      * @param source UTF-16 chars string buffer
323      * @param offset16 UTF-16 offset to the start of the character.
324      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
325      *         of that codepoint are the same as in <code>bounds32()</code>.
326      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
327      * @stable ICU 2.1
328      */
charAt(StringBuffer source, int offset16)329     public static int charAt(StringBuffer source, int offset16) {
330         if (offset16 < 0 || offset16 >= source.length()) {
331             throw new StringIndexOutOfBoundsException(offset16);
332         }
333 
334         char single = source.charAt(offset16);
335         if (!isSurrogate(single)) {
336             return single;
337         }
338 
339         // Convert the UTF-16 surrogate pair if necessary.
340         // For simplicity in usage, and because the frequency of pairs is
341         // low, look both directions.
342 
343         if (single <= LEAD_SURROGATE_MAX_VALUE) {
344             ++offset16;
345             if (source.length() != offset16) {
346                 char trail = source.charAt(offset16);
347                 if (isTrailSurrogate(trail))
348                     return Character.toCodePoint(single, trail);
349             }
350         } else {
351             --offset16;
352             if (offset16 >= 0) {
353                 // single is a trail surrogate so
354                 char lead = source.charAt(offset16);
355                 if (isLeadSurrogate(lead)) {
356                     return Character.toCodePoint(lead, single);
357                 }
358             }
359         }
360         return single; // return unmatched surrogate
361     }
362 
363     /**
364      * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
365      * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
366      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
367      * </a></code>
368      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
369      * character will be returned. If a complete supplementary character is not found the incomplete
370      * character will be returned
371      *
372      * @param source Array of UTF-16 chars
373      * @param start Offset to substring in the source array for analyzing
374      * @param limit Offset to substring in the source array for analyzing
375      * @param offset16 UTF-16 offset relative to start
376      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
377      *         of that codepoint are the same as in <code>bounds32()</code>.
378      * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
379      * @stable ICU 2.1
380      */
charAt(char source[], int start, int limit, int offset16)381     public static int charAt(char source[], int start, int limit, int offset16) {
382         offset16 += start;
383         if (offset16 < start || offset16 >= limit) {
384             throw new ArrayIndexOutOfBoundsException(offset16);
385         }
386 
387         char single = source[offset16];
388         if (!isSurrogate(single)) {
389             return single;
390         }
391 
392         // Convert the UTF-16 surrogate pair if necessary.
393         // For simplicity in usage, and because the frequency of pairs is
394         // low, look both directions.
395         if (single <= LEAD_SURROGATE_MAX_VALUE) {
396             offset16++;
397             if (offset16 >= limit) {
398                 return single;
399             }
400             char trail = source[offset16];
401             if (isTrailSurrogate(trail)) {
402                 return Character.toCodePoint(single, trail);
403             }
404         } else { // isTrailSurrogate(single), so
405             if (offset16 == start) {
406                 return single;
407             }
408             offset16--;
409             char lead = source[offset16];
410             if (isLeadSurrogate(lead))
411                 return Character.toCodePoint(lead, single);
412         }
413         return single; // return unmatched surrogate
414     }
415 
416     /**
417      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
418      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
419      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
420      * </a></code>
421      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
422      * character will be returned. If a complete supplementary character is not found the incomplete
423      * character will be returned
424      *
425      * @param source UTF-16 chars string buffer
426      * @param offset16 UTF-16 offset to the start of the character.
427      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
428      *         of that codepoint are the same as in <code>bounds32()</code>.
429      * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
430      * @stable ICU 2.1
431      */
charAt(Replaceable source, int offset16)432     public static int charAt(Replaceable source, int offset16) {
433         if (offset16 < 0 || offset16 >= source.length()) {
434             throw new StringIndexOutOfBoundsException(offset16);
435         }
436 
437         char single = source.charAt(offset16);
438         if (!isSurrogate(single)) {
439             return single;
440         }
441 
442         // Convert the UTF-16 surrogate pair if necessary.
443         // For simplicity in usage, and because the frequency of pairs is
444         // low, look both directions.
445 
446         if (single <= LEAD_SURROGATE_MAX_VALUE) {
447             ++offset16;
448             if (source.length() != offset16) {
449                 char trail = source.charAt(offset16);
450                 if (isTrailSurrogate(trail))
451                     return Character.toCodePoint(single, trail);
452             }
453         } else {
454             --offset16;
455             if (offset16 >= 0) {
456                 // single is a trail surrogate so
457                 char lead = source.charAt(offset16);
458                 if (isLeadSurrogate(lead)) {
459                     return Character.toCodePoint(lead, single);
460                 }
461             }
462         }
463         return single; // return unmatched surrogate
464     }
465 
466     /**
467      * Determines how many chars this char32 requires. If a validity check is required, use <code>
468      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
469      * on char32 before calling.
470      *
471      * @param char32 The input codepoint.
472      * @return 2 if is in supplementary space, otherwise 1.
473      * @stable ICU 2.1
474      */
getCharCount(int char32)475     public static int getCharCount(int char32) {
476         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
477             return 1;
478         }
479         return 2;
480     }
481 
482     /**
483      * Returns the type of the boundaries around the char at offset16. Used for random access.
484      *
485      * @param source Text to analyse
486      * @param offset16 UTF-16 offset
487      * @return
488      *            <ul>
489      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1]
490      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
491      *            are [offset16, offset16 + 2]
492      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
493      *            bounds are [offset16 - 1, offset16 + 1]
494      *            </ul>
495      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
496      *            can be gotten by: [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)].
497      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
498      * @stable ICU 2.1
499      */
bounds(String source, int offset16)500     public static int bounds(String source, int offset16) {
501         char ch = source.charAt(offset16);
502         if (isSurrogate(ch)) {
503             if (isLeadSurrogate(ch)) {
504                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
505                     return LEAD_SURROGATE_BOUNDARY;
506                 }
507             } else {
508                 // isTrailSurrogate(ch), so
509                 --offset16;
510                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
511                     return TRAIL_SURROGATE_BOUNDARY;
512                 }
513             }
514         }
515         return SINGLE_CHAR_BOUNDARY;
516     }
517 
518     /**
519      * Returns the type of the boundaries around the char at offset16. Used for random access.
520      *
521      * @param source String buffer to analyse
522      * @param offset16 UTF16 offset
523      * @return
524      *            <ul>
525      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1]
526      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
527      *            are [offset16, offset16 + 2]
528      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
529      *            bounds are [offset16 - 1, offset16 + 1]
530      *            </ul>
531      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
532      *            can be gotten by: [offset16 - (value &gt;&gt; 2), offset16 + (value &amp; 3)].
533      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
534      * @stable ICU 2.1
535      */
bounds(StringBuffer source, int offset16)536     public static int bounds(StringBuffer source, int offset16) {
537         char ch = source.charAt(offset16);
538         if (isSurrogate(ch)) {
539             if (isLeadSurrogate(ch)) {
540                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
541                     return LEAD_SURROGATE_BOUNDARY;
542                 }
543             } else {
544                 // isTrailSurrogate(ch), so
545                 --offset16;
546                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
547                     return TRAIL_SURROGATE_BOUNDARY;
548                 }
549             }
550         }
551         return SINGLE_CHAR_BOUNDARY;
552     }
553 
554     /**
555      * Returns the type of the boundaries around the char at offset16. Used for random access. Note
556      * that the boundaries are determined with respect to the subarray, hence the char array
557      * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
558      *
559      * @param source Char array to analyse
560      * @param start Offset to substring in the source array for analyzing
561      * @param limit Offset to substring in the source array for analyzing
562      * @param offset16 UTF16 offset relative to start
563      * @return
564      *            <ul>
565      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
566      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
567      *            are [offset16, offset16 + 2]
568      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
569      *            bounds are [offset16 - 1, offset16 + 1]
570      *            </ul>
571      *            For bit-twiddlers, the boundary values for these are chosen so that the boundaries
572      *            can be gotten by: [offset16 - (boundvalue &gt;&gt; 2), offset16 + (boundvalue &amp; 3)].
573      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
574      * @stable ICU 2.1
575      */
bounds(char source[], int start, int limit, int offset16)576     public static int bounds(char source[], int start, int limit, int offset16) {
577         offset16 += start;
578         if (offset16 < start || offset16 >= limit) {
579             throw new ArrayIndexOutOfBoundsException(offset16);
580         }
581         char ch = source[offset16];
582         if (isSurrogate(ch)) {
583             if (isLeadSurrogate(ch)) {
584                 ++offset16;
585                 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
586                     return LEAD_SURROGATE_BOUNDARY;
587                 }
588             } else { // isTrailSurrogate(ch), so
589                 --offset16;
590                 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
591                     return TRAIL_SURROGATE_BOUNDARY;
592                 }
593             }
594         }
595         return SINGLE_CHAR_BOUNDARY;
596     }
597 
598     /**
599      * Determines whether the code value is a surrogate.
600      *
601      * @param char16 The input character.
602      * @return true If the input character is a surrogate.
603      * @stable ICU 2.1
604      */
isSurrogate(char char16)605     public static boolean isSurrogate(char char16) {
606         return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
607     }
608 
609     /**
610      * Determines whether the character is a trail surrogate.
611      *
612      * @param char16 The input character.
613      * @return true If the input character is a trail surrogate.
614      * @stable ICU 2.1
615      */
isTrailSurrogate(char char16)616     public static boolean isTrailSurrogate(char char16) {
617         return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
618     }
619 
620     /**
621      * Determines whether the character is a lead surrogate.
622      *
623      * @param char16 The input character.
624      * @return true If the input character is a lead surrogate
625      * @stable ICU 2.1
626      */
isLeadSurrogate(char char16)627     public static boolean isLeadSurrogate(char char16) {
628         return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
629     }
630 
631     /**
632      * Returns the lead surrogate. If a validity check is required, use
633      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
634      * before calling.
635      *
636      * @param char32 The input character.
637      * @return lead surrogate if the getCharCount(ch) is 2; <br>
638      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
639      * @stable ICU 2.1
640      */
getLeadSurrogate(int char32)641     public static char getLeadSurrogate(int char32) {
642         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
643             return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
644         }
645         return 0;
646     }
647 
648     /**
649      * Returns the trail surrogate. If a validity check is required, use
650      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
651      * before calling.
652      *
653      * @param char32 The input character.
654      * @return the trail surrogate if the getCharCount(ch) is 2; <br>
655      *         otherwise the character itself
656      * @stable ICU 2.1
657      */
getTrailSurrogate(int char32)658     public static char getTrailSurrogate(int char32) {
659         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
660             return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
661         }
662         return (char) char32;
663     }
664 
665     /**
666      * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
667      * containing the UTF-32 value in UTF16 format. If a validity check is required, use
668      * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before calling.
669      *
670      * @param char32 The input character.
671      * @return string value of char32 in UTF16 format
672      * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint.
673      * @stable ICU 2.1
674      */
valueOf(int char32)675     public static String valueOf(int char32) {
676         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
677             throw new IllegalArgumentException("Illegal codepoint");
678         }
679         return toString(char32);
680     }
681 
682     /**
683      * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
684      * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
685      * character, the whole supplementary codepoint will be returned. If a validity check is
686      * required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the
687      * codepoint at offset16 before calling. The result returned will be a newly created String
688      * obtained by calling source.substring(..) with the appropriate indexes.
689      *
690      * @param source The input string.
691      * @param offset16 The UTF16 index to the codepoint in source
692      * @return string value of char32 in UTF16 format
693      * @stable ICU 2.1
694      */
valueOf(String source, int offset16)695     public static String valueOf(String source, int offset16) {
696         switch (bounds(source, offset16)) {
697         case LEAD_SURROGATE_BOUNDARY:
698             return source.substring(offset16, offset16 + 2);
699         case TRAIL_SURROGATE_BOUNDARY:
700             return source.substring(offset16 - 1, offset16 + 1);
701         default:
702             return source.substring(offset16, offset16 + 1);
703         }
704     }
705 
706     /**
707      * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
708      * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
709      * surrogate character, the whole supplementary codepoint will be returned. If a validity check
710      * is required, use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on
711      * the codepoint at offset16 before calling. The result returned will be a newly created String
712      * obtained by calling source.substring(..) with the appropriate indexes.
713      *
714      * @param source The input string buffer.
715      * @param offset16 The UTF16 index to the codepoint in source
716      * @return string value of char32 in UTF16 format
717      * @stable ICU 2.1
718      */
valueOf(StringBuffer source, int offset16)719     public static String valueOf(StringBuffer source, int offset16) {
720         switch (bounds(source, offset16)) {
721         case LEAD_SURROGATE_BOUNDARY:
722             return source.substring(offset16, offset16 + 2);
723         case TRAIL_SURROGATE_BOUNDARY:
724             return source.substring(offset16 - 1, offset16 + 1);
725         default:
726             return source.substring(offset16, offset16 + 1);
727         }
728     }
729 
730     /**
731      * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
732      * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
733      * returned, except when either the leading or trailing surrogate character lies out of the
734      * specified subarray. In the latter case, only the surrogate character within bounds will be
735      * returned. If a validity check is required, use
736      * {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on the codepoint at
737      * offset16 before calling. The result returned will be a newly created String containing the
738      * relevant characters.
739      *
740      * @param source The input char array.
741      * @param start Start index of the subarray
742      * @param limit End index of the subarray
743      * @param offset16 The UTF16 index to the codepoint in source relative to start
744      * @return string value of char32 in UTF16 format
745      * @stable ICU 2.1
746      */
valueOf(char source[], int start, int limit, int offset16)747     public static String valueOf(char source[], int start, int limit, int offset16) {
748         switch (bounds(source, start, limit, offset16)) {
749         case LEAD_SURROGATE_BOUNDARY:
750             return new String(source, start + offset16, 2);
751         case TRAIL_SURROGATE_BOUNDARY:
752             return new String(source, start + offset16 - 1, 2);
753         }
754         return new String(source, start + offset16, 1);
755     }
756 
757     /**
758      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
759      * the {@link UTF16 class description} for notes on roundtripping.
760      *
761      * @param source The UTF-16 string
762      * @param offset32 UTF-32 offset
763      * @return UTF-16 offset
764      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
765      * @stable ICU 2.1
766      */
findOffsetFromCodePoint(String source, int offset32)767     public static int findOffsetFromCodePoint(String source, int offset32) {
768         char ch;
769         int size = source.length(), result = 0, count = offset32;
770         if (offset32 < 0 || offset32 > size) {
771             throw new StringIndexOutOfBoundsException(offset32);
772         }
773         while (result < size && count > 0) {
774             ch = source.charAt(result);
775             if (isLeadSurrogate(ch) && ((result + 1) < size)
776                     && isTrailSurrogate(source.charAt(result + 1))) {
777                 result++;
778             }
779 
780             count--;
781             result++;
782         }
783         if (count != 0) {
784             throw new StringIndexOutOfBoundsException(offset32);
785         }
786         return result;
787     }
788 
789     /**
790      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
791      * the {@link UTF16 class description} for notes on roundtripping.
792      *
793      * @param source The UTF-16 string buffer
794      * @param offset32 UTF-32 offset
795      * @return UTF-16 offset
796      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
797      * @stable ICU 2.1
798      */
findOffsetFromCodePoint(StringBuffer source, int offset32)799     public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
800         char ch;
801         int size = source.length(), result = 0, count = offset32;
802         if (offset32 < 0 || offset32 > size) {
803             throw new StringIndexOutOfBoundsException(offset32);
804         }
805         while (result < size && count > 0) {
806             ch = source.charAt(result);
807             if (isLeadSurrogate(ch) && ((result + 1) < size)
808                     && isTrailSurrogate(source.charAt(result + 1))) {
809                 result++;
810             }
811 
812             count--;
813             result++;
814         }
815         if (count != 0) {
816             throw new StringIndexOutOfBoundsException(offset32);
817         }
818         return result;
819     }
820 
821     /**
822      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
823      * the {@link UTF16 class description} for notes on roundtripping.
824      *
825      * @param source The UTF-16 char array whose substring is to be analysed
826      * @param start Offset of the substring to be analysed
827      * @param limit Offset of the substring to be analysed
828      * @param offset32 UTF-32 offset relative to start
829      * @return UTF-16 offset relative to start
830      * @exception IndexOutOfBoundsException If offset32 is out of bounds.
831      * @stable ICU 2.1
832      */
findOffsetFromCodePoint(char source[], int start, int limit, int offset32)833     public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
834         char ch;
835         int result = start, count = offset32;
836         if (offset32 > limit - start) {
837             throw new ArrayIndexOutOfBoundsException(offset32);
838         }
839         while (result < limit && count > 0) {
840             ch = source[result];
841             if (isLeadSurrogate(ch) && ((result + 1) < limit)
842                     && isTrailSurrogate(source[result + 1])) {
843                 result++;
844             }
845 
846             count--;
847             result++;
848         }
849         if (count != 0) {
850             throw new ArrayIndexOutOfBoundsException(offset32);
851         }
852         return result - start;
853     }
854 
855     /**
856      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
857      * UTF-16 offset. Used for random access. See the {@link UTF16 class description} for
858      * notes on roundtripping.<br>
859      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
860      * of the <strong>lead</strong> of the pair is returned. </i>
861      * <p>
862      * To find the UTF-32 length of a string, use:
863      *
864      * <pre>
865      * len32 = countCodePoint(source, source.length());
866      * </pre>
867      *
868      * @param source Text to analyse
869      * @param offset16 UTF-16 offset &lt; source text length.
870      * @return UTF-32 offset
871      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
872      * @stable ICU 2.1
873      */
findCodePointOffset(String source, int offset16)874     public static int findCodePointOffset(String source, int offset16) {
875         if (offset16 < 0 || offset16 > source.length()) {
876             throw new StringIndexOutOfBoundsException(offset16);
877         }
878 
879         int result = 0;
880         char ch;
881         boolean hadLeadSurrogate = false;
882 
883         for (int i = 0; i < offset16; ++i) {
884             ch = source.charAt(i);
885             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
886                 hadLeadSurrogate = false; // count valid trail as zero
887             } else {
888                 hadLeadSurrogate = isLeadSurrogate(ch);
889                 ++result; // count others as 1
890             }
891         }
892 
893         if (offset16 == source.length()) {
894             return result;
895         }
896 
897         // end of source being the less significant surrogate character
898         // shift result back to the start of the supplementary character
899         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
900             result--;
901         }
902 
903         return result;
904     }
905 
906     /**
907      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
908      * offset. Used for random access. See the {@link UTF16 class description} for notes on
909      * roundtripping.<br>
910      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
911      * of the <strong>lead</strong> of the pair is returned. </i>
912      * <p>
913      * To find the UTF-32 length of a string, use:
914      *
915      * <pre>
916      * len32 = countCodePoint(source);
917      * </pre>
918      *
919      * @param source Text to analyse
920      * @param offset16 UTF-16 offset &lt; source text length.
921      * @return UTF-32 offset
922      * @exception IndexOutOfBoundsException If offset16 is out of bounds.
923      * @stable ICU 2.1
924      */
findCodePointOffset(StringBuffer source, int offset16)925     public static int findCodePointOffset(StringBuffer source, int offset16) {
926         if (offset16 < 0 || offset16 > source.length()) {
927             throw new StringIndexOutOfBoundsException(offset16);
928         }
929 
930         int result = 0;
931         char ch;
932         boolean hadLeadSurrogate = false;
933 
934         for (int i = 0; i < offset16; ++i) {
935             ch = source.charAt(i);
936             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
937                 hadLeadSurrogate = false; // count valid trail as zero
938             } else {
939                 hadLeadSurrogate = isLeadSurrogate(ch);
940                 ++result; // count others as 1
941             }
942         }
943 
944         if (offset16 == source.length()) {
945             return result;
946         }
947 
948         // end of source being the less significant surrogate character
949         // shift result back to the start of the supplementary character
950         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
951             result--;
952         }
953 
954         return result;
955     }
956 
957     /**
958      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
959      * offset. Used for random access. See the {@link UTF16 class description} for notes on
960      * roundtripping.<br>
961      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
962      * of the <strong>lead</strong> of the pair is returned. </i>
963      * <p>
964      * To find the UTF-32 length of a substring, use:
965      *
966      * <pre>
967      * len32 = countCodePoint(source, start, limit);
968      * </pre>
969      *
970      * @param source Text to analyse
971      * @param start Offset of the substring
972      * @param limit Offset of the substring
973      * @param offset16 UTF-16 relative to start
974      * @return UTF-32 offset relative to start
975      * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
976      * @stable ICU 2.1
977      */
findCodePointOffset(char source[], int start, int limit, int offset16)978     public static int findCodePointOffset(char source[], int start, int limit, int offset16) {
979         offset16 += start;
980         if (offset16 > limit) {
981             throw new StringIndexOutOfBoundsException(offset16);
982         }
983 
984         int result = 0;
985         char ch;
986         boolean hadLeadSurrogate = false;
987 
988         for (int i = start; i < offset16; ++i) {
989             ch = source[i];
990             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
991                 hadLeadSurrogate = false; // count valid trail as zero
992             } else {
993                 hadLeadSurrogate = isLeadSurrogate(ch);
994                 ++result; // count others as 1
995             }
996         }
997 
998         if (offset16 == limit) {
999             return result;
1000         }
1001 
1002         // end of source being the less significant surrogate character
1003         // shift result back to the start of the supplementary character
1004         if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
1005             result--;
1006         }
1007 
1008         return result;
1009     }
1010 
1011     /**
1012      * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required,
1013      * use {@link com.ibm.icu.lang.UCharacter#isLegal(int)} on char32 before
1014      * calling.
1015      *
1016      * @param target The buffer to append to
1017      * @param char32 Value to append.
1018      * @return the updated StringBuffer
1019      * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints
1020      * @stable ICU 2.1
1021      */
append(StringBuffer target, int char32)1022     public static StringBuffer append(StringBuffer target, int char32) {
1023         // Check for irregular values
1024         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1025             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
1026         }
1027 
1028         // Write the UTF-16 values
1029         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1030             target.append(getLeadSurrogate(char32));
1031             target.append(getTrailSurrogate(char32));
1032         } else {
1033             target.append((char) char32);
1034         }
1035         return target;
1036     }
1037 
1038     /**
1039      * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
1040      * convenience.
1041      *
1042      * @param target The buffer to append to
1043      * @param cp The code point to append
1044      * @return the updated StringBuffer
1045      * @throws IllegalArgumentException If cp is not a valid code point
1046      * @stable ICU 3.0
1047      */
appendCodePoint(StringBuffer target, int cp)1048     public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
1049         return append(target, cp);
1050     }
1051 
1052     /**
1053      * Adds a codepoint to offset16 position of the argument char array.
1054      *
1055      * @param target Char array to be append with the new code point
1056      * @param limit UTF16 offset which the codepoint will be appended.
1057      * @param char32 Code point to be appended
1058      * @return offset after char32 in the array.
1059      * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not
1060      *                lie within the range of the Unicode codepoints.
1061      * @stable ICU 2.1
1062      */
append(char[] target, int limit, int char32)1063     public static int append(char[] target, int limit, int char32) {
1064         // Check for irregular values
1065         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1066             throw new IllegalArgumentException("Illegal codepoint");
1067         }
1068         // Write the UTF-16 values
1069         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1070             target[limit++] = getLeadSurrogate(char32);
1071             target[limit++] = getTrailSurrogate(char32);
1072         } else {
1073             target[limit++] = (char) char32;
1074         }
1075         return limit;
1076     }
1077 
1078     /**
1079      * Number of codepoints in a UTF16 String
1080      *
1081      * @param source UTF16 string
1082      * @return number of codepoint in string
1083      * @stable ICU 2.1
1084      */
countCodePoint(String source)1085     public static int countCodePoint(String source) {
1086         if (source == null || source.length() == 0) {
1087             return 0;
1088         }
1089         return findCodePointOffset(source, source.length());
1090     }
1091 
1092     /**
1093      * Number of codepoints in a UTF16 String buffer
1094      *
1095      * @param source UTF16 string buffer
1096      * @return number of codepoint in string
1097      * @stable ICU 2.1
1098      */
countCodePoint(StringBuffer source)1099     public static int countCodePoint(StringBuffer source) {
1100         if (source == null || source.length() == 0) {
1101             return 0;
1102         }
1103         return findCodePointOffset(source, source.length());
1104     }
1105 
1106     /**
1107      * Number of codepoints in a UTF16 char array substring
1108      *
1109      * @param source UTF16 char array
1110      * @param start Offset of the substring
1111      * @param limit Offset of the substring
1112      * @return number of codepoint in the substring
1113      * @exception IndexOutOfBoundsException If start and limit are not valid.
1114      * @stable ICU 2.1
1115      */
countCodePoint(char source[], int start, int limit)1116     public static int countCodePoint(char source[], int start, int limit) {
1117         if (source == null || source.length == 0) {
1118             return 0;
1119         }
1120         return findCodePointOffset(source, start, limit, limit - start);
1121     }
1122 
1123     /**
1124      * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
1125      * non-supplementary codepoint with a supplementary and vice versa.
1126      *
1127      * @param target Stringbuffer
1128      * @param offset16 UTF16 position to insert into
1129      * @param char32 Code point
1130      * @stable ICU 2.1
1131      */
setCharAt(StringBuffer target, int offset16, int char32)1132     public static void setCharAt(StringBuffer target, int offset16, int char32) {
1133         int count = 1;
1134         char single = target.charAt(offset16);
1135 
1136         if (isSurrogate(single)) {
1137             // pairs of the surrogate with offset16 at the lead char found
1138             if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
1139                     && isTrailSurrogate(target.charAt(offset16 + 1))) {
1140                 count++;
1141             } else {
1142                 // pairs of the surrogate with offset16 at the trail char
1143                 // found
1144                 if (isTrailSurrogate(single) && (offset16 > 0)
1145                         && isLeadSurrogate(target.charAt(offset16 - 1))) {
1146                     offset16--;
1147                     count++;
1148                 }
1149             }
1150         }
1151         target.replace(offset16, offset16 + count, valueOf(char32));
1152     }
1153 
1154     /**
1155      * Set a code point into a UTF16 position in a char array. Adjusts target according if we are
1156      * replacing a non-supplementary codepoint with a supplementary and vice versa.
1157      *
1158      * @param target char array
1159      * @param limit numbers of valid chars in target, different from target.length. limit counts the
1160      *            number of chars in target that represents a string, not the size of array target.
1161      * @param offset16 UTF16 position to insert into
1162      * @param char32 code point
1163      * @return new number of chars in target that represents a string
1164      * @exception IndexOutOfBoundsException if offset16 is out of range
1165      * @stable ICU 2.1
1166      */
setCharAt(char target[], int limit, int offset16, int char32)1167     public static int setCharAt(char target[], int limit, int offset16, int char32) {
1168         if (offset16 >= limit) {
1169             throw new ArrayIndexOutOfBoundsException(offset16);
1170         }
1171         int count = 1;
1172         char single = target[offset16];
1173 
1174         if (isSurrogate(single)) {
1175             // pairs of the surrogate with offset16 at the lead char found
1176             if (isLeadSurrogate(single) && (target.length > offset16 + 1)
1177                     && isTrailSurrogate(target[offset16 + 1])) {
1178                 count++;
1179             } else {
1180                 // pairs of the surrogate with offset16 at the trail char
1181                 // found
1182                 if (isTrailSurrogate(single) && (offset16 > 0)
1183                         && isLeadSurrogate(target[offset16 - 1])) {
1184                     offset16--;
1185                     count++;
1186                 }
1187             }
1188         }
1189 
1190         String str = valueOf(char32);
1191         int result = limit;
1192         int strlength = str.length();
1193         target[offset16] = str.charAt(0);
1194         if (count == strlength) {
1195             if (count == 2) {
1196                 target[offset16 + 1] = str.charAt(1);
1197             }
1198         } else {
1199             // this is not exact match in space, we'll have to do some
1200             // shifting
1201             System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit
1202                     - (offset16 + count));
1203             if (count < strlength) {
1204                 // char32 is a supplementary character trying to squeeze into
1205                 // a non-supplementary space
1206                 target[offset16 + 1] = str.charAt(1);
1207                 result++;
1208                 if (result < target.length) {
1209                     target[result] = 0;
1210                 }
1211             } else {
1212                 // char32 is a non-supplementary character trying to fill
1213                 // into a supplementary space
1214                 result--;
1215                 target[result] = 0;
1216             }
1217         }
1218         return result;
1219     }
1220 
1221     /**
1222      * Shifts offset16 by the argument number of codepoints
1223      *
1224      * @param source string
1225      * @param offset16 UTF16 position to shift
1226      * @param shift32 number of codepoints to shift
1227      * @return new shifted offset16
1228      * @exception IndexOutOfBoundsException if the new offset16 is out of bounds.
1229      * @stable ICU 2.1
1230      */
moveCodePointOffset(String source, int offset16, int shift32)1231     public static int moveCodePointOffset(String source, int offset16, int shift32) {
1232         int result = offset16;
1233         int size = source.length();
1234         int count;
1235         char ch;
1236         if (offset16 < 0 || offset16 > size) {
1237             throw new StringIndexOutOfBoundsException(offset16);
1238         }
1239         if (shift32 > 0) {
1240             if (shift32 + offset16 > size) {
1241                 throw new StringIndexOutOfBoundsException(offset16);
1242             }
1243             count = shift32;
1244             while (result < size && count > 0) {
1245                 ch = source.charAt(result);
1246                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1247                         && isTrailSurrogate(source.charAt(result + 1))) {
1248                     result++;
1249                 }
1250                 count--;
1251                 result++;
1252             }
1253         } else {
1254             if (offset16 + shift32 < 0) {
1255                 throw new StringIndexOutOfBoundsException(offset16);
1256             }
1257             for (count = -shift32; count > 0; count--) {
1258                 result--;
1259                 if (result < 0) {
1260                     break;
1261                 }
1262                 ch = source.charAt(result);
1263                 if (isTrailSurrogate(ch) && result > 0
1264                         && isLeadSurrogate(source.charAt(result - 1))) {
1265                     result--;
1266                 }
1267             }
1268         }
1269         if (count != 0) {
1270             throw new StringIndexOutOfBoundsException(shift32);
1271         }
1272         return result;
1273     }
1274 
1275     /**
1276      * Shifts offset16 by the argument number of codepoints
1277      *
1278      * @param source String buffer
1279      * @param offset16 UTF16 position to shift
1280      * @param shift32 Number of codepoints to shift
1281      * @return new shifted offset16
1282      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds.
1283      * @stable ICU 2.1
1284      */
moveCodePointOffset(StringBuffer source, int offset16, int shift32)1285     public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) {
1286         int result = offset16;
1287         int size = source.length();
1288         int count;
1289         char ch;
1290         if (offset16 < 0 || offset16 > size) {
1291             throw new StringIndexOutOfBoundsException(offset16);
1292         }
1293         if (shift32 > 0) {
1294             if (shift32 + offset16 > size) {
1295                 throw new StringIndexOutOfBoundsException(offset16);
1296             }
1297             count = shift32;
1298             while (result < size && count > 0) {
1299                 ch = source.charAt(result);
1300                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1301                         && isTrailSurrogate(source.charAt(result + 1))) {
1302                     result++;
1303                 }
1304                 count--;
1305                 result++;
1306             }
1307         } else {
1308             if (offset16 + shift32 < 0) {
1309                 throw new StringIndexOutOfBoundsException(offset16);
1310             }
1311             for (count = -shift32; count > 0; count--) {
1312                 result--;
1313                 if (result < 0) {
1314                     break;
1315                 }
1316                 ch = source.charAt(result);
1317                 if (isTrailSurrogate(ch) && result > 0
1318                         && isLeadSurrogate(source.charAt(result - 1))) {
1319                     result--;
1320                 }
1321             }
1322         }
1323         if (count != 0) {
1324             throw new StringIndexOutOfBoundsException(shift32);
1325         }
1326         return result;
1327     }
1328 
1329     /**
1330      * Shifts offset16 by the argument number of codepoints within a subarray.
1331      *
1332      * @param source Char array
1333      * @param start Position of the subarray to be performed on
1334      * @param limit Position of the subarray to be performed on
1335      * @param offset16 UTF16 position to shift relative to start
1336      * @param shift32 Number of codepoints to shift
1337      * @return new shifted offset16 relative to start
1338      * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the
1339      *                subarray bounds are out of range.
1340      * @stable ICU 2.1
1341      */
moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32)1342     public static int moveCodePointOffset(char source[], int start, int limit, int offset16,
1343             int shift32) {
1344         int size = source.length;
1345         int count;
1346         char ch;
1347         int result = offset16 + start;
1348         if (start < 0 || limit < start) {
1349             throw new StringIndexOutOfBoundsException(start);
1350         }
1351         if (limit > size) {
1352             throw new StringIndexOutOfBoundsException(limit);
1353         }
1354         if (offset16 < 0 || result > limit) {
1355             throw new StringIndexOutOfBoundsException(offset16);
1356         }
1357         if (shift32 > 0) {
1358             if (shift32 + result > size) {
1359                 throw new StringIndexOutOfBoundsException(result);
1360             }
1361             count = shift32;
1362             while (result < limit && count > 0) {
1363                 ch = source[result];
1364                 if (isLeadSurrogate(ch) && (result + 1 < limit)
1365                         && isTrailSurrogate(source[result + 1])) {
1366                     result++;
1367                 }
1368                 count--;
1369                 result++;
1370             }
1371         } else {
1372             if (result + shift32 < start) {
1373                 throw new StringIndexOutOfBoundsException(result);
1374             }
1375             for (count = -shift32; count > 0; count--) {
1376                 result--;
1377                 if (result < start) {
1378                     break;
1379                 }
1380                 ch = source[result];
1381                 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
1382                     result--;
1383                 }
1384             }
1385         }
1386         if (count != 0) {
1387             throw new StringIndexOutOfBoundsException(shift32);
1388         }
1389         result -= start;
1390         return result;
1391     }
1392 
1393     /**
1394      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1395      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1396      * codepoint. The length of target increases by one if codepoint is non-supplementary, 2
1397      * otherwise.
1398      * <p>
1399      * The overall effect is exactly as if the argument were converted to a string by the method
1400      * valueOf(char) and the characters in that string were then inserted into target at the
1401      * position indicated by offset16.
1402      * </p>
1403      * <p>
1404      * The offset argument must be greater than or equal to 0, and less than or equal to the length
1405      * of source.
1406      *
1407      * @param target String buffer to insert to
1408      * @param offset16 Offset which char32 will be inserted in
1409      * @param char32 Codepoint to be inserted
1410      * @return a reference to target
1411      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1412      * @stable ICU 2.1
1413      */
insert(StringBuffer target, int offset16, int char32)1414     public static StringBuffer insert(StringBuffer target, int offset16, int char32) {
1415         String str = valueOf(char32);
1416         if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1417             offset16++;
1418         }
1419         target.insert(offset16, str);
1420         return target;
1421     }
1422 
1423     /**
1424      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1425      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1426      * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise.
1427      * <p>
1428      * The overall effect is exactly as if the argument were converted to a string by the method
1429      * valueOf(char) and the characters in that string were then inserted into target at the
1430      * position indicated by offset16.
1431      * </p>
1432      * <p>
1433      * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
1434      *
1435      * @param target Char array to insert to
1436      * @param limit End index of the char array, limit &lt;= target.length
1437      * @param offset16 Offset which char32 will be inserted in
1438      * @param char32 Codepoint to be inserted
1439      * @return new limit size
1440      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1441      * @stable ICU 2.1
1442      */
insert(char target[], int limit, int offset16, int char32)1443     public static int insert(char target[], int limit, int offset16, int char32) {
1444         String str = valueOf(char32);
1445         if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1446             offset16++;
1447         }
1448         int size = str.length();
1449         if (limit + size > target.length) {
1450             throw new ArrayIndexOutOfBoundsException(offset16 + size);
1451         }
1452         System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
1453         target[offset16] = str.charAt(0);
1454         if (size == 2) {
1455             target[offset16 + 1] = str.charAt(1);
1456         }
1457         return limit + size;
1458     }
1459 
1460     /**
1461      * Removes the codepoint at the specified position in this target (shortening target by 1
1462      * character if the codepoint is a non-supplementary, 2 otherwise).
1463      *
1464      * @param target String buffer to remove codepoint from
1465      * @param offset16 Offset which the codepoint will be removed
1466      * @return a reference to target
1467      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1468      * @stable ICU 2.1
1469      */
delete(StringBuffer target, int offset16)1470     public static StringBuffer delete(StringBuffer target, int offset16) {
1471         int count = 1;
1472         switch (bounds(target, offset16)) {
1473         case LEAD_SURROGATE_BOUNDARY:
1474             count++;
1475             break;
1476         case TRAIL_SURROGATE_BOUNDARY:
1477             count++;
1478             offset16--;
1479             break;
1480         }
1481         target.delete(offset16, offset16 + count);
1482         return target;
1483     }
1484 
1485     /**
1486      * Removes the codepoint at the specified position in this target (shortening target by 1
1487      * character if the codepoint is a non-supplementary, 2 otherwise).
1488      *
1489      * @param target String buffer to remove codepoint from
1490      * @param limit End index of the char array, limit &lt;= target.length
1491      * @param offset16 Offset which the codepoint will be removed
1492      * @return a new limit size
1493      * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1494      * @stable ICU 2.1
1495      */
delete(char target[], int limit, int offset16)1496     public static int delete(char target[], int limit, int offset16) {
1497         int count = 1;
1498         switch (bounds(target, 0, limit, offset16)) {
1499         case LEAD_SURROGATE_BOUNDARY:
1500             count++;
1501             break;
1502         case TRAIL_SURROGATE_BOUNDARY:
1503             count++;
1504             offset16--;
1505             break;
1506         }
1507         System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
1508         target[limit - count] = 0;
1509         return limit - count;
1510     }
1511 
1512     /**
1513      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1514      * the argument codepoint. I.e., the smallest index <code>i</code> such that
1515      * <code>UTF16.charAt(source, i) ==
1516      * char32</code> is true.
1517      * <p>
1518      * If no such character occurs in this string, then -1 is returned.
1519      * </p>
1520      * <p>
1521      * Examples:<br>
1522      * UTF16.indexOf("abc", 'a') returns 0<br>
1523      * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1524      * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1525      * </p>
1526      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1527      * characters to its fullest.
1528      *
1529      * @param source UTF16 format Unicode string that will be searched
1530      * @param char32 Codepoint to search for
1531      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1532      *         -1 if the codepoint does not occur.
1533      * @stable ICU 2.6
1534      */
indexOf(String source, int char32)1535     public static int indexOf(String source, int char32) {
1536         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1537             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1538         }
1539         // non-surrogate bmp
1540         if (char32 < LEAD_SURROGATE_MIN_VALUE
1541                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1542             return source.indexOf((char) char32);
1543         }
1544         // surrogate
1545         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1546             int result = source.indexOf((char) char32);
1547             if (result >= 0) {
1548                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1549                         && isTrailSurrogate(source.charAt(result + 1))) {
1550                     return indexOf(source, char32, result + 1);
1551                 }
1552                 // trail surrogate
1553                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1554                     return indexOf(source, char32, result + 1);
1555                 }
1556             }
1557             return result;
1558         }
1559         // supplementary
1560         String char32str = toString(char32);
1561         return source.indexOf(char32str);
1562     }
1563 
1564     /**
1565      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1566      * the argument string str. This method is implemented based on codepoints, hence a "lead
1567      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1568      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1569      * character before str found at in source will not have a valid match. Vice versa for lead
1570      * surrogates that ends str. See example below.
1571      * <p>
1572      * If no such string str occurs in this source, then -1 is returned.
1573      * </p>
1574      * <p>
1575      * Examples:<br>
1576      * UTF16.indexOf("abc", "ab") returns 0<br>
1577      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1578      * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1579      * </p>
1580      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1581      * characters to its fullest.
1582      *
1583      * @param source UTF16 format Unicode string that will be searched
1584      * @param str UTF16 format Unicode string to search for
1585      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1586      *         -1 if the codepoint does not occur.
1587      * @stable ICU 2.6
1588      */
indexOf(String source, String str)1589     public static int indexOf(String source, String str) {
1590         int strLength = str.length();
1591         // non-surrogate ends
1592         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1593             return source.indexOf(str);
1594         }
1595 
1596         int result = source.indexOf(str);
1597         int resultEnd = result + strLength;
1598         if (result >= 0) {
1599             // check last character
1600             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1601                     && isTrailSurrogate(source.charAt(resultEnd + 1))) {
1602                 return indexOf(source, str, resultEnd + 1);
1603             }
1604             // check first character which is a trail surrogate
1605             if (isTrailSurrogate(str.charAt(0)) && result > 0
1606                     && isLeadSurrogate(source.charAt(result - 1))) {
1607                 return indexOf(source, str, resultEnd + 1);
1608             }
1609         }
1610         return result;
1611     }
1612 
1613     /**
1614      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1615      * the argument codepoint. I.e., the smallest index i such that: <br>
1616      * (UTF16.charAt(source, i) == char32 &amp;&amp; i &gt;= fromIndex) is true.
1617      * <p>
1618      * If no such character occurs in this string, then -1 is returned.
1619      * </p>
1620      * <p>
1621      * Examples:<br>
1622      * UTF16.indexOf("abc", 'a', 1) returns -1<br>
1623      * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
1624      * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
1625      * </p>
1626      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1627      * characters to its fullest.
1628      *
1629      * @param source UTF16 format Unicode string that will be searched
1630      * @param char32 Codepoint to search for
1631      * @param fromIndex The index to start the search from.
1632      * @return the index of the first occurrence of the codepoint in the argument Unicode string at
1633      *         or after fromIndex, or -1 if the codepoint does not occur.
1634      * @stable ICU 2.6
1635      */
indexOf(String source, int char32, int fromIndex)1636     public static int indexOf(String source, int char32, int fromIndex) {
1637         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1638             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1639         }
1640         // non-surrogate bmp
1641         if (char32 < LEAD_SURROGATE_MIN_VALUE
1642                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1643             return source.indexOf((char) char32, fromIndex);
1644         }
1645         // surrogate
1646         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1647             int result = source.indexOf((char) char32, fromIndex);
1648             if (result >= 0) {
1649                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1650                         && isTrailSurrogate(source.charAt(result + 1))) {
1651                     return indexOf(source, char32, result + 1);
1652                 }
1653                 // trail surrogate
1654                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1655                     return indexOf(source, char32, result + 1);
1656                 }
1657             }
1658             return result;
1659         }
1660         // supplementary
1661         String char32str = toString(char32);
1662         return source.indexOf(char32str, fromIndex);
1663     }
1664 
1665     /**
1666      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1667      * the argument string str. This method is implemented based on codepoints, hence a "lead
1668      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1669      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1670      * character before str found at in source will not have a valid match. Vice versa for lead
1671      * surrogates that ends str. See example below.
1672      * <p>
1673      * If no such string str occurs in this source, then -1 is returned.
1674      * </p>
1675      * <p>
1676      * Examples:<br>
1677      * UTF16.indexOf("abc", "ab", 0) returns 0<br>
1678      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
1679      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
1680      * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
1681      * </p>
1682      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1683      * characters to its fullest.
1684      *
1685      * @param source UTF16 format Unicode string that will be searched
1686      * @param str UTF16 format Unicode string to search for
1687      * @param fromIndex The index to start the search from.
1688      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1689      *         -1 if the codepoint does not occur.
1690      * @stable ICU 2.6
1691      */
indexOf(String source, String str, int fromIndex)1692     public static int indexOf(String source, String str, int fromIndex) {
1693         int strLength = str.length();
1694         // non-surrogate ends
1695         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1696             return source.indexOf(str, fromIndex);
1697         }
1698 
1699         int result = source.indexOf(str, fromIndex);
1700         int resultEnd = result + strLength;
1701         if (result >= 0) {
1702             // check last character
1703             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1704                     && isTrailSurrogate(source.charAt(resultEnd))) {
1705                 return indexOf(source, str, resultEnd + 1);
1706             }
1707             // check first character which is a trail surrogate
1708             if (isTrailSurrogate(str.charAt(0)) && result > 0
1709                     && isLeadSurrogate(source.charAt(result - 1))) {
1710                 return indexOf(source, str, resultEnd + 1);
1711             }
1712         }
1713         return result;
1714     }
1715 
1716     /**
1717      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1718      * the argument codepoint. I.e., the index returned is the largest value i such that:
1719      * UTF16.charAt(source, i) == char32 is true.
1720      * <p>
1721      * Examples:<br>
1722      * UTF16.lastIndexOf("abc", 'a') returns 0<br>
1723      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1724      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1725      * </p>
1726      * <p>
1727      * source is searched backwards starting at the last character.
1728      * </p>
1729      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1730      * characters to its fullest.
1731      *
1732      * @param source UTF16 format Unicode string that will be searched
1733      * @param char32 Codepoint to search for
1734      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1735      *         does not occur.
1736      * @stable ICU 2.6
1737      */
lastIndexOf(String source, int char32)1738     public static int lastIndexOf(String source, int char32) {
1739         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1740             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1741         }
1742         // non-surrogate bmp
1743         if (char32 < LEAD_SURROGATE_MIN_VALUE
1744                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1745             return source.lastIndexOf((char) char32);
1746         }
1747         // surrogate
1748         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1749             int result = source.lastIndexOf((char) char32);
1750             if (result >= 0) {
1751                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1752                         && isTrailSurrogate(source.charAt(result + 1))) {
1753                     return lastIndexOf(source, char32, result - 1);
1754                 }
1755                 // trail surrogate
1756                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1757                     return lastIndexOf(source, char32, result - 1);
1758                 }
1759             }
1760             return result;
1761         }
1762         // supplementary
1763         String char32str = toString(char32);
1764         return source.lastIndexOf(char32str);
1765     }
1766 
1767     /**
1768      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1769      * the argument string str. This method is implemented based on codepoints, hence a "lead
1770      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1771      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1772      * character before str found at in source will not have a valid match. Vice versa for lead
1773      * surrogates that ends str. See example below.
1774      * <p>
1775      * Examples:<br>
1776      * UTF16.lastIndexOf("abc", "a") returns 0<br>
1777      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1778      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1779      * </p>
1780      * <p>
1781      * source is searched backwards starting at the last character.
1782      * </p>
1783      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1784      * characters to its fullest.
1785      *
1786      * @param source UTF16 format Unicode string that will be searched
1787      * @param str UTF16 format Unicode string to search for
1788      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1789      *         does not occur.
1790      * @stable ICU 2.6
1791      */
lastIndexOf(String source, String str)1792     public static int lastIndexOf(String source, String str) {
1793         int strLength = str.length();
1794         // non-surrogate ends
1795         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1796             return source.lastIndexOf(str);
1797         }
1798 
1799         int result = source.lastIndexOf(str);
1800         if (result >= 0) {
1801             // check last character
1802             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1803                     && isTrailSurrogate(source.charAt(result + strLength + 1))) {
1804                 return lastIndexOf(source, str, result - 1);
1805             }
1806             // check first character which is a trail surrogate
1807             if (isTrailSurrogate(str.charAt(0)) && result > 0
1808                     && isLeadSurrogate(source.charAt(result - 1))) {
1809                 return lastIndexOf(source, str, result - 1);
1810             }
1811         }
1812         return result;
1813     }
1814 
1815     /**
1816      * <p>
1817      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1818      * the argument codepoint, where the result is less than or equals to fromIndex.
1819      * </p>
1820      * <p>
1821      * This method is implemented based on codepoints, hence a single surrogate character will not
1822      * match a supplementary character.
1823      * </p>
1824      * <p>
1825      * source is searched backwards starting at the last character starting at the specified index.
1826      * </p>
1827      * <p>
1828      * Examples:<br>
1829      * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
1830      * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
1831      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
1832      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
1833      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1834      * </p>
1835      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1836      * characters to its fullest.
1837      *
1838      * @param source UTF16 format Unicode string that will be searched
1839      * @param char32 Codepoint to search for
1840      * @param fromIndex the index to start the search from. There is no restriction on the value of
1841      *            fromIndex. If it is greater than or equal to the length of this string, it has the
1842      *            same effect as if it were equal to one less than the length of this string: this
1843      *            entire string may be searched. If it is negative, it has the same effect as if it
1844      *            were -1: -1 is returned.
1845      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1846      *         does not occur.
1847      * @stable ICU 2.6
1848      */
lastIndexOf(String source, int char32, int fromIndex)1849     public static int lastIndexOf(String source, int char32, int fromIndex) {
1850         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1851             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1852         }
1853         // non-surrogate bmp
1854         if (char32 < LEAD_SURROGATE_MIN_VALUE
1855                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1856             return source.lastIndexOf((char) char32, fromIndex);
1857         }
1858         // surrogate
1859         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1860             int result = source.lastIndexOf((char) char32, fromIndex);
1861             if (result >= 0) {
1862                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1863                         && isTrailSurrogate(source.charAt(result + 1))) {
1864                     return lastIndexOf(source, char32, result - 1);
1865                 }
1866                 // trail surrogate
1867                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1868                     return lastIndexOf(source, char32, result - 1);
1869                 }
1870             }
1871             return result;
1872         }
1873         // supplementary
1874         String char32str = toString(char32);
1875         return source.lastIndexOf(char32str, fromIndex);
1876     }
1877 
1878     /**
1879      * <p>
1880      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1881      * the argument string str, where the result is less than or equals to fromIndex.
1882      * </p>
1883      * <p>
1884      * This method is implemented based on codepoints, hence a "lead surrogate character + trail
1885      * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate
1886      * character at index 0, a source with a leading a surrogate character before str found at in
1887      * source will not have a valid match. Vice versa for lead surrogates that ends str.
1888      * </p>
1889      * See example below.
1890      * <p>
1891      * Examples:<br>
1892      * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
1893      * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
1894      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
1895      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
1896      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
1897      * </p>
1898      * <p>
1899      * source is searched backwards starting at the last character.
1900      * </p>
1901      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1902      * characters to its fullest.
1903      *
1904      * @param source UTF16 format Unicode string that will be searched
1905      * @param str UTF16 format Unicode string to search for
1906      * @param fromIndex the index to start the search from. There is no restriction on the value of
1907      *            fromIndex. If it is greater than or equal to the length of this string, it has the
1908      *            same effect as if it were equal to one less than the length of this string: this
1909      *            entire string may be searched. If it is negative, it has the same effect as if it
1910      *            were -1: -1 is returned.
1911      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1912      *         does not occur.
1913      * @stable ICU 2.6
1914      */
lastIndexOf(String source, String str, int fromIndex)1915     public static int lastIndexOf(String source, String str, int fromIndex) {
1916         int strLength = str.length();
1917         // non-surrogate ends
1918         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1919             return source.lastIndexOf(str, fromIndex);
1920         }
1921 
1922         int result = source.lastIndexOf(str, fromIndex);
1923         if (result >= 0) {
1924             // check last character
1925             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1926                     && isTrailSurrogate(source.charAt(result + strLength))) {
1927                 return lastIndexOf(source, str, result - 1);
1928             }
1929             // check first character which is a trail surrogate
1930             if (isTrailSurrogate(str.charAt(0)) && result > 0
1931                     && isLeadSurrogate(source.charAt(result - 1))) {
1932                 return lastIndexOf(source, str, result - 1);
1933             }
1934         }
1935         return result;
1936     }
1937 
1938     /**
1939      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of
1940      * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16
1941      * format Unicode string source, then source will be returned. Otherwise, a new String object is
1942      * created that represents a codepoint sequence identical to the codepoint sequence represented
1943      * by source, except that every occurrence of oldChar32 is replaced by an occurrence of
1944      * newChar32.
1945      * <p>
1946      * Examples: <br>
1947      * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
1948      * returns "mosquito in your collar"<br>
1949      * UTF16.replace("JonL", 'q', 'x');<br>
1950      * returns "JonL" (no change)<br>
1951      * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br>
1952      * returns "Supplementary character !"<br>
1953      * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br>
1954      * returns "Supplementary character \ud800\udc00"<br>
1955      * </p>
1956      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1957      * characters to its fullest.
1958      *
1959      * @param source UTF16 format Unicode string which the codepoint replacements will be based on.
1960      * @param oldChar32 Non-zero old codepoint to be replaced.
1961      * @param newChar32 The new codepoint to replace oldChar32
1962      * @return new String derived from source by replacing every occurrence of oldChar32 with
1963      *         newChar32, unless when no oldChar32 is found in source then source will be returned.
1964      * @stable ICU 2.6
1965      */
replace(String source, int oldChar32, int newChar32)1966     public static String replace(String source, int oldChar32, int newChar32) {
1967         if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
1968             throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint");
1969         }
1970         if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
1971             throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint");
1972         }
1973 
1974         int index = indexOf(source, oldChar32);
1975         if (index == -1) {
1976             return source;
1977         }
1978         String newChar32Str = toString(newChar32);
1979         int oldChar32Size = 1;
1980         int newChar32Size = newChar32Str.length();
1981         StringBuffer result = new StringBuffer(source);
1982         int resultIndex = index;
1983 
1984         if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
1985             oldChar32Size = 2;
1986         }
1987 
1988         while (index != -1) {
1989             int endResultIndex = resultIndex + oldChar32Size;
1990             result.replace(resultIndex, endResultIndex, newChar32Str);
1991             int lastEndIndex = index + oldChar32Size;
1992             index = indexOf(source, oldChar32, lastEndIndex);
1993             resultIndex += newChar32Size + index - lastEndIndex;
1994         }
1995         return result.toString();
1996     }
1997 
1998     /**
1999      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr
2000      * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string
2001      * source, then source will be returned. Otherwise, a new String object is created that
2002      * represents a codepoint sequence identical to the codepoint sequence represented by source,
2003      * except that every occurrence of oldStr is replaced by an occurrence of newStr.
2004      * <p>
2005      * Examples: <br>
2006      * UTF16.replace("mesquite in your cellar", "e", "o");<br>
2007      * returns "mosquito in your collar"<br>
2008      * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
2009      * returns "cat in your cellar"<br>
2010      * UTF16.replace("JonL", "q", "x");<br>
2011      * returns "JonL" (no change)<br>
2012      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br>
2013      * returns "Supplementary character !"<br>
2014      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br>
2015      * returns "Supplementary character \ud800\udc00"<br>
2016      * </p>
2017      * Note this method is provided as support to jdk 1.3, which does not support supplementary
2018      * characters to its fullest.
2019      *
2020      * @param source UTF16 format Unicode string which the replacements will be based on.
2021      * @param oldStr Non-zero-length string to be replaced.
2022      * @param newStr The new string to replace oldStr
2023      * @return new String derived from source by replacing every occurrence of oldStr with newStr.
2024      *         When no oldStr is found in source, then source will be returned.
2025      * @stable ICU 2.6
2026      */
replace(String source, String oldStr, String newStr)2027     public static String replace(String source, String oldStr, String newStr) {
2028         int index = indexOf(source, oldStr);
2029         if (index == -1) {
2030             return source;
2031         }
2032         int oldStrSize = oldStr.length();
2033         int newStrSize = newStr.length();
2034         StringBuffer result = new StringBuffer(source);
2035         int resultIndex = index;
2036 
2037         while (index != -1) {
2038             int endResultIndex = resultIndex + oldStrSize;
2039             result.replace(resultIndex, endResultIndex, newStr);
2040             int lastEndIndex = index + oldStrSize;
2041             index = indexOf(source, oldStr, lastEndIndex);
2042             resultIndex += newStrSize + index - lastEndIndex;
2043         }
2044         return result.toString();
2045     }
2046 
2047     /**
2048      * Reverses a UTF16 format Unicode string and replaces source's content with it. This method
2049      * will reverse surrogate characters correctly, instead of blindly reversing every character.
2050      * <p>
2051      * Examples:<br>
2052      * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
2053      * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
2054      *
2055      * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed
2056      * @return a modified source with reversed UTF16 format Unicode string.
2057      * @stable ICU 2.6
2058      */
reverse(StringBuffer source)2059     public static StringBuffer reverse(StringBuffer source) {
2060         int length = source.length();
2061         StringBuffer result = new StringBuffer(length);
2062         for (int i = length; i-- > 0;) {
2063             char ch = source.charAt(i);
2064             if (isTrailSurrogate(ch) && i > 0) {
2065                 char ch2 = source.charAt(i - 1);
2066                 if (isLeadSurrogate(ch2)) {
2067                     result.append(ch2);
2068                     result.append(ch);
2069                     --i;
2070                     continue;
2071                 }
2072             }
2073             result.append(ch);
2074         }
2075         return result;
2076     }
2077 
2078     /**
2079      * Check if the string contains more Unicode code points than a certain number. This is more
2080      * efficient than counting all code points in the entire string and comparing that number with a
2081      * threshold. This function may not need to scan the string at all if the length is within a
2082      * certain range, and never needs to count more than 'number + 1' code points. Logically
2083      * equivalent to (countCodePoint(s) &gt; number). A Unicode code point may occupy either one or two
2084      * code units.
2085      *
2086      * @param source The input string.
2087      * @param number The number of code points in the string is compared against the 'number'
2088      *            parameter.
2089      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2090      * @stable ICU 2.4
2091      */
hasMoreCodePointsThan(String source, int number)2092     public static boolean hasMoreCodePointsThan(String source, int number) {
2093         if (number < 0) {
2094             return true;
2095         }
2096         if (source == null) {
2097             return false;
2098         }
2099         int length = source.length();
2100 
2101         // length >= 0 known
2102         // source contains at least (length + 1) / 2 code points: <= 2
2103         // chars per cp
2104         if (((length + 1) >> 1) > number) {
2105             return true;
2106         }
2107 
2108         // check if source does not even contain enough chars
2109         int maxsupplementary = length - number;
2110         if (maxsupplementary <= 0) {
2111             return false;
2112         }
2113 
2114         // there are maxsupplementary = length - number more chars than
2115         // asked-for code points
2116 
2117         // count code points until they exceed and also check that there are
2118         // no more than maxsupplementary supplementary code points (char pairs)
2119         int start = 0;
2120         while (true) {
2121             if (length == 0) {
2122                 return false;
2123             }
2124             if (number == 0) {
2125                 return true;
2126             }
2127             if (isLeadSurrogate(source.charAt(start++)) && start != length
2128                     && isTrailSurrogate(source.charAt(start))) {
2129                 start++;
2130                 if (--maxsupplementary <= 0) {
2131                     // too many pairs - too few code points
2132                     return false;
2133                 }
2134             }
2135             --number;
2136         }
2137     }
2138 
2139     /**
2140      * Check if the sub-range of char array, from argument start to limit, contains more Unicode
2141      * code points than a certain number. This is more efficient than counting all code points in
2142      * the entire char array range and comparing that number with a threshold. This function may not
2143      * need to scan the char array at all if start and limit is within a certain range, and never
2144      * needs to count more than 'number + 1' code points. Logically equivalent to
2145      * (countCodePoint(source, start, limit) &gt; number). A Unicode code point may occupy either one
2146      * or two code units.
2147      *
2148      * @param source Array of UTF-16 chars
2149      * @param start Offset to substring in the source array for analyzing
2150      * @param limit Offset to substring in the source array for analyzing
2151      * @param number The number of code points in the string is compared against the 'number'
2152      *            parameter.
2153      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2154      * @exception IndexOutOfBoundsException Thrown when limit &lt; start
2155      * @stable ICU 2.4
2156      */
hasMoreCodePointsThan(char source[], int start, int limit, int number)2157     public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
2158         int length = limit - start;
2159         if (length < 0 || start < 0 || limit < 0) {
2160             throw new IndexOutOfBoundsException(
2161                     "Start and limit indexes should be non-negative and start <= limit");
2162         }
2163         if (number < 0) {
2164             return true;
2165         }
2166         if (source == null) {
2167             return false;
2168         }
2169 
2170         // length >= 0 known
2171         // source contains at least (length + 1) / 2 code points: <= 2
2172         // chars per cp
2173         if (((length + 1) >> 1) > number) {
2174             return true;
2175         }
2176 
2177         // check if source does not even contain enough chars
2178         int maxsupplementary = length - number;
2179         if (maxsupplementary <= 0) {
2180             return false;
2181         }
2182 
2183         // there are maxsupplementary = length - number more chars than
2184         // asked-for code points
2185 
2186         // count code points until they exceed and also check that there are
2187         // no more than maxsupplementary supplementary code points (char pairs)
2188         while (true) {
2189             if (length == 0) {
2190                 return false;
2191             }
2192             if (number == 0) {
2193                 return true;
2194             }
2195             if (isLeadSurrogate(source[start++]) && start != limit
2196                     && isTrailSurrogate(source[start])) {
2197                 start++;
2198                 if (--maxsupplementary <= 0) {
2199                     // too many pairs - too few code points
2200                     return false;
2201                 }
2202             }
2203             --number;
2204         }
2205     }
2206 
2207     /**
2208      * Check if the string buffer contains more Unicode code points than a certain number. This is
2209      * more efficient than counting all code points in the entire string buffer and comparing that
2210      * number with a threshold. This function may not need to scan the string buffer at all if the
2211      * length is within a certain range, and never needs to count more than 'number + 1' code
2212      * points. Logically equivalent to (countCodePoint(s) &gt; number). A Unicode code point may
2213      * occupy either one or two code units.
2214      *
2215      * @param source The input string buffer.
2216      * @param number The number of code points in the string buffer is compared against the 'number'
2217      *            parameter.
2218      * @return boolean value for whether the string buffer contains more Unicode code points than
2219      *         'number'.
2220      * @stable ICU 2.4
2221      */
hasMoreCodePointsThan(StringBuffer source, int number)2222     public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
2223         if (number < 0) {
2224             return true;
2225         }
2226         if (source == null) {
2227             return false;
2228         }
2229         int length = source.length();
2230 
2231         // length >= 0 known
2232         // source contains at least (length + 1) / 2 code points: <= 2
2233         // chars per cp
2234         if (((length + 1) >> 1) > number) {
2235             return true;
2236         }
2237 
2238         // check if source does not even contain enough chars
2239         int maxsupplementary = length - number;
2240         if (maxsupplementary <= 0) {
2241             return false;
2242         }
2243 
2244         // there are maxsupplementary = length - number more chars than
2245         // asked-for code points
2246 
2247         // count code points until they exceed and also check that there are
2248         // no more than maxsupplementary supplementary code points (char pairs)
2249         int start = 0;
2250         while (true) {
2251             if (length == 0) {
2252                 return false;
2253             }
2254             if (number == 0) {
2255                 return true;
2256             }
2257             if (isLeadSurrogate(source.charAt(start++)) && start != length
2258                     && isTrailSurrogate(source.charAt(start))) {
2259                 start++;
2260                 if (--maxsupplementary <= 0) {
2261                     // too many pairs - too few code points
2262                     return false;
2263                 }
2264             }
2265             --number;
2266         }
2267     }
2268 
2269     /**
2270      * Cover JDK 1.5 API. Create a String from an array of codePoints.
2271      *
2272      * @param codePoints The code array
2273      * @param offset The start of the text in the code point array
2274      * @param count The number of code points
2275      * @return a String representing the code points between offset and count
2276      * @throws IllegalArgumentException If an invalid code point is encountered
2277      * @throws IndexOutOfBoundsException If the offset or count are out of bounds.
2278      * @stable ICU 3.0
2279      */
newString(int[] codePoints, int offset, int count)2280     public static String newString(int[] codePoints, int offset, int count) {
2281         if (count < 0) {
2282             throw new IllegalArgumentException();
2283         }
2284         char[] chars = new char[count];
2285         int w = 0;
2286         for (int r = offset, e = offset + count; r < e; ++r) {
2287             int cp = codePoints[r];
2288             if (cp < 0 || cp > 0x10ffff) {
2289                 throw new IllegalArgumentException();
2290             }
2291             while (true) {
2292                 try {
2293                     if (cp < 0x010000) {
2294                         chars[w] = (char) cp;
2295                         w++;
2296                     } else {
2297                         chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
2298                         chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
2299                         w += 2;
2300                     }
2301                     break;
2302                 } catch (IndexOutOfBoundsException ex) {
2303                     int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
2304                             / (r - offset + 1)));
2305                     char[] temp = new char[newlen];
2306                     System.arraycopy(chars, 0, temp, 0, w);
2307                     chars = temp;
2308                 }
2309             }
2310         }
2311         return new String(chars, 0, w);
2312     }
2313 
2314     /**
2315      * <p>
2316      * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various
2317      * modes
2318      * </p>
2319      * <ul>
2320      * <li> Code point comparison or code unit comparison
2321      * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison
2322      * with special handling for character 'i'.
2323      * </ul>
2324      * <p>
2325      * The code unit or code point comparison differ only when comparing supplementary code points
2326      * (&#92;u10000..&#92;u10ffff) to BMP code points near the end of the BMP (i.e.,
2327      * &#92;ue000..&#92;uffff). In code unit comparison, high BMP code points sort after
2328      * supplementary code points because they are stored as pairs of surrogates which are at
2329      * &#92;ud800..&#92;udfff.
2330      * </p>
2331      *
2332      * @see #FOLD_CASE_DEFAULT
2333      * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2334      * @stable ICU 2.1
2335      */
2336     public static final class StringComparator implements java.util.Comparator<String> {
2337         // public constructor ------------------------------------------------
2338 
2339         /**
2340          * Default constructor that does code unit comparison and case sensitive comparison.
2341          *
2342          * @stable ICU 2.1
2343          */
StringComparator()2344         public StringComparator() {
2345             this(false, false, FOLD_CASE_DEFAULT);
2346         }
2347 
2348         /**
2349          * Constructor that does comparison based on the argument options.
2350          *
2351          * @param codepointcompare Flag to indicate true for code point comparison or false for code unit
2352          *            comparison.
2353          * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison
2354          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2355          *            when ignorecase is set to true. If ignorecase is false, this option is
2356          *            ignored.
2357          * @see #FOLD_CASE_DEFAULT
2358          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2359          * @throws IllegalArgumentException If foldcaseoption is out of range
2360          * @stable ICU 2.4
2361          */
StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption)2362         public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) {
2363             setCodePointCompare(codepointcompare);
2364             m_ignoreCase_ = ignorecase;
2365             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2366                 throw new IllegalArgumentException("Invalid fold case option");
2367             }
2368             m_foldCase_ = foldcaseoption;
2369         }
2370 
2371         // public data member ------------------------------------------------
2372 
2373         /**
2374          * Option value for case folding comparison:
2375          *
2376          * <p>Comparison is case insensitive, strings are folded using default mappings defined in
2377          * Unicode data file CaseFolding.txt, before comparison.
2378          *
2379          * @stable ICU 2.4
2380          */
2381         public static final int FOLD_CASE_DEFAULT = 0;
2382 
2383         /**
2384          * Option value for case folding:
2385          * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
2386          * and dotless i appropriately for Turkic languages (tr, az).
2387          *
2388          * <p>Comparison is case insensitive, strings are folded using modified mappings defined in
2389          * Unicode data file CaseFolding.txt, before comparison.
2390          *
2391          * @stable ICU 2.4
2392          * @see com.ibm.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
2393          */
2394         public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
2395 
2396         // public methods ----------------------------------------------------
2397 
2398         // public setters ----------------------------------------------------
2399 
2400         /**
2401          * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode
2402          * is set to code unit compare
2403          *
2404          * @param flag True for code point compare, false for code unit compare
2405          * @stable ICU 2.4
2406          */
setCodePointCompare(boolean flag)2407         public void setCodePointCompare(boolean flag) {
2408             if (flag) {
2409                 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
2410             } else {
2411                 m_codePointCompare_ = 0;
2412             }
2413         }
2414 
2415         /**
2416          * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise
2417          * case sensitive comparison mode if set to false.
2418          *
2419          * @param ignorecase True for case-insitive comparison, false for case sensitive comparison
2420          * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2421          *            when ignorecase is set to true. If ignorecase is false, this option is
2422          *            ignored.
2423          * @see #FOLD_CASE_DEFAULT
2424          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2425          * @stable ICU 2.4
2426          */
setIgnoreCase(boolean ignorecase, int foldcaseoption)2427         public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
2428             m_ignoreCase_ = ignorecase;
2429             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2430                 throw new IllegalArgumentException("Invalid fold case option");
2431             }
2432             m_foldCase_ = foldcaseoption;
2433         }
2434 
2435         // public getters ----------------------------------------------------
2436 
2437         /**
2438          * Checks if the comparison mode is code point compare.
2439          *
2440          * @return true for code point compare, false for code unit compare
2441          * @stable ICU 2.4
2442          */
getCodePointCompare()2443         public boolean getCodePointCompare() {
2444             return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2445         }
2446 
2447         /**
2448          * Checks if Comparator is in the case insensitive mode.
2449          *
2450          * @return true if Comparator performs case insensitive comparison, false otherwise
2451          * @stable ICU 2.4
2452          */
getIgnoreCase()2453         public boolean getIgnoreCase() {
2454             return m_ignoreCase_;
2455         }
2456 
2457         /**
2458          * Gets the fold case options set in Comparator to be used with case insensitive comparison.
2459          *
2460          * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
2461          * @see #FOLD_CASE_DEFAULT
2462          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2463          * @stable ICU 2.4
2464          */
getIgnoreCaseOption()2465         public int getIgnoreCaseOption() {
2466             return m_foldCase_;
2467         }
2468 
2469         // public other methods ----------------------------------------------
2470 
2471         /**
2472          * Compare two strings depending on the options selected during construction.
2473          *
2474          * @param a first source string.
2475          * @param b second source string.
2476          * @return 0 returned if a == b. If a &lt; b, a negative value is returned. Otherwise if a &gt; b,
2477          *         a positive value is returned.
2478          * @exception ClassCastException thrown when either a or b is not a String object
2479          * @stable ICU 4.4
2480          */
2481         @Override
compare(String a, String b)2482         public int compare(String a, String b) {
2483             if (Utility.sameObjects(a, b)) {
2484                 return 0;
2485             }
2486             if (a == null) {
2487                 return -1;
2488             }
2489             if (b == null) {
2490                 return 1;
2491             }
2492 
2493             if (m_ignoreCase_) {
2494                 return compareCaseInsensitive(a, b);
2495             }
2496             return compareCaseSensitive(a, b);
2497         }
2498 
2499         // private data member ----------------------------------------------
2500 
2501         /**
2502          * Code unit comparison flag. True if code unit comparison is required. False if code point
2503          * comparison is required.
2504          */
2505         private int m_codePointCompare_;
2506 
2507         /**
2508          * Fold case comparison option.
2509          */
2510         private int m_foldCase_;
2511 
2512         /**
2513          * Flag indicator if ignore case is to be used during comparison
2514          */
2515         private boolean m_ignoreCase_;
2516 
2517         /**
2518          * Code point order offset for surrogate characters
2519          */
2520         private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
2521 
2522         // private method ---------------------------------------------------
2523 
2524         /**
2525          * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life
2526          * easier.
2527          *
2528          * @param s1
2529          *            first string to compare
2530          * @param s2
2531          *            second string to compare
2532          * @return -1 is s1 &lt; s2, 0 if equals,
2533          */
compareCaseInsensitive(String s1, String s2)2534         private int compareCaseInsensitive(String s1, String s2) {
2535             return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_
2536                     | Normalizer.COMPARE_IGNORE_CASE);
2537         }
2538 
2539         /**
2540          * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life
2541          * easier.
2542          *
2543          * @param s1
2544          *            first string to compare
2545          * @param s2
2546          *            second string to compare
2547          * @return -1 is s1 &lt; s2, 0 if equals,
2548          */
compareCaseSensitive(String s1, String s2)2549         private int compareCaseSensitive(String s1, String s2) {
2550             // compare identical prefixes - they do not need to be fixed up
2551             // limit1 = start1 + min(lenght1, length2)
2552             int length1 = s1.length();
2553             int length2 = s2.length();
2554             int minlength = length1;
2555             int result = 0;
2556             if (length1 < length2) {
2557                 result = -1;
2558             } else if (length1 > length2) {
2559                 result = 1;
2560                 minlength = length2;
2561             }
2562 
2563             char c1 = 0;
2564             char c2 = 0;
2565             int index = 0;
2566             for (; index < minlength; index++) {
2567                 c1 = s1.charAt(index);
2568                 c2 = s2.charAt(index);
2569                 // check pseudo-limit
2570                 if (c1 != c2) {
2571                     break;
2572                 }
2573             }
2574 
2575             if (index == minlength) {
2576                 return result;
2577             }
2578 
2579             boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2580             // if both values are in or above the surrogate range, fix them up
2581             if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE
2582                     && codepointcompare) {
2583                 // subtract 0x2800 from BMP code points to make them smaller
2584                 // than supplementary ones
2585                 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1)))
2586                         || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) {
2587                     // part of a surrogate pair, leave >=d800
2588                 } else {
2589                     // BMP code point - may be surrogate code point - make
2590                     // < d800
2591                     c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2592                 }
2593 
2594                 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1)))
2595                         || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) {
2596                     // part of a surrogate pair, leave >=d800
2597                 } else {
2598                     // BMP code point - may be surrogate code point - make <d800
2599                     c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2600                 }
2601             }
2602 
2603             // now c1 and c2 are in UTF-32-compatible order
2604             return c1 - c2;
2605         }
2606     }
2607 
2608     /**
2609      * Utility for getting a code point from a CharSequence that contains exactly one code point.
2610      * @return the code point IF the string is non-null and consists of a single code point.
2611      * otherwise returns -1.
2612      * @param s to test
2613      * @stable ICU 54
2614      */
getSingleCodePoint(CharSequence s)2615     public static int getSingleCodePoint(CharSequence s) {
2616         if (s == null || s.length() == 0) {
2617             return -1;
2618         } else if (s.length() == 1) {
2619             return s.charAt(0);
2620         } else if (s.length() > 2) {
2621             return -1;
2622         }
2623 
2624         // at this point, len = 2
2625         int cp = Character.codePointAt(s, 0);
2626         if (cp > 0xFFFF) { // is surrogate pair
2627             return cp;
2628         }
2629         return -1;
2630     }
2631 
2632     /**
2633      * Utility for comparing a code point to a string without having to create a new string. Returns the same results
2634      * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if
2635      * <pre>
2636      * sc = new StringComparator(true,false,0);
2637      * fast = UTF16.compareCodePoint(codePoint, charSequence)
2638      * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString())
2639      * </pre>
2640      * then
2641      * <pre>
2642      * Integer.signum(fast) == Integer.signum(slower)
2643      * </pre>
2644      * @param codePoint to test
2645      * @param s to test
2646      * @return equivalent of code point comparator comparing two strings.
2647      * @stable ICU 54
2648      */
compareCodePoint(int codePoint, CharSequence s)2649     public static int compareCodePoint(int codePoint, CharSequence s) {
2650         if (s == null) {
2651             return 1;
2652         }
2653         final int strLen = s.length();
2654         if (strLen == 0) {
2655             return 1;
2656         }
2657         int second = Character.codePointAt(s, 0);
2658         int diff = codePoint - second;
2659         if (diff != 0) {
2660             return diff;
2661         }
2662         return strLen == Character.charCount(codePoint) ? 0 : -1;
2663     }
2664 
2665     // private data members -------------------------------------------------
2666 
2667     /**
2668      * Shift value for lead surrogate to form a supplementary character.
2669      */
2670     private static final int LEAD_SURROGATE_SHIFT_ = 10;
2671 
2672     /**
2673      * Mask to retrieve the significant value from a trail surrogate.
2674      */
2675     private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
2676 
2677     /**
2678      * Value that all lead surrogate starts with
2679      */
2680     private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
2681             - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
2682 
2683     // private methods ------------------------------------------------------
2684 
2685     /**
2686      * <p>
2687      * Converts argument code point and returns a String object representing the code point's value
2688      * in UTF16 format.
2689      * </p>
2690      * <p>
2691      * This method does not check for the validity of the codepoint, the results are not guaranteed
2692      * if a invalid codepoint is passed as argument.
2693      * </p>
2694      * <p>
2695      * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise.
2696      * </p>
2697      *
2698      * @param ch
2699      *            code point
2700      * @return string representation of the code point
2701      */
toString(int ch)2702     private static String toString(int ch) {
2703         if (ch < SUPPLEMENTARY_MIN_VALUE) {
2704             return String.valueOf((char) ch);
2705         }
2706 
2707         StringBuilder result = new StringBuilder();
2708         result.append(getLeadSurrogate(ch));
2709         result.append(getTrailSurrogate(ch));
2710         return result.toString();
2711     }
2712 }
2713 // eof
2714