1 // Copied from ICU4J 57.1
2 /**
3 *******************************************************************************
4 * Copyright (C) 2002-2004, International Business Machines Corporation and    *
5 * others. All Rights Reserved.                                                *
6 *******************************************************************************
7 */
8 package com.ibm.icu.dev.test;
9 
10 /**
11  * Utility class for supplementary code point
12  * support. This one is written purely for updating
13  * Normalization sample from the unicode.org site.
14  * If you want the real thing, use UTF16 class
15  * from ICU4J
16  * @author Vladimir Weinstein, Markus Scherer
17  */
18 public class UTF16Util {
19     static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000;
20 
21     /**
22      * Method nextCodePoint. Returns the next code point
23      * in a string.
24      * @param s String in question
25      * @param i index from which we want a code point
26      * @return int codepoint at index i
27      */
nextCodePoint(String s, int i)28     public static final int nextCodePoint(String s, int i) {
29         int ch = s.charAt(i);
30         if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
31             int ch2 = s.charAt(i);
32             if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
33                 ch = (ch << 10) + ch2 - suppOffset;
34             }
35         }
36         return ch;
37     }
38 
39     /**
40      * Method prevCodePoint. Gets the code point preceding
41      * index i (predecrement).
42      * @param s String in question
43      * @param i index in string
44      * @return int codepoint at index --i
45      */
prevCodePoint(String s, int i)46     public static final int prevCodePoint(String s, int i) {
47         int ch = s.charAt(--i);
48         if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
49             int ch2 = s.charAt(i);
50             if (0xd800 <= ch2 && ch2 <= 0xdbff) {
51                 ch = (ch2 << 10) + ch - suppOffset;
52             }
53         }
54         return ch;
55     }
56 
57     /**
58      * Method nextCodePoint. Returns the next code point
59      * in a string.
60      * @param s StringBuffer in question
61      * @param i index from which we want a code point
62      * @return int codepoint at index i
63      */
nextCodePoint(StringBuffer s, int i)64     public static final int nextCodePoint(StringBuffer s, int i) {
65         int ch = s.charAt(i);
66         if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
67             int ch2 = s.charAt(i);
68             if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
69                 ch = (ch << 10) + ch2 - suppOffset;
70             }
71         }
72         return ch;
73     }
74 
75     /**
76      * Method prevCodePoint. Gets the code point preceding
77      * index i (predecrement).
78      * @param s StringBuffer in question
79      * @param i index in string
80      * @return int codepoint at index --i
81      */
prevCodePoint(StringBuffer s, int i)82     public static final int prevCodePoint(StringBuffer s, int i) {
83         int ch = s.charAt(--i);
84         if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
85             int ch2 = s.charAt(i);
86             if (0xd800 <= ch2 && ch2 <= 0xdbff) {
87                 ch = (ch2 << 10) + ch - suppOffset;
88             }
89         }
90         return ch;
91     }
92 
93     /**
94      * Method codePointLength. Returns the length
95      * in UTF-16 code units of a given code point
96      * @param c code point in question
97      * @return int length in UTF-16 code units. Can be 1 or 2
98      */
codePointLength(int c)99     public static final int codePointLength(int c) {
100         return c <= 0xffff ? 1 : 2;
101     }
102 
103     /**
104      * Method appendCodePoint. Appends a code point
105      * to a StringBuffer
106      * @param buffer StringBuffer in question
107      * @param ch code point to append
108      */
appendCodePoint(StringBuffer buffer, int ch)109     public static final void appendCodePoint(StringBuffer buffer, int ch) {
110         if (ch <= 0xffff) {
111             buffer.append((char)ch);
112         } else {
113             buffer.append((char)(0xd7c0 + (ch >> 10)));
114             buffer.append((char)(0xdc00 + (ch & 0x3ff)));
115         }
116     }
117 
118     /**
119      * Method insertCodePoint. Inserts a code point in
120      * a StringBuffer
121      * @param buffer StringBuffer in question
122      * @param i index at which we want code point to be inserted
123      * @param ch code point to be inserted
124      */
insertCodePoint(StringBuffer buffer, int i, int ch)125     public static final void insertCodePoint(StringBuffer buffer, int i, int ch) {
126         if (ch <= 0xffff) {
127             buffer.insert(i, (char)ch);
128         } else {
129             buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff)));
130         }
131     }
132 
133     /**
134      * Method setCodePointAt. Changes a code point at a
135      * given index. Can change the length of the string.
136      * @param buffer StringBuffer in question
137      * @param i index at which we want to change the contents
138      * @param ch replacement code point
139      * @return int difference in resulting StringBuffer length
140      */
setCodePointAt(StringBuffer buffer, int i, int ch)141     public static final int setCodePointAt(StringBuffer buffer, int i, int ch) {
142         int cp = nextCodePoint(buffer, i);
143 
144         if (ch <= 0xffff && cp <= 0xffff) { // Both BMP
145             buffer.setCharAt(i, (char)ch);
146             return 0;
147         } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary
148             buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
149             buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff)));
150             return 0;
151         } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks
152             buffer.setCharAt(i, (char)ch);
153             buffer.deleteCharAt(i+1);
154             return -1;
155         } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows
156             buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
157             buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff)));
158             return 1;
159         }
160     }
161 
162     /**
163      * Method countCodePoint. Counts the UTF-32 code points
164      * in a UTF-16 encoded string.
165      * @param source String in question.
166      * @return int number of code points in this string
167      */
countCodePoint(String source)168     public static final int countCodePoint(String source)
169     {
170         int result = 0;
171         char ch;
172         boolean hadLeadSurrogate = false;
173 
174         for (int i = 0; i < source.length(); ++ i)
175         {
176             ch = source.charAt(i);
177             if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
178                 hadLeadSurrogate = false;           // count valid trail as zero
179             }
180             else
181             {
182                 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
183                 ++ result;                          // count others as 1
184             }
185         }
186 
187         return result;
188     }
189 
190     /**
191      * Method countCodePoint. Counts the UTF-32 code points
192      * in a UTF-16 encoded string.
193      * @param source StringBuffer in question.
194      * @return int number of code points in this string
195      */
countCodePoint(StringBuffer source)196     public static final int countCodePoint(StringBuffer source)
197     {
198         int result = 0;
199         char ch;
200         boolean hadLeadSurrogate = false;
201 
202         for (int i = 0; i < source.length(); ++ i)
203         {
204             ch = source.charAt(i);
205             if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
206                 hadLeadSurrogate = false;           // count valid trail as zero
207             }
208             else
209             {
210                 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
211                 ++ result;                          // count others as 1
212             }
213         }
214 
215         return result;
216     }
217     /**
218      * The minimum value for Supplementary code points
219      */
220     public static final int SUPPLEMENTARY_MIN_VALUE  = 0x10000;
221     /**
222      * Determines how many chars this char32 requires.
223      * If a validity check is required, use <code>
224      * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on
225      * char32 before calling.
226      * @param char32 the input codepoint.
227      * @return 2 if is in supplementary space, otherwise 1.
228      */
getCharCount(int char32)229     public static int getCharCount(int char32)
230     {
231         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
232             return 1;
233         }
234         return 2;
235     }
236     /**
237      * Lead surrogate maximum value
238      * @stable ICU 2.1
239      */
240     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
241     /**
242      * Lead surrogate minimum value
243      * @stable ICU 2.1
244      */
245     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
246 
247     /**
248      * Trail surrogate minimum value
249      * @stable ICU 2.1
250      */
251     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
252     /**
253      * Trail surrogate maximum value
254      * @stable ICU 2.1
255      */
256     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
257     /**
258      * Determines whether the code value is a surrogate.
259      * @param char16 the input character.
260      * @return true iff the input character is a surrogate.
261      * @stable ICU 2.1
262      */
isSurrogate(char char16)263     public static boolean isSurrogate(char char16)
264     {
265         return LEAD_SURROGATE_MIN_VALUE <= char16 &&
266             char16 <= TRAIL_SURROGATE_MAX_VALUE;
267     }
268 
269     /**
270      * Determines whether the character is a trail surrogate.
271      * @param char16 the input character.
272      * @return true iff the input character is a trail surrogate.
273      * @stable ICU 2.1
274      */
isTrailSurrogate(char char16)275     public static boolean isTrailSurrogate(char char16)
276     {
277         return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
278                 char16 <= TRAIL_SURROGATE_MAX_VALUE);
279     }
280 
281     /**
282      * Determines whether the character is a lead surrogate.
283      * @param char16 the input character.
284      * @return true iff the input character is a lead surrogate
285      * @stable ICU 2.1
286      */
isLeadSurrogate(char char16)287     public static boolean isLeadSurrogate(char char16)
288     {
289         return LEAD_SURROGATE_MIN_VALUE <= char16 &&
290             char16 <= LEAD_SURROGATE_MAX_VALUE;
291     }
292     /**
293      * Extract a single UTF-32 value from a substring.
294      * Used when iterating forwards or backwards (with
295      * <code>UTF16.getCharCount()</code>, as well as random access. If a
296      * validity check is required, use
297      * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal()
298      * </a></code> on the return value.
299      * If the char retrieved is part of a surrogate pair, its supplementary
300      * character will be returned. If a complete supplementary character is
301      * not found the incomplete character will be returned
302      * @param source array of UTF-16 chars
303      * @param start offset to substring in the source array for analyzing
304      * @param limit offset to substring in the source array for analyzing
305      * @param offset16 UTF-16 offset relative to start
306      * @return UTF-32 value for the UTF-32 value that contains the char at
307      *         offset16. The boundaries of that codepoint are the same as in
308      *         <code>bounds32()</code>.
309      * @exception IndexOutOfBoundsException thrown if offset16 is not within
310      *            the range of start and limit.
311      * @stable ICU 2.1
312      */
charAt(char source[], int start, int limit, int offset16)313     public static int charAt(char source[], int start, int limit,
314                              int offset16)
315     {
316         offset16 += start;
317         if (offset16 < start || offset16 >= limit) {
318             throw new ArrayIndexOutOfBoundsException(offset16);
319         }
320 
321         char single = source[offset16];
322         if (!isSurrogate(single)) {
323             return single;
324         }
325 
326         // Convert the UTF-16 surrogate pair if necessary.
327         // For simplicity in usage, and because the frequency of pairs is
328         // low, look both directions.
329         if (single <= LEAD_SURROGATE_MAX_VALUE) {
330             offset16 ++;
331             if (offset16 >= limit) {
332                 return single;
333             }
334             char trail = source[offset16];
335             if (isTrailSurrogate(trail)) {
336                 return getRawSupplementary(single, trail);
337             }
338         }
339         else { // isTrailSurrogate(single), so
340             if (offset16 == start) {
341                 return single;
342             }
343             offset16 --;
344             char lead = source[offset16];
345             if (isLeadSurrogate(lead))
346                 return getRawSupplementary(lead, single);
347         }
348         return single; // return unmatched surrogate
349     }
350     /**
351      * Shift value for lead surrogate to form a supplementary character.
352      */
353     private static final int LEAD_SURROGATE_SHIFT_ = 10;
354 
355     /**
356      * Offset to add to combined surrogate pair to avoid msking.
357      */
358     private static final int SURROGATE_OFFSET_ =
359                            SUPPLEMENTARY_MIN_VALUE -
360                            (LEAD_SURROGATE_MIN_VALUE <<
361                            LEAD_SURROGATE_SHIFT_) -
362                            TRAIL_SURROGATE_MIN_VALUE;
363 
364 
365    /**
366     * Forms a supplementary code point from the argument character<br>
367     * Note this is for internal use hence no checks for the validity of the
368     * surrogate characters are done
369     * @param lead lead surrogate character
370     * @param trail trailing surrogate character
371     * @return code point of the supplementary character
372     */
getRawSupplementary(char lead, char trail)373     public static int getRawSupplementary(char lead, char trail)
374     {
375         return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
376     }
377 
378 }
379