1 // Copied from ICU4J 57.1 2 /** 3 ******************************************************************************* 4 * Copyright (C) 2002-2004, International Business Machines Corporation and * 5 * others. All Rights Reserved. * 6 ******************************************************************************* 7 */ 8 package com.ibm.icu.dev.test; 9 10 /** 11 * Utility class for supplementary code point 12 * support. This one is written purely for updating 13 * Normalization sample from the unicode.org site. 14 * If you want the real thing, use UTF16 class 15 * from ICU4J 16 * @author Vladimir Weinstein, Markus Scherer 17 */ 18 public class UTF16Util { 19 static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000; 20 21 /** 22 * Method nextCodePoint. Returns the next code point 23 * in a string. 24 * @param s String in question 25 * @param i index from which we want a code point 26 * @return int codepoint at index i 27 */ nextCodePoint(String s, int i)28 public static final int nextCodePoint(String s, int i) { 29 int ch = s.charAt(i); 30 if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) { 31 int ch2 = s.charAt(i); 32 if (0xdc00 <= ch2 && ch2 <= 0xdfff) { 33 ch = (ch << 10) + ch2 - suppOffset; 34 } 35 } 36 return ch; 37 } 38 39 /** 40 * Method prevCodePoint. Gets the code point preceding 41 * index i (predecrement). 42 * @param s String in question 43 * @param i index in string 44 * @return int codepoint at index --i 45 */ prevCodePoint(String s, int i)46 public static final int prevCodePoint(String s, int i) { 47 int ch = s.charAt(--i); 48 if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) { 49 int ch2 = s.charAt(i); 50 if (0xd800 <= ch2 && ch2 <= 0xdbff) { 51 ch = (ch2 << 10) + ch - suppOffset; 52 } 53 } 54 return ch; 55 } 56 57 /** 58 * Method nextCodePoint. Returns the next code point 59 * in a string. 60 * @param s StringBuffer in question 61 * @param i index from which we want a code point 62 * @return int codepoint at index i 63 */ nextCodePoint(StringBuffer s, int i)64 public static final int nextCodePoint(StringBuffer s, int i) { 65 int ch = s.charAt(i); 66 if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) { 67 int ch2 = s.charAt(i); 68 if (0xdc00 <= ch2 && ch2 <= 0xdfff) { 69 ch = (ch << 10) + ch2 - suppOffset; 70 } 71 } 72 return ch; 73 } 74 75 /** 76 * Method prevCodePoint. Gets the code point preceding 77 * index i (predecrement). 78 * @param s StringBuffer in question 79 * @param i index in string 80 * @return int codepoint at index --i 81 */ prevCodePoint(StringBuffer s, int i)82 public static final int prevCodePoint(StringBuffer s, int i) { 83 int ch = s.charAt(--i); 84 if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) { 85 int ch2 = s.charAt(i); 86 if (0xd800 <= ch2 && ch2 <= 0xdbff) { 87 ch = (ch2 << 10) + ch - suppOffset; 88 } 89 } 90 return ch; 91 } 92 93 /** 94 * Method codePointLength. Returns the length 95 * in UTF-16 code units of a given code point 96 * @param c code point in question 97 * @return int length in UTF-16 code units. Can be 1 or 2 98 */ codePointLength(int c)99 public static final int codePointLength(int c) { 100 return c <= 0xffff ? 1 : 2; 101 } 102 103 /** 104 * Method appendCodePoint. Appends a code point 105 * to a StringBuffer 106 * @param buffer StringBuffer in question 107 * @param ch code point to append 108 */ appendCodePoint(StringBuffer buffer, int ch)109 public static final void appendCodePoint(StringBuffer buffer, int ch) { 110 if (ch <= 0xffff) { 111 buffer.append((char)ch); 112 } else { 113 buffer.append((char)(0xd7c0 + (ch >> 10))); 114 buffer.append((char)(0xdc00 + (ch & 0x3ff))); 115 } 116 } 117 118 /** 119 * Method insertCodePoint. Inserts a code point in 120 * a StringBuffer 121 * @param buffer StringBuffer in question 122 * @param i index at which we want code point to be inserted 123 * @param ch code point to be inserted 124 */ insertCodePoint(StringBuffer buffer, int i, int ch)125 public static final void insertCodePoint(StringBuffer buffer, int i, int ch) { 126 if (ch <= 0xffff) { 127 buffer.insert(i, (char)ch); 128 } else { 129 buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff))); 130 } 131 } 132 133 /** 134 * Method setCodePointAt. Changes a code point at a 135 * given index. Can change the length of the string. 136 * @param buffer StringBuffer in question 137 * @param i index at which we want to change the contents 138 * @param ch replacement code point 139 * @return int difference in resulting StringBuffer length 140 */ setCodePointAt(StringBuffer buffer, int i, int ch)141 public static final int setCodePointAt(StringBuffer buffer, int i, int ch) { 142 int cp = nextCodePoint(buffer, i); 143 144 if (ch <= 0xffff && cp <= 0xffff) { // Both BMP 145 buffer.setCharAt(i, (char)ch); 146 return 0; 147 } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary 148 buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10))); 149 buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff))); 150 return 0; 151 } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks 152 buffer.setCharAt(i, (char)ch); 153 buffer.deleteCharAt(i+1); 154 return -1; 155 } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows 156 buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10))); 157 buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff))); 158 return 1; 159 } 160 } 161 162 /** 163 * Method countCodePoint. Counts the UTF-32 code points 164 * in a UTF-16 encoded string. 165 * @param source String in question. 166 * @return int number of code points in this string 167 */ countCodePoint(String source)168 public static final int countCodePoint(String source) 169 { 170 int result = 0; 171 char ch; 172 boolean hadLeadSurrogate = false; 173 174 for (int i = 0; i < source.length(); ++ i) 175 { 176 ch = source.charAt(i); 177 if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) { 178 hadLeadSurrogate = false; // count valid trail as zero 179 } 180 else 181 { 182 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff); 183 ++ result; // count others as 1 184 } 185 } 186 187 return result; 188 } 189 190 /** 191 * Method countCodePoint. Counts the UTF-32 code points 192 * in a UTF-16 encoded string. 193 * @param source StringBuffer in question. 194 * @return int number of code points in this string 195 */ countCodePoint(StringBuffer source)196 public static final int countCodePoint(StringBuffer source) 197 { 198 int result = 0; 199 char ch; 200 boolean hadLeadSurrogate = false; 201 202 for (int i = 0; i < source.length(); ++ i) 203 { 204 ch = source.charAt(i); 205 if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) { 206 hadLeadSurrogate = false; // count valid trail as zero 207 } 208 else 209 { 210 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff); 211 ++ result; // count others as 1 212 } 213 } 214 215 return result; 216 } 217 /** 218 * The minimum value for Supplementary code points 219 */ 220 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 221 /** 222 * Determines how many chars this char32 requires. 223 * If a validity check is required, use <code> 224 * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on 225 * char32 before calling. 226 * @param char32 the input codepoint. 227 * @return 2 if is in supplementary space, otherwise 1. 228 */ getCharCount(int char32)229 public static int getCharCount(int char32) 230 { 231 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 232 return 1; 233 } 234 return 2; 235 } 236 /** 237 * Lead surrogate maximum value 238 * @stable ICU 2.1 239 */ 240 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 241 /** 242 * Lead surrogate minimum value 243 * @stable ICU 2.1 244 */ 245 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 246 247 /** 248 * Trail surrogate minimum value 249 * @stable ICU 2.1 250 */ 251 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 252 /** 253 * Trail surrogate maximum value 254 * @stable ICU 2.1 255 */ 256 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 257 /** 258 * Determines whether the code value is a surrogate. 259 * @param char16 the input character. 260 * @return true iff the input character is a surrogate. 261 * @stable ICU 2.1 262 */ isSurrogate(char char16)263 public static boolean isSurrogate(char char16) 264 { 265 return LEAD_SURROGATE_MIN_VALUE <= char16 && 266 char16 <= TRAIL_SURROGATE_MAX_VALUE; 267 } 268 269 /** 270 * Determines whether the character is a trail surrogate. 271 * @param char16 the input character. 272 * @return true iff the input character is a trail surrogate. 273 * @stable ICU 2.1 274 */ isTrailSurrogate(char char16)275 public static boolean isTrailSurrogate(char char16) 276 { 277 return (TRAIL_SURROGATE_MIN_VALUE <= char16 && 278 char16 <= TRAIL_SURROGATE_MAX_VALUE); 279 } 280 281 /** 282 * Determines whether the character is a lead surrogate. 283 * @param char16 the input character. 284 * @return true iff the input character is a lead surrogate 285 * @stable ICU 2.1 286 */ isLeadSurrogate(char char16)287 public static boolean isLeadSurrogate(char char16) 288 { 289 return LEAD_SURROGATE_MIN_VALUE <= char16 && 290 char16 <= LEAD_SURROGATE_MAX_VALUE; 291 } 292 /** 293 * Extract a single UTF-32 value from a substring. 294 * Used when iterating forwards or backwards (with 295 * <code>UTF16.getCharCount()</code>, as well as random access. If a 296 * validity check is required, use 297 * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal() 298 * </a></code> on the return value. 299 * If the char retrieved is part of a surrogate pair, its supplementary 300 * character will be returned. If a complete supplementary character is 301 * not found the incomplete character will be returned 302 * @param source array of UTF-16 chars 303 * @param start offset to substring in the source array for analyzing 304 * @param limit offset to substring in the source array for analyzing 305 * @param offset16 UTF-16 offset relative to start 306 * @return UTF-32 value for the UTF-32 value that contains the char at 307 * offset16. The boundaries of that codepoint are the same as in 308 * <code>bounds32()</code>. 309 * @exception IndexOutOfBoundsException thrown if offset16 is not within 310 * the range of start and limit. 311 * @stable ICU 2.1 312 */ charAt(char source[], int start, int limit, int offset16)313 public static int charAt(char source[], int start, int limit, 314 int offset16) 315 { 316 offset16 += start; 317 if (offset16 < start || offset16 >= limit) { 318 throw new ArrayIndexOutOfBoundsException(offset16); 319 } 320 321 char single = source[offset16]; 322 if (!isSurrogate(single)) { 323 return single; 324 } 325 326 // Convert the UTF-16 surrogate pair if necessary. 327 // For simplicity in usage, and because the frequency of pairs is 328 // low, look both directions. 329 if (single <= LEAD_SURROGATE_MAX_VALUE) { 330 offset16 ++; 331 if (offset16 >= limit) { 332 return single; 333 } 334 char trail = source[offset16]; 335 if (isTrailSurrogate(trail)) { 336 return getRawSupplementary(single, trail); 337 } 338 } 339 else { // isTrailSurrogate(single), so 340 if (offset16 == start) { 341 return single; 342 } 343 offset16 --; 344 char lead = source[offset16]; 345 if (isLeadSurrogate(lead)) 346 return getRawSupplementary(lead, single); 347 } 348 return single; // return unmatched surrogate 349 } 350 /** 351 * Shift value for lead surrogate to form a supplementary character. 352 */ 353 private static final int LEAD_SURROGATE_SHIFT_ = 10; 354 355 /** 356 * Offset to add to combined surrogate pair to avoid msking. 357 */ 358 private static final int SURROGATE_OFFSET_ = 359 SUPPLEMENTARY_MIN_VALUE - 360 (LEAD_SURROGATE_MIN_VALUE << 361 LEAD_SURROGATE_SHIFT_) - 362 TRAIL_SURROGATE_MIN_VALUE; 363 364 365 /** 366 * Forms a supplementary code point from the argument character<br> 367 * Note this is for internal use hence no checks for the validity of the 368 * surrogate characters are done 369 * @param lead lead surrogate character 370 * @param trail trailing surrogate character 371 * @return code point of the supplementary character 372 */ getRawSupplementary(char lead, char trail)373 public static int getRawSupplementary(char lead, char trail) 374 { 375 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; 376 } 377 378 } 379