1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2003-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import com.ibm.icu.lang.UCharacter; 12 import com.ibm.icu.text.StringPrepParseException; 13 import com.ibm.icu.text.UTF16; 14 15 /** 16 * Ported code from ICU punycode.c 17 * @author ram 18 */ 19 public final class Punycode { 20 21 /* Punycode parameters for Bootstring */ 22 private static final int BASE = 36; 23 private static final int TMIN = 1; 24 private static final int TMAX = 26; 25 private static final int SKEW = 38; 26 private static final int DAMP = 700; 27 private static final int INITIAL_BIAS = 72; 28 private static final int INITIAL_N = 0x80; 29 30 /* "Basic" Unicode/ASCII code points */ 31 private static final char HYPHEN = 0x2d; 32 private static final char DELIMITER = HYPHEN; 33 34 private static final int ZERO = 0x30; 35 //private static final int NINE = 0x39; 36 37 private static final int SMALL_A = 0x61; 38 private static final int SMALL_Z = 0x7a; 39 40 private static final int CAPITAL_A = 0x41; 41 private static final int CAPITAL_Z = 0x5a; 42 adaptBias(int delta, int length, boolean firstTime)43 private static int adaptBias(int delta, int length, boolean firstTime){ 44 if(firstTime){ 45 delta /=DAMP; 46 }else{ 47 delta /= 2; 48 } 49 delta += delta/length; 50 51 int count=0; 52 for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { 53 delta/=(BASE-TMIN); 54 } 55 56 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); 57 } 58 59 /** 60 * basicToDigit[] contains the numeric value of a basic code 61 * point (for use in representing integers) in the range 0 to 62 * BASE-1, or -1 if b is does not represent a value. 63 */ 64 static final int[] basicToDigit= new int[]{ 65 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 66 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 67 68 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 69 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, 70 71 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 72 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 73 74 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 75 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 76 77 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 78 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 79 80 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 81 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 82 83 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 84 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 85 86 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 87 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 88 }; 89 90 ///CLOVER:OFF asciiCaseMap(char b, boolean uppercase)91 private static char asciiCaseMap(char b, boolean uppercase) { 92 if(uppercase) { 93 if(SMALL_A<=b && b<=SMALL_Z) { 94 b-=(SMALL_A-CAPITAL_A); 95 } 96 } else { 97 if(CAPITAL_A<=b && b<=CAPITAL_Z) { 98 b+=(SMALL_A-CAPITAL_A); 99 } 100 } 101 return b; 102 } 103 ///CLOVER:ON 104 /** 105 * digitToBasic() returns the basic code point whose value 106 * (when used for representing integers) is d, which must be in the 107 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is 108 * nonzero, in which case the uppercase form is used. 109 */ digitToBasic(int digit, boolean uppercase)110 private static char digitToBasic(int digit, boolean uppercase) { 111 /* 0..25 map to ASCII a..z or A..Z */ 112 /* 26..35 map to ASCII 0..9 */ 113 if(digit<26) { 114 if(uppercase) { 115 return (char)(CAPITAL_A+digit); 116 } else { 117 return (char)(SMALL_A+digit); 118 } 119 } else { 120 return (char)((ZERO-26)+digit); 121 } 122 } 123 /** 124 * Converts Unicode to Punycode. 125 * The input string must not contain single, unpaired surrogates. 126 * The output will be represented as an array of ASCII code points. 127 * 128 * @param src The source of the String Buffer passed. 129 * @param caseFlags The boolean array of case flags. 130 * @return An array of ASCII code points. 131 */ encode(CharSequence src, boolean[] caseFlags)132 public static StringBuilder encode(CharSequence src, boolean[] caseFlags) throws StringPrepParseException{ 133 int n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, srcCPCount; 134 char c, c2; 135 int srcLength = src.length(); 136 int[] cpBuffer = new int[srcLength]; 137 StringBuilder dest = new StringBuilder(srcLength); 138 /* 139 * Handle the basic code points and 140 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): 141 */ 142 srcCPCount=0; 143 144 for(j=0; j<srcLength; ++j) { 145 c=src.charAt(j); 146 if(isBasic(c)) { 147 cpBuffer[srcCPCount++]=0; 148 dest.append(caseFlags!=null ? asciiCaseMap(c, caseFlags[j]) : c); 149 } else { 150 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L; 151 if(!UTF16.isSurrogate(c)) { 152 n|=c; 153 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) { 154 ++j; 155 156 n|=UCharacter.getCodePoint(c, c2); 157 } else { 158 /* error: unmatched surrogate */ 159 throw new StringPrepParseException("Illegal char found",StringPrepParseException.ILLEGAL_CHAR_FOUND); 160 } 161 cpBuffer[srcCPCount++]=n; 162 } 163 } 164 165 /* Finish the basic string - if it is not empty - with a delimiter. */ 166 basicLength=dest.length(); 167 if(basicLength>0) { 168 dest.append(DELIMITER); 169 } 170 171 /* 172 * handledCPCount is the number of code points that have been handled 173 * basicLength is the number of basic code points 174 * destLength is the number of chars that have been output 175 */ 176 177 /* Initialize the state: */ 178 n=INITIAL_N; 179 delta=0; 180 bias=INITIAL_BIAS; 181 182 /* Main encoding loop: */ 183 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) { 184 /* 185 * All non-basic code points < n have been handled already. 186 * Find the next larger one: 187 */ 188 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) { 189 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 190 if(n<=q && q<m) { 191 m=q; 192 } 193 } 194 195 /* 196 * Increase delta enough to advance the decoder's 197 * <n,i> state to <m,0>, but guard against overflow: 198 */ 199 if(m-n>(0x7fffffff-delta)/(handledCPCount+1)) { 200 throw new IllegalStateException("Internal program error"); 201 } 202 delta+=(m-n)*(handledCPCount+1); 203 n=m; 204 205 /* Encode a sequence of same code points n */ 206 for(j=0; j<srcCPCount; ++j) { 207 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 208 if(q<n) { 209 ++delta; 210 } else if(q==n) { 211 /* Represent delta as a generalized variable-length integer: */ 212 for(q=delta, k=BASE; /* no condition */; k+=BASE) { 213 214 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt 215 216 t=k-bias; 217 if(t<TMIN) { 218 t=TMIN; 219 } else if(t>TMAX) { 220 t=TMAX; 221 } 222 */ 223 224 t=k-bias; 225 if(t<TMIN) { 226 t=TMIN; 227 } else if(k>=(bias+TMAX)) { 228 t=TMAX; 229 } 230 231 if(q<t) { 232 break; 233 } 234 235 dest.append(digitToBasic(t+(q-t)%(BASE-t), false)); 236 q=(q-t)/(BASE-t); 237 } 238 239 dest.append(digitToBasic(q, (cpBuffer[j]<0))); 240 bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength)); 241 delta=0; 242 ++handledCPCount; 243 } 244 } 245 246 ++delta; 247 ++n; 248 } 249 250 return dest; 251 } 252 isBasic(int ch)253 private static boolean isBasic(int ch){ 254 return (ch < INITIAL_N); 255 } 256 ///CLOVER:OFF isBasicUpperCase(int ch)257 private static boolean isBasicUpperCase(int ch){ 258 return( CAPITAL_A<=ch && ch >= CAPITAL_Z); 259 } 260 ///CLOVER:ON isSurrogate(int ch)261 private static boolean isSurrogate(int ch){ 262 return (((ch)&0xfffff800)==0xd800); 263 } 264 /** 265 * Converts Punycode to Unicode. 266 * The Unicode string will be at most as long as the Punycode string. 267 * 268 * @param src The source of the string buffer being passed. 269 * @param caseFlags The array of boolean case flags. 270 * @return StringBuilder string. 271 */ decode(CharSequence src, boolean[] caseFlags)272 public static StringBuilder decode(CharSequence src, boolean[] caseFlags) 273 throws StringPrepParseException{ 274 int srcLength = src.length(); 275 StringBuilder dest = new StringBuilder(src.length()); 276 int n, i, bias, basicLength, j, in, oldi, w, k, digit, t, 277 destCPCount, firstSupplementaryIndex, cpLength; 278 char b; 279 280 /* 281 * Handle the basic code points: 282 * Let basicLength be the number of input code points 283 * before the last delimiter, or 0 if there is none, 284 * then copy the first basicLength code points to the output. 285 * 286 * The following loop iterates backward. 287 */ 288 for(j=srcLength; j>0;) { 289 if(src.charAt(--j)==DELIMITER) { 290 break; 291 } 292 } 293 basicLength=destCPCount=j; 294 295 for(j=0; j<basicLength; ++j) { 296 b=src.charAt(j); 297 if(!isBasic(b)) { 298 throw new StringPrepParseException("Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND); 299 } 300 dest.append(b); 301 302 if(caseFlags!=null && j<caseFlags.length) { 303 caseFlags[j]=isBasicUpperCase(b); 304 } 305 } 306 307 /* Initialize the state: */ 308 n=INITIAL_N; 309 i=0; 310 bias=INITIAL_BIAS; 311 firstSupplementaryIndex=1000000000; 312 313 /* 314 * Main decoding loop: 315 * Start just after the last delimiter if any 316 * basic code points were copied; start at the beginning otherwise. 317 */ 318 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) { 319 /* 320 * in is the index of the next character to be consumed, and 321 * destCPCount is the number of code points in the output array. 322 * 323 * Decode a generalized variable-length integer into delta, 324 * which gets added to i. The overflow checking is easier 325 * if we increase i as we go, then subtract off its starting 326 * value at the end to obtain delta. 327 */ 328 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { 329 if(in>=srcLength) { 330 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 331 } 332 333 digit=basicToDigit[src.charAt(in++) & 0xFF]; 334 if(digit<0) { 335 throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND); 336 } 337 if(digit>(0x7fffffff-i)/w) { 338 /* integer overflow */ 339 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 340 } 341 342 i+=digit*w; 343 t=k-bias; 344 if(t<TMIN) { 345 t=TMIN; 346 } else if(k>=(bias+TMAX)) { 347 t=TMAX; 348 } 349 if(digit<t) { 350 break; 351 } 352 353 if(w>0x7fffffff/(BASE-t)) { 354 /* integer overflow */ 355 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 356 } 357 w*=BASE-t; 358 } 359 360 /* 361 * Modification from sample code: 362 * Increments destCPCount here, 363 * where needed instead of in for() loop tail. 364 */ 365 ++destCPCount; 366 bias=adaptBias(i-oldi, destCPCount, (oldi==0)); 367 368 /* 369 * i was supposed to wrap around from (incremented) destCPCount to 0, 370 * incrementing n each time, so we'll fix that now: 371 */ 372 if(i/destCPCount>(0x7fffffff-n)) { 373 /* integer overflow */ 374 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 375 } 376 377 n+=i/destCPCount; 378 i%=destCPCount; 379 /* not needed for Punycode: */ 380 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ 381 382 if(n>0x10ffff || isSurrogate(n)) { 383 /* Unicode code point overflow */ 384 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 385 } 386 387 /* Insert n at position i of the output: */ 388 cpLength=Character.charCount(n); 389 int codeUnitIndex; 390 391 /* 392 * Handle indexes when supplementary code points are present. 393 * 394 * In almost all cases, there will be only BMP code points before i 395 * and even in the entire string. 396 * This is handled with the same efficiency as with UTF-32. 397 * 398 * Only the rare cases with supplementary code points are handled 399 * more slowly - but not too bad since this is an insertion anyway. 400 */ 401 if(i<=firstSupplementaryIndex) { 402 codeUnitIndex=i; 403 if(cpLength>1) { 404 firstSupplementaryIndex=codeUnitIndex; 405 } else { 406 ++firstSupplementaryIndex; 407 } 408 } else { 409 codeUnitIndex=dest.offsetByCodePoints(firstSupplementaryIndex, i-firstSupplementaryIndex); 410 } 411 412 /* use the UChar index codeUnitIndex instead of the code point index i */ 413 if(caseFlags!=null && (dest.length()+cpLength)<=caseFlags.length) { 414 if(codeUnitIndex<dest.length()) { 415 System.arraycopy(caseFlags, codeUnitIndex, 416 caseFlags, codeUnitIndex+cpLength, 417 dest.length()-codeUnitIndex); 418 } 419 /* Case of last character determines uppercase flag: */ 420 caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1)); 421 if(cpLength==2) { 422 caseFlags[codeUnitIndex+1]=false; 423 } 424 } 425 if(cpLength==1) { 426 /* BMP, insert one code unit */ 427 dest.insert(codeUnitIndex, (char)n); 428 } else { 429 /* supplementary character, insert two code units */ 430 dest.insert(codeUnitIndex, UTF16.getLeadSurrogate(n)); 431 dest.insert(codeUnitIndex+1, UTF16.getTrailSurrogate(n)); 432 } 433 ++i; 434 } 435 return dest; 436 } 437 } 438