1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 package java.net; 27 28 import com.android.icu.text.ExtendedIDNA; 29 30 /** 31 * Provides methods to convert internationalized domain names (IDNs) between 32 * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation. 33 * Internationalized domain names can use characters from the entire range of 34 * Unicode, while traditional domain names are restricted to ASCII characters. 35 * ACE is an encoding of Unicode strings that uses only ASCII characters and 36 * can be used with software (such as the Domain Name System) that only 37 * understands traditional domain names. 38 * 39 * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. 40 * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ 41 * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a 42 * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and 43 * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert 44 * domain name string back and forth. 45 * 46 * <p>The behavior of aforementioned conversion process can be adjusted by various flags: 47 * <ul> 48 * <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted 49 * can contain code points that are unassigned in Unicode 3.2, which is the 50 * Unicode version on which IDN conversion is based. If the flag is not used, 51 * the presence of such unassigned code points is treated as an error. 52 * <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>. 53 * It is an error if they don't meet the requirements. 54 * </ul> 55 * These flags can be logically OR'ed together. 56 * 57 * <p>The security consideration is important with respect to internationalization 58 * domain name support. For example, English domain names may be <i>homographed</i> 59 * - maliciously misspelled by substitution of non-Latin letters. 60 * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a> 61 * discusses security issues of IDN support as well as possible solutions. 62 * Applications are responsible for taking adequate security measures when using 63 * international domain names. 64 * 65 * @author Edward Wang 66 * @since 1.6 67 * 68 */ 69 public final class IDN { 70 /** 71 * Flag to allow processing of unassigned code points 72 */ 73 public static final int ALLOW_UNASSIGNED = 0x01; 74 75 /** 76 * Flag to turn on the check against STD-3 ASCII rules 77 */ 78 public static final int USE_STD3_ASCII_RULES = 0x02; 79 80 81 /** 82 * Translates a string from Unicode to ASCII Compatible Encoding (ACE), 83 * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. 84 * 85 * <p>ToASCII operation can fail. ToASCII fails if any step of it fails. 86 * If ToASCII operation fails, an IllegalArgumentException will be thrown. 87 * In this case, the input string should not be used in an internationalized domain name. 88 * 89 * <p> A label is an individual part of a domain name. The original ToASCII operation, 90 * as defined in RFC 3490, only operates on a single label. This method can handle 91 * both label and entire domain name, by assuming that labels in a domain name are 92 * always separated by dots. The following characters are recognized as dots: 93 * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), 94 * and \uFF61 (halfwidth ideographic full stop). if dots are 95 * used as label separators, this method also changes all of them to \u002E (full stop) 96 * in output translated string. 97 * 98 * @param input the string to be processed 99 * @param flag process flag; can be 0 or any logical OR of possible flags 100 * 101 * @return the translated {@code String} 102 * 103 * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification 104 */ toASCII(String input, int flag)105 public static String toASCII(String input, int flag) { 106 // BEGIN Android-changed: Use ICU4J implementation. 107 try { 108 return ExtendedIDNA.convertIDNToASCII(input, flag).toString(); 109 } catch (android.icu.text.StringPrepParseException e) { 110 // b/113787610: "." is a valid IDN but is rejected by ICU. 111 // Usage is relatively uncommon, so only check for it if ICU throws. 112 if (".".equals(input)) { 113 return input; 114 } 115 throw new IllegalArgumentException("Invalid input to toASCII: " + input, e); 116 } 117 // END Android-changed: Use ICU4J implementation. 118 } 119 120 121 /** 122 * Translates a string from Unicode to ASCII Compatible Encoding (ACE), 123 * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. 124 * 125 * <p> This convenience method works as if by invoking the 126 * two-argument counterpart as follows: 127 * <blockquote> 128 * {@link #toASCII(String, int) toASCII}(input, 0); 129 * </blockquote> 130 * 131 * @param input the string to be processed 132 * 133 * @return the translated {@code String} 134 * 135 * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification 136 */ toASCII(String input)137 public static String toASCII(String input) { 138 return toASCII(input, 0); 139 } 140 141 142 /** 143 * Translates a string from ASCII Compatible Encoding (ACE) to Unicode, 144 * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. 145 * 146 * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified. 147 * 148 * <p> A label is an individual part of a domain name. The original ToUnicode operation, 149 * as defined in RFC 3490, only operates on a single label. This method can handle 150 * both label and entire domain name, by assuming that labels in a domain name are 151 * always separated by dots. The following characters are recognized as dots: 152 * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), 153 * and \uFF61 (halfwidth ideographic full stop). 154 * 155 * @param input the string to be processed 156 * @param flag process flag; can be 0 or any logical OR of possible flags 157 * 158 * @return the translated {@code String} 159 */ toUnicode(String input, int flag)160 public static String toUnicode(String input, int flag) { 161 // BEGIN Android-changed: Use ICU4J implementation. 162 try { 163 // ICU only translates separators to ASCII for toASCII. 164 // Java expects the translation for toUnicode too. 165 return convertFullStop(ExtendedIDNA.convertIDNToUnicode(input, flag)).toString(); 166 } catch (android.icu.text.StringPrepParseException e) { 167 // The RI documentation explicitly states that if the conversion was unsuccessful 168 // the original string is returned. 169 return input; 170 } 171 // END Android-changed: Use ICU4J implementation. 172 } 173 174 // BEGIN Android-added: Use ICU4J implementation. isLabelSeperator(char c)175 private static boolean isLabelSeperator(char c) { 176 return (c == '\u3002' || c == '\uff0e' || c == '\uff61'); 177 } 178 convertFullStop(StringBuffer input)179 private static StringBuffer convertFullStop(StringBuffer input) { 180 for (int i = 0; i < input.length(); i++) { 181 if (isLabelSeperator(input.charAt(i))) { 182 input.setCharAt(i, '.'); 183 } 184 } 185 return input; 186 } 187 // END Android-added: Use ICU4J implementation. 188 189 /** 190 * Translates a string from ASCII Compatible Encoding (ACE) to Unicode, 191 * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. 192 * 193 * <p> This convenience method works as if by invoking the 194 * two-argument counterpart as follows: 195 * <blockquote> 196 * {@link #toUnicode(String, int) toUnicode}(input, 0); 197 * </blockquote> 198 * 199 * @param input the string to be processed 200 * 201 * @return the translated {@code String} 202 */ toUnicode(String input)203 public static String toUnicode(String input) { 204 return toUnicode(input, 0); 205 } 206 207 208 /* ---------------- Private members -------------- */ 209 210 // Android-removed: Private helper methods, unused because we use ICU. 211 /* 212 // ACE Prefix is "xn--" 213 private static final String ACE_PREFIX = "xn--"; 214 private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length(); 215 216 private static final int MAX_LABEL_LENGTH = 63; 217 218 // single instance of nameprep 219 private static StringPrep namePrep = null; 220 221 static { 222 InputStream stream = null; 223 224 try { 225 final String IDN_PROFILE = "uidna.spp"; 226 if (System.getSecurityManager() != null) { 227 stream = AccessController.doPrivileged(new PrivilegedAction<InputStream>() { 228 public InputStream run() { 229 return StringPrep.class.getResourceAsStream(IDN_PROFILE); 230 } 231 }); 232 } else { 233 stream = StringPrep.class.getResourceAsStream(IDN_PROFILE); 234 } 235 236 namePrep = new StringPrep(stream); 237 stream.close(); 238 } catch (IOException e) { 239 // should never reach here 240 assert false; 241 } 242 } 243 */ 244 245 /* ---------------- Private operations -------------- */ 246 247 248 // 249 // to suppress the default zero-argument constructor 250 // IDN()251 private IDN() {} 252 253 // Android-removed: Private helper methods, unused because we use ICU. 254 /* 255 // 256 // toASCII operation; should only apply to a single label 257 // 258 private static String toASCIIInternal(String label, int flag) 259 { 260 // step 1 261 // Check if the string contains code points outside the ASCII range 0..0x7c. 262 boolean isASCII = isAllASCII(label); 263 StringBuffer dest; 264 265 // step 2 266 // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here 267 if (!isASCII) { 268 UCharacterIterator iter = UCharacterIterator.getInstance(label); 269 try { 270 dest = namePrep.prepare(iter, flag); 271 } catch (java.text.ParseException e) { 272 throw new IllegalArgumentException(e); 273 } 274 } else { 275 dest = new StringBuffer(label); 276 } 277 278 // step 8, move forward to check the smallest number of the code points 279 // the length must be inside 1..63 280 if (dest.length() == 0) { 281 throw new IllegalArgumentException( 282 "Empty label is not a legal name"); 283 } 284 285 // step 3 286 // Verify the absence of non-LDH ASCII code points 287 // 0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f 288 // Verify the absence of leading and trailing hyphen 289 boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0); 290 if (useSTD3ASCIIRules) { 291 for (int i = 0; i < dest.length(); i++) { 292 int c = dest.charAt(i); 293 if (isNonLDHAsciiCodePoint(c)) { 294 throw new IllegalArgumentException( 295 "Contains non-LDH ASCII characters"); 296 } 297 } 298 299 if (dest.charAt(0) == '-' || 300 dest.charAt(dest.length() - 1) == '-') { 301 302 throw new IllegalArgumentException( 303 "Has leading or trailing hyphen"); 304 } 305 } 306 307 if (!isASCII) { 308 // step 4 309 // If all code points are inside 0..0x7f, skip to step 8 310 if (!isAllASCII(dest.toString())) { 311 // step 5 312 // verify the sequence does not begin with ACE prefix 313 if(!startsWithACEPrefix(dest)){ 314 315 // step 6 316 // encode the sequence with punycode 317 try { 318 dest = Punycode.encode(dest, null); 319 } catch (java.text.ParseException e) { 320 throw new IllegalArgumentException(e); 321 } 322 323 dest = toASCIILower(dest); 324 325 // step 7 326 // prepend the ACE prefix 327 dest.insert(0, ACE_PREFIX); 328 } else { 329 throw new IllegalArgumentException("The input starts with the ACE Prefix"); 330 } 331 332 } 333 } 334 335 // step 8 336 // the length must be inside 1..63 337 if (dest.length() > MAX_LABEL_LENGTH) { 338 throw new IllegalArgumentException("The label in the input is too long"); 339 } 340 341 return dest.toString(); 342 } 343 344 // 345 // toUnicode operation; should only apply to a single label 346 // 347 private static String toUnicodeInternal(String label, int flag) { 348 boolean[] caseFlags = null; 349 StringBuffer dest; 350 351 // step 1 352 // find out if all the codepoints in input are ASCII 353 boolean isASCII = isAllASCII(label); 354 355 if(!isASCII){ 356 // step 2 357 // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here 358 try { 359 UCharacterIterator iter = UCharacterIterator.getInstance(label); 360 dest = namePrep.prepare(iter, flag); 361 } catch (Exception e) { 362 // toUnicode never fails; if any step fails, return the input string 363 return label; 364 } 365 } else { 366 dest = new StringBuffer(label); 367 } 368 369 // step 3 370 // verify ACE Prefix 371 if(startsWithACEPrefix(dest)) { 372 373 // step 4 374 // Remove the ACE Prefix 375 String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length()); 376 377 try { 378 // step 5 379 // Decode using punycode 380 StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null); 381 382 // step 6 383 // Apply toASCII 384 String toASCIIOut = toASCII(decodeOut.toString(), flag); 385 386 // step 7 387 // verify 388 if (toASCIIOut.equalsIgnoreCase(dest.toString())) { 389 // step 8 390 // return output of step 5 391 return decodeOut.toString(); 392 } 393 } catch (Exception ignored) { 394 // no-op 395 } 396 } 397 398 // just return the input 399 return label; 400 } 401 402 403 // 404 // LDH stands for "letter/digit/hyphen", with characters restricted to the 405 // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen 406 // <->. 407 // Non LDH refers to characters in the ASCII range, but which are not 408 // letters, digits or the hypen. 409 // 410 // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F 411 // 412 private static boolean isNonLDHAsciiCodePoint(int ch){ 413 return (0x0000 <= ch && ch <= 0x002C) || 414 (0x002E <= ch && ch <= 0x002F) || 415 (0x003A <= ch && ch <= 0x0040) || 416 (0x005B <= ch && ch <= 0x0060) || 417 (0x007B <= ch && ch <= 0x007F); 418 } 419 420 // 421 // search dots in a string and return the index of that character; 422 // or if there is no dots, return the length of input string 423 // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), 424 // and \uFF61 (halfwidth ideographic full stop). 425 // 426 private static int searchDots(String s, int start) { 427 int i; 428 for (i = start; i < s.length(); i++) { 429 if (isLabelSeparator(s.charAt(i))) { 430 break; 431 } 432 } 433 434 return i; 435 } 436 437 // 438 // to check if a string is a root label, ".". 439 // 440 private static boolean isRootLabel(String s) { 441 return (s.length() == 1 && isLabelSeparator(s.charAt(0))); 442 } 443 444 // 445 // to check if a character is a label separator, i.e. a dot character. 446 // 447 private static boolean isLabelSeparator(char c) { 448 return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61'); 449 } 450 451 // 452 // to check if a string only contains US-ASCII code point 453 // 454 private static boolean isAllASCII(String input) { 455 boolean isASCII = true; 456 for (int i = 0; i < input.length(); i++) { 457 int c = input.charAt(i); 458 if (c > 0x7F) { 459 isASCII = false; 460 break; 461 } 462 } 463 return isASCII; 464 } 465 466 // 467 // to check if a string starts with ACE-prefix 468 // 469 private static boolean startsWithACEPrefix(StringBuffer input){ 470 boolean startsWithPrefix = true; 471 472 if(input.length() < ACE_PREFIX_LENGTH){ 473 return false; 474 } 475 for(int i = 0; i < ACE_PREFIX_LENGTH; i++){ 476 if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){ 477 startsWithPrefix = false; 478 } 479 } 480 return startsWithPrefix; 481 } 482 483 private static char toASCIILower(char ch){ 484 if('A' <= ch && ch <= 'Z'){ 485 return (char)(ch + 'a' - 'A'); 486 } 487 return ch; 488 } 489 490 private static StringBuffer toASCIILower(StringBuffer input){ 491 StringBuffer dest = new StringBuffer(); 492 for(int i = 0; i < input.length();i++){ 493 dest.append(toASCIILower(input.charAt(i))); 494 } 495 return dest; 496 } 497 */ 498 } 499