1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.  Oracle designates this
9  * particular file as subject to the "Classpath" exception as provided
10  * by Oracle in the LICENSE file that accompanied this code.
11  *
12  * This code is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * version 2 for more details (a copy is included in the LICENSE file that
16  * accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License version
19  * 2 along with this work; if not, write to the Free Software Foundation,
20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23  * or visit www.oracle.com if you need additional information or have any
24  * questions.
25  */
26 package java.net;
27 
28 import com.android.icu.text.ExtendedIDNA;
29 
30 /**
31  * Provides methods to convert internationalized domain names (IDNs) between
32  * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.
33  * Internationalized domain names can use characters from the entire range of
34  * Unicode, while traditional domain names are restricted to ASCII characters.
35  * ACE is an encoding of Unicode strings that uses only ASCII characters and
36  * can be used with software (such as the Domain Name System) that only
37  * understands traditional domain names.
38  *
39  * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
40  * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ
41  * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a
42  * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and
43  * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert
44  * domain name string back and forth.
45  *
46  * <p>The behavior of aforementioned conversion process can be adjusted by various flags:
47  *   <ul>
48  *     <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted
49  *         can contain code points that are unassigned in Unicode 3.2, which is the
50  *         Unicode version on which IDN conversion is based. If the flag is not used,
51  *         the presence of such unassigned code points is treated as an error.
52  *     <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.
53  *         It is an error if they don't meet the requirements.
54  *   </ul>
55  * These flags can be logically OR'ed together.
56  *
57  * <p>The security consideration is important with respect to internationalization
58  * domain name support. For example, English domain names may be <i>homographed</i>
59  * - maliciously misspelled by substitution of non-Latin letters.
60  * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>
61  * discusses security issues of IDN support as well as possible solutions.
62  * Applications are responsible for taking adequate security measures when using
63  * international domain names.
64  *
65  * @author Edward Wang
66  * @since 1.6
67  *
68  */
69 public final class IDN {
70     /**
71      * Flag to allow processing of unassigned code points
72      */
73     public static final int ALLOW_UNASSIGNED = 0x01;
74 
75     /**
76      * Flag to turn on the check against STD-3 ASCII rules
77      */
78     public static final int USE_STD3_ASCII_RULES = 0x02;
79 
80 
81     /**
82      * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
83      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
84      *
85      * <p>ToASCII operation can fail. ToASCII fails if any step of it fails.
86      * If ToASCII operation fails, an IllegalArgumentException will be thrown.
87      * In this case, the input string should not be used in an internationalized domain name.
88      *
89      * <p> A label is an individual part of a domain name. The original ToASCII operation,
90      * as defined in RFC 3490, only operates on a single label. This method can handle
91      * both label and entire domain name, by assuming that labels in a domain name are
92      * always separated by dots. The following characters are recognized as dots:
93      * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
94      * and &#0092;uFF61 (halfwidth ideographic full stop). if dots are
95      * used as label separators, this method also changes all of them to &#0092;u002E (full stop)
96      * in output translated string.
97      *
98      * @param input     the string to be processed
99      * @param flag      process flag; can be 0 or any logical OR of possible flags
100      *
101      * @return          the translated {@code String}
102      *
103      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
104      */
toASCII(String input, int flag)105     public static String toASCII(String input, int flag) {
106         // BEGIN Android-changed: Use ICU4J implementation.
107         try {
108             return ExtendedIDNA.convertIDNToASCII(input, flag).toString();
109         } catch (android.icu.text.StringPrepParseException e) {
110             // b/113787610: "." is a valid IDN but is rejected by ICU.
111             // Usage is relatively uncommon, so only check for it if ICU throws.
112             if (".".equals(input)) {
113                 return input;
114             }
115             throw new IllegalArgumentException("Invalid input to toASCII: " + input, e);
116         }
117         // END Android-changed: Use ICU4J implementation.
118     }
119 
120 
121     /**
122      * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
123      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
124      *
125      * <p> This convenience method works as if by invoking the
126      * two-argument counterpart as follows:
127      * <blockquote>
128      * {@link #toASCII(String, int) toASCII}(input,&nbsp;0);
129      * </blockquote>
130      *
131      * @param input     the string to be processed
132      *
133      * @return          the translated {@code String}
134      *
135      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
136      */
toASCII(String input)137     public static String toASCII(String input) {
138         return toASCII(input, 0);
139     }
140 
141 
142     /**
143      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
144      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
145      *
146      * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.
147      *
148      * <p> A label is an individual part of a domain name. The original ToUnicode operation,
149      * as defined in RFC 3490, only operates on a single label. This method can handle
150      * both label and entire domain name, by assuming that labels in a domain name are
151      * always separated by dots. The following characters are recognized as dots:
152      * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
153      * and &#0092;uFF61 (halfwidth ideographic full stop).
154      *
155      * @param input     the string to be processed
156      * @param flag      process flag; can be 0 or any logical OR of possible flags
157      *
158      * @return          the translated {@code String}
159      */
toUnicode(String input, int flag)160     public static String toUnicode(String input, int flag) {
161         // BEGIN Android-changed: Use ICU4J implementation.
162         try {
163             // ICU only translates separators to ASCII for toASCII.
164             // Java expects the translation for toUnicode too.
165             return convertFullStop(ExtendedIDNA.convertIDNToUnicode(input, flag)).toString();
166         } catch (android.icu.text.StringPrepParseException e) {
167             // The RI documentation explicitly states that if the conversion was unsuccessful
168             // the original string is returned.
169             return input;
170         }
171         // END Android-changed: Use ICU4J implementation.
172     }
173 
174     // BEGIN Android-added: Use ICU4J implementation.
isLabelSeperator(char c)175     private static boolean isLabelSeperator(char c) {
176         return (c == '\u3002' || c == '\uff0e' || c == '\uff61');
177     }
178 
convertFullStop(StringBuffer input)179     private static StringBuffer convertFullStop(StringBuffer input) {
180         for (int i = 0; i < input.length(); i++) {
181             if (isLabelSeperator(input.charAt(i))) {
182                 input.setCharAt(i, '.');
183             }
184         }
185         return input;
186     }
187     // END Android-added: Use ICU4J implementation.
188 
189     /**
190      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
191      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
192      *
193      * <p> This convenience method works as if by invoking the
194      * two-argument counterpart as follows:
195      * <blockquote>
196      * {@link #toUnicode(String, int) toUnicode}(input,&nbsp;0);
197      * </blockquote>
198      *
199      * @param input     the string to be processed
200      *
201      * @return          the translated {@code String}
202      */
toUnicode(String input)203     public static String toUnicode(String input) {
204         return toUnicode(input, 0);
205     }
206 
207 
208     /* ---------------- Private members -------------- */
209 
210     // Android-removed: Private helper methods, unused because we use ICU.
211     /*
212     // ACE Prefix is "xn--"
213     private static final String ACE_PREFIX = "xn--";
214     private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();
215 
216     private static final int MAX_LABEL_LENGTH   = 63;
217 
218     // single instance of nameprep
219     private static StringPrep namePrep = null;
220 
221     static {
222         InputStream stream = null;
223 
224         try {
225             final String IDN_PROFILE = "uidna.spp";
226             if (System.getSecurityManager() != null) {
227                 stream = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {
228                     public InputStream run() {
229                         return StringPrep.class.getResourceAsStream(IDN_PROFILE);
230                     }
231                 });
232             } else {
233                 stream = StringPrep.class.getResourceAsStream(IDN_PROFILE);
234             }
235 
236             namePrep = new StringPrep(stream);
237             stream.close();
238         } catch (IOException e) {
239             // should never reach here
240             assert false;
241         }
242     }
243     */
244 
245     /* ---------------- Private operations -------------- */
246 
247 
248     //
249     // to suppress the default zero-argument constructor
250     //
IDN()251     private IDN() {}
252 
253     // Android-removed: Private helper methods, unused because we use ICU.
254     /*
255     //
256     // toASCII operation; should only apply to a single label
257     //
258     private static String toASCIIInternal(String label, int flag)
259     {
260         // step 1
261         // Check if the string contains code points outside the ASCII range 0..0x7c.
262         boolean isASCII  = isAllASCII(label);
263         StringBuffer dest;
264 
265         // step 2
266         // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
267         if (!isASCII) {
268             UCharacterIterator iter = UCharacterIterator.getInstance(label);
269             try {
270                 dest = namePrep.prepare(iter, flag);
271             } catch (java.text.ParseException e) {
272                 throw new IllegalArgumentException(e);
273             }
274         } else {
275             dest = new StringBuffer(label);
276         }
277 
278         // step 8, move forward to check the smallest number of the code points
279         // the length must be inside 1..63
280         if (dest.length() == 0) {
281             throw new IllegalArgumentException(
282                         "Empty label is not a legal name");
283         }
284 
285         // step 3
286         // Verify the absence of non-LDH ASCII code points
287         //   0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f
288         // Verify the absence of leading and trailing hyphen
289         boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0);
290         if (useSTD3ASCIIRules) {
291             for (int i = 0; i < dest.length(); i++) {
292                 int c = dest.charAt(i);
293                 if (isNonLDHAsciiCodePoint(c)) {
294                     throw new IllegalArgumentException(
295                         "Contains non-LDH ASCII characters");
296                 }
297             }
298 
299             if (dest.charAt(0) == '-' ||
300                 dest.charAt(dest.length() - 1) == '-') {
301 
302                 throw new IllegalArgumentException(
303                         "Has leading or trailing hyphen");
304             }
305         }
306 
307         if (!isASCII) {
308             // step 4
309             // If all code points are inside 0..0x7f, skip to step 8
310             if (!isAllASCII(dest.toString())) {
311                 // step 5
312                 // verify the sequence does not begin with ACE prefix
313                 if(!startsWithACEPrefix(dest)){
314 
315                     // step 6
316                     // encode the sequence with punycode
317                     try {
318                         dest = Punycode.encode(dest, null);
319                     } catch (java.text.ParseException e) {
320                         throw new IllegalArgumentException(e);
321                     }
322 
323                     dest = toASCIILower(dest);
324 
325                     // step 7
326                     // prepend the ACE prefix
327                     dest.insert(0, ACE_PREFIX);
328                 } else {
329                     throw new IllegalArgumentException("The input starts with the ACE Prefix");
330                 }
331 
332             }
333         }
334 
335         // step 8
336         // the length must be inside 1..63
337         if (dest.length() > MAX_LABEL_LENGTH) {
338             throw new IllegalArgumentException("The label in the input is too long");
339         }
340 
341         return dest.toString();
342     }
343 
344     //
345     // toUnicode operation; should only apply to a single label
346     //
347     private static String toUnicodeInternal(String label, int flag) {
348         boolean[] caseFlags = null;
349         StringBuffer dest;
350 
351         // step 1
352         // find out if all the codepoints in input are ASCII
353         boolean isASCII = isAllASCII(label);
354 
355         if(!isASCII){
356             // step 2
357             // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
358             try {
359                 UCharacterIterator iter = UCharacterIterator.getInstance(label);
360                 dest = namePrep.prepare(iter, flag);
361             } catch (Exception e) {
362                 // toUnicode never fails; if any step fails, return the input string
363                 return label;
364             }
365         } else {
366             dest = new StringBuffer(label);
367         }
368 
369         // step 3
370         // verify ACE Prefix
371         if(startsWithACEPrefix(dest)) {
372 
373             // step 4
374             // Remove the ACE Prefix
375             String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length());
376 
377             try {
378                 // step 5
379                 // Decode using punycode
380                 StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null);
381 
382                 // step 6
383                 // Apply toASCII
384                 String toASCIIOut = toASCII(decodeOut.toString(), flag);
385 
386                 // step 7
387                 // verify
388                 if (toASCIIOut.equalsIgnoreCase(dest.toString())) {
389                     // step 8
390                     // return output of step 5
391                     return decodeOut.toString();
392                 }
393             } catch (Exception ignored) {
394                 // no-op
395             }
396         }
397 
398         // just return the input
399         return label;
400     }
401 
402 
403     //
404     // LDH stands for "letter/digit/hyphen", with characters restricted to the
405     // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen
406     // <->.
407     // Non LDH refers to characters in the ASCII range, but which are not
408     // letters, digits or the hypen.
409     //
410     // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F
411     //
412     private static boolean isNonLDHAsciiCodePoint(int ch){
413         return (0x0000 <= ch && ch <= 0x002C) ||
414                (0x002E <= ch && ch <= 0x002F) ||
415                (0x003A <= ch && ch <= 0x0040) ||
416                (0x005B <= ch && ch <= 0x0060) ||
417                (0x007B <= ch && ch <= 0x007F);
418     }
419 
420     //
421     // search dots in a string and return the index of that character;
422     // or if there is no dots, return the length of input string
423     // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
424     // and \uFF61 (halfwidth ideographic full stop).
425     //
426     private static int searchDots(String s, int start) {
427         int i;
428         for (i = start; i < s.length(); i++) {
429             if (isLabelSeparator(s.charAt(i))) {
430                 break;
431             }
432         }
433 
434         return i;
435     }
436 
437     //
438     // to check if a string is a root label, ".".
439     //
440     private static boolean isRootLabel(String s) {
441         return (s.length() == 1 && isLabelSeparator(s.charAt(0)));
442     }
443 
444     //
445     // to check if a character is a label separator, i.e. a dot character.
446     //
447     private static boolean isLabelSeparator(char c) {
448         return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61');
449     }
450 
451     //
452     // to check if a string only contains US-ASCII code point
453     //
454     private static boolean isAllASCII(String input) {
455         boolean isASCII = true;
456         for (int i = 0; i < input.length(); i++) {
457             int c = input.charAt(i);
458             if (c > 0x7F) {
459                 isASCII = false;
460                 break;
461             }
462         }
463         return isASCII;
464     }
465 
466     //
467     // to check if a string starts with ACE-prefix
468     //
469     private static boolean startsWithACEPrefix(StringBuffer input){
470         boolean startsWithPrefix = true;
471 
472         if(input.length() < ACE_PREFIX_LENGTH){
473             return false;
474         }
475         for(int i = 0; i < ACE_PREFIX_LENGTH; i++){
476             if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){
477                 startsWithPrefix = false;
478             }
479         }
480         return startsWithPrefix;
481     }
482 
483     private static char toASCIILower(char ch){
484         if('A' <= ch && ch <= 'Z'){
485             return (char)(ch + 'a' - 'A');
486         }
487         return ch;
488     }
489 
490     private static StringBuffer toASCIILower(StringBuffer input){
491         StringBuffer dest = new StringBuffer();
492         for(int i = 0; i < input.length();i++){
493             dest.append(toASCIILower(input.charAt(i)));
494         }
495         return dest;
496     }
497     */
498 }
499