1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 *******************************************************************************
5 * Copyright (C) 2003-2010, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 */
9 package com.ibm.icu.impl;
10 
11 import com.ibm.icu.text.IDNA;
12 import com.ibm.icu.text.StringPrep;
13 import com.ibm.icu.text.StringPrepParseException;
14 import com.ibm.icu.text.UCharacterIterator;
15 
16 /**
17  * IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java
18  * while extending that class to support IDNA2008/UTS #46 as well.
19  * @author Ram Viswanadha
20  */
21 public final class IDNA2003 {
22     /* IDNA ACE Prefix is "xn--" */
23     private static char[] ACE_PREFIX                = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
24     //private static final int ACE_PREFIX_LENGTH      = ACE_PREFIX.length;
25 
26     private static final int MAX_LABEL_LENGTH       = 63;
27     private static final int HYPHEN                 = 0x002D;
28     private static final int CAPITAL_A              = 0x0041;
29     private static final int CAPITAL_Z              = 0x005A;
30     private static final int LOWER_CASE_DELTA       = 0x0020;
31     private static final int FULL_STOP              = 0x002E;
32     private static final int MAX_DOMAIN_NAME_LENGTH = 255;
33 
34     // The NamePrep profile object
35     private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
36 
startsWithPrefix(StringBuffer src)37     private static boolean startsWithPrefix(StringBuffer src){
38         boolean startsWithPrefix = true;
39 
40         if(src.length() < ACE_PREFIX.length){
41             return false;
42         }
43         for(int i=0; i<ACE_PREFIX.length;i++){
44             if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
45                 startsWithPrefix = false;
46             }
47         }
48         return startsWithPrefix;
49     }
50 
toASCIILower(char ch)51     private static char toASCIILower(char ch){
52         if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
53             return (char)(ch + LOWER_CASE_DELTA);
54         }
55         return ch;
56     }
57 
toASCIILower(CharSequence src)58     private static StringBuffer toASCIILower(CharSequence src){
59         StringBuffer dest = new StringBuffer();
60         for(int i=0; i<src.length();i++){
61             dest.append(toASCIILower(src.charAt(i)));
62         }
63         return dest;
64     }
65 
compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2)66     private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){
67         char c1,c2;
68         int rc;
69         for(int i =0;/* no condition */;i++) {
70             /* If we reach the ends of both strings then they match */
71             if(i == s1.length()) {
72                 return 0;
73             }
74 
75             c1 = s1.charAt(i);
76             c2 = s2.charAt(i);
77 
78             /* Case-insensitive comparison */
79             if(c1!=c2) {
80                 rc=toASCIILower(c1)-toASCIILower(c2);
81                 if(rc!=0) {
82                     return rc;
83                 }
84             }
85         }
86     }
87 
getSeparatorIndex(char[] src,int start, int limit)88     private static int getSeparatorIndex(char[] src,int start, int limit){
89         for(; start<limit;start++){
90             if(isLabelSeparator(src[start])){
91                 return start;
92             }
93         }
94         // we have not found the separator just return length
95         return start;
96     }
97 
98     /*
99     private static int getSeparatorIndex(UCharacterIterator iter){
100         int currentIndex = iter.getIndex();
101         int separatorIndex = 0;
102         int ch;
103         while((ch=iter.next())!= UCharacterIterator.DONE){
104             if(isLabelSeparator(ch)){
105                 separatorIndex = iter.getIndex();
106                 iter.setIndex(currentIndex);
107                 return separatorIndex;
108             }
109         }
110         // reset index
111         iter.setIndex(currentIndex);
112         // we have not found the separator just return the length
113 
114     }
115     */
116 
117 
isLDHChar(int ch)118     private static boolean isLDHChar(int ch){
119         // high runner case
120         if(ch>0x007A){
121             return false;
122         }
123         //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
124         if( (ch==0x002D) ||
125             (0x0030 <= ch && ch <= 0x0039) ||
126             (0x0041 <= ch && ch <= 0x005A) ||
127             (0x0061 <= ch && ch <= 0x007A)
128           ){
129             return true;
130         }
131         return false;
132     }
133 
134     /**
135      * Ascertain if the given code point is a label separator as
136      * defined by the IDNA RFC
137      *
138      * @param ch The code point to be ascertained
139      * @return true if the char is a label separator
140      * @stable ICU 2.8
141      */
isLabelSeparator(int ch)142     private static boolean isLabelSeparator(int ch){
143         switch(ch){
144             case 0x002e:
145             case 0x3002:
146             case 0xFF0E:
147             case 0xFF61:
148                 return true;
149             default:
150                 return false;
151         }
152     }
153 
convertToASCII(UCharacterIterator src, int options)154     public static StringBuffer convertToASCII(UCharacterIterator src, int options)
155             throws StringPrepParseException{
156 
157         boolean[] caseFlags = null;
158 
159         // the source contains all ascii codepoints
160         boolean srcIsASCII  = true;
161         // assume the source contains all LDH codepoints
162         boolean srcIsLDH = true;
163 
164         //get the options
165         boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0);
166         int ch;
167         // step 1
168         while((ch = src.next())!= UCharacterIterator.DONE){
169             if(ch> 0x7f){
170                 srcIsASCII = false;
171             }
172         }
173         int failPos = -1;
174         src.setToStart();
175         StringBuffer processOut = null;
176         // step 2 is performed only if the source contains non ASCII
177         if(!srcIsASCII){
178             // step 2
179             processOut = namePrep.prepare(src, options);
180         }else{
181             processOut = new StringBuffer(src.getText());
182         }
183         int poLen = processOut.length();
184 
185         if(poLen==0){
186             throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
187         }
188         StringBuffer dest = new StringBuffer();
189 
190         // reset the variable to verify if output of prepare is ASCII or not
191         srcIsASCII = true;
192 
193         // step 3 & 4
194         for(int j=0;j<poLen;j++ ){
195             ch=processOut.charAt(j);
196             if(ch > 0x7F){
197                 srcIsASCII = false;
198             }else if(isLDHChar(ch)==false){
199                 // here we do not assemble surrogates
200                 // since we know that LDH code points
201                 // are in the ASCII range only
202                 srcIsLDH = false;
203                 failPos = j;
204             }
205         }
206 
207         if(useSTD3ASCIIRules == true){
208             // verify 3a and 3b
209             if( srcIsLDH == false /* source contains some non-LDH characters */
210                 || processOut.charAt(0) ==  HYPHEN
211                 || processOut.charAt(processOut.length()-1) == HYPHEN){
212 
213                 /* populate the parseError struct */
214                 if(srcIsLDH==false){
215                      throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
216                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,
217                                               processOut.toString(),
218                                              (failPos>0) ? (failPos-1) : failPos);
219                 }else if(processOut.charAt(0) == HYPHEN){
220                     throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
221                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
222 
223                 }else{
224                      throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
225                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,
226                                               processOut.toString(),
227                                               (poLen>0) ? poLen-1 : poLen);
228 
229                 }
230             }
231         }
232         if(srcIsASCII){
233             dest =  processOut;
234         }else{
235             // step 5 : verify the sequence does not begin with ACE prefix
236             if(!startsWithPrefix(processOut)){
237 
238                 //step 6: encode the sequence with punycode
239                 caseFlags = new boolean[poLen];
240 
241                 StringBuilder punyout = Punycode.encode(processOut,caseFlags);
242 
243                 // convert all codepoints to lower case ASCII
244                 StringBuffer lowerOut = toASCIILower(punyout);
245 
246                 //Step 7: prepend the ACE prefix
247                 dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
248                 //Step 6: copy the contents in b2 into dest
249                 dest.append(lowerOut);
250             }else{
251 
252                 throw new StringPrepParseException("The input does not start with the ACE Prefix.",
253                                          StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
254             }
255         }
256         if(dest.length() > MAX_LABEL_LENGTH){
257             throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
258                                      StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
259         }
260         return dest;
261     }
262 
convertIDNToASCII(String src,int options)263     public static StringBuffer convertIDNToASCII(String src,int options)
264             throws StringPrepParseException{
265 
266         char[] srcArr = src.toCharArray();
267         StringBuffer result = new StringBuffer();
268         int sepIndex=0;
269         int oldSepIndex=0;
270         for(;;){
271             sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
272             String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
273             //make sure this is not a root label separator.
274             if(!(label.length()==0 && sepIndex==srcArr.length)){
275                 UCharacterIterator iter = UCharacterIterator.getInstance(label);
276                 result.append(convertToASCII(iter,options));
277             }
278             if(sepIndex==srcArr.length){
279                 break;
280             }
281 
282             // increment the sepIndex to skip past the separator
283             sepIndex++;
284             oldSepIndex = sepIndex;
285             result.append((char)FULL_STOP);
286         }
287         if(result.length() > MAX_DOMAIN_NAME_LENGTH){
288             throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
289         }
290         return result;
291     }
292 
convertToUnicode(UCharacterIterator src, int options)293     public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
294             throws StringPrepParseException{
295 
296         boolean[] caseFlags = null;
297 
298         // the source contains all ascii codepoints
299         boolean srcIsASCII  = true;
300         // assume the source contains all LDH codepoints
301         //boolean srcIsLDH = true;
302 
303         //get the options
304         //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
305 
306         //int failPos = -1;
307         int ch;
308         int saveIndex = src.getIndex();
309         // step 1: find out if all the codepoints in src are ASCII
310         while((ch=src.next())!= UCharacterIterator.DONE){
311             if(ch>0x7F){
312                 srcIsASCII = false;
313             }/*else if((srcIsLDH = isLDHChar(ch))==false){
314                 failPos = src.getIndex();
315             }*/
316         }
317         StringBuffer processOut;
318 
319         if(srcIsASCII == false){
320             try {
321                 // step 2: process the string
322                 src.setIndex(saveIndex);
323                 processOut = namePrep.prepare(src,options);
324             } catch (StringPrepParseException ex) {
325                 return new StringBuffer(src.getText());
326             }
327 
328         }else{
329             //just point to source
330             processOut = new StringBuffer(src.getText());
331         }
332         // TODO:
333         // The RFC states that
334         // <quote>
335         // ToUnicode never fails. If any step fails, then the original input
336         // is returned immediately in that step.
337         // </quote>
338 
339         //step 3: verify ACE Prefix
340         if(startsWithPrefix(processOut)){
341             StringBuffer decodeOut = null;
342 
343             //step 4: Remove the ACE Prefix
344             String temp = processOut.substring(ACE_PREFIX.length,processOut.length());
345 
346             //step 5: Decode using punycode
347             try {
348                 decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags));
349             } catch (StringPrepParseException e) {
350                 decodeOut = null;
351             }
352 
353             //step 6:Apply toASCII
354             if (decodeOut != null) {
355                 StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options);
356 
357                 //step 7: verify
358                 if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
359 //                    throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
360 //                                             StringPrepParseException.VERIFICATION_ERROR);
361                     decodeOut = null;
362                 }
363             }
364 
365             //step 8: return output of step 5
366              if (decodeOut != null) {
367                  return decodeOut;
368              }
369         }
370 
371 //        }else{
372 //            // verify that STD3 ASCII rules are satisfied
373 //            if(useSTD3ASCIIRules == true){
374 //                if( srcIsLDH == false /* source contains some non-LDH characters */
375 //                    || processOut.charAt(0) ==  HYPHEN
376 //                    || processOut.charAt(processOut.length()-1) == HYPHEN){
377 //
378 //                    if(srcIsLDH==false){
379 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
380 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
381 //                                                 (failPos>0) ? (failPos-1) : failPos);
382 //                    }else if(processOut.charAt(0) == HYPHEN){
383 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
384 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
385 //                                                 processOut.toString(),0);
386 //
387 //                    }else{
388 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
389 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
390 //                                                 processOut.toString(),
391 //                                                 processOut.length());
392 //
393 //                    }
394 //                }
395 //            }
396 //            // just return the source
397 //            return new StringBuffer(src.getText());
398 //        }
399 
400         return new StringBuffer(src.getText());
401     }
402 
convertIDNToUnicode(String src, int options)403     public static StringBuffer convertIDNToUnicode(String src, int options)
404             throws StringPrepParseException{
405 
406         char[] srcArr = src.toCharArray();
407         StringBuffer result = new StringBuffer();
408         int sepIndex=0;
409         int oldSepIndex=0;
410         for(;;){
411             sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
412             String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
413             if(label.length()==0 && sepIndex!=srcArr.length ){
414                 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
415             }
416             UCharacterIterator iter = UCharacterIterator.getInstance(label);
417             result.append(convertToUnicode(iter,options));
418             if(sepIndex==srcArr.length){
419                 break;
420             }
421             // Unlike the ToASCII operation we don't normalize the label separators
422             result.append(srcArr[sepIndex]);
423             // increment the sepIndex to skip past the separator
424             sepIndex++;
425             oldSepIndex =sepIndex;
426         }
427         if(result.length() > MAX_DOMAIN_NAME_LENGTH){
428             throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
429         }
430         return result;
431     }
432 
compare(String s1, String s2, int options)433     public static int compare(String s1, String s2, int options) throws StringPrepParseException{
434         StringBuffer s1Out = convertIDNToASCII(s1, options);
435         StringBuffer s2Out = convertIDNToASCII(s2, options);
436         return compareCaseInsensitiveASCII(s1Out,s2Out);
437     }
438 }
439