1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2003-2014, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import com.ibm.icu.lang.UCharacter;
12 import com.ibm.icu.text.StringPrepParseException;
13 import com.ibm.icu.text.UTF16;
14 
15 /**
16  * Ported code from ICU punycode.c
17  * @author ram
18  */
19 public final class Punycode {
20 
21     /* Punycode parameters for Bootstring */
22     private static final int BASE           = 36;
23     private static final int TMIN           = 1;
24     private static final int TMAX           = 26;
25     private static final int SKEW           = 38;
26     private static final int DAMP           = 700;
27     private static final int INITIAL_BIAS   = 72;
28     private static final int INITIAL_N      = 0x80;
29 
30     /* "Basic" Unicode/ASCII code points */
31     private static final char HYPHEN        = 0x2d;
32     private static final char DELIMITER     = HYPHEN;
33 
34     private static final int ZERO           = 0x30;
35     //private static final int NINE           = 0x39;
36 
37     private static final int SMALL_A        = 0x61;
38     private static final int SMALL_Z        = 0x7a;
39 
40     private static final int CAPITAL_A      = 0x41;
41     private static final int CAPITAL_Z      = 0x5a;
42 
adaptBias(int delta, int length, boolean firstTime)43     private static int adaptBias(int delta, int length, boolean firstTime){
44         if(firstTime){
45             delta /=DAMP;
46         }else{
47             delta /=  2;
48         }
49         delta += delta/length;
50 
51         int count=0;
52         for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
53             delta/=(BASE-TMIN);
54         }
55 
56         return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
57     }
58 
59     /**
60      * basicToDigit[] contains the numeric value of a basic code
61      * point (for use in representing integers) in the range 0 to
62      * BASE-1, or -1 if b is does not represent a value.
63      */
64     static final int[]    basicToDigit= new int[]{
65         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
66         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
67 
68         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
69         26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
70 
71         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
72         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
73 
74         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
75         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
76 
77         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
78         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
79 
80         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
81         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
82 
83         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
84         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
85 
86         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
87         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
88     };
89 
90     ///CLOVER:OFF
asciiCaseMap(char b, boolean uppercase)91     private static char asciiCaseMap(char b, boolean uppercase) {
92         if(uppercase) {
93             if(SMALL_A<=b && b<=SMALL_Z) {
94                 b-=(SMALL_A-CAPITAL_A);
95             }
96         } else {
97             if(CAPITAL_A<=b && b<=CAPITAL_Z) {
98                 b+=(SMALL_A-CAPITAL_A);
99             }
100         }
101         return b;
102     }
103     ///CLOVER:ON
104     /**
105      * digitToBasic() returns the basic code point whose value
106      * (when used for representing integers) is d, which must be in the
107      * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
108      * nonzero, in which case the uppercase form is used.
109      */
digitToBasic(int digit, boolean uppercase)110     private static char digitToBasic(int digit, boolean uppercase) {
111         /*  0..25 map to ASCII a..z or A..Z */
112         /* 26..35 map to ASCII 0..9         */
113         if(digit<26) {
114             if(uppercase) {
115                 return (char)(CAPITAL_A+digit);
116             } else {
117                 return (char)(SMALL_A+digit);
118             }
119         } else {
120             return (char)((ZERO-26)+digit);
121         }
122     }
123     /**
124      * Converts Unicode to Punycode.
125      * The input string must not contain single, unpaired surrogates.
126      * The output will be represented as an array of ASCII code points.
127      *
128      * @param src The source of the String Buffer passed.
129      * @param caseFlags The boolean array of case flags.
130      * @return An array of ASCII code points.
131      */
encode(CharSequence src, boolean[] caseFlags)132     public static StringBuilder encode(CharSequence src, boolean[] caseFlags) throws StringPrepParseException{
133         int n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, srcCPCount;
134         char c, c2;
135         int srcLength = src.length();
136         int[] cpBuffer = new int[srcLength];
137         StringBuilder dest = new StringBuilder(srcLength);
138         /*
139          * Handle the basic code points and
140          * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
141          */
142         srcCPCount=0;
143 
144         for(j=0; j<srcLength; ++j) {
145             c=src.charAt(j);
146             if(isBasic(c)) {
147                 cpBuffer[srcCPCount++]=0;
148                 dest.append(caseFlags!=null ? asciiCaseMap(c, caseFlags[j]) : c);
149             } else {
150                 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
151                 if(!UTF16.isSurrogate(c)) {
152                     n|=c;
153                 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
154                     ++j;
155 
156                     n|=UCharacter.getCodePoint(c, c2);
157                 } else {
158                     /* error: unmatched surrogate */
159                     throw new StringPrepParseException("Illegal char found",StringPrepParseException.ILLEGAL_CHAR_FOUND);
160                 }
161                 cpBuffer[srcCPCount++]=n;
162             }
163         }
164 
165         /* Finish the basic string - if it is not empty - with a delimiter. */
166         basicLength=dest.length();
167         if(basicLength>0) {
168             dest.append(DELIMITER);
169         }
170 
171         /*
172          * handledCPCount is the number of code points that have been handled
173          * basicLength is the number of basic code points
174          * destLength is the number of chars that have been output
175          */
176 
177         /* Initialize the state: */
178         n=INITIAL_N;
179         delta=0;
180         bias=INITIAL_BIAS;
181 
182         /* Main encoding loop: */
183         for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
184             /*
185              * All non-basic code points < n have been handled already.
186              * Find the next larger one:
187              */
188             for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
189                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
190                 if(n<=q && q<m) {
191                     m=q;
192                 }
193             }
194 
195             /*
196              * Increase delta enough to advance the decoder's
197              * <n,i> state to <m,0>, but guard against overflow:
198              */
199             if(m-n>(0x7fffffff-delta)/(handledCPCount+1)) {
200                 throw new IllegalStateException("Internal program error");
201             }
202             delta+=(m-n)*(handledCPCount+1);
203             n=m;
204 
205             /* Encode a sequence of same code points n */
206             for(j=0; j<srcCPCount; ++j) {
207                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
208                 if(q<n) {
209                     ++delta;
210                 } else if(q==n) {
211                     /* Represent delta as a generalized variable-length integer: */
212                     for(q=delta, k=BASE; /* no condition */; k+=BASE) {
213 
214                         /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
215 
216                         t=k-bias;
217                         if(t<TMIN) {
218                             t=TMIN;
219                         } else if(t>TMAX) {
220                             t=TMAX;
221                         }
222                         */
223 
224                         t=k-bias;
225                         if(t<TMIN) {
226                             t=TMIN;
227                         } else if(k>=(bias+TMAX)) {
228                             t=TMAX;
229                         }
230 
231                         if(q<t) {
232                             break;
233                         }
234 
235                         dest.append(digitToBasic(t+(q-t)%(BASE-t), false));
236                         q=(q-t)/(BASE-t);
237                     }
238 
239                     dest.append(digitToBasic(q, (cpBuffer[j]<0)));
240                     bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
241                     delta=0;
242                     ++handledCPCount;
243                 }
244             }
245 
246             ++delta;
247             ++n;
248         }
249 
250         return dest;
251     }
252 
isBasic(int ch)253     private static boolean isBasic(int ch){
254         return (ch < INITIAL_N);
255     }
256     ///CLOVER:OFF
isBasicUpperCase(int ch)257     private static boolean isBasicUpperCase(int ch){
258         return( CAPITAL_A<=ch && ch >= CAPITAL_Z);
259     }
260     ///CLOVER:ON
isSurrogate(int ch)261     private static boolean isSurrogate(int ch){
262         return (((ch)&0xfffff800)==0xd800);
263     }
264     /**
265      * Converts Punycode to Unicode.
266      * The Unicode string will be at most as long as the Punycode string.
267      *
268      * @param src The source of the string buffer being passed.
269      * @param caseFlags The array of boolean case flags.
270      * @return StringBuilder string.
271      */
decode(CharSequence src, boolean[] caseFlags)272     public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
273                                throws StringPrepParseException{
274         int srcLength = src.length();
275         StringBuilder dest = new StringBuilder(src.length());
276         int n, i, bias, basicLength, j, in, oldi, w, k, digit, t,
277                 destCPCount, firstSupplementaryIndex, cpLength;
278         char b;
279 
280         /*
281          * Handle the basic code points:
282          * Let basicLength be the number of input code points
283          * before the last delimiter, or 0 if there is none,
284          * then copy the first basicLength code points to the output.
285          *
286          * The following loop iterates backward.
287          */
288         for(j=srcLength; j>0;) {
289             if(src.charAt(--j)==DELIMITER) {
290                 break;
291             }
292         }
293         basicLength=destCPCount=j;
294 
295         for(j=0; j<basicLength; ++j) {
296             b=src.charAt(j);
297             if(!isBasic(b)) {
298                 throw new StringPrepParseException("Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND);
299             }
300             dest.append(b);
301 
302             if(caseFlags!=null && j<caseFlags.length) {
303                 caseFlags[j]=isBasicUpperCase(b);
304             }
305         }
306 
307         /* Initialize the state: */
308         n=INITIAL_N;
309         i=0;
310         bias=INITIAL_BIAS;
311         firstSupplementaryIndex=1000000000;
312 
313         /*
314          * Main decoding loop:
315          * Start just after the last delimiter if any
316          * basic code points were copied; start at the beginning otherwise.
317          */
318         for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
319             /*
320              * in is the index of the next character to be consumed, and
321              * destCPCount is the number of code points in the output array.
322              *
323              * Decode a generalized variable-length integer into delta,
324              * which gets added to i.  The overflow checking is easier
325              * if we increase i as we go, then subtract off its starting
326              * value at the end to obtain delta.
327              */
328             for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
329                 if(in>=srcLength) {
330                     throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
331                 }
332 
333                 digit=basicToDigit[src.charAt(in++) & 0xFF];
334                 if(digit<0) {
335                     throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND);
336                 }
337                 if(digit>(0x7fffffff-i)/w) {
338                     /* integer overflow */
339                     throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
340                 }
341 
342                 i+=digit*w;
343                 t=k-bias;
344                 if(t<TMIN) {
345                     t=TMIN;
346                 } else if(k>=(bias+TMAX)) {
347                     t=TMAX;
348                 }
349                 if(digit<t) {
350                     break;
351                 }
352 
353                 if(w>0x7fffffff/(BASE-t)) {
354                     /* integer overflow */
355                     throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
356                 }
357                 w*=BASE-t;
358             }
359 
360             /*
361              * Modification from sample code:
362              * Increments destCPCount here,
363              * where needed instead of in for() loop tail.
364              */
365             ++destCPCount;
366             bias=adaptBias(i-oldi, destCPCount, (oldi==0));
367 
368             /*
369              * i was supposed to wrap around from (incremented) destCPCount to 0,
370              * incrementing n each time, so we'll fix that now:
371              */
372             if(i/destCPCount>(0x7fffffff-n)) {
373                 /* integer overflow */
374                 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
375             }
376 
377             n+=i/destCPCount;
378             i%=destCPCount;
379             /* not needed for Punycode: */
380             /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
381 
382             if(n>0x10ffff || isSurrogate(n)) {
383                 /* Unicode code point overflow */
384                 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
385             }
386 
387             /* Insert n at position i of the output: */
388             cpLength=Character.charCount(n);
389             int codeUnitIndex;
390 
391             /*
392              * Handle indexes when supplementary code points are present.
393              *
394              * In almost all cases, there will be only BMP code points before i
395              * and even in the entire string.
396              * This is handled with the same efficiency as with UTF-32.
397              *
398              * Only the rare cases with supplementary code points are handled
399              * more slowly - but not too bad since this is an insertion anyway.
400              */
401             if(i<=firstSupplementaryIndex) {
402                 codeUnitIndex=i;
403                 if(cpLength>1) {
404                     firstSupplementaryIndex=codeUnitIndex;
405                 } else {
406                     ++firstSupplementaryIndex;
407                 }
408             } else {
409                 codeUnitIndex=dest.offsetByCodePoints(firstSupplementaryIndex, i-firstSupplementaryIndex);
410             }
411 
412             /* use the UChar index codeUnitIndex instead of the code point index i */
413             if(caseFlags!=null && (dest.length()+cpLength)<=caseFlags.length) {
414                 if(codeUnitIndex<dest.length()) {
415                     System.arraycopy(caseFlags, codeUnitIndex,
416                                      caseFlags, codeUnitIndex+cpLength,
417                                      dest.length()-codeUnitIndex);
418                 }
419                 /* Case of last character determines uppercase flag: */
420                 caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
421                 if(cpLength==2) {
422                     caseFlags[codeUnitIndex+1]=false;
423                 }
424             }
425             if(cpLength==1) {
426                 /* BMP, insert one code unit */
427                 dest.insert(codeUnitIndex, (char)n);
428             } else {
429                 /* supplementary character, insert two code units */
430                 dest.insert(codeUnitIndex, UTF16.getLeadSurrogate(n));
431                 dest.insert(codeUnitIndex+1, UTF16.getTrailSurrogate(n));
432             }
433             ++i;
434         }
435         return dest;
436     }
437 }
438