1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2002-2011, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: punycode.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2002jan31
16 * created by: Markus W. Scherer
17 */
18
19
20 /* This ICU code derived from: */
21 /*
22 punycode.c 0.4.0 (2001-Nov-17-Sat)
23 http://www.cs.berkeley.edu/~amc/idn/
24 Adam M. Costello
25 http://www.nicemice.net/amc/
26
27 Disclaimer and license
28
29 Regarding this entire document or any portion of it (including
30 the pseudocode and C code), the author makes no guarantees and
31 is not responsible for any damage resulting from its use. The
32 author grants irrevocable permission to anyone to use, modify,
33 and distribute it in any way that does not diminish the rights
34 of anyone else to use, modify, and distribute it, provided that
35 redistributed derivative works do not contain misleading author or
36 version information. Derivative works need not be licensed under
37 similar terms.
38 */
39 /*
40 * ICU modifications:
41 * - ICU data types and coding conventions
42 * - ICU string buffer handling with implicit source lengths
43 * and destination preflighting
44 * - UTF-16 handling
45 */
46
47 #include "unicode/utypes.h"
48
49 #if !UCONFIG_NO_IDNA
50
51 #include "unicode/ustring.h"
52 #include "unicode/utf.h"
53 #include "unicode/utf16.h"
54 #include "ustr_imp.h"
55 #include "cstring.h"
56 #include "cmemory.h"
57 #include "punycode.h"
58 #include "uassert.h"
59
60
61 /* Punycode ----------------------------------------------------------------- */
62
63 /* Punycode parameters for Bootstring */
64 #define BASE 36
65 #define TMIN 1
66 #define TMAX 26
67 #define SKEW 38
68 #define DAMP 700
69 #define INITIAL_BIAS 72
70 #define INITIAL_N 0x80
71
72 /* "Basic" Unicode/ASCII code points */
73 #define _HYPHEN 0X2d
74 #define DELIMITER _HYPHEN
75
76 #define _ZERO_ 0X30
77 #define _NINE 0x39
78
79 #define _SMALL_A 0X61
80 #define _SMALL_Z 0X7a
81
82 #define _CAPITAL_A 0X41
83 #define _CAPITAL_Z 0X5a
84
85 #define IS_BASIC(c) ((c)<0x80)
86 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
87
88 /**
89 * digitToBasic() returns the basic code point whose value
90 * (when used for representing integers) is d, which must be in the
91 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
92 * nonzero, in which case the uppercase form is used.
93 */
94 static inline char
digitToBasic(int32_t digit,UBool uppercase)95 digitToBasic(int32_t digit, UBool uppercase) {
96 /* 0..25 map to ASCII a..z or A..Z */
97 /* 26..35 map to ASCII 0..9 */
98 if(digit<26) {
99 if(uppercase) {
100 return (char)(_CAPITAL_A+digit);
101 } else {
102 return (char)(_SMALL_A+digit);
103 }
104 } else {
105 return (char)((_ZERO_-26)+digit);
106 }
107 }
108
109 /**
110 * @return the numeric value of a basic code point (for use in representing integers)
111 * in the range 0 to BASE-1, or a negative value if cp is invalid.
112 */
decodeDigit(int32_t cp)113 static int32_t decodeDigit(int32_t cp) {
114 if(cp<=u'Z') {
115 if(cp<=u'9') {
116 if(cp<u'0') {
117 return -1;
118 } else {
119 return cp-u'0'+26; // 0..9 -> 26..35
120 }
121 } else {
122 return cp-u'A'; // A-Z -> 0..25
123 }
124 } else if(cp<=u'z') {
125 return cp-'a'; // a..z -> 0..25
126 } else {
127 return -1;
128 }
129 }
130
131 static inline char
asciiCaseMap(char b,UBool uppercase)132 asciiCaseMap(char b, UBool uppercase) {
133 if(uppercase) {
134 if(_SMALL_A<=b && b<=_SMALL_Z) {
135 b-=(_SMALL_A-_CAPITAL_A);
136 }
137 } else {
138 if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
139 b+=(_SMALL_A-_CAPITAL_A);
140 }
141 }
142 return b;
143 }
144
145 /* Punycode-specific Bootstring code ---------------------------------------- */
146
147 /*
148 * The following code omits the {parts} of the pseudo-algorithm in the spec
149 * that are not used with the Punycode parameter set.
150 */
151
152 /* Bias adaptation function. */
153 static int32_t
adaptBias(int32_t delta,int32_t length,UBool firstTime)154 adaptBias(int32_t delta, int32_t length, UBool firstTime) {
155 int32_t count;
156
157 if(firstTime) {
158 delta/=DAMP;
159 } else {
160 delta/=2;
161 }
162
163 delta+=delta/length;
164 for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
165 delta/=(BASE-TMIN);
166 }
167
168 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
169 }
170
171 namespace {
172
173 // ICU-13727: Limit input length for n^2 algorithm
174 // where well-formed strings are at most 59 characters long.
175 constexpr int32_t ENCODE_MAX_CODE_UNITS=1000;
176 constexpr int32_t DECODE_MAX_CHARS=2000;
177
178 } // namespace
179
180 // encode
181 U_CAPI int32_t
u_strToPunycode(const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,const UBool * caseFlags,UErrorCode * pErrorCode)182 u_strToPunycode(const UChar *src, int32_t srcLength,
183 UChar *dest, int32_t destCapacity,
184 const UBool *caseFlags,
185 UErrorCode *pErrorCode) {
186
187 int32_t cpBuffer[ENCODE_MAX_CODE_UNITS];
188 int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
189 UChar c, c2;
190
191 /* argument checking */
192 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
193 return 0;
194 }
195
196 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
197 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
198 return 0;
199 }
200 if (srcLength>ENCODE_MAX_CODE_UNITS) {
201 *pErrorCode=U_INPUT_TOO_LONG_ERROR;
202 return 0;
203 }
204
205 /*
206 * Handle the basic code points and
207 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
208 */
209 srcCPCount=destLength=0;
210 if(srcLength==-1) {
211 /* NUL-terminated input */
212 for(j=0; /* no condition */; ++j) {
213 if((c=src[j])==0) {
214 break;
215 }
216 if(j>=ENCODE_MAX_CODE_UNITS) {
217 *pErrorCode=U_INPUT_TOO_LONG_ERROR;
218 return 0;
219 }
220 if(IS_BASIC(c)) {
221 cpBuffer[srcCPCount++]=0;
222 if(destLength<destCapacity) {
223 dest[destLength]=
224 caseFlags!=NULL ?
225 asciiCaseMap((char)c, caseFlags[j]) :
226 (char)c;
227 }
228 ++destLength;
229 } else {
230 n=(caseFlags!=NULL && caseFlags[j])<<31L;
231 if(U16_IS_SINGLE(c)) {
232 n|=c;
233 } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
234 ++j;
235 n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
236 } else {
237 /* error: unmatched surrogate */
238 *pErrorCode=U_INVALID_CHAR_FOUND;
239 return 0;
240 }
241 cpBuffer[srcCPCount++]=n;
242 }
243 }
244 } else {
245 /* length-specified input */
246 for(j=0; j<srcLength; ++j) {
247 c=src[j];
248 if(IS_BASIC(c)) {
249 cpBuffer[srcCPCount++]=0;
250 if(destLength<destCapacity) {
251 dest[destLength]=
252 caseFlags!=NULL ?
253 asciiCaseMap((char)c, caseFlags[j]) :
254 (char)c;
255 }
256 ++destLength;
257 } else {
258 n=(caseFlags!=NULL && caseFlags[j])<<31L;
259 if(U16_IS_SINGLE(c)) {
260 n|=c;
261 } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
262 ++j;
263 n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
264 } else {
265 /* error: unmatched surrogate */
266 *pErrorCode=U_INVALID_CHAR_FOUND;
267 return 0;
268 }
269 cpBuffer[srcCPCount++]=n;
270 }
271 }
272 }
273
274 /* Finish the basic string - if it is not empty - with a delimiter. */
275 basicLength=destLength;
276 if(basicLength>0) {
277 if(destLength<destCapacity) {
278 dest[destLength]=DELIMITER;
279 }
280 ++destLength;
281 }
282
283 /*
284 * handledCPCount is the number of code points that have been handled
285 * basicLength is the number of basic code points
286 * destLength is the number of chars that have been output
287 */
288
289 /* Initialize the state: */
290 n=INITIAL_N;
291 delta=0;
292 bias=INITIAL_BIAS;
293
294 /* Main encoding loop: */
295 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
296 /*
297 * All non-basic code points < n have been handled already.
298 * Find the next larger one:
299 */
300 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
301 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
302 if(n<=q && q<m) {
303 m=q;
304 }
305 }
306
307 /*
308 * Increase delta enough to advance the decoder's
309 * <n,i> state to <m,0>, but guard against overflow:
310 */
311 if(m-n>(0x7fffffff-handledCPCount-delta)/(handledCPCount+1)) {
312 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
313 return 0;
314 }
315 delta+=(m-n)*(handledCPCount+1);
316 n=m;
317
318 /* Encode a sequence of same code points n */
319 for(j=0; j<srcCPCount; ++j) {
320 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
321 if(q<n) {
322 ++delta;
323 } else if(q==n) {
324 /* Represent delta as a generalized variable-length integer: */
325 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
326
327 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
328
329 t=k-bias;
330 if(t<TMIN) {
331 t=TMIN;
332 } else if(t>TMAX) {
333 t=TMAX;
334 }
335 */
336
337 t=k-bias;
338 if(t<TMIN) {
339 t=TMIN;
340 } else if(k>=(bias+TMAX)) {
341 t=TMAX;
342 }
343
344 if(q<t) {
345 break;
346 }
347
348 if(destLength<destCapacity) {
349 dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
350 }
351 ++destLength;
352 q=(q-t)/(BASE-t);
353 }
354
355 if(destLength<destCapacity) {
356 dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
357 }
358 ++destLength;
359 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
360 delta=0;
361 ++handledCPCount;
362 }
363 }
364
365 ++delta;
366 ++n;
367 }
368
369 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
370 }
371
372 // decode
373 U_CAPI int32_t
u_strFromPunycode(const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,UBool * caseFlags,UErrorCode * pErrorCode)374 u_strFromPunycode(const UChar *src, int32_t srcLength,
375 UChar *dest, int32_t destCapacity,
376 UBool *caseFlags,
377 UErrorCode *pErrorCode) {
378 int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
379 destCPCount, firstSupplementaryIndex, cpLength;
380 UChar b;
381
382 /* argument checking */
383 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
384 return 0;
385 }
386
387 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
388 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
389 return 0;
390 }
391
392 if(srcLength==-1) {
393 srcLength=u_strlen(src);
394 }
395 if (srcLength>DECODE_MAX_CHARS) {
396 *pErrorCode=U_INPUT_TOO_LONG_ERROR;
397 return 0;
398 }
399
400 /*
401 * Handle the basic code points:
402 * Let basicLength be the number of input code points
403 * before the last delimiter, or 0 if there is none,
404 * then copy the first basicLength code points to the output.
405 *
406 * The two following loops iterate backward.
407 */
408 for(j=srcLength; j>0;) {
409 if(src[--j]==DELIMITER) {
410 break;
411 }
412 }
413 destLength=basicLength=destCPCount=j;
414 U_ASSERT(destLength>=0);
415
416 while(j>0) {
417 b=src[--j];
418 if(!IS_BASIC(b)) {
419 *pErrorCode=U_INVALID_CHAR_FOUND;
420 return 0;
421 }
422
423 if(j<destCapacity) {
424 dest[j]=(UChar)b;
425
426 if(caseFlags!=NULL) {
427 caseFlags[j]=IS_BASIC_UPPERCASE(b);
428 }
429 }
430 }
431
432 /* Initialize the state: */
433 n=INITIAL_N;
434 i=0;
435 bias=INITIAL_BIAS;
436 firstSupplementaryIndex=1000000000;
437
438 /*
439 * Main decoding loop:
440 * Start just after the last delimiter if any
441 * basic code points were copied; start at the beginning otherwise.
442 */
443 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
444 /*
445 * in is the index of the next character to be consumed, and
446 * destCPCount is the number of code points in the output array.
447 *
448 * Decode a generalized variable-length integer into delta,
449 * which gets added to i. The overflow checking is easier
450 * if we increase i as we go, then subtract off its starting
451 * value at the end to obtain delta.
452 */
453 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
454 if(in>=srcLength) {
455 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
456 return 0;
457 }
458
459 digit=decodeDigit(src[in++]);
460 if(digit<0) {
461 *pErrorCode=U_INVALID_CHAR_FOUND;
462 return 0;
463 }
464 if(digit>(0x7fffffff-i)/w) {
465 /* integer overflow */
466 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
467 return 0;
468 }
469
470 i+=digit*w;
471 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
472 t=k-bias;
473 if(t<TMIN) {
474 t=TMIN;
475 } else if(t>TMAX) {
476 t=TMAX;
477 }
478 */
479 t=k-bias;
480 if(t<TMIN) {
481 t=TMIN;
482 } else if(k>=(bias+TMAX)) {
483 t=TMAX;
484 }
485 if(digit<t) {
486 break;
487 }
488
489 if(w>0x7fffffff/(BASE-t)) {
490 /* integer overflow */
491 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
492 return 0;
493 }
494 w*=BASE-t;
495 }
496
497 /*
498 * Modification from sample code:
499 * Increments destCPCount here,
500 * where needed instead of in for() loop tail.
501 */
502 ++destCPCount;
503 bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
504
505 /*
506 * i was supposed to wrap around from (incremented) destCPCount to 0,
507 * incrementing n each time, so we'll fix that now:
508 */
509 if(i/destCPCount>(0x7fffffff-n)) {
510 /* integer overflow */
511 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
512 return 0;
513 }
514
515 n+=i/destCPCount;
516 i%=destCPCount;
517 /* not needed for Punycode: */
518 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
519
520 if(n>0x10ffff || U_IS_SURROGATE(n)) {
521 /* Unicode code point overflow */
522 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
523 return 0;
524 }
525
526 /* Insert n at position i of the output: */
527 cpLength=U16_LENGTH(n);
528 if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
529 int32_t codeUnitIndex;
530
531 /*
532 * Handle indexes when supplementary code points are present.
533 *
534 * In almost all cases, there will be only BMP code points before i
535 * and even in the entire string.
536 * This is handled with the same efficiency as with UTF-32.
537 *
538 * Only the rare cases with supplementary code points are handled
539 * more slowly - but not too bad since this is an insertion anyway.
540 */
541 if(i<=firstSupplementaryIndex) {
542 codeUnitIndex=i;
543 if(cpLength>1) {
544 firstSupplementaryIndex=codeUnitIndex;
545 } else {
546 ++firstSupplementaryIndex;
547 }
548 } else {
549 codeUnitIndex=firstSupplementaryIndex;
550 U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
551 }
552
553 /* use the UChar index codeUnitIndex instead of the code point index i */
554 if(codeUnitIndex<destLength) {
555 uprv_memmove(dest+codeUnitIndex+cpLength,
556 dest+codeUnitIndex,
557 (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
558 if(caseFlags!=NULL) {
559 uprv_memmove(caseFlags+codeUnitIndex+cpLength,
560 caseFlags+codeUnitIndex,
561 destLength-codeUnitIndex);
562 }
563 }
564 if(cpLength==1) {
565 /* BMP, insert one code unit */
566 dest[codeUnitIndex]=(UChar)n;
567 } else {
568 /* supplementary character, insert two code units */
569 dest[codeUnitIndex]=U16_LEAD(n);
570 dest[codeUnitIndex+1]=U16_TRAIL(n);
571 }
572 if(caseFlags!=NULL) {
573 /* Case of last character determines uppercase flag: */
574 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
575 if(cpLength==2) {
576 caseFlags[codeUnitIndex+1]=FALSE;
577 }
578 }
579 }
580 destLength+=cpLength;
581 U_ASSERT(destLength>=0);
582 ++i;
583 }
584
585 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
586 }
587
588 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
589
590 #endif /* #if !UCONFIG_NO_IDNA */
591