1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2010-2014, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  uts46test.cpp
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2010may05
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_IDNA
20 
21 #include <string.h>
22 #include "unicode/bytestream.h"
23 #include "unicode/idna.h"
24 #include "unicode/localpointer.h"
25 #include "unicode/std_string.h"
26 #include "unicode/stringpiece.h"
27 #include "unicode/uidna.h"
28 #include "unicode/unistr.h"
29 #include "charstr.h"
30 #include "cmemory.h"
31 #include "intltest.h"
32 #include "punycode.h"
33 #include "uparse.h"
34 
35 class UTS46Test : public IntlTest {
36 public:
UTS46Test()37     UTS46Test() : trans(NULL), nontrans(NULL) {}
38     virtual ~UTS46Test();
39 
40     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
41     void TestAPI();
42     void TestNotSTD3();
43     void TestInvalidPunycodeDigits();
44     void TestACELabelEdgeCases();
45     void TestTooLong();
46     void TestSomeCases();
47     void IdnaTest();
48 
49     void checkIdnaTestResult(const char *line, const char *type,
50                              const UnicodeString &expected, const UnicodeString &result,
51                              const char *status, const IDNAInfo &info);
52     void idnaTestOneLine(char *fields[][2], UErrorCode &errorCode);
53 
54 private:
55     IDNA *trans, *nontrans;
56 };
57 
createUTS46Test()58 extern IntlTest *createUTS46Test() {
59     return new UTS46Test();
60 }
61 
~UTS46Test()62 UTS46Test::~UTS46Test() {
63     delete trans;
64     delete nontrans;
65 }
66 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)67 void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
68     if(exec) {
69         logln("TestSuite UTS46Test: ");
70         if(trans==NULL) {
71             IcuTestErrorCode errorCode(*this, "init/createUTS46Instance()");
72             uint32_t commonOptions=
73                 UIDNA_USE_STD3_RULES|UIDNA_CHECK_BIDI|
74                 UIDNA_CHECK_CONTEXTJ|UIDNA_CHECK_CONTEXTO;
75             trans=IDNA::createUTS46Instance(commonOptions, errorCode);
76             nontrans=IDNA::createUTS46Instance(
77                 commonOptions|
78                 UIDNA_NONTRANSITIONAL_TO_ASCII|UIDNA_NONTRANSITIONAL_TO_UNICODE,
79                 errorCode);
80             if(errorCode.errDataIfFailureAndReset("createUTS46Instance()")) {
81                 name="";
82                 return;
83             }
84         }
85     }
86     TESTCASE_AUTO_BEGIN;
87     TESTCASE_AUTO(TestAPI);
88     TESTCASE_AUTO(TestNotSTD3);
89     TESTCASE_AUTO(TestInvalidPunycodeDigits);
90     TESTCASE_AUTO(TestACELabelEdgeCases);
91     TESTCASE_AUTO(TestTooLong);
92     TESTCASE_AUTO(TestSomeCases);
93     TESTCASE_AUTO(IdnaTest);
94     TESTCASE_AUTO_END;
95 }
96 
97 const uint32_t severeErrors=
98     UIDNA_ERROR_LEADING_COMBINING_MARK|
99     UIDNA_ERROR_DISALLOWED|
100     UIDNA_ERROR_PUNYCODE|
101     UIDNA_ERROR_LABEL_HAS_DOT|
102     UIDNA_ERROR_INVALID_ACE_LABEL;
103 
isASCII(const UnicodeString & str)104 static UBool isASCII(const UnicodeString &str) {
105     const UChar *s=str.getBuffer();
106     int32_t length=str.length();
107     for(int32_t i=0; i<length; ++i) {
108         if(s[i]>=0x80) {
109             return FALSE;
110         }
111     }
112     return TRUE;
113 }
114 
115 class TestCheckedArrayByteSink : public CheckedArrayByteSink {
116 public:
TestCheckedArrayByteSink(char * outbuf,int32_t capacity)117     TestCheckedArrayByteSink(char* outbuf, int32_t capacity)
118             : CheckedArrayByteSink(outbuf, capacity), calledFlush(FALSE) {}
Reset()119     virtual CheckedArrayByteSink& Reset() {
120         CheckedArrayByteSink::Reset();
121         calledFlush = FALSE;
122         return *this;
123     }
Flush()124     virtual void Flush() { calledFlush = TRUE; }
125     UBool calledFlush;
126 };
127 
TestAPI()128 void UTS46Test::TestAPI() {
129     UErrorCode errorCode=U_ZERO_ERROR;
130     UnicodeString result;
131     IDNAInfo info;
132     UnicodeString input=UNICODE_STRING_SIMPLE("www.eXample.cOm");
133     UnicodeString expected=UNICODE_STRING_SIMPLE("www.example.com");
134     trans->nameToASCII(input, result, info, errorCode);
135     if(U_FAILURE(errorCode) || info.hasErrors() || result!=expected) {
136         errln("T.nameToASCII(www.example.com) info.errors=%04lx result matches=%d %s",
137               (long)info.getErrors(), result==expected, u_errorName(errorCode));
138     }
139     errorCode=U_USELESS_COLLATOR_ERROR;
140     trans->nameToUnicode(input, result, info, errorCode);
141     if(errorCode!=U_USELESS_COLLATOR_ERROR || !result.isBogus()) {
142         errln("T.nameToUnicode(U_FAILURE) did not preserve the errorCode "
143               "or not result.setToBogus() - %s",
144               u_errorName(errorCode));
145     }
146     errorCode=U_ZERO_ERROR;
147     input.setToBogus();
148     result=UNICODE_STRING_SIMPLE("quatsch");
149     nontrans->labelToASCII(input, result, info, errorCode);
150     if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || !result.isBogus()) {
151         errln("N.labelToASCII(bogus) did not set illegal-argument-error "
152               "or not result.setToBogus() - %s",
153               u_errorName(errorCode));
154     }
155     errorCode=U_ZERO_ERROR;
156     input=UNICODE_STRING_SIMPLE("xn--bcher.de-65a");
157     expected=UNICODE_STRING_SIMPLE("xn--bcher\\uFFFDde-65a").unescape();
158     nontrans->labelToASCII(input, result, info, errorCode);
159     if( U_FAILURE(errorCode) ||
160         info.getErrors()!=(UIDNA_ERROR_LABEL_HAS_DOT|UIDNA_ERROR_INVALID_ACE_LABEL) ||
161         result!=expected
162     ) {
163         errln("N.labelToASCII(label-with-dot) failed with errors %04lx - %s",
164               info.getErrors(), u_errorName(errorCode));
165     }
166     // UTF-8
167     char buffer[100];
168     TestCheckedArrayByteSink sink(buffer, UPRV_LENGTHOF(buffer));
169     errorCode=U_ZERO_ERROR;
170     nontrans->labelToUnicodeUTF8(StringPiece((const char *)NULL, 5), sink, info, errorCode);
171     if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || sink.NumberOfBytesWritten()!=0) {
172         errln("N.labelToUnicodeUTF8(StringPiece(NULL, 5)) did not set illegal-argument-error ",
173               "or did output something - %s",
174               u_errorName(errorCode));
175     }
176 
177     sink.Reset();
178     errorCode=U_ZERO_ERROR;
179     nontrans->nameToASCII_UTF8(StringPiece(), sink, info, errorCode);
180     if(U_FAILURE(errorCode) || sink.NumberOfBytesWritten()!=0 || !sink.calledFlush) {
181         errln("N.nameToASCII_UTF8(empty) failed - %s",
182               u_errorName(errorCode));
183     }
184 
185     static const char s[]={ 0x61, (char)0xc3, (char)0x9f };
186     sink.Reset();
187     errorCode=U_USELESS_COLLATOR_ERROR;
188     nontrans->nameToUnicodeUTF8(StringPiece(s, 3), sink, info, errorCode);
189     if(errorCode!=U_USELESS_COLLATOR_ERROR || sink.NumberOfBytesWritten()!=0) {
190         errln("N.nameToUnicode_UTF8(U_FAILURE) did not preserve the errorCode "
191               "or did output something - %s",
192               u_errorName(errorCode));
193     }
194 
195     sink.Reset();
196     errorCode=U_ZERO_ERROR;
197     trans->labelToUnicodeUTF8(StringPiece(s, 3), sink, info, errorCode);
198     if( U_FAILURE(errorCode) || sink.NumberOfBytesWritten()!=3 ||
199         buffer[0]!=0x61 || buffer[1]!=0x73 || buffer[2]!=0x73 ||
200         !sink.calledFlush
201     ) {
202         errln("T.labelToUnicodeUTF8(a sharp-s) failed - %s",
203               u_errorName(errorCode));
204     }
205 
206     sink.Reset();
207     errorCode=U_ZERO_ERROR;
208     // "eXampLe.cOm"
209     static const char eX[]={ 0x65, 0x58, 0x61, 0x6d, 0x70, 0x4c, 0x65, 0x2e, 0x63, 0x4f, 0x6d, 0 };
210     // "example.com"
211     static const char ex[]={ 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x6f, 0x6d };
212     trans->nameToUnicodeUTF8(eX, sink, info, errorCode);
213     if( U_FAILURE(errorCode) || sink.NumberOfBytesWritten()!=11 ||
214         0!=memcmp(ex, buffer, 11) || !sink.calledFlush
215     ) {
216         errln("T.nameToUnicodeUTF8(eXampLe.cOm) failed - %s",
217               u_errorName(errorCode));
218     }
219 }
220 
TestNotSTD3()221 void UTS46Test::TestNotSTD3() {
222     IcuTestErrorCode errorCode(*this, "TestNotSTD3()");
223     char buffer[400];
224     LocalPointer<IDNA> not3(IDNA::createUTS46Instance(UIDNA_CHECK_BIDI, errorCode));
225     if(errorCode.isFailure()) {
226         return;
227     }
228     UnicodeString input=UNICODE_STRING_SIMPLE("\\u0000A_2+2=4\\u000A.e\\u00DFen.net").unescape();
229     UnicodeString result;
230     IDNAInfo info;
231     if( not3->nameToUnicode(input, result, info, errorCode)!=
232             UNICODE_STRING_SIMPLE("\\u0000a_2+2=4\\u000A.essen.net").unescape() ||
233         info.hasErrors()
234     ) {
235         prettify(result).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
236         errln("notSTD3.nameToUnicode(non-LDH ASCII) unexpected errors %04lx string %s",
237               (long)info.getErrors(), buffer);
238     }
239     // A space (BiDi class WS) is not allowed in a BiDi domain name.
240     input=UNICODE_STRING_SIMPLE("a z.xn--4db.edu");
241     not3->nameToASCII(input, result, info, errorCode);
242     if(result!=input || info.getErrors()!=UIDNA_ERROR_BIDI) {
243         errln("notSTD3.nameToASCII(ASCII-with-space.alef.edu) failed");
244     }
245     // Characters that are canonically equivalent to sequences with non-LDH ASCII.
246     input=UNICODE_STRING_SIMPLE("a\\u2260b\\u226Ec\\u226Fd").unescape();
247     not3->nameToUnicode(input, result, info, errorCode);
248     if(result!=input || info.hasErrors()) {
249         prettify(result).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
250         errln("notSTD3.nameToUnicode(equiv to non-LDH ASCII) unexpected errors %04lx string %s",
251               (long)info.getErrors(), buffer);
252     }
253 }
254 
TestInvalidPunycodeDigits()255 void UTS46Test::TestInvalidPunycodeDigits() {
256     IcuTestErrorCode errorCode(*this, "TestInvalidPunycodeDigits()");
257     LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
258     if(errorCode.isFailure()) {
259         return;
260     }
261     UnicodeString result;
262     {
263         IDNAInfo info;
264         idna->nameToUnicode(u"xn--pleP", result, info, errorCode);  // P=U+0050
265         assertFalse("nameToUnicode() should succeed",
266                     (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
267         assertEquals("normal result", u"ᔼᔴ", result);
268     }
269     {
270         IDNAInfo info;
271         idna->nameToUnicode(u"xn--pleѐ", result, info, errorCode);  // ends with non-ASCII U+0450
272         assertTrue("nameToUnicode() should detect non-ASCII",
273                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
274     }
275 
276     // Test with ASCII characters adjacent to LDH.
277     {
278         IDNAInfo info;
279         idna->nameToUnicode(u"xn--ple/", result, info, errorCode);
280         assertTrue("nameToUnicode() should detect '/'",
281                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
282     }
283 
284     {
285         IDNAInfo info;
286         idna->nameToUnicode(u"xn--ple:", result, info, errorCode);
287         assertTrue("nameToUnicode() should detect ':'",
288                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
289     }
290 
291     {
292         IDNAInfo info;
293         idna->nameToUnicode(u"xn--ple@", result, info, errorCode);
294         assertTrue("nameToUnicode() should detect '@'",
295                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
296     }
297 
298     {
299         IDNAInfo info;
300         idna->nameToUnicode(u"xn--ple[", result, info, errorCode);
301         assertTrue("nameToUnicode() should detect '['",
302                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
303     }
304 
305     {
306         IDNAInfo info;
307         idna->nameToUnicode(u"xn--ple`", result, info, errorCode);
308         assertTrue("nameToUnicode() should detect '`'",
309                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
310     }
311 
312     {
313         IDNAInfo info;
314         idna->nameToUnicode(u"xn--ple{", result, info, errorCode);
315         assertTrue("nameToUnicode() should detect '{'",
316                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
317     }
318 }
319 
TestACELabelEdgeCases()320 void UTS46Test::TestACELabelEdgeCases() {
321     // In IDNA2008, these labels fail the round-trip validation from comparing
322     // the ToUnicode input with the back-to-ToASCII output.
323     IcuTestErrorCode errorCode(*this, "TestACELabelEdgeCases()");
324     LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
325     if(errorCode.isFailure()) {
326         return;
327     }
328     UnicodeString result;
329     {
330         IDNAInfo info;
331         idna->labelToUnicode(u"xn--", result, info, errorCode);
332         assertTrue("empty xn--", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
333     }
334     {
335         IDNAInfo info;
336         idna->labelToUnicode(u"xN--ASCII-", result, info, errorCode);
337         assertTrue("nothing but ASCII", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
338     }
339     {
340         // Different error: The Punycode decoding procedure does not consume the last delimiter
341         // if it is right after the xn-- so the main decoding loop fails because the hyphen
342         // is not a valid Punycode digit.
343         IDNAInfo info;
344         idna->labelToUnicode(u"Xn---", result, info, errorCode);
345         assertTrue("empty Xn---", (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
346     }
347 }
348 
TestTooLong()349 void UTS46Test::TestTooLong() {
350     // ICU-13727: Limit input length for n^2 algorithm
351     // where well-formed strings are at most 59 characters long.
352     int32_t count = 50000;
353     UnicodeString s(count, u'a', count);  // capacity, code point, count
354     char16_t dest[60000];
355     UErrorCode errorCode = U_ZERO_ERROR;
356     u_strToPunycode(s.getBuffer(), s.length(), dest, UPRV_LENGTHOF(dest), nullptr, &errorCode);
357     assertEquals("encode: expected an error for too-long input", U_INPUT_TOO_LONG_ERROR, errorCode);
358     errorCode = U_ZERO_ERROR;
359     u_strFromPunycode(s.getBuffer(), s.length(), dest, UPRV_LENGTHOF(dest), nullptr, &errorCode);
360     assertEquals("decode: expected an error for too-long input", U_INPUT_TOO_LONG_ERROR, errorCode);
361 }
362 
363 struct TestCase {
364     // Input string and options string (Nontransitional/Transitional/Both).
365     const char *s, *o;
366     // Expected Unicode result string.
367     const char *u;
368     uint32_t errors;
369 };
370 
371 static const TestCase testCases[]={
372     { "www.eXample.cOm", "B",  // all ASCII
373       "www.example.com", 0 },
374     { "B\\u00FCcher.de", "B",  // u-umlaut
375       "b\\u00FCcher.de", 0 },
376     { "\\u00D6BB", "B",  // O-umlaut
377       "\\u00F6bb", 0 },
378     { "fa\\u00DF.de", "N",  // sharp s
379       "fa\\u00DF.de", 0 },
380     { "fa\\u00DF.de", "T",  // sharp s
381       "fass.de", 0 },
382     { "XN--fA-hia.dE", "B",  // sharp s in Punycode
383       "fa\\u00DF.de", 0 },
384     { "\\u03B2\\u03CC\\u03BB\\u03BF\\u03C2.com", "N",  // Greek with final sigma
385       "\\u03B2\\u03CC\\u03BB\\u03BF\\u03C2.com", 0 },
386     { "\\u03B2\\u03CC\\u03BB\\u03BF\\u03C2.com", "T",  // Greek with final sigma
387       "\\u03B2\\u03CC\\u03BB\\u03BF\\u03C3.com", 0 },
388     { "xn--nxasmm1c", "B",  // Greek with final sigma in Punycode
389       "\\u03B2\\u03CC\\u03BB\\u03BF\\u03C2", 0 },
390     { "www.\\u0DC1\\u0DCA\\u200D\\u0DBB\\u0DD3.com", "N",  // "Sri" in "Sri Lanka" has a ZWJ
391       "www.\\u0DC1\\u0DCA\\u200D\\u0DBB\\u0DD3.com", 0 },
392     { "www.\\u0DC1\\u0DCA\\u200D\\u0DBB\\u0DD3.com", "T",  // "Sri" in "Sri Lanka" has a ZWJ
393       "www.\\u0DC1\\u0DCA\\u0DBB\\u0DD3.com", 0 },
394     { "www.xn--10cl1a0b660p.com", "B",  // "Sri" in Punycode
395       "www.\\u0DC1\\u0DCA\\u200D\\u0DBB\\u0DD3.com", 0 },
396     { "\\u0646\\u0627\\u0645\\u0647\\u200C\\u0627\\u06CC", "N",  // ZWNJ
397       "\\u0646\\u0627\\u0645\\u0647\\u200C\\u0627\\u06CC", 0 },
398     { "\\u0646\\u0627\\u0645\\u0647\\u200C\\u0627\\u06CC", "T",  // ZWNJ
399       "\\u0646\\u0627\\u0645\\u0647\\u0627\\u06CC", 0 },
400     { "xn--mgba3gch31f060k.com", "B",  // ZWNJ in Punycode
401       "\\u0646\\u0627\\u0645\\u0647\\u200C\\u0627\\u06CC.com", 0 },
402     { "a.b\\uFF0Ec\\u3002d\\uFF61", "B",
403       "a.b.c.d.", 0 },
404     { "U\\u0308.xn--tda", "B",  // U+umlaut.u-umlaut
405       "\\u00FC.\\u00FC", 0 },
406     { "xn--u-ccb", "B",  // u+umlaut in Punycode
407       "xn--u-ccb\\uFFFD", UIDNA_ERROR_INVALID_ACE_LABEL },
408     { "a\\u2488com", "B",  // contains 1-dot
409       "a\\uFFFDcom", UIDNA_ERROR_DISALLOWED },
410     { "xn--a-ecp.ru", "B",  // contains 1-dot in Punycode
411       "xn--a-ecp\\uFFFD.ru", UIDNA_ERROR_INVALID_ACE_LABEL },
412     { "xn--0.pt", "B",  // invalid Punycode
413       "xn--0\\uFFFD.pt", UIDNA_ERROR_PUNYCODE },
414     { "xn--a.pt", "B",  // U+0080
415       "xn--a\\uFFFD.pt", UIDNA_ERROR_INVALID_ACE_LABEL },
416     { "xn--a-\\u00C4.pt", "B",  // invalid Punycode
417       "xn--a-\\u00E4.pt", UIDNA_ERROR_PUNYCODE },
418     { "\\u65E5\\u672C\\u8A9E\\u3002\\uFF2A\\uFF30", "B",  // Japanese with fullwidth ".jp"
419       "\\u65E5\\u672C\\u8A9E.jp", 0 },
420     { "\\u2615", "B", "\\u2615", 0 },  // Unicode 4.0 HOT BEVERAGE
421     // some characters are disallowed because they are canonically equivalent
422     // to sequences with non-LDH ASCII
423     { "a\\u2260b\\u226Ec\\u226Fd", "B",
424       "a\\uFFFDb\\uFFFDc\\uFFFDd", UIDNA_ERROR_DISALLOWED },
425     // many deviation characters, test the special mapping code
426     { "1.a\\u00DF\\u200C\\u200Db\\u200C\\u200Dc\\u00DF\\u00DF\\u00DF\\u00DFd"
427       "\\u03C2\\u03C3\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFe"
428       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFx"
429       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFy"
430       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u0302\\u00DFz", "N",
431       "1.a\\u00DF\\u200C\\u200Db\\u200C\\u200Dc\\u00DF\\u00DF\\u00DF\\u00DFd"
432       "\\u03C2\\u03C3\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFe"
433       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFx"
434       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFy"
435       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u0302\\u00DFz",
436       UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_CONTEXTJ },
437     { "1.a\\u00DF\\u200C\\u200Db\\u200C\\u200Dc\\u00DF\\u00DF\\u00DF\\u00DFd"
438       "\\u03C2\\u03C3\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFe"
439       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFx"
440       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFy"
441       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u0302\\u00DFz", "T",
442       "1.assbcssssssssd"
443       "\\u03C3\\u03C3sssssssssssssssse"
444       "ssssssssssssssssssssx"
445       "ssssssssssssssssssssy"
446       "sssssssssssssss\\u015Dssz", UIDNA_ERROR_LABEL_TOO_LONG },
447     // "xn--bss" with deviation characters
448     { "\\u200Cx\\u200Dn\\u200C-\\u200D-b\\u00DF", "N",
449       "\\u200Cx\\u200Dn\\u200C-\\u200D-b\\u00DF", UIDNA_ERROR_CONTEXTJ },
450     { "\\u200Cx\\u200Dn\\u200C-\\u200D-b\\u00DF", "T",
451       "\\u5919", 0 },
452     // "xn--bssffl" written as:
453     // 02E3 MODIFIER LETTER SMALL X
454     // 034F COMBINING GRAPHEME JOINER (ignored)
455     // 2115 DOUBLE-STRUCK CAPITAL N
456     // 200B ZERO WIDTH SPACE (ignored)
457     // FE63 SMALL HYPHEN-MINUS
458     // 00AD SOFT HYPHEN (ignored)
459     // FF0D FULLWIDTH HYPHEN-MINUS
460     // 180C MONGOLIAN FREE VARIATION SELECTOR TWO (ignored)
461     // 212C SCRIPT CAPITAL B
462     // FE00 VARIATION SELECTOR-1 (ignored)
463     // 017F LATIN SMALL LETTER LONG S
464     // 2064 INVISIBLE PLUS (ignored)
465     // 1D530 MATHEMATICAL FRAKTUR SMALL S
466     // E01EF VARIATION SELECTOR-256 (ignored)
467     // FB04 LATIN SMALL LIGATURE FFL
468     { "\\u02E3\\u034F\\u2115\\u200B\\uFE63\\u00AD\\uFF0D\\u180C"
469       "\\u212C\\uFE00\\u017F\\u2064\\U0001D530\\U000E01EF\\uFB04", "B",
470       "\\u5921\\u591E\\u591C\\u5919", 0 },
471     { "123456789012345678901234567890123456789012345678901234567890123."
472       "123456789012345678901234567890123456789012345678901234567890123."
473       "123456789012345678901234567890123456789012345678901234567890123."
474       "1234567890123456789012345678901234567890123456789012345678901", "B",
475       "123456789012345678901234567890123456789012345678901234567890123."
476       "123456789012345678901234567890123456789012345678901234567890123."
477       "123456789012345678901234567890123456789012345678901234567890123."
478       "1234567890123456789012345678901234567890123456789012345678901", 0 },
479     { "123456789012345678901234567890123456789012345678901234567890123."
480       "123456789012345678901234567890123456789012345678901234567890123."
481       "123456789012345678901234567890123456789012345678901234567890123."
482       "1234567890123456789012345678901234567890123456789012345678901.", "B",
483       "123456789012345678901234567890123456789012345678901234567890123."
484       "123456789012345678901234567890123456789012345678901234567890123."
485       "123456789012345678901234567890123456789012345678901234567890123."
486       "1234567890123456789012345678901234567890123456789012345678901.", 0 },
487     // Domain name >256 characters, forces slow path in UTF-8 processing.
488     { "123456789012345678901234567890123456789012345678901234567890123."
489       "123456789012345678901234567890123456789012345678901234567890123."
490       "123456789012345678901234567890123456789012345678901234567890123."
491       "123456789012345678901234567890123456789012345678901234567890123."
492       "12345678901234567890123456789012345678901234567890123456789012", "B",
493       "123456789012345678901234567890123456789012345678901234567890123."
494       "123456789012345678901234567890123456789012345678901234567890123."
495       "123456789012345678901234567890123456789012345678901234567890123."
496       "123456789012345678901234567890123456789012345678901234567890123."
497       "12345678901234567890123456789012345678901234567890123456789012",
498       UIDNA_ERROR_DOMAIN_NAME_TOO_LONG },
499     { "123456789012345678901234567890123456789012345678901234567890123."
500       "123456789012345678901234567890123456789012345678901234567890123."
501       "123456789012345678901234567890123456789012345678901234567890123."
502       "123456789012345678901234567890123456789012345678901234567890123."
503       "1234567890123456789012345678901234567890123456789\\u05D0", "B",
504       "123456789012345678901234567890123456789012345678901234567890123."
505       "123456789012345678901234567890123456789012345678901234567890123."
506       "123456789012345678901234567890123456789012345678901234567890123."
507       "123456789012345678901234567890123456789012345678901234567890123."
508       "1234567890123456789012345678901234567890123456789\\u05D0",
509       UIDNA_ERROR_DOMAIN_NAME_TOO_LONG|UIDNA_ERROR_BIDI },
510     { "123456789012345678901234567890123456789012345678901234567890123."
511       "1234567890123456789012345678901234567890123456789012345678901234."
512       "123456789012345678901234567890123456789012345678901234567890123."
513       "123456789012345678901234567890123456789012345678901234567890", "B",
514       "123456789012345678901234567890123456789012345678901234567890123."
515       "1234567890123456789012345678901234567890123456789012345678901234."
516       "123456789012345678901234567890123456789012345678901234567890123."
517       "123456789012345678901234567890123456789012345678901234567890",
518       UIDNA_ERROR_LABEL_TOO_LONG },
519     { "123456789012345678901234567890123456789012345678901234567890123."
520       "1234567890123456789012345678901234567890123456789012345678901234."
521       "123456789012345678901234567890123456789012345678901234567890123."
522       "123456789012345678901234567890123456789012345678901234567890.", "B",
523       "123456789012345678901234567890123456789012345678901234567890123."
524       "1234567890123456789012345678901234567890123456789012345678901234."
525       "123456789012345678901234567890123456789012345678901234567890123."
526       "123456789012345678901234567890123456789012345678901234567890.",
527       UIDNA_ERROR_LABEL_TOO_LONG },
528     { "123456789012345678901234567890123456789012345678901234567890123."
529       "1234567890123456789012345678901234567890123456789012345678901234."
530       "123456789012345678901234567890123456789012345678901234567890123."
531       "1234567890123456789012345678901234567890123456789012345678901", "B",
532       "123456789012345678901234567890123456789012345678901234567890123."
533       "1234567890123456789012345678901234567890123456789012345678901234."
534       "123456789012345678901234567890123456789012345678901234567890123."
535       "1234567890123456789012345678901234567890123456789012345678901",
536       UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG },
537     // label length 63: xn--1234567890123456789012345678901234567890123456789012345-9te
538     { "\\u00E41234567890123456789012345678901234567890123456789012345", "B",
539       "\\u00E41234567890123456789012345678901234567890123456789012345", 0 },
540     { "1234567890\\u00E41234567890123456789012345678901234567890123456", "B",
541       "1234567890\\u00E41234567890123456789012345678901234567890123456", UIDNA_ERROR_LABEL_TOO_LONG },
542     { "123456789012345678901234567890123456789012345678901234567890123."
543       "1234567890\\u00E4123456789012345678901234567890123456789012345."
544       "123456789012345678901234567890123456789012345678901234567890123."
545       "1234567890123456789012345678901234567890123456789012345678901", "B",
546       "123456789012345678901234567890123456789012345678901234567890123."
547       "1234567890\\u00E4123456789012345678901234567890123456789012345."
548       "123456789012345678901234567890123456789012345678901234567890123."
549       "1234567890123456789012345678901234567890123456789012345678901", 0 },
550     { "123456789012345678901234567890123456789012345678901234567890123."
551       "1234567890\\u00E4123456789012345678901234567890123456789012345."
552       "123456789012345678901234567890123456789012345678901234567890123."
553       "1234567890123456789012345678901234567890123456789012345678901.", "B",
554       "123456789012345678901234567890123456789012345678901234567890123."
555       "1234567890\\u00E4123456789012345678901234567890123456789012345."
556       "123456789012345678901234567890123456789012345678901234567890123."
557       "1234567890123456789012345678901234567890123456789012345678901.", 0 },
558     { "123456789012345678901234567890123456789012345678901234567890123."
559       "1234567890\\u00E4123456789012345678901234567890123456789012345."
560       "123456789012345678901234567890123456789012345678901234567890123."
561       "12345678901234567890123456789012345678901234567890123456789012", "B",
562       "123456789012345678901234567890123456789012345678901234567890123."
563       "1234567890\\u00E4123456789012345678901234567890123456789012345."
564       "123456789012345678901234567890123456789012345678901234567890123."
565       "12345678901234567890123456789012345678901234567890123456789012",
566       UIDNA_ERROR_DOMAIN_NAME_TOO_LONG },
567     { "123456789012345678901234567890123456789012345678901234567890123."
568       "1234567890\\u00E41234567890123456789012345678901234567890123456."
569       "123456789012345678901234567890123456789012345678901234567890123."
570       "123456789012345678901234567890123456789012345678901234567890", "B",
571       "123456789012345678901234567890123456789012345678901234567890123."
572       "1234567890\\u00E41234567890123456789012345678901234567890123456."
573       "123456789012345678901234567890123456789012345678901234567890123."
574       "123456789012345678901234567890123456789012345678901234567890",
575       UIDNA_ERROR_LABEL_TOO_LONG },
576     { "123456789012345678901234567890123456789012345678901234567890123."
577       "1234567890\\u00E41234567890123456789012345678901234567890123456."
578       "123456789012345678901234567890123456789012345678901234567890123."
579       "123456789012345678901234567890123456789012345678901234567890.", "B",
580       "123456789012345678901234567890123456789012345678901234567890123."
581       "1234567890\\u00E41234567890123456789012345678901234567890123456."
582       "123456789012345678901234567890123456789012345678901234567890123."
583       "123456789012345678901234567890123456789012345678901234567890.",
584       UIDNA_ERROR_LABEL_TOO_LONG },
585     { "123456789012345678901234567890123456789012345678901234567890123."
586       "1234567890\\u00E41234567890123456789012345678901234567890123456."
587       "123456789012345678901234567890123456789012345678901234567890123."
588       "1234567890123456789012345678901234567890123456789012345678901", "B",
589       "123456789012345678901234567890123456789012345678901234567890123."
590       "1234567890\\u00E41234567890123456789012345678901234567890123456."
591       "123456789012345678901234567890123456789012345678901234567890123."
592       "1234567890123456789012345678901234567890123456789012345678901",
593       UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG },
594     // hyphen errors and empty-label errors
595     // Ticket #10883: ToUnicode also checks for empty labels.
596     { ".", "B", ".", UIDNA_ERROR_EMPTY_LABEL },
597     { "\\uFF0E", "B", ".", UIDNA_ERROR_EMPTY_LABEL },
598     // "xn---q----jra"=="-q--a-umlaut-"
599     { "a.b..-q--a-.e", "B", "a.b..-q--a-.e",
600       UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|
601       UIDNA_ERROR_HYPHEN_3_4 },
602     { "a.b..-q--\\u00E4-.e", "B", "a.b..-q--\\u00E4-.e",
603       UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|
604       UIDNA_ERROR_HYPHEN_3_4 },
605     { "a.b..xn---q----jra.e", "B", "a.b..-q--\\u00E4-.e",
606       UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|
607       UIDNA_ERROR_HYPHEN_3_4 },
608     { "a..c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL },
609     { "a.xn--.c", "B", "a.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
610     { "a.-b.", "B", "a.-b.", UIDNA_ERROR_LEADING_HYPHEN },
611     { "a.b-.c", "B", "a.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
612     { "a.-.c", "B", "a.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },
613     { "a.bc--de.f", "B", "a.bc--de.f", UIDNA_ERROR_HYPHEN_3_4 },
614     { "\\u00E4.\\u00AD.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL },
615     { "\\u00E4.xn--.c", "B", "\\u00E4.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
616     { "\\u00E4.-b.", "B", "\\u00E4.-b.", UIDNA_ERROR_LEADING_HYPHEN },
617     { "\\u00E4.b-.c", "B", "\\u00E4.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
618     { "\\u00E4.-.c", "B", "\\u00E4.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },
619     { "\\u00E4.bc--de.f", "B", "\\u00E4.bc--de.f", UIDNA_ERROR_HYPHEN_3_4 },
620     { "a.b.\\u0308c.d", "B", "a.b.\\uFFFDc.d", UIDNA_ERROR_LEADING_COMBINING_MARK },
621     { "a.b.xn--c-bcb.d", "B",
622       "a.b.xn--c-bcb\\uFFFD.d", UIDNA_ERROR_LEADING_COMBINING_MARK|UIDNA_ERROR_INVALID_ACE_LABEL },
623     // BiDi
624     { "A0", "B", "a0", 0 },
625     { "0A", "B", "0a", 0 },  // all-LTR is ok to start with a digit (EN)
626     { "0A.\\u05D0", "B",  // ASCII label does not start with L/R/AL
627       "0a.\\u05D0", UIDNA_ERROR_BIDI },
628     { "c.xn--0-eha.xn--4db", "B",  // 2nd label does not start with L/R/AL
629       "c.0\\u00FC.\\u05D0", UIDNA_ERROR_BIDI },
630     { "b-.\\u05D0", "B",  // label does not end with L/EN
631       "b-.\\u05D0", UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI },
632     { "d.xn----dha.xn--4db", "B",  // 2nd label does not end with L/EN
633       "d.\\u00FC-.\\u05D0", UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI },
634     { "a\\u05D0", "B", "a\\u05D0", UIDNA_ERROR_BIDI },  // first dir != last dir
635     { "\\u05D0\\u05C7", "B", "\\u05D0\\u05C7", 0 },
636     { "\\u05D09\\u05C7", "B", "\\u05D09\\u05C7", 0 },
637     { "\\u05D0a\\u05C7", "B", "\\u05D0a\\u05C7", UIDNA_ERROR_BIDI },  // first dir != last dir
638     { "\\u05D0\\u05EA", "B", "\\u05D0\\u05EA", 0 },
639     { "\\u05D0\\u05F3\\u05EA", "B", "\\u05D0\\u05F3\\u05EA", 0 },
640     { "a\\u05D0Tz", "B", "a\\u05D0tz", UIDNA_ERROR_BIDI },  // mixed dir
641     { "\\u05D0T\\u05EA", "B", "\\u05D0t\\u05EA", UIDNA_ERROR_BIDI },  // mixed dir
642     { "\\u05D07\\u05EA", "B", "\\u05D07\\u05EA", 0 },
643     { "\\u05D0\\u0667\\u05EA", "B", "\\u05D0\\u0667\\u05EA", 0 },  // Arabic 7 in the middle
644     { "a7\\u0667z", "B", "a7\\u0667z", UIDNA_ERROR_BIDI },  // AN digit in LTR
645     { "a7\\u0667", "B", "a7\\u0667", UIDNA_ERROR_BIDI },  // AN digit in LTR
646     { "\\u05D07\\u0667\\u05EA", "B",  // mixed EN/AN digits in RTL
647       "\\u05D07\\u0667\\u05EA", UIDNA_ERROR_BIDI },
648     { "\\u05D07\\u0667", "B",  // mixed EN/AN digits in RTL
649       "\\u05D07\\u0667", UIDNA_ERROR_BIDI },
650     // ZWJ
651     { "\\u0BB9\\u0BCD\\u200D", "N", "\\u0BB9\\u0BCD\\u200D", 0 },  // Virama+ZWJ
652     { "\\u0BB9\\u200D", "N", "\\u0BB9\\u200D", UIDNA_ERROR_CONTEXTJ },  // no Virama
653     { "\\u200D", "N", "\\u200D", UIDNA_ERROR_CONTEXTJ },  // no Virama
654     // ZWNJ
655     { "\\u0BB9\\u0BCD\\u200C", "N", "\\u0BB9\\u0BCD\\u200C", 0 },  // Virama+ZWNJ
656     { "\\u0BB9\\u200C", "N", "\\u0BB9\\u200C", UIDNA_ERROR_CONTEXTJ },  // no Virama
657     { "\\u200C", "N", "\\u200C", UIDNA_ERROR_CONTEXTJ },  // no Virama
658     { "\\u0644\\u0670\\u200C\\u06ED\\u06EF", "N",  // Joining types D T ZWNJ T R
659       "\\u0644\\u0670\\u200C\\u06ED\\u06EF", 0 },
660     { "\\u0644\\u0670\\u200C\\u06EF", "N",  // D T ZWNJ R
661       "\\u0644\\u0670\\u200C\\u06EF", 0 },
662     { "\\u0644\\u200C\\u06ED\\u06EF", "N",  // D ZWNJ T R
663       "\\u0644\\u200C\\u06ED\\u06EF", 0 },
664     { "\\u0644\\u200C\\u06EF", "N",  // D ZWNJ R
665       "\\u0644\\u200C\\u06EF", 0 },
666     { "\\u0644\\u0670\\u200C\\u06ED", "N",  // D T ZWNJ T
667       "\\u0644\\u0670\\u200C\\u06ED", UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ },
668     { "\\u06EF\\u200C\\u06EF", "N",  // R ZWNJ R
669       "\\u06EF\\u200C\\u06EF", UIDNA_ERROR_CONTEXTJ },
670     { "\\u0644\\u200C", "N",  // D ZWNJ
671       "\\u0644\\u200C", UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ },
672     { "\\u0660\\u0661", "B",  // Arabic-Indic Digits alone
673       "\\u0660\\u0661", UIDNA_ERROR_BIDI },
674     { "\\u06F0\\u06F1", "B",  // Extended Arabic-Indic Digits alone
675       "\\u06F0\\u06F1", 0 },
676     { "\\u0660\\u06F1", "B",  // Mixed Arabic-Indic Digits
677       "\\u0660\\u06F1", UIDNA_ERROR_CONTEXTO_DIGITS|UIDNA_ERROR_BIDI },
678     // All of the CONTEXTO "Would otherwise have been DISALLOWED" characters
679     // in their correct contexts,
680     // then each in incorrect context.
681     { "l\\u00B7l\\u4E00\\u0375\\u03B1\\u05D0\\u05F3\\u05F4\\u30FB", "B",
682       "l\\u00B7l\\u4E00\\u0375\\u03B1\\u05D0\\u05F3\\u05F4\\u30FB", UIDNA_ERROR_BIDI },
683     { "l\\u00B7", "B",
684       "l\\u00B7", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
685     { "\\u00B7l", "B",
686       "\\u00B7l", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
687     { "\\u0375", "B",
688       "\\u0375", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
689     { "\\u03B1\\u05F3", "B",
690       "\\u03B1\\u05F3", UIDNA_ERROR_CONTEXTO_PUNCTUATION|UIDNA_ERROR_BIDI },
691     { "\\u05F4", "B",
692       "\\u05F4", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
693     { "l\\u30FB", "B",
694       "l\\u30FB", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
695     // Ticket #8137: UTS #46 toUnicode() fails with non-ASCII labels that turn
696     // into 15 characters (UChars).
697     // The bug was in u_strFromPunycode() which did not write the last character
698     // if it just so fit into the end of the destination buffer.
699     // The UTS #46 code gives a default-capacity UnicodeString as the destination buffer,
700     // and the internal UnicodeString capacity is currently 15 UChars on 64-bit machines
701     // but 13 on 32-bit machines.
702     // Label with 15 UChars, for 64-bit-machine testing:
703     { "aaaaaaaaaaaaa\\u00FCa.de", "B", "aaaaaaaaaaaaa\\u00FCa.de", 0 },
704     { "xn--aaaaaaaaaaaaaa-ssb.de", "B", "aaaaaaaaaaaaa\\u00FCa.de", 0 },
705     { "abschlu\\u00DFpr\\u00FCfung.de", "N", "abschlu\\u00DFpr\\u00FCfung.de", 0 },
706     { "xn--abschluprfung-hdb15b.de", "B", "abschlu\\u00DFpr\\u00FCfung.de", 0 },
707     // Label with 13 UChars, for 32-bit-machine testing:
708     { "xn--aaaaaaaaaaaa-nlb.de", "B", "aaaaaaaaaaa\\u00FCa.de", 0 },
709     { "xn--schluprfung-z6a39a.de", "B", "schlu\\u00DFpr\\u00FCfung.de", 0 },
710     // { "", "B",
711     //   "", 0 },
712 };
713 
TestSomeCases()714 void UTS46Test::TestSomeCases() {
715     IcuTestErrorCode errorCode(*this, "TestSomeCases");
716     char buffer[400], buffer2[400];
717     int32_t i;
718     for(i=0; i<UPRV_LENGTHOF(testCases); ++i) {
719         const TestCase &testCase=testCases[i];
720         UnicodeString input(ctou(testCase.s));
721         UnicodeString expected(ctou(testCase.u));
722         // ToASCII/ToUnicode, transitional/nontransitional
723         UnicodeString aT, uT, aN, uN;
724         IDNAInfo aTInfo, uTInfo, aNInfo, uNInfo;
725         trans->nameToASCII(input, aT, aTInfo, errorCode);
726         trans->nameToUnicode(input, uT, uTInfo, errorCode);
727         nontrans->nameToASCII(input, aN, aNInfo, errorCode);
728         nontrans->nameToUnicode(input, uN, uNInfo, errorCode);
729         if(errorCode.errIfFailureAndReset("first-level processing [%d/%s] %s",
730                                           (int)i, testCase.o, testCase.s)
731         ) {
732             continue;
733         }
734         // ToUnicode does not set length-overflow errors.
735         uint32_t uniErrors=testCase.errors&~
736             (UIDNA_ERROR_LABEL_TOO_LONG|
737              UIDNA_ERROR_DOMAIN_NAME_TOO_LONG);
738         char mode=testCase.o[0];
739         if(mode=='B' || mode=='N') {
740             if(uNInfo.getErrors()!=uniErrors) {
741                 errln("N.nameToUnicode([%d] %s) unexpected errors %04lx",
742                       (int)i, testCase.s, (long)uNInfo.getErrors());
743                 continue;
744             }
745             if(uN!=expected) {
746                 prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
747                 errln("N.nameToUnicode([%d] %s) unexpected string %s",
748                       (int)i, testCase.s, buffer);
749                 continue;
750             }
751             if(aNInfo.getErrors()!=testCase.errors) {
752                 errln("N.nameToASCII([%d] %s) unexpected errors %04lx",
753                       (int)i, testCase.s, (long)aNInfo.getErrors());
754                 continue;
755             }
756         }
757         if(mode=='B' || mode=='T') {
758             if(uTInfo.getErrors()!=uniErrors) {
759                 errln("T.nameToUnicode([%d] %s) unexpected errors %04lx",
760                       (int)i, testCase.s, (long)uTInfo.getErrors());
761                 continue;
762             }
763             if(uT!=expected) {
764                 prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
765                 errln("T.nameToUnicode([%d] %s) unexpected string %s",
766                       (int)i, testCase.s, buffer);
767                 continue;
768             }
769             if(aTInfo.getErrors()!=testCase.errors) {
770                 errln("T.nameToASCII([%d] %s) unexpected errors %04lx",
771                       (int)i, testCase.s, (long)aTInfo.getErrors());
772                 continue;
773             }
774         }
775         // ToASCII is all-ASCII if no severe errors
776         if((aNInfo.getErrors()&severeErrors)==0 && !isASCII(aN)) {
777             prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
778             errln("N.nameToASCII([%d] %s) (errors %04lx) result is not ASCII %s",
779                   (int)i, testCase.s, aNInfo.getErrors(), buffer);
780             continue;
781         }
782         if((aTInfo.getErrors()&severeErrors)==0 && !isASCII(aT)) {
783             prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
784             errln("T.nameToASCII([%d] %s) (errors %04lx) result is not ASCII %s",
785                   (int)i, testCase.s, aTInfo.getErrors(), buffer);
786             continue;
787         }
788         if(verbose) {
789             char m= mode=='B' ? mode : 'N';
790             prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
791             logln("%c.nameToASCII([%d] %s) (errors %04lx) result string: %s",
792                   m, (int)i, testCase.s, aNInfo.getErrors(), buffer);
793             if(mode!='B') {
794                 prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
795                 logln("T.nameToASCII([%d] %s) (errors %04lx) result string: %s",
796                       (int)i, testCase.s, aTInfo.getErrors(), buffer);
797             }
798         }
799         // second-level processing
800         UnicodeString aTuN, uTaN, aNuN, uNaN;
801         IDNAInfo aTuNInfo, uTaNInfo, aNuNInfo, uNaNInfo;
802         nontrans->nameToUnicode(aT, aTuN, aTuNInfo, errorCode);
803         nontrans->nameToASCII(uT, uTaN, uTaNInfo, errorCode);
804         nontrans->nameToUnicode(aN, aNuN, aNuNInfo, errorCode);
805         nontrans->nameToASCII(uN, uNaN, uNaNInfo, errorCode);
806         if(errorCode.errIfFailureAndReset("second-level processing [%d/%s] %s",
807                                           (int)i, testCase.o, testCase.s)
808         ) {
809             continue;
810         }
811         if(aN!=uNaN) {
812             prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
813             prettify(uNaN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
814             errln("N.nameToASCII([%d] %s)!=N.nameToUnicode().N.nameToASCII() "
815                   "(errors %04lx) %s vs. %s",
816                   (int)i, testCase.s, aNInfo.getErrors(), buffer, buffer2);
817             continue;
818         }
819         if(aT!=uTaN) {
820             prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
821             prettify(uTaN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
822             errln("T.nameToASCII([%d] %s)!=T.nameToUnicode().N.nameToASCII() "
823                   "(errors %04lx) %s vs. %s",
824                   (int)i, testCase.s, aNInfo.getErrors(), buffer, buffer2);
825             continue;
826         }
827         if(uN!=aNuN) {
828             prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
829             prettify(aNuN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
830             errln("N.nameToUnicode([%d] %s)!=N.nameToASCII().N.nameToUnicode() "
831                   "(errors %04lx) %s vs. %s",
832                   (int)i, testCase.s, uNInfo.getErrors(), buffer, buffer2);
833             continue;
834         }
835         if(uT!=aTuN) {
836             prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
837             prettify(aTuN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
838             errln("T.nameToUnicode([%d] %s)!=T.nameToASCII().N.nameToUnicode() "
839                   "(errors %04lx) %s vs. %s",
840                   (int)i, testCase.s, uNInfo.getErrors(), buffer, buffer2);
841             continue;
842         }
843         // labelToUnicode
844         UnicodeString aTL, uTL, aNL, uNL;
845         IDNAInfo aTLInfo, uTLInfo, aNLInfo, uNLInfo;
846         trans->labelToASCII(input, aTL, aTLInfo, errorCode);
847         trans->labelToUnicode(input, uTL, uTLInfo, errorCode);
848         nontrans->labelToASCII(input, aNL, aNLInfo, errorCode);
849         nontrans->labelToUnicode(input, uNL, uNLInfo, errorCode);
850         if(errorCode.errIfFailureAndReset("labelToXYZ processing [%d/%s] %s",
851                                           (int)i, testCase.o, testCase.s)
852         ) {
853             continue;
854         }
855         if(aN.indexOf((UChar)0x2e)<0) {
856             if(aN!=aNL || aNInfo.getErrors()!=aNLInfo.getErrors()) {
857                 prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
858                 prettify(aNL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
859                 errln("N.nameToASCII([%d] %s)!=N.labelToASCII() "
860                       "(errors %04lx vs %04lx) %s vs. %s",
861                       (int)i, testCase.s, aNInfo.getErrors(), aNLInfo.getErrors(), buffer, buffer2);
862                 continue;
863             }
864         } else {
865             if((aNLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
866                 errln("N.labelToASCII([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
867                       (int)i, testCase.s, (long)aNLInfo.getErrors());
868                 continue;
869             }
870         }
871         if(aT.indexOf((UChar)0x2e)<0) {
872             if(aT!=aTL || aTInfo.getErrors()!=aTLInfo.getErrors()) {
873                 prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
874                 prettify(aTL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
875                 errln("T.nameToASCII([%d] %s)!=T.labelToASCII() "
876                       "(errors %04lx vs %04lx) %s vs. %s",
877                       (int)i, testCase.s, aTInfo.getErrors(), aTLInfo.getErrors(), buffer, buffer2);
878                 continue;
879             }
880         } else {
881             if((aTLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
882                 errln("T.labelToASCII([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
883                       (int)i, testCase.s, (long)aTLInfo.getErrors());
884                 continue;
885             }
886         }
887         if(uN.indexOf((UChar)0x2e)<0) {
888             if(uN!=uNL || uNInfo.getErrors()!=uNLInfo.getErrors()) {
889                 prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
890                 prettify(uNL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
891                 errln("N.nameToUnicode([%d] %s)!=N.labelToUnicode() "
892                       "(errors %04lx vs %04lx) %s vs. %s",
893                       (int)i, testCase.s, uNInfo.getErrors(), uNLInfo.getErrors(), buffer, buffer2);
894                 continue;
895             }
896         } else {
897             if((uNLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
898                 errln("N.labelToUnicode([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
899                       (int)i, testCase.s, (long)uNLInfo.getErrors());
900                 continue;
901             }
902         }
903         if(uT.indexOf((UChar)0x2e)<0) {
904             if(uT!=uTL || uTInfo.getErrors()!=uTLInfo.getErrors()) {
905                 prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
906                 prettify(uTL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
907                 errln("T.nameToUnicode([%d] %s)!=T.labelToUnicode() "
908                       "(errors %04lx vs %04lx) %s vs. %s",
909                       (int)i, testCase.s, uTInfo.getErrors(), uTLInfo.getErrors(), buffer, buffer2);
910                 continue;
911             }
912         } else {
913             if((uTLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
914                 errln("T.labelToUnicode([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
915                       (int)i, testCase.s, (long)uTLInfo.getErrors());
916                 continue;
917             }
918         }
919         // Differences between transitional and nontransitional processing
920         if(mode=='B') {
921             if( aNInfo.isTransitionalDifferent() ||
922                 aTInfo.isTransitionalDifferent() ||
923                 uNInfo.isTransitionalDifferent() ||
924                 uTInfo.isTransitionalDifferent() ||
925                 aNLInfo.isTransitionalDifferent() ||
926                 aTLInfo.isTransitionalDifferent() ||
927                 uNLInfo.isTransitionalDifferent() ||
928                 uTLInfo.isTransitionalDifferent()
929             ) {
930                 errln("B.process([%d] %s) isTransitionalDifferent()", (int)i, testCase.s);
931                 continue;
932             }
933             if( aN!=aT || uN!=uT || aNL!=aTL || uNL!=uTL ||
934                 aNInfo.getErrors()!=aTInfo.getErrors() || uNInfo.getErrors()!=uTInfo.getErrors() ||
935                 aNLInfo.getErrors()!=aTLInfo.getErrors() || uNLInfo.getErrors()!=uTLInfo.getErrors()
936             ) {
937                 errln("N.process([%d] %s) vs. T.process() different errors or result strings",
938                       (int)i, testCase.s);
939                 continue;
940             }
941         } else {
942             if( !aNInfo.isTransitionalDifferent() ||
943                 !aTInfo.isTransitionalDifferent() ||
944                 !uNInfo.isTransitionalDifferent() ||
945                 !uTInfo.isTransitionalDifferent() ||
946                 !aNLInfo.isTransitionalDifferent() ||
947                 !aTLInfo.isTransitionalDifferent() ||
948                 !uNLInfo.isTransitionalDifferent() ||
949                 !uTLInfo.isTransitionalDifferent()
950             ) {
951                 errln("%s.process([%d] %s) !isTransitionalDifferent()",
952                       testCase.o, (int)i, testCase.s);
953                 continue;
954             }
955             if(aN==aT || uN==uT || aNL==aTL || uNL==uTL) {
956                 errln("N.process([%d] %s) vs. T.process() same result strings",
957                       (int)i, testCase.s);
958                 continue;
959             }
960         }
961         // UTF-8
962         std::string input8, aT8, uT8, aN8, uN8;
963         StringByteSink<std::string> aT8Sink(&aT8), uT8Sink(&uT8), aN8Sink(&aN8), uN8Sink(&uN8);
964         IDNAInfo aT8Info, uT8Info, aN8Info, uN8Info;
965         input.toUTF8String(input8);
966         trans->nameToASCII_UTF8(input8, aT8Sink, aT8Info, errorCode);
967         trans->nameToUnicodeUTF8(input8, uT8Sink, uT8Info, errorCode);
968         nontrans->nameToASCII_UTF8(input8, aN8Sink, aN8Info, errorCode);
969         nontrans->nameToUnicodeUTF8(input8, uN8Sink, uN8Info, errorCode);
970         if(errorCode.errIfFailureAndReset("UTF-8 processing [%d/%s] %s",
971                                           (int)i, testCase.o, testCase.s)
972         ) {
973             continue;
974         }
975         UnicodeString aT16(UnicodeString::fromUTF8(aT8));
976         UnicodeString uT16(UnicodeString::fromUTF8(uT8));
977         UnicodeString aN16(UnicodeString::fromUTF8(aN8));
978         UnicodeString uN16(UnicodeString::fromUTF8(uN8));
979         if( aN8Info.getErrors()!=aNInfo.getErrors() ||
980             uN8Info.getErrors()!=uNInfo.getErrors()
981         ) {
982             errln("N.xyzUTF8([%d] %s) vs. UTF-16 processing different errors %04lx vs. %04lx",
983                   (int)i, testCase.s,
984                   (long)aN8Info.getErrors(), (long)aNInfo.getErrors());
985             continue;
986         }
987         if( aT8Info.getErrors()!=aTInfo.getErrors() ||
988             uT8Info.getErrors()!=uTInfo.getErrors()
989         ) {
990             errln("T.xyzUTF8([%d] %s) vs. UTF-16 processing different errors %04lx vs. %04lx",
991                   (int)i, testCase.s,
992                   (long)aT8Info.getErrors(), (long)aTInfo.getErrors());
993             continue;
994         }
995         if(aT16!=aT || uT16!=uT || aN16!=aN || uN16!=uN) {
996             errln("%s.xyzUTF8([%d] %s) vs. UTF-16 processing different string results",
997                   testCase.o, (int)i, testCase.s, (long)aTInfo.getErrors());
998             continue;
999         }
1000         if( aT8Info.isTransitionalDifferent()!=aTInfo.isTransitionalDifferent() ||
1001             uT8Info.isTransitionalDifferent()!=uTInfo.isTransitionalDifferent() ||
1002             aN8Info.isTransitionalDifferent()!=aNInfo.isTransitionalDifferent() ||
1003             uN8Info.isTransitionalDifferent()!=uNInfo.isTransitionalDifferent()
1004         ) {
1005             errln("%s.xyzUTF8([%d] %s) vs. UTF-16 processing different isTransitionalDifferent()",
1006                   testCase.o, (int)i, testCase.s);
1007             continue;
1008         }
1009     }
1010 }
1011 
1012 namespace {
1013 
1014 const int32_t kNumFields = 7;
1015 
1016 void U_CALLCONV
idnaTestLineFn(void * context,char * fields[][2],int32_t,UErrorCode * pErrorCode)1017 idnaTestLineFn(void *context,
1018                char *fields[][2], int32_t /* fieldCount */,
1019                UErrorCode *pErrorCode) {
1020     reinterpret_cast<UTS46Test *>(context)->idnaTestOneLine(fields, *pErrorCode);
1021 }
1022 
s16FromField(char * (& field)[2])1023 UnicodeString s16FromField(char *(&field)[2]) {
1024     int32_t length = (int32_t)(field[1] - field[0]);
1025     return UnicodeString::fromUTF8(StringPiece(field[0], length)).trim().unescape();
1026 }
1027 
statusFromField(char * (& field)[2])1028 std::string statusFromField(char *(&field)[2]) {
1029     const char *start = u_skipWhitespace(field[0]);
1030     std::string status;
1031     if (start != field[1]) {
1032         int32_t length = (int32_t)(field[1] - start);
1033         while (length > 0 && (start[length - 1] == u' ' || start[length - 1] == u'\t')) {
1034             --length;
1035         }
1036         status.assign(start, length);
1037     }
1038     return status;
1039 }
1040 
1041 }  // namespace
1042 
checkIdnaTestResult(const char * line,const char * type,const UnicodeString & expected,const UnicodeString & result,const char * status,const IDNAInfo & info)1043 void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
1044                                     const UnicodeString &expected, const UnicodeString &result,
1045                                     const char *status, const IDNAInfo &info) {
1046     // An error in toUnicode or toASCII is indicated by a value in square brackets,
1047     // such as "[B5 B6]".
1048     UBool expectedHasErrors = FALSE;
1049     if (*status != 0) {
1050         if (*status != u'[') {
1051             errln("%s  status field does not start with '[': %s\n    %s", type, status, line);
1052         }
1053         if (strcmp(status, reinterpret_cast<const char*>(u8"[]")) != 0) {
1054             expectedHasErrors = TRUE;
1055         }
1056     }
1057     if (expectedHasErrors != info.hasErrors()) {
1058         errln("%s  expected errors %s %d != %d = actual has errors: %04lx\n    %s",
1059               type, status, expectedHasErrors, info.hasErrors(), (long)info.getErrors(), line);
1060     }
1061     if (!expectedHasErrors && expected != result) {
1062         errln("%s  expected != actual\n    %s", type, line);
1063         errln(UnicodeString(u"    ") + expected);
1064         errln(UnicodeString(u"    ") + result);
1065     }
1066 }
1067 
idnaTestOneLine(char * fields[][2],UErrorCode & errorCode)1068 void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
1069     // IdnaTestV2.txt (since Unicode 11)
1070     // Column 1: source
1071     // The source string to be tested
1072     UnicodeString source = s16FromField(fields[0]);
1073 
1074     // Column 2: toUnicode
1075     // The result of applying toUnicode to the source, with Transitional_Processing=false.
1076     // A blank value means the same as the source value.
1077     UnicodeString toUnicode = s16FromField(fields[1]);
1078     if (toUnicode.isEmpty()) {
1079         toUnicode = source;
1080     }
1081 
1082     // Column 3: toUnicodeStatus
1083     // A set of status codes, each corresponding to a particular test.
1084     // A blank value means [].
1085     std::string toUnicodeStatus = statusFromField(fields[2]);
1086 
1087     // Column 4: toAsciiN
1088     // The result of applying toASCII to the source, with Transitional_Processing=false.
1089     // A blank value means the same as the toUnicode value.
1090     UnicodeString toAsciiN = s16FromField(fields[3]);
1091     if (toAsciiN.isEmpty()) {
1092         toAsciiN = toUnicode;
1093     }
1094 
1095     // Column 5: toAsciiNStatus
1096     // A set of status codes, each corresponding to a particular test.
1097     // A blank value means the same as the toUnicodeStatus value.
1098     std::string toAsciiNStatus = statusFromField(fields[4]);
1099     if (toAsciiNStatus.empty()) {
1100         toAsciiNStatus = toUnicodeStatus;
1101     }
1102 
1103     // Column 6: toAsciiT
1104     // The result of applying toASCII to the source, with Transitional_Processing=true.
1105     // A blank value means the same as the toAsciiN value.
1106     UnicodeString toAsciiT = s16FromField(fields[5]);
1107     if (toAsciiT.isEmpty()) {
1108         toAsciiT = toAsciiN;
1109     }
1110 
1111     // Column 7: toAsciiTStatus
1112     // A set of status codes, each corresponding to a particular test.
1113     // A blank value means the same as the toAsciiNStatus value.
1114     std::string toAsciiTStatus = statusFromField(fields[6]);
1115     if (toAsciiTStatus.empty()) {
1116         toAsciiTStatus = toAsciiNStatus;
1117     }
1118 
1119     // ToASCII/ToUnicode, transitional/nontransitional
1120     UnicodeString uN, aN, aT;
1121     IDNAInfo uNInfo, aNInfo, aTInfo;
1122     nontrans->nameToUnicode(source, uN, uNInfo, errorCode);
1123     checkIdnaTestResult(fields[0][0], "toUnicodeNontrans", toUnicode, uN,
1124                         toUnicodeStatus.c_str(), uNInfo);
1125     nontrans->nameToASCII(source, aN, aNInfo, errorCode);
1126     checkIdnaTestResult(fields[0][0], "toASCIINontrans", toAsciiN, aN,
1127                         toAsciiNStatus.c_str(), aNInfo);
1128     trans->nameToASCII(source, aT, aTInfo, errorCode);
1129     checkIdnaTestResult(fields[0][0], "toASCIITrans", toAsciiT, aT,
1130                         toAsciiTStatus.c_str(), aTInfo);
1131 }
1132 
1133 // TODO: de-duplicate
1134 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
1135 
1136 // http://www.unicode.org/Public/idna/latest/IdnaTest.txt
IdnaTest()1137 void UTS46Test::IdnaTest() {
1138     IcuTestErrorCode errorCode(*this, "IdnaTest");
1139     const char *sourceTestDataPath = getSourceTestData(errorCode);
1140     if (errorCode.errIfFailureAndReset("unable to find the source/test/testdata "
1141                                        "folder (getSourceTestData())")) {
1142         return;
1143     }
1144     CharString path(sourceTestDataPath, errorCode);
1145     path.appendPathPart("IdnaTestV2.txt", errorCode);
1146     LocalStdioFilePointer idnaTestFile(fopen(path.data(), "r"));
1147     if (idnaTestFile.isNull()) {
1148         errln("unable to open %s", path.data());
1149         return;
1150     }
1151 
1152     // Columns (c1, c2,...) are separated by semicolons.
1153     // Leading and trailing spaces and tabs in each column are ignored.
1154     // Comments are indicated with hash marks.
1155     char *fields[kNumFields][2];
1156     u_parseDelimitedFile(path.data(), ';', fields, kNumFields, idnaTestLineFn, this, errorCode);
1157     if (errorCode.errIfFailureAndReset("error parsing IdnaTest.txt")) {
1158         return;
1159     }
1160 }
1161 
1162 #endif  // UCONFIG_NO_IDNA
1163