1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2003-2014, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  convtest.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2003jul15
14 *   created by: Markus W. Scherer
15 *
16 *   Test file for data-driven conversion tests.
17 */
18 
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_LEGACY_CONVERSION
22 /*
23  * Note: Turning off all of convtest.cpp if !UCONFIG_NO_LEGACY_CONVERSION
24  * is slightly unnecessary - it removes tests for Unicode charsets
25  * like UTF-8 that should work.
26  * However, there is no easy way for the test to detect whether a test case
27  * is for a Unicode charset, so it would be difficult to only exclude those.
28  * Also, regular testing of ICU is done with all modules on, therefore
29  * not testing conversion for a custom configuration like this should be ok.
30  */
31 
32 #include "unicode/ucnv.h"
33 #include "unicode/unistr.h"
34 #include "unicode/parsepos.h"
35 #include "unicode/uniset.h"
36 #include "unicode/ustring.h"
37 #include "unicode/ures.h"
38 #include "convtest.h"
39 #include "cmemory.h"
40 #include "unicode/tstdtmod.h"
41 #include <string.h>
42 #include <stdlib.h>
43 
44 enum {
45     // characters used in test data for callbacks
46     SUB_CB='?',
47     SKIP_CB='0',
48     STOP_CB='.',
49     ESC_CB='&'
50 };
51 
ConversionTest()52 ConversionTest::ConversionTest() {
53     UErrorCode errorCode=U_ZERO_ERROR;
54     utf8Cnv=ucnv_open("UTF-8", &errorCode);
55     ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
56     if(U_FAILURE(errorCode)) {
57         errln("unable to open UTF-8 converter");
58     }
59 }
60 
~ConversionTest()61 ConversionTest::~ConversionTest() {
62     ucnv_close(utf8Cnv);
63 }
64 
65 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)66 ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
67     if (exec) logln("TestSuite ConversionTest: ");
68     switch (index) {
69 #if !UCONFIG_NO_FILE_IO
70         case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
71         case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
72         case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
73         case 3: name="TestDefaultIgnorableCallback"; if (exec) TestDefaultIgnorableCallback(); break;
74 #else
75         case 0:
76         case 1:
77         case 2:
78         case 3: name="skip"; break;
79 #endif
80         case 4: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
81         default: name=""; break; //needed to end loop
82     }
83 }
84 
85 // test data interface ----------------------------------------------------- ***
86 
87 void
TestToUnicode()88 ConversionTest::TestToUnicode() {
89     ConversionCase cc;
90     char charset[100], cbopt[4];
91     const char *option;
92     UnicodeString s, unicode;
93     int32_t offsetsLength;
94     UConverterToUCallback callback;
95 
96     TestDataModule *dataModule;
97     TestData *testData;
98     const DataMap *testCase;
99     UErrorCode errorCode;
100     int32_t i;
101 
102     errorCode=U_ZERO_ERROR;
103     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
104     if(U_SUCCESS(errorCode)) {
105         testData=dataModule->createTestData("toUnicode", errorCode);
106         if(U_SUCCESS(errorCode)) {
107             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
108                 if(U_FAILURE(errorCode)) {
109                     errln("error retrieving conversion/toUnicode test case %d - %s",
110                             i, u_errorName(errorCode));
111                     errorCode=U_ZERO_ERROR;
112                     continue;
113                 }
114 
115                 cc.caseNr=i;
116 
117                 s=testCase->getString("charset", errorCode);
118                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
119                 cc.charset=charset;
120 
121                 // BEGIN android-added
122                 // To save space, Android does not build full ISO-2022-CN tables.
123                 // We skip the TestGetKeywordValuesForLocale for counting available collations.
124                 if (strlen(charset) >= 8 &&
125                     strncmp(charset+4, "2022-CN", 4) == 0) {
126                     continue;
127                 }
128                 // END android-added
129 
130                 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
131                 unicode=testCase->getString("unicode", errorCode);
132                 cc.unicode=unicode.getBuffer();
133                 cc.unicodeLength=unicode.length();
134 
135                 offsetsLength=0;
136                 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
137                 if(offsetsLength==0) {
138                     cc.offsets=NULL;
139                 } else if(offsetsLength!=unicode.length()) {
140                     errln("toUnicode[%d] unicode[%d] and offsets[%d] must have the same length",
141                             i, unicode.length(), offsetsLength);
142                     errorCode=U_ILLEGAL_ARGUMENT_ERROR;
143                 }
144 
145                 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
146                 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
147 
148                 s=testCase->getString("errorCode", errorCode);
149                 if(s==UNICODE_STRING("invalid", 7)) {
150                     cc.outErrorCode=U_INVALID_CHAR_FOUND;
151                 } else if(s==UNICODE_STRING("illegal", 7)) {
152                     cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
153                 } else if(s==UNICODE_STRING("truncated", 9)) {
154                     cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
155                 } else if(s==UNICODE_STRING("illesc", 6)) {
156                     cc.outErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
157                 } else if(s==UNICODE_STRING("unsuppesc", 9)) {
158                     cc.outErrorCode=U_UNSUPPORTED_ESCAPE_SEQUENCE;
159                 } else {
160                     cc.outErrorCode=U_ZERO_ERROR;
161                 }
162 
163                 s=testCase->getString("callback", errorCode);
164                 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
165                 cc.cbopt=cbopt;
166                 switch(cbopt[0]) {
167                 case SUB_CB:
168                     callback=UCNV_TO_U_CALLBACK_SUBSTITUTE;
169                     break;
170                 case SKIP_CB:
171                     callback=UCNV_TO_U_CALLBACK_SKIP;
172                     break;
173                 case STOP_CB:
174                     callback=UCNV_TO_U_CALLBACK_STOP;
175                     break;
176                 case ESC_CB:
177                     callback=UCNV_TO_U_CALLBACK_ESCAPE;
178                     break;
179                 default:
180                     callback=NULL;
181                     break;
182                 }
183                 option=callback==NULL ? cbopt : cbopt+1;
184                 if(*option==0) {
185                     option=NULL;
186                 }
187 
188                 cc.invalidChars=testCase->getBinary(cc.invalidLength, "invalidChars", errorCode);
189 
190                 if(U_FAILURE(errorCode)) {
191                     errln("error parsing conversion/toUnicode test case %d - %s",
192                             i, u_errorName(errorCode));
193                     errorCode=U_ZERO_ERROR;
194                 } else {
195                     logln("TestToUnicode[%d] %s", i, charset);
196                     ToUnicodeCase(cc, callback, option);
197                 }
198             }
199             delete testData;
200         }
201         delete dataModule;
202     }
203     else {
204         dataerrln("Could not load test conversion data");
205     }
206 }
207 
208 void
TestFromUnicode()209 ConversionTest::TestFromUnicode() {
210     ConversionCase cc;
211     char charset[100], cbopt[4];
212     const char *option;
213     UnicodeString s, unicode, invalidUChars;
214     int32_t offsetsLength, index;
215     UConverterFromUCallback callback;
216 
217     TestDataModule *dataModule;
218     TestData *testData;
219     const DataMap *testCase;
220     const UChar *p;
221     UErrorCode errorCode;
222     int32_t i, length;
223 
224     errorCode=U_ZERO_ERROR;
225     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
226     if(U_SUCCESS(errorCode)) {
227         testData=dataModule->createTestData("fromUnicode", errorCode);
228         if(U_SUCCESS(errorCode)) {
229             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
230                 if(U_FAILURE(errorCode)) {
231                     errln("error retrieving conversion/fromUnicode test case %d - %s",
232                             i, u_errorName(errorCode));
233                     errorCode=U_ZERO_ERROR;
234                     continue;
235                 }
236 
237                 cc.caseNr=i;
238 
239                 s=testCase->getString("charset", errorCode);
240                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
241                 cc.charset=charset;
242 
243                 // BEGIN android-added
244                 // To save space, Android does not build full ISO-2022-CN tables.
245                 // We skip the TestGetKeywordValuesForLocale for counting available collations.
246                 if (strlen(charset) >= 8 &&
247                     strncmp(charset+4, "2022-CN", 4) == 0) {
248                     continue;
249                 }
250                 // END android-added
251 
252                 unicode=testCase->getString("unicode", errorCode);
253                 cc.unicode=unicode.getBuffer();
254                 cc.unicodeLength=unicode.length();
255                 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
256 
257                 offsetsLength=0;
258                 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
259                 if(offsetsLength==0) {
260                     cc.offsets=NULL;
261                 } else if(offsetsLength!=cc.bytesLength) {
262                     errln("fromUnicode[%d] bytes[%d] and offsets[%d] must have the same length",
263                             i, cc.bytesLength, offsetsLength);
264                     errorCode=U_ILLEGAL_ARGUMENT_ERROR;
265                 }
266 
267                 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
268                 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
269 
270                 s=testCase->getString("errorCode", errorCode);
271                 if(s==UNICODE_STRING("invalid", 7)) {
272                     cc.outErrorCode=U_INVALID_CHAR_FOUND;
273                 } else if(s==UNICODE_STRING("illegal", 7)) {
274                     cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
275                 } else if(s==UNICODE_STRING("truncated", 9)) {
276                     cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
277                 } else {
278                     cc.outErrorCode=U_ZERO_ERROR;
279                 }
280 
281                 s=testCase->getString("callback", errorCode);
282                 cc.setSub=0; // default: no subchar
283 
284                 if((index=s.indexOf((UChar)0))>0) {
285                     // read NUL-separated subchar first, if any
286                     // copy the subchar from Latin-1 characters
287                     // start after the NUL
288                     p=s.getTerminatedBuffer();
289                     length=index+1;
290                     p+=length;
291                     length=s.length()-length;
292                     if(length<=0 || length>=(int32_t)sizeof(cc.subchar)) {
293                         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
294                     } else {
295                         int32_t j;
296 
297                         for(j=0; j<length; ++j) {
298                             cc.subchar[j]=(char)p[j];
299                         }
300                         // NUL-terminate the subchar
301                         cc.subchar[j]=0;
302                         cc.setSub=1;
303                     }
304 
305                     // remove the NUL and subchar from s
306                     s.truncate(index);
307                 } else if((index=s.indexOf((UChar)0x3d))>0) /* '=' */ {
308                     // read a substitution string, separated by an equal sign
309                     p=s.getBuffer()+index+1;
310                     length=s.length()-(index+1);
311                     if(length<0 || length>=UPRV_LENGTHOF(cc.subString)) {
312                         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
313                     } else {
314                         u_memcpy(cc.subString, p, length);
315                         // NUL-terminate the subString
316                         cc.subString[length]=0;
317                         cc.setSub=-1;
318                     }
319 
320                     // remove the equal sign and subString from s
321                     s.truncate(index);
322                 }
323 
324                 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
325                 cc.cbopt=cbopt;
326                 switch(cbopt[0]) {
327                 case SUB_CB:
328                     callback=UCNV_FROM_U_CALLBACK_SUBSTITUTE;
329                     break;
330                 case SKIP_CB:
331                     callback=UCNV_FROM_U_CALLBACK_SKIP;
332                     break;
333                 case STOP_CB:
334                     callback=UCNV_FROM_U_CALLBACK_STOP;
335                     break;
336                 case ESC_CB:
337                     callback=UCNV_FROM_U_CALLBACK_ESCAPE;
338                     break;
339                 default:
340                     callback=NULL;
341                     break;
342                 }
343                 option=callback==NULL ? cbopt : cbopt+1;
344                 if(*option==0) {
345                     option=NULL;
346                 }
347 
348                 invalidUChars=testCase->getString("invalidUChars", errorCode);
349                 cc.invalidUChars=invalidUChars.getBuffer();
350                 cc.invalidLength=invalidUChars.length();
351 
352                 if(U_FAILURE(errorCode)) {
353                     errln("error parsing conversion/fromUnicode test case %d - %s",
354                             i, u_errorName(errorCode));
355                     errorCode=U_ZERO_ERROR;
356                 } else {
357                     logln("TestFromUnicode[%d] %s", i, charset);
358                     FromUnicodeCase(cc, callback, option);
359                 }
360             }
361             delete testData;
362         }
363         delete dataModule;
364     }
365     else {
366         dataerrln("Could not load test conversion data");
367     }
368 }
369 
370 static const UChar ellipsis[]={ 0x2e, 0x2e, 0x2e };
371 
372 void
TestGetUnicodeSet()373 ConversionTest::TestGetUnicodeSet() {
374     char charset[100];
375     UnicodeString s, map, mapnot;
376     int32_t which;
377 
378     ParsePosition pos;
379     UnicodeSet cnvSet, mapSet, mapnotSet, diffSet;
380     UnicodeSet *cnvSetPtr = &cnvSet;
381     LocalUConverterPointer cnv;
382 
383     TestDataModule *dataModule;
384     TestData *testData;
385     const DataMap *testCase;
386     UErrorCode errorCode;
387     int32_t i;
388 
389     errorCode=U_ZERO_ERROR;
390     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
391     if(U_SUCCESS(errorCode)) {
392         testData=dataModule->createTestData("getUnicodeSet", errorCode);
393         if(U_SUCCESS(errorCode)) {
394             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
395                 if(U_FAILURE(errorCode)) {
396                     errln("error retrieving conversion/getUnicodeSet test case %d - %s",
397                             i, u_errorName(errorCode));
398                     errorCode=U_ZERO_ERROR;
399                     continue;
400                 }
401 
402                 s=testCase->getString("charset", errorCode);
403                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
404 
405                 // BEGIN android-added
406                 // To save space, Android does not build full ISO-2022-CN tables.
407                 // We skip the TestGetKeywordValuesForLocale for counting available collations.
408                 if (strlen(charset) >= 8 &&
409                     strncmp(charset+4, "2022-CN", 4) == 0) {
410                     continue;
411                 }
412                 // END android-added
413 
414                 map=testCase->getString("map", errorCode);
415                 mapnot=testCase->getString("mapnot", errorCode);
416 
417                 which=testCase->getInt28("which", errorCode);
418 
419                 if(U_FAILURE(errorCode)) {
420                     errln("error parsing conversion/getUnicodeSet test case %d - %s",
421                             i, u_errorName(errorCode));
422                     errorCode=U_ZERO_ERROR;
423                     continue;
424                 }
425 
426                 // test this test case
427                 mapSet.clear();
428                 mapnotSet.clear();
429 
430                 pos.setIndex(0);
431                 mapSet.applyPattern(map, pos, 0, NULL, errorCode);
432                 if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) {
433                     errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n"
434                           "    error index %d  index %d  U+%04x",
435                             i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex()));
436                     errorCode=U_ZERO_ERROR;
437                     continue;
438                 }
439 
440                 pos.setIndex(0);
441                 mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode);
442                 if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) {
443                     errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n"
444                           "    error index %d  index %d  U+%04x",
445                             i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex()));
446                     errorCode=U_ZERO_ERROR;
447                     continue;
448                 }
449 
450                 logln("TestGetUnicodeSet[%d] %s", i, charset);
451 
452                 cnv.adoptInstead(cnv_open(charset, errorCode));
453                 if(U_FAILURE(errorCode)) {
454                     errcheckln(errorCode, "error opening \"%s\" for conversion/getUnicodeSet test case %d - %s",
455                             charset, i, u_errorName(errorCode));
456                     errorCode=U_ZERO_ERROR;
457                     continue;
458                 }
459 
460                 ucnv_getUnicodeSet(cnv.getAlias(), cnvSetPtr->toUSet(), (UConverterUnicodeSet)which, &errorCode);
461 
462                 if(U_FAILURE(errorCode)) {
463                     errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s",
464                             charset, i, u_errorName(errorCode));
465                     errorCode=U_ZERO_ERROR;
466                     continue;
467                 }
468 
469                 // are there items that must be in cnvSet but are not?
470                 (diffSet=mapSet).removeAll(cnvSet);
471                 if(!diffSet.isEmpty()) {
472                     diffSet.toPattern(s, TRUE);
473                     if(s.length()>100) {
474                         s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
475                     }
476                     errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
477                             charset, i);
478                     errln(s);
479                 }
480 
481                 // are there items that must not be in cnvSet but are?
482                 (diffSet=mapnotSet).retainAll(cnvSet);
483                 if(!diffSet.isEmpty()) {
484                     diffSet.toPattern(s, TRUE);
485                     if(s.length()>100) {
486                         s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
487                     }
488                     errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
489                             charset, i);
490                     errln(s);
491                 }
492             }
493             delete testData;
494         }
495         delete dataModule;
496     }
497     else {
498         dataerrln("Could not load test conversion data");
499     }
500 }
501 
502 U_CDECL_BEGIN
503 static void U_CALLCONV
getUnicodeSetCallback(const void * context,UConverterFromUnicodeArgs *,const UChar *,int32_t,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * pErrorCode)504 getUnicodeSetCallback(const void *context,
505                       UConverterFromUnicodeArgs * /*fromUArgs*/,
506                       const UChar* /*codeUnits*/,
507                       int32_t /*length*/,
508                       UChar32 codePoint,
509                       UConverterCallbackReason reason,
510                       UErrorCode *pErrorCode) {
511     if(reason<=UCNV_IRREGULAR) {
512         ((UnicodeSet *)context)->remove(codePoint);  // the converter cannot convert this code point
513         *pErrorCode=U_ZERO_ERROR;                    // skip
514     }  // else ignore the reset, close and clone calls.
515 }
516 U_CDECL_END
517 
518 // Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
519 void
TestGetUnicodeSet2()520 ConversionTest::TestGetUnicodeSet2() {
521     // Build a string with all code points.
522     UChar32 cpLimit;
523     int32_t s0Length;
524     if(quick) {
525         cpLimit=s0Length=0x10000;  // BMP only
526     } else {
527         cpLimit=0x110000;
528         s0Length=0x10000+0x200000;  // BMP + surrogate pairs
529     }
530     UChar *s0=new UChar[s0Length];
531     if(s0==NULL) {
532         return;
533     }
534     UChar *s=s0;
535     UChar32 c;
536     UChar c2;
537     // low BMP
538     for(c=0; c<=0xd7ff; ++c) {
539         *s++=(UChar)c;
540     }
541     // trail surrogates
542     for(c=0xdc00; c<=0xdfff; ++c) {
543         *s++=(UChar)c;
544     }
545     // lead surrogates
546     // (after trails so that there is not even one surrogate pair in between)
547     for(c=0xd800; c<=0xdbff; ++c) {
548         *s++=(UChar)c;
549     }
550     // high BMP
551     for(c=0xe000; c<=0xffff; ++c) {
552         *s++=(UChar)c;
553     }
554     // supplementary code points = surrogate pairs
555     if(cpLimit==0x110000) {
556         for(c=0xd800; c<=0xdbff; ++c) {
557             for(c2=0xdc00; c2<=0xdfff; ++c2) {
558                 *s++=(UChar)c;
559                 *s++=c2;
560             }
561         }
562     }
563 
564     static const char *const cnvNames[]={
565         "UTF-8",
566         "UTF-7",
567         "UTF-16",
568         "US-ASCII",
569         "ISO-8859-1",
570         "windows-1252",
571         "Shift-JIS",
572         "ibm-1390",  // EBCDIC_STATEFUL table
573         "ibm-16684",  // DBCS-only extension table based on EBCDIC_STATEFUL table
574         "HZ",
575         "ISO-2022-JP",
576         "JIS7",
577         "ISO-2022-CN",
578         "ISO-2022-CN-EXT",
579         "LMBCS"
580     };
581     LocalUConverterPointer cnv;
582     char buffer[1024];
583     int32_t i;
584     for(i=0; i<UPRV_LENGTHOF(cnvNames); ++i) {
585         UErrorCode errorCode=U_ZERO_ERROR;
586         cnv.adoptInstead(cnv_open(cnvNames[i], errorCode));
587         if(U_FAILURE(errorCode)) {
588             errcheckln(errorCode, "failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
589             continue;
590         }
591         UnicodeSet expected;
592         ucnv_setFromUCallBack(cnv.getAlias(), getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
593         if(U_FAILURE(errorCode)) {
594             errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
595             continue;
596         }
597         UConverterUnicodeSet which;
598         for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
599             if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
600                 ucnv_setFallback(cnv.getAlias(), TRUE);
601             }
602             expected.add(0, cpLimit-1);
603             s=s0;
604             UBool flush;
605             do {
606                 char *t=buffer;
607                 flush=(UBool)(s==s0+s0Length);
608                 ucnv_fromUnicode(cnv.getAlias(), &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
609                 if(U_FAILURE(errorCode)) {
610                     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
611                         errorCode=U_ZERO_ERROR;
612                         continue;
613                     } else {
614                         break;  // unexpected error, should not occur
615                     }
616                 }
617             } while(!flush);
618             UnicodeSet set;
619             ucnv_getUnicodeSet(cnv.getAlias(), set.toUSet(), which, &errorCode);
620             if(cpLimit<0x110000) {
621                 set.remove(cpLimit, 0x10ffff);
622             }
623             if(which==UCNV_ROUNDTRIP_SET) {
624                 // ignore PUA code points because they will be converted even if they
625                 // are fallbacks and when other fallbacks are turned off,
626                 // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
627                 expected.remove(0xe000, 0xf8ff);
628                 expected.remove(0xf0000, 0xffffd);
629                 expected.remove(0x100000, 0x10fffd);
630                 set.remove(0xe000, 0xf8ff);
631                 set.remove(0xf0000, 0xffffd);
632                 set.remove(0x100000, 0x10fffd);
633             }
634             if(set!=expected) {
635                 // First try to see if we have different sets because ucnv_getUnicodeSet()
636                 // added strings: The above conversion method does not tell us what strings might be convertible.
637                 // Remove strings from the set and compare again.
638                 // Unfortunately, there are no good, direct set methods for finding out whether there are strings
639                 // in the set, nor for enumerating or removing just them.
640                 // Intersect all code points with the set. The intersection will not contain strings.
641                 UnicodeSet temp(0, 0x10ffff);
642                 temp.retainAll(set);
643                 set=temp;
644             }
645             if(set!=expected) {
646                 UnicodeSet diffSet;
647                 UnicodeString out;
648 
649                 // are there items that must be in the set but are not?
650                 (diffSet=expected).removeAll(set);
651                 if(!diffSet.isEmpty()) {
652                     diffSet.toPattern(out, TRUE);
653                     if(out.length()>100) {
654                         out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
655                     }
656                     errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
657                             cnvNames[i], which);
658                     errln(out);
659                 }
660 
661                 // are there items that must not be in the set but are?
662                 (diffSet=set).removeAll(expected);
663                 if(!diffSet.isEmpty()) {
664                     diffSet.toPattern(out, TRUE);
665                     if(out.length()>100) {
666                         out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
667                     }
668                     errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
669                             cnvNames[i], which);
670                     errln(out);
671                 }
672             }
673         }
674     }
675 
676     delete [] s0;
677 }
678 
679 // Test all codepoints which has the default ignorable Unicode property are ignored if they have no mapping
680 // If there are any failures, the hard coded list (IS_DEFAULT_IGNORABLE_CODE_POINT) in ucnv_err.c should be updated
681 void
TestDefaultIgnorableCallback()682 ConversionTest::TestDefaultIgnorableCallback() {
683     UErrorCode status = U_ZERO_ERROR;
684     const char *cnv_name = "euc-jp-2007";
685     const char *pattern_ignorable = "[:Default_Ignorable_Code_Point:]";
686     const char *pattern_not_ignorable = "[:^Default_Ignorable_Code_Point:]";
687 
688     UnicodeSet *set_ignorable = new UnicodeSet(pattern_ignorable, status);
689     if (U_FAILURE(status)) {
690         dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_ignorable, u_errorName(status));
691         return;
692     }
693 
694     UnicodeSet *set_not_ignorable = new UnicodeSet(pattern_not_ignorable, status);
695     if (U_FAILURE(status)) {
696         dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_not_ignorable, u_errorName(status));
697         return;
698     }
699 
700     UConverter *cnv = cnv_open(cnv_name, status);
701     if (U_FAILURE(status)) {
702         dataerrln("Unable to open converter: %s - %s\n", cnv_name, u_errorName(status));
703         return;
704     }
705 
706     // set callback for the converter
707     ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status);
708 
709     UChar32 input[1];
710     char output[10];
711     int32_t outputLength;
712 
713     // test default ignorables are ignored
714     int size = set_ignorable->size();
715     for (int i = 0; i < size; i++) {
716         status = U_ZERO_ERROR;
717         outputLength= 0;
718 
719         input[0] = set_ignorable->charAt(i);
720 
721         outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
722         if (U_FAILURE(status) || outputLength != 0) {
723             errln("Ignorable code point: U+%04X not skipped as expected - %s", input[0], u_errorName(status));
724         }
725     }
726 
727     // test non-ignorables are not ignored
728     size = set_not_ignorable->size();
729     for (int i = 0; i < size; i++) {
730         status = U_ZERO_ERROR;
731         outputLength= 0;
732 
733         input[0] = set_not_ignorable->charAt(i);
734 
735         if (input[0] == 0) {
736             continue;
737         }
738 
739         outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
740         if (U_FAILURE(status) || outputLength <= 0) {
741             errln("Non-ignorable code point: U+%04X skipped unexpectedly - %s", input[0], u_errorName(status));
742         }
743     }
744 
745     ucnv_close(cnv);
746     delete set_not_ignorable;
747     delete set_ignorable;
748 }
749 
750 // open testdata or ICU data converter ------------------------------------- ***
751 
752 UConverter *
cnv_open(const char * name,UErrorCode & errorCode)753 ConversionTest::cnv_open(const char *name, UErrorCode &errorCode) {
754     if(name!=NULL && *name=='+') {
755         // Converter names that start with '+' are ignored in ICU4J tests.
756         ++name;
757     }
758     if(name!=NULL && *name=='*') {
759         /* loadTestData(): set the data directory */
760         return ucnv_openPackage(loadTestData(errorCode), name+1, &errorCode);
761     } else {
762         return ucnv_open(name, &errorCode);
763     }
764 }
765 
766 // output helpers ---------------------------------------------------------- ***
767 
768 static inline char
hexDigit(uint8_t digit)769 hexDigit(uint8_t digit) {
770     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
771 }
772 
773 static char *
printBytes(const uint8_t * bytes,int32_t length,char * out)774 printBytes(const uint8_t *bytes, int32_t length, char *out) {
775     uint8_t b;
776 
777     if(length>0) {
778         b=*bytes++;
779         --length;
780         *out++=hexDigit((uint8_t)(b>>4));
781         *out++=hexDigit((uint8_t)(b&0xf));
782     }
783 
784     while(length>0) {
785         b=*bytes++;
786         --length;
787         *out++=' ';
788         *out++=hexDigit((uint8_t)(b>>4));
789         *out++=hexDigit((uint8_t)(b&0xf));
790     }
791     *out++=0;
792     return out;
793 }
794 
795 static char *
printUnicode(const UChar * unicode,int32_t length,char * out)796 printUnicode(const UChar *unicode, int32_t length, char *out) {
797     UChar32 c;
798     int32_t i;
799 
800     for(i=0; i<length;) {
801         if(i>0) {
802             *out++=' ';
803         }
804         U16_NEXT(unicode, i, length, c);
805         // write 4..6 digits
806         if(c>=0x100000) {
807             *out++='1';
808         }
809         if(c>=0x10000) {
810             *out++=hexDigit((uint8_t)((c>>16)&0xf));
811         }
812         *out++=hexDigit((uint8_t)((c>>12)&0xf));
813         *out++=hexDigit((uint8_t)((c>>8)&0xf));
814         *out++=hexDigit((uint8_t)((c>>4)&0xf));
815         *out++=hexDigit((uint8_t)(c&0xf));
816     }
817     *out++=0;
818     return out;
819 }
820 
821 static char *
printOffsets(const int32_t * offsets,int32_t length,char * out)822 printOffsets(const int32_t *offsets, int32_t length, char *out) {
823     int32_t i, o, d;
824 
825     if(offsets==NULL) {
826         length=0;
827     }
828 
829     for(i=0; i<length; ++i) {
830         if(i>0) {
831             *out++=' ';
832         }
833         o=offsets[i];
834 
835         // print all offsets with 2 characters each (-x, -9..99, xx)
836         if(o<-9) {
837             *out++='-';
838             *out++='x';
839         } else if(o<0) {
840             *out++='-';
841             *out++=(char)('0'-o);
842         } else if(o<=99) {
843             *out++=(d=o/10)==0 ? ' ' : (char)('0'+d);
844             *out++=(char)('0'+o%10);
845         } else /* o>99 */ {
846             *out++='x';
847             *out++='x';
848         }
849     }
850     *out++=0;
851     return out;
852 }
853 
854 // toUnicode test worker functions ----------------------------------------- ***
855 
856 static int32_t
stepToUnicode(ConversionCase & cc,UConverter * cnv,UChar * result,int32_t resultCapacity,int32_t * resultOffsets,int32_t step,UErrorCode * pErrorCode)857 stepToUnicode(ConversionCase &cc, UConverter *cnv,
858               UChar *result, int32_t resultCapacity,
859               int32_t *resultOffsets, /* also resultCapacity */
860               int32_t step,
861               UErrorCode *pErrorCode) {
862     const char *source, *sourceLimit, *bytesLimit;
863     UChar *target, *targetLimit, *resultLimit;
864     UBool flush;
865 
866     source=(const char *)cc.bytes;
867     target=result;
868     bytesLimit=source+cc.bytesLength;
869     resultLimit=result+resultCapacity;
870 
871     if(step>=0) {
872         // call ucnv_toUnicode() with in/out buffers no larger than (step) at a time
873         // move only one buffer (in vs. out) at a time to be extra mean
874         // step==0 performs bulk conversion and generates offsets
875 
876         // initialize the partial limits for the loop
877         if(step==0) {
878             // use the entire buffers
879             sourceLimit=bytesLimit;
880             targetLimit=resultLimit;
881             flush=cc.finalFlush;
882         } else {
883             // start with empty partial buffers
884             sourceLimit=source;
885             targetLimit=target;
886             flush=FALSE;
887 
888             // output offsets only for bulk conversion
889             resultOffsets=NULL;
890         }
891 
892         for(;;) {
893             // resetting the opposite conversion direction must not affect this one
894             ucnv_resetFromUnicode(cnv);
895 
896             // convert
897             ucnv_toUnicode(cnv,
898                 &target, targetLimit,
899                 &source, sourceLimit,
900                 resultOffsets,
901                 flush, pErrorCode);
902 
903             // check pointers and errors
904             if(source>sourceLimit || target>targetLimit) {
905                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
906                 break;
907             } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
908                 if(target!=targetLimit) {
909                     // buffer overflow must only be set when the target is filled
910                     *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
911                     break;
912                 } else if(targetLimit==resultLimit) {
913                     // not just a partial overflow
914                     break;
915                 }
916 
917                 // the partial target is filled, set a new limit, reset the error and continue
918                 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
919                 *pErrorCode=U_ZERO_ERROR;
920             } else if(U_FAILURE(*pErrorCode)) {
921                 // some other error occurred, done
922                 break;
923             } else {
924                 if(source!=sourceLimit) {
925                     // when no error occurs, then the input must be consumed
926                     *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
927                     break;
928                 }
929 
930                 if(sourceLimit==bytesLimit) {
931                     // we are done
932                     break;
933                 }
934 
935                 // the partial conversion succeeded, set a new limit and continue
936                 sourceLimit=(bytesLimit-source)>=step ? source+step : bytesLimit;
937                 flush=(UBool)(cc.finalFlush && sourceLimit==bytesLimit);
938             }
939         }
940     } else /* step<0 */ {
941         /*
942          * step==-1: call only ucnv_getNextUChar()
943          * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar()
944          *   if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input,
945          *   else give it at most (-step-2)/2 bytes
946          */
947         UChar32 c;
948 
949         // end the loop by getting an index out of bounds error
950         for(;;) {
951             // resetting the opposite conversion direction must not affect this one
952             ucnv_resetFromUnicode(cnv);
953 
954             // convert
955             if((step&1)!=0 /* odd: -1, -3, -5, ... */) {
956                 sourceLimit=source; // use sourceLimit not as a real limit
957                                     // but to remember the pre-getNextUChar source pointer
958                 c=ucnv_getNextUChar(cnv, &source, bytesLimit, pErrorCode);
959 
960                 // check pointers and errors
961                 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
962                     if(source!=bytesLimit) {
963                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
964                     } else {
965                         *pErrorCode=U_ZERO_ERROR;
966                     }
967                     break;
968                 } else if(U_FAILURE(*pErrorCode)) {
969                     break;
970                 }
971                 // source may not move if c is from previous overflow
972 
973                 if(target==resultLimit) {
974                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
975                     break;
976                 }
977                 if(c<=0xffff) {
978                     *target++=(UChar)c;
979                 } else {
980                     *target++=U16_LEAD(c);
981                     if(target==resultLimit) {
982                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
983                         break;
984                     }
985                     *target++=U16_TRAIL(c);
986                 }
987 
988                 // alternate between -n-1 and -n but leave -1 alone
989                 if(step<-1) {
990                     ++step;
991                 }
992             } else /* step is even */ {
993                 // allow only one UChar output
994                 targetLimit=target<resultLimit ? target+1 : resultLimit;
995 
996                 // as with ucnv_getNextUChar(), we always flush (if we go to bytesLimit)
997                 // and never output offsets
998                 if(step==-2) {
999                     sourceLimit=bytesLimit;
1000                 } else {
1001                     sourceLimit=source+(-step-2)/2;
1002                     if(sourceLimit>bytesLimit) {
1003                         sourceLimit=bytesLimit;
1004                     }
1005                 }
1006 
1007                 ucnv_toUnicode(cnv,
1008                     &target, targetLimit,
1009                     &source, sourceLimit,
1010                     NULL, (UBool)(sourceLimit==bytesLimit), pErrorCode);
1011 
1012                 // check pointers and errors
1013                 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1014                     if(target!=targetLimit) {
1015                         // buffer overflow must only be set when the target is filled
1016                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1017                         break;
1018                     } else if(targetLimit==resultLimit) {
1019                         // not just a partial overflow
1020                         break;
1021                     }
1022 
1023                     // the partial target is filled, set a new limit and continue
1024                     *pErrorCode=U_ZERO_ERROR;
1025                 } else if(U_FAILURE(*pErrorCode)) {
1026                     // some other error occurred, done
1027                     break;
1028                 } else {
1029                     if(source!=sourceLimit) {
1030                         // when no error occurs, then the input must be consumed
1031                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1032                         break;
1033                     }
1034 
1035                     // we are done (flush==TRUE) but we continue, to get the index out of bounds error above
1036                 }
1037 
1038                 --step;
1039             }
1040         }
1041     }
1042 
1043     return (int32_t)(target-result);
1044 }
1045 
1046 UBool
ToUnicodeCase(ConversionCase & cc,UConverterToUCallback callback,const char * option)1047 ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback, const char *option) {
1048     // open the converter
1049     IcuTestErrorCode errorCode(*this, "ToUnicodeCase");
1050     LocalUConverterPointer cnv(cnv_open(cc.charset, errorCode));
1051     if(errorCode.isFailure()) {
1052         errcheckln(errorCode, "toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
1053                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, errorCode.errorName());
1054         errorCode.reset();
1055         return FALSE;
1056     }
1057 
1058     // set the callback
1059     if(callback!=NULL) {
1060         ucnv_setToUCallBack(cnv.getAlias(), callback, option, NULL, NULL, errorCode);
1061         if(U_FAILURE(errorCode)) {
1062             errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setToUCallBack() failed - %s",
1063                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1064             return FALSE;
1065         }
1066     }
1067 
1068     int32_t resultOffsets[256];
1069     UChar result[256];
1070     int32_t resultLength;
1071     UBool ok;
1072 
1073     static const struct {
1074         int32_t step;
1075         const char *name;
1076     } steps[]={
1077         { 0, "bulk" }, // must be first for offsets to be checked
1078         { 1, "step=1" },
1079         { 3, "step=3" },
1080         { 7, "step=7" },
1081         { -1, "getNext" },
1082         { -2, "toU(bulk)+getNext" },
1083         { -3, "getNext+toU(bulk)" },
1084         { -4, "toU(1)+getNext" },
1085         { -5, "getNext+toU(1)" },
1086         { -12, "toU(5)+getNext" },
1087         { -13, "getNext+toU(5)" },
1088     };
1089     int32_t i, step;
1090 
1091     ok=TRUE;
1092     for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
1093         step=steps[i].step;
1094         if(step<0 && !cc.finalFlush) {
1095             // skip ucnv_getNextUChar() if !finalFlush because
1096             // ucnv_getNextUChar() always implies flush
1097             continue;
1098         }
1099         if(step!=0) {
1100             // bulk test is first, then offsets are not checked any more
1101             cc.offsets=NULL;
1102         }
1103         else {
1104             memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
1105         }
1106         memset(result, -1, UPRV_LENGTHOF(result));
1107         errorCode.reset();
1108         resultLength=stepToUnicode(cc, cnv.getAlias(),
1109                                 result, UPRV_LENGTHOF(result),
1110                                 step==0 ? resultOffsets : NULL,
1111                                 step, errorCode);
1112         ok=checkToUnicode(
1113                 cc, cnv.getAlias(), steps[i].name,
1114                 result, resultLength,
1115                 cc.offsets!=NULL ? resultOffsets : NULL,
1116                 errorCode);
1117         if(errorCode.isFailure() || !cc.finalFlush) {
1118             // reset if an error occurred or we did not flush
1119             // otherwise do nothing to make sure that flushing resets
1120             ucnv_resetToUnicode(cnv.getAlias());
1121         }
1122         if (cc.offsets != NULL && resultOffsets[resultLength] != -1) {
1123             errln("toUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1124                 cc.caseNr, cc.charset, resultLength);
1125         }
1126         if (result[resultLength] != (UChar)-1) {
1127             errln("toUnicode[%d](%s) Conversion wrote too much to result at index %d",
1128                 cc.caseNr, cc.charset, resultLength);
1129         }
1130     }
1131 
1132     // not a real loop, just a convenience for breaking out of the block
1133     while(ok && cc.finalFlush) {
1134         // test ucnv_toUChars()
1135         memset(result, 0, sizeof(result));
1136 
1137         errorCode.reset();
1138         resultLength=ucnv_toUChars(cnv.getAlias(),
1139                         result, UPRV_LENGTHOF(result),
1140                         (const char *)cc.bytes, cc.bytesLength,
1141                         errorCode);
1142         ok=checkToUnicode(
1143                 cc, cnv.getAlias(), "toUChars",
1144                 result, resultLength,
1145                 NULL,
1146                 errorCode);
1147         if(!ok) {
1148             break;
1149         }
1150 
1151         // test preflighting
1152         // keep the correct result for simple checking
1153         errorCode.reset();
1154         resultLength=ucnv_toUChars(cnv.getAlias(),
1155                         NULL, 0,
1156                         (const char *)cc.bytes, cc.bytesLength,
1157                         errorCode);
1158         if(errorCode.get()==U_STRING_NOT_TERMINATED_WARNING || errorCode.get()==U_BUFFER_OVERFLOW_ERROR) {
1159             errorCode.reset();
1160         }
1161         ok=checkToUnicode(
1162                 cc, cnv.getAlias(), "preflight toUChars",
1163                 result, resultLength,
1164                 NULL,
1165                 errorCode);
1166         break;
1167     }
1168 
1169     errorCode.reset();  // all errors have already been reported
1170     return ok;
1171 }
1172 
1173 UBool
checkToUnicode(ConversionCase & cc,UConverter * cnv,const char * name,const UChar * result,int32_t resultLength,const int32_t * resultOffsets,UErrorCode resultErrorCode)1174 ConversionTest::checkToUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1175                                const UChar *result, int32_t resultLength,
1176                                const int32_t *resultOffsets,
1177                                UErrorCode resultErrorCode) {
1178     char resultInvalidChars[8];
1179     int8_t resultInvalidLength;
1180     UErrorCode errorCode;
1181 
1182     const char *msg;
1183 
1184     // reset the message; NULL will mean "ok"
1185     msg=NULL;
1186 
1187     errorCode=U_ZERO_ERROR;
1188     resultInvalidLength=sizeof(resultInvalidChars);
1189     ucnv_getInvalidChars(cnv, resultInvalidChars, &resultInvalidLength, &errorCode);
1190     if(U_FAILURE(errorCode)) {
1191         errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidChars() failed - %s",
1192                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1193         return FALSE;
1194     }
1195 
1196     // check everything that might have gone wrong
1197     if(cc.unicodeLength!=resultLength) {
1198         msg="wrong result length";
1199     } else if(0!=u_memcmp(cc.unicode, result, cc.unicodeLength)) {
1200         msg="wrong result string";
1201     } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.unicodeLength*sizeof(*cc.offsets))) {
1202         msg="wrong offsets";
1203     } else if(cc.outErrorCode!=resultErrorCode) {
1204         msg="wrong error code";
1205     } else if(cc.invalidLength!=resultInvalidLength) {
1206         msg="wrong length of last invalid input";
1207     } else if(0!=memcmp(cc.invalidChars, resultInvalidChars, cc.invalidLength)) {
1208         msg="wrong last invalid input";
1209     }
1210 
1211     if(msg==NULL) {
1212         return TRUE;
1213     } else {
1214         char buffer[2000]; // one buffer for all strings
1215         char *s, *bytesString, *unicodeString, *resultString,
1216             *offsetsString, *resultOffsetsString,
1217             *invalidCharsString, *resultInvalidCharsString;
1218 
1219         bytesString=s=buffer;
1220         s=printBytes(cc.bytes, cc.bytesLength, bytesString);
1221         s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString=s);
1222         s=printUnicode(result, resultLength, resultString=s);
1223         s=printOffsets(cc.offsets, cc.unicodeLength, offsetsString=s);
1224         s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1225         s=printBytes(cc.invalidChars, cc.invalidLength, invalidCharsString=s);
1226         s=printBytes((uint8_t *)resultInvalidChars, resultInvalidLength, resultInvalidCharsString=s);
1227 
1228         if((s-buffer)>(int32_t)sizeof(buffer)) {
1229             errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkToUnicode() test output buffer overflow writing %d chars\n",
1230                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1231             exit(1);
1232         }
1233 
1234         errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1235               "  bytes <%s>[%d]\n"
1236               " expected <%s>[%d]\n"
1237               "  result  <%s>[%d]\n"
1238               " offsets         <%s>\n"
1239               "  result offsets <%s>\n"
1240               " error code expected %s got %s\n"
1241               "  invalidChars expected <%s> got <%s>\n",
1242               cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1243               bytesString, cc.bytesLength,
1244               unicodeString, cc.unicodeLength,
1245               resultString, resultLength,
1246               offsetsString,
1247               resultOffsetsString,
1248               u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1249               invalidCharsString, resultInvalidCharsString);
1250 
1251         return FALSE;
1252     }
1253 }
1254 
1255 // fromUnicode test worker functions --------------------------------------- ***
1256 
1257 static int32_t
stepFromUTF8(ConversionCase & cc,UConverter * utf8Cnv,UConverter * cnv,char * result,int32_t resultCapacity,int32_t step,UErrorCode * pErrorCode)1258 stepFromUTF8(ConversionCase &cc,
1259              UConverter *utf8Cnv, UConverter *cnv,
1260              char *result, int32_t resultCapacity,
1261              int32_t step,
1262              UErrorCode *pErrorCode) {
1263     const char *source, *sourceLimit, *utf8Limit;
1264     UChar pivotBuffer[32];
1265     UChar *pivotSource, *pivotTarget, *pivotLimit;
1266     char *target, *targetLimit, *resultLimit;
1267     UBool flush;
1268 
1269     source=cc.utf8;
1270     pivotSource=pivotTarget=pivotBuffer;
1271     target=result;
1272     utf8Limit=source+cc.utf8Length;
1273     resultLimit=result+resultCapacity;
1274 
1275     // call ucnv_convertEx() with in/out buffers no larger than (step) at a time
1276     // move only one buffer (in vs. out) at a time to be extra mean
1277     // step==0 performs bulk conversion
1278 
1279     // initialize the partial limits for the loop
1280     if(step==0) {
1281         // use the entire buffers
1282         sourceLimit=utf8Limit;
1283         targetLimit=resultLimit;
1284         flush=cc.finalFlush;
1285 
1286         pivotLimit=pivotBuffer+UPRV_LENGTHOF(pivotBuffer);
1287     } else {
1288         // start with empty partial buffers
1289         sourceLimit=source;
1290         targetLimit=target;
1291         flush=FALSE;
1292 
1293         // empty pivot is not allowed, make it of length step
1294         pivotLimit=pivotBuffer+step;
1295     }
1296 
1297     for(;;) {
1298         // resetting the opposite conversion direction must not affect this one
1299         ucnv_resetFromUnicode(utf8Cnv);
1300         ucnv_resetToUnicode(cnv);
1301 
1302         // convert
1303         ucnv_convertEx(cnv, utf8Cnv,
1304             &target, targetLimit,
1305             &source, sourceLimit,
1306             pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
1307             FALSE, flush, pErrorCode);
1308 
1309         // check pointers and errors
1310         if(source>sourceLimit || target>targetLimit) {
1311             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1312             break;
1313         } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1314             if(target!=targetLimit) {
1315                 // buffer overflow must only be set when the target is filled
1316                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1317                 break;
1318             } else if(targetLimit==resultLimit) {
1319                 // not just a partial overflow
1320                 break;
1321             }
1322 
1323             // the partial target is filled, set a new limit, reset the error and continue
1324             targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1325             *pErrorCode=U_ZERO_ERROR;
1326         } else if(U_FAILURE(*pErrorCode)) {
1327             if(pivotSource==pivotBuffer) {
1328                 // toUnicode error, should not occur
1329                 // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1330                 break;
1331             } else {
1332                 // fromUnicode error
1333                 // some other error occurred, done
1334                 break;
1335             }
1336         } else {
1337             if(source!=sourceLimit) {
1338                 // when no error occurs, then the input must be consumed
1339                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1340                 break;
1341             }
1342 
1343             if(sourceLimit==utf8Limit) {
1344                 // we are done
1345                 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
1346                     // ucnv_convertEx() warns about not terminating the output
1347                     // but ucnv_fromUnicode() does not and so
1348                     // checkFromUnicode() does not expect it
1349                     *pErrorCode=U_ZERO_ERROR;
1350                 }
1351                 break;
1352             }
1353 
1354             // the partial conversion succeeded, set a new limit and continue
1355             sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit;
1356             flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit);
1357         }
1358     }
1359 
1360     return (int32_t)(target-result);
1361 }
1362 
1363 static int32_t
stepFromUnicode(ConversionCase & cc,UConverter * cnv,char * result,int32_t resultCapacity,int32_t * resultOffsets,int32_t step,UErrorCode * pErrorCode)1364 stepFromUnicode(ConversionCase &cc, UConverter *cnv,
1365                 char *result, int32_t resultCapacity,
1366                 int32_t *resultOffsets, /* also resultCapacity */
1367                 int32_t step,
1368                 UErrorCode *pErrorCode) {
1369     const UChar *source, *sourceLimit, *unicodeLimit;
1370     char *target, *targetLimit, *resultLimit;
1371     UBool flush;
1372 
1373     source=cc.unicode;
1374     target=result;
1375     unicodeLimit=source+cc.unicodeLength;
1376     resultLimit=result+resultCapacity;
1377 
1378     // call ucnv_fromUnicode() with in/out buffers no larger than (step) at a time
1379     // move only one buffer (in vs. out) at a time to be extra mean
1380     // step==0 performs bulk conversion and generates offsets
1381 
1382     // initialize the partial limits for the loop
1383     if(step==0) {
1384         // use the entire buffers
1385         sourceLimit=unicodeLimit;
1386         targetLimit=resultLimit;
1387         flush=cc.finalFlush;
1388     } else {
1389         // start with empty partial buffers
1390         sourceLimit=source;
1391         targetLimit=target;
1392         flush=FALSE;
1393 
1394         // output offsets only for bulk conversion
1395         resultOffsets=NULL;
1396     }
1397 
1398     for(;;) {
1399         // resetting the opposite conversion direction must not affect this one
1400         ucnv_resetToUnicode(cnv);
1401 
1402         // convert
1403         ucnv_fromUnicode(cnv,
1404             &target, targetLimit,
1405             &source, sourceLimit,
1406             resultOffsets,
1407             flush, pErrorCode);
1408 
1409         // check pointers and errors
1410         if(source>sourceLimit || target>targetLimit) {
1411             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1412             break;
1413         } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1414             if(target!=targetLimit) {
1415                 // buffer overflow must only be set when the target is filled
1416                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1417                 break;
1418             } else if(targetLimit==resultLimit) {
1419                 // not just a partial overflow
1420                 break;
1421             }
1422 
1423             // the partial target is filled, set a new limit, reset the error and continue
1424             targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1425             *pErrorCode=U_ZERO_ERROR;
1426         } else if(U_FAILURE(*pErrorCode)) {
1427             // some other error occurred, done
1428             break;
1429         } else {
1430             if(source!=sourceLimit) {
1431                 // when no error occurs, then the input must be consumed
1432                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1433                 break;
1434             }
1435 
1436             if(sourceLimit==unicodeLimit) {
1437                 // we are done
1438                 break;
1439             }
1440 
1441             // the partial conversion succeeded, set a new limit and continue
1442             sourceLimit=(unicodeLimit-source)>=step ? source+step : unicodeLimit;
1443             flush=(UBool)(cc.finalFlush && sourceLimit==unicodeLimit);
1444         }
1445     }
1446 
1447     return (int32_t)(target-result);
1448 }
1449 
1450 UBool
FromUnicodeCase(ConversionCase & cc,UConverterFromUCallback callback,const char * option)1451 ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback callback, const char *option) {
1452     UConverter *cnv;
1453     UErrorCode errorCode;
1454 
1455     // open the converter
1456     errorCode=U_ZERO_ERROR;
1457     cnv=cnv_open(cc.charset, errorCode);
1458     if(U_FAILURE(errorCode)) {
1459         errcheckln(errorCode, "fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
1460                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1461         return FALSE;
1462     }
1463     ucnv_resetToUnicode(utf8Cnv);
1464 
1465     // set the callback
1466     if(callback!=NULL) {
1467         ucnv_setFromUCallBack(cnv, callback, option, NULL, NULL, &errorCode);
1468         if(U_FAILURE(errorCode)) {
1469             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setFromUCallBack() failed - %s",
1470                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1471             ucnv_close(cnv);
1472             return FALSE;
1473         }
1474     }
1475 
1476     // set the fallbacks flag
1477     // TODO change with Jitterbug 2401, then add a similar call for toUnicode too
1478     ucnv_setFallback(cnv, cc.fallbacks);
1479 
1480     // set the subchar
1481     int32_t length;
1482 
1483     if(cc.setSub>0) {
1484         length=(int32_t)strlen(cc.subchar);
1485         ucnv_setSubstChars(cnv, cc.subchar, (int8_t)length, &errorCode);
1486         if(U_FAILURE(errorCode)) {
1487             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstChars() failed - %s",
1488                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1489             ucnv_close(cnv);
1490             return FALSE;
1491         }
1492     } else if(cc.setSub<0) {
1493         ucnv_setSubstString(cnv, cc.subString, -1, &errorCode);
1494         if(U_FAILURE(errorCode)) {
1495             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstString() failed - %s",
1496                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1497             ucnv_close(cnv);
1498             return FALSE;
1499         }
1500     }
1501 
1502     // convert unicode to utf8
1503     char utf8[256];
1504     cc.utf8=utf8;
1505     u_strToUTF8(utf8, UPRV_LENGTHOF(utf8), &cc.utf8Length,
1506                 cc.unicode, cc.unicodeLength,
1507                 &errorCode);
1508     if(U_FAILURE(errorCode)) {
1509         // skip UTF-8 testing of a string with an unpaired surrogate,
1510         // or of one that's too long
1511         // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1512         cc.utf8Length=-1;
1513     }
1514 
1515     int32_t resultOffsets[256];
1516     char result[256];
1517     int32_t resultLength;
1518     UBool ok;
1519 
1520     static const struct {
1521         int32_t step;
1522         const char *name, *utf8Name;
1523     } steps[]={
1524         { 0, "bulk",   "utf8" }, // must be first for offsets to be checked
1525         { 1, "step=1", "utf8 step=1" },
1526         { 3, "step=3", "utf8 step=3" },
1527         { 7, "step=7", "utf8 step=7" }
1528     };
1529     int32_t i, step;
1530 
1531     ok=TRUE;
1532     for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
1533         step=steps[i].step;
1534         memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
1535         memset(result, -1, UPRV_LENGTHOF(result));
1536         errorCode=U_ZERO_ERROR;
1537         resultLength=stepFromUnicode(cc, cnv,
1538                                 result, UPRV_LENGTHOF(result),
1539                                 step==0 ? resultOffsets : NULL,
1540                                 step, &errorCode);
1541         ok=checkFromUnicode(
1542                 cc, cnv, steps[i].name,
1543                 (uint8_t *)result, resultLength,
1544                 cc.offsets!=NULL ? resultOffsets : NULL,
1545                 errorCode);
1546         if(U_FAILURE(errorCode) || !cc.finalFlush) {
1547             // reset if an error occurred or we did not flush
1548             // otherwise do nothing to make sure that flushing resets
1549             ucnv_resetFromUnicode(cnv);
1550         }
1551         if (resultOffsets[resultLength] != -1) {
1552             errln("fromUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1553                 cc.caseNr, cc.charset, resultLength);
1554         }
1555         if (result[resultLength] != (char)-1) {
1556             errln("fromUnicode[%d](%s) Conversion wrote too much to result at index %d",
1557                 cc.caseNr, cc.charset, resultLength);
1558         }
1559 
1560         // bulk test is first, then offsets are not checked any more
1561         cc.offsets=NULL;
1562 
1563         // test direct conversion from UTF-8
1564         if(cc.utf8Length>=0) {
1565             errorCode=U_ZERO_ERROR;
1566             resultLength=stepFromUTF8(cc, utf8Cnv, cnv,
1567                                     result, UPRV_LENGTHOF(result),
1568                                     step, &errorCode);
1569             ok=checkFromUnicode(
1570                     cc, cnv, steps[i].utf8Name,
1571                     (uint8_t *)result, resultLength,
1572                     NULL,
1573                     errorCode);
1574             if(U_FAILURE(errorCode) || !cc.finalFlush) {
1575                 // reset if an error occurred or we did not flush
1576                 // otherwise do nothing to make sure that flushing resets
1577                 ucnv_resetToUnicode(utf8Cnv);
1578                 ucnv_resetFromUnicode(cnv);
1579             }
1580         }
1581     }
1582 
1583     // not a real loop, just a convenience for breaking out of the block
1584     while(ok && cc.finalFlush) {
1585         // test ucnv_fromUChars()
1586         memset(result, 0, sizeof(result));
1587 
1588         errorCode=U_ZERO_ERROR;
1589         resultLength=ucnv_fromUChars(cnv,
1590                         result, UPRV_LENGTHOF(result),
1591                         cc.unicode, cc.unicodeLength,
1592                         &errorCode);
1593         ok=checkFromUnicode(
1594                 cc, cnv, "fromUChars",
1595                 (uint8_t *)result, resultLength,
1596                 NULL,
1597                 errorCode);
1598         if(!ok) {
1599             break;
1600         }
1601 
1602         // test preflighting
1603         // keep the correct result for simple checking
1604         errorCode=U_ZERO_ERROR;
1605         resultLength=ucnv_fromUChars(cnv,
1606                         NULL, 0,
1607                         cc.unicode, cc.unicodeLength,
1608                         &errorCode);
1609         if(errorCode==U_STRING_NOT_TERMINATED_WARNING || errorCode==U_BUFFER_OVERFLOW_ERROR) {
1610             errorCode=U_ZERO_ERROR;
1611         }
1612         ok=checkFromUnicode(
1613                 cc, cnv, "preflight fromUChars",
1614                 (uint8_t *)result, resultLength,
1615                 NULL,
1616                 errorCode);
1617         break;
1618     }
1619 
1620     ucnv_close(cnv);
1621     return ok;
1622 }
1623 
1624 UBool
checkFromUnicode(ConversionCase & cc,UConverter * cnv,const char * name,const uint8_t * result,int32_t resultLength,const int32_t * resultOffsets,UErrorCode resultErrorCode)1625 ConversionTest::checkFromUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1626                                  const uint8_t *result, int32_t resultLength,
1627                                  const int32_t *resultOffsets,
1628                                  UErrorCode resultErrorCode) {
1629     UChar resultInvalidUChars[8];
1630     int8_t resultInvalidLength;
1631     UErrorCode errorCode;
1632 
1633     const char *msg;
1634 
1635     // reset the message; NULL will mean "ok"
1636     msg=NULL;
1637 
1638     errorCode=U_ZERO_ERROR;
1639     resultInvalidLength=UPRV_LENGTHOF(resultInvalidUChars);
1640     ucnv_getInvalidUChars(cnv, resultInvalidUChars, &resultInvalidLength, &errorCode);
1641     if(U_FAILURE(errorCode)) {
1642         errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidUChars() failed - %s",
1643                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1644         return FALSE;
1645     }
1646 
1647     // check everything that might have gone wrong
1648     if(cc.bytesLength!=resultLength) {
1649         msg="wrong result length";
1650     } else if(0!=memcmp(cc.bytes, result, cc.bytesLength)) {
1651         msg="wrong result string";
1652     } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.bytesLength*sizeof(*cc.offsets))) {
1653         msg="wrong offsets";
1654     } else if(cc.outErrorCode!=resultErrorCode) {
1655         msg="wrong error code";
1656     } else if(cc.invalidLength!=resultInvalidLength) {
1657         msg="wrong length of last invalid input";
1658     } else if(0!=u_memcmp(cc.invalidUChars, resultInvalidUChars, cc.invalidLength)) {
1659         msg="wrong last invalid input";
1660     }
1661 
1662     if(msg==NULL) {
1663         return TRUE;
1664     } else {
1665         char buffer[2000]; // one buffer for all strings
1666         char *s, *unicodeString, *bytesString, *resultString,
1667             *offsetsString, *resultOffsetsString,
1668             *invalidCharsString, *resultInvalidUCharsString;
1669 
1670         unicodeString=s=buffer;
1671         s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString);
1672         s=printBytes(cc.bytes, cc.bytesLength, bytesString=s);
1673         s=printBytes(result, resultLength, resultString=s);
1674         s=printOffsets(cc.offsets, cc.bytesLength, offsetsString=s);
1675         s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1676         s=printUnicode(cc.invalidUChars, cc.invalidLength, invalidCharsString=s);
1677         s=printUnicode(resultInvalidUChars, resultInvalidLength, resultInvalidUCharsString=s);
1678 
1679         if((s-buffer)>(int32_t)sizeof(buffer)) {
1680             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkFromUnicode() test output buffer overflow writing %d chars\n",
1681                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1682             exit(1);
1683         }
1684 
1685         errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1686               "  unicode <%s>[%d]\n"
1687               " expected <%s>[%d]\n"
1688               "  result  <%s>[%d]\n"
1689               " offsets         <%s>\n"
1690               "  result offsets <%s>\n"
1691               " error code expected %s got %s\n"
1692               "  invalidChars expected <%s> got <%s>\n",
1693               cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1694               unicodeString, cc.unicodeLength,
1695               bytesString, cc.bytesLength,
1696               resultString, resultLength,
1697               offsetsString,
1698               resultOffsetsString,
1699               u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1700               invalidCharsString, resultInvalidUCharsString);
1701 
1702         return FALSE;
1703     }
1704 }
1705 
1706 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
1707