1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2002-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  strcase.cpp
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002mar12
16 *   created by: Markus W. Scherer
17 *
18 *   Test file for string casing C++ API functions.
19 */
20 
21 #include "unicode/std_string.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ures.h"
24 #include "unicode/uloc.h"
25 #include "unicode/locid.h"
26 #include "unicode/ubrk.h"
27 #include "unicode/unistr.h"
28 #include "unicode/ucasemap.h"
29 #include "ucase.h"
30 #include "ustrtest.h"
31 #include "unicode/tstdtmod.h"
32 #include "cmemory.h"
33 
StringCaseTest()34 StringCaseTest::StringCaseTest() : GREEK_LOCALE_("el") {}
35 
~StringCaseTest()36 StringCaseTest::~StringCaseTest() {}
37 
38 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)39 StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
40     if(exec) {
41         logln("TestSuite StringCaseTest: ");
42     }
43     TESTCASE_AUTO_BEGIN;
44     TESTCASE_AUTO(TestCaseConversion);
45 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
46     TESTCASE_AUTO(TestCasing);
47 #endif
48     TESTCASE_AUTO(TestFullCaseFoldingIterator);
49     TESTCASE_AUTO(TestGreekUpper);
50     TESTCASE_AUTO(TestLongUpper);
51     TESTCASE_AUTO(TestMalformedUTF8);
52     TESTCASE_AUTO(TestBufferOverflow);
53     TESTCASE_AUTO_END;
54 }
55 
56 void
TestCaseConversion()57 StringCaseTest::TestCaseConversion()
58 {
59     static const UChar uppercaseGreek[] =
60         { 0x399, 0x395, 0x3a3, 0x3a5, 0x3a3, 0x20, 0x03a7, 0x3a1, 0x399, 0x3a3, 0x3a4,
61         0x39f, 0x3a3, 0 };
62         // "IESUS CHRISTOS"
63 
64     static const UChar lowercaseGreek[] =
65         { 0x3b9, 0x3b5, 0x3c3, 0x3c5, 0x3c2, 0x20, 0x03c7, 0x3c1, 0x3b9, 0x3c3, 0x3c4,
66         0x3bf, 0x3c2, 0 };
67         // "iesus christos"
68 
69     static const UChar lowercaseTurkish[] =
70         { 0x69, 0x73, 0x74, 0x61, 0x6e, 0x62, 0x75, 0x6c, 0x2c, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x63, 0x6f,
71         0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x0131, 0x6e, 0x6f, 0x70, 0x6c, 0x65, 0x21, 0 };
72 
73     static const UChar uppercaseTurkish[] =
74         { 0x54, 0x4f, 0x50, 0x4b, 0x41, 0x50, 0x49, 0x20, 0x50, 0x41, 0x4c, 0x41, 0x43, 0x45, 0x2c, 0x20,
75         0x0130, 0x53, 0x54, 0x41, 0x4e, 0x42, 0x55, 0x4c, 0 };
76 
77     UnicodeString expectedResult;
78     UnicodeString   test3;
79 
80     test3 += (UChar32)0x0130;
81     test3 += "STANBUL, NOT CONSTANTINOPLE!";
82 
83     UnicodeString   test4(test3);
84     test4.toLower(Locale(""));
85     expectedResult = UnicodeString("i\\u0307stanbul, not constantinople!", "").unescape();
86     if (test4 != expectedResult)
87         errln("1. toLower failed: expected \"" + expectedResult + "\", got \"" + test4 + "\".");
88 
89     test4 = test3;
90     test4.toLower(Locale("tr", "TR"));
91     expectedResult = lowercaseTurkish;
92     if (test4 != expectedResult)
93         errln("2. toLower failed: expected \"" + expectedResult + "\", got \"" + test4 + "\".");
94 
95     test3 = "topkap";
96     test3 += (UChar32)0x0131;
97     test3 += " palace, istanbul";
98     test4 = test3;
99 
100     test4.toUpper(Locale(""));
101     expectedResult = "TOPKAPI PALACE, ISTANBUL";
102     if (test4 != expectedResult)
103         errln("toUpper failed: expected \"" + expectedResult + "\", got \"" + test4 + "\".");
104 
105     test4 = test3;
106     test4.toUpper(Locale("tr", "TR"));
107     expectedResult = uppercaseTurkish;
108     if (test4 != expectedResult)
109         errln("toUpper failed: expected \"" + expectedResult + "\", got \"" + test4 + "\".");
110 
111     test3 = CharsToUnicodeString("S\\u00FC\\u00DFmayrstra\\u00DFe");
112 
113     test3.toUpper(Locale("de", "DE"));
114     expectedResult = CharsToUnicodeString("S\\u00DCSSMAYRSTRASSE");
115     if (test3 != expectedResult)
116         errln("toUpper failed: expected \"" + expectedResult + "\", got \"" + test3 + "\".");
117 
118     test4.replace(0, test4.length(), uppercaseGreek);
119 
120     test4.toLower(Locale("el", "GR"));
121     expectedResult = lowercaseGreek;
122     if (test4 != expectedResult)
123         errln("toLower failed: expected \"" + expectedResult + "\", got \"" + test4 + "\".");
124 
125     test4.replace(0, test4.length(), lowercaseGreek);
126 
127     test4.toUpper();
128     expectedResult = uppercaseGreek;
129     if (test4 != expectedResult)
130         errln("toUpper failed: expected \"" + expectedResult + "\", got \"" + test4 + "\".");
131 
132     // more string case mapping tests with the new implementation
133     {
134         static const UChar
135 
136         beforeLower[]= { 0x61, 0x42, 0x49,  0x3a3, 0xdf, 0x3a3, 0x2f, 0xd93f, 0xdfff },
137         lowerRoot[]=   { 0x61, 0x62, 0x69,  0x3c3, 0xdf, 0x3c2, 0x2f, 0xd93f, 0xdfff },
138         lowerTurkish[]={ 0x61, 0x62, 0x131, 0x3c3, 0xdf, 0x3c2, 0x2f, 0xd93f, 0xdfff },
139 
140         beforeUpper[]= { 0x61, 0x42, 0x69,  0x3c2, 0xdf,       0x3c3, 0x2f, 0xfb03,           0xfb03,           0xfb03,           0xd93f, 0xdfff },
141         upperRoot[]=   { 0x41, 0x42, 0x49,  0x3a3, 0x53, 0x53, 0x3a3, 0x2f, 0x46, 0x46, 0x49, 0x46, 0x46, 0x49, 0x46, 0x46, 0x49, 0xd93f, 0xdfff },
142         upperTurkish[]={ 0x41, 0x42, 0x130, 0x3a3, 0x53, 0x53, 0x3a3, 0x2f, 0x46, 0x46, 0x49, 0x46, 0x46, 0x49, 0x46, 0x46, 0x49, 0xd93f, 0xdfff },
143 
144         beforeMiniUpper[]=  { 0xdf, 0x61 },
145         miniUpper[]=        { 0x53, 0x53, 0x41 };
146 
147         UnicodeString s;
148 
149         /* lowercase with root locale */
150         s=UnicodeString(FALSE, beforeLower, UPRV_LENGTHOF(beforeLower));
151         s.toLower("");
152         if( s.length()!=UPRV_LENGTHOF(lowerRoot) ||
153             s!=UnicodeString(FALSE, lowerRoot, s.length())
154         ) {
155             errln("error in toLower(root locale)=\"" + s + "\" expected \"" + UnicodeString(FALSE, lowerRoot, UPRV_LENGTHOF(lowerRoot)) + "\"");
156         }
157 
158         /* lowercase with turkish locale */
159         s=UnicodeString(FALSE, beforeLower, UPRV_LENGTHOF(beforeLower));
160         s.setCharAt(0, beforeLower[0]).toLower(Locale("tr"));
161         if( s.length()!=UPRV_LENGTHOF(lowerTurkish) ||
162             s!=UnicodeString(FALSE, lowerTurkish, s.length())
163         ) {
164             errln("error in toLower(turkish locale)=\"" + s + "\" expected \"" + UnicodeString(FALSE, lowerTurkish, UPRV_LENGTHOF(lowerTurkish)) + "\"");
165         }
166 
167         /* uppercase with root locale */
168         s=UnicodeString(FALSE, beforeUpper, UPRV_LENGTHOF(beforeUpper));
169         s.setCharAt(0, beforeUpper[0]).toUpper(Locale(""));
170         if( s.length()!=UPRV_LENGTHOF(upperRoot) ||
171             s!=UnicodeString(FALSE, upperRoot, s.length())
172         ) {
173             errln("error in toUpper(root locale)=\"" + s + "\" expected \"" + UnicodeString(FALSE, upperRoot, UPRV_LENGTHOF(upperRoot)) + "\"");
174         }
175 
176         /* uppercase with turkish locale */
177         s=UnicodeString(FALSE, beforeUpper, UPRV_LENGTHOF(beforeUpper));
178         s.toUpper(Locale("tr"));
179         if( s.length()!=UPRV_LENGTHOF(upperTurkish) ||
180             s!=UnicodeString(FALSE, upperTurkish, s.length())
181         ) {
182             errln("error in toUpper(turkish locale)=\"" + s + "\" expected \"" + UnicodeString(FALSE, upperTurkish, UPRV_LENGTHOF(upperTurkish)) + "\"");
183         }
184 
185         /* uppercase a short string with root locale */
186         s=UnicodeString(FALSE, beforeMiniUpper, UPRV_LENGTHOF(beforeMiniUpper));
187         s.setCharAt(0, beforeMiniUpper[0]).toUpper("");
188         if( s.length()!=UPRV_LENGTHOF(miniUpper) ||
189             s!=UnicodeString(FALSE, miniUpper, s.length())
190         ) {
191             errln("error in toUpper(root locale)=\"" + s + "\" expected \"" + UnicodeString(FALSE, miniUpper, UPRV_LENGTHOF(miniUpper)) + "\"");
192         }
193     }
194 
195     // test some supplementary characters (>= Unicode 3.1)
196     {
197         UnicodeString t;
198 
199         UnicodeString
200             deseretInput=UnicodeString("\\U0001043C\\U00010414", "").unescape(),
201             deseretLower=UnicodeString("\\U0001043C\\U0001043C", "").unescape(),
202             deseretUpper=UnicodeString("\\U00010414\\U00010414", "").unescape();
203         (t=deseretInput).toLower();
204         if(t!=deseretLower) {
205             errln("error lowercasing Deseret (plane 1) characters");
206         }
207         (t=deseretInput).toUpper();
208         if(t!=deseretUpper) {
209             errln("error uppercasing Deseret (plane 1) characters");
210         }
211     }
212 
213     // test some more cases that looked like problems
214     {
215         UnicodeString t;
216 
217         UnicodeString
218             ljInput=UnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 \\U0001043C\\U00010414", "").unescape(),
219             ljLower=UnicodeString("ab'cd \\uFB00i\\u0131ii\\u0307 \\u01C9\\u01C9\\u01C9 \\U0001043C\\U0001043C", "").unescape(),
220             ljUpper=UnicodeString("AB'CD FFIII\\u0130 \\u01C7\\u01C7\\u01C7 \\U00010414\\U00010414", "").unescape();
221         (t=ljInput).toLower("en");
222         if(t!=ljLower) {
223             errln("error lowercasing LJ characters");
224         }
225         (t=ljInput).toUpper("en");
226         if(t!=ljUpper) {
227             errln("error uppercasing LJ characters");
228         }
229     }
230 
231 #if !UCONFIG_NO_NORMALIZATION
232     // some context-sensitive casing depends on normalization data being present
233 
234     // Unicode 3.1.1 SpecialCasing tests
235     {
236         UnicodeString t;
237 
238         // sigmas preceded and/or followed by cased letters
239         UnicodeString
240             sigmas=UnicodeString("i\\u0307\\u03a3\\u0308j \\u0307\\u03a3\\u0308j i\\u00ad\\u03a3\\u0308 \\u0307\\u03a3\\u0308 ", "").unescape(),
241             sigmasLower=UnicodeString("i\\u0307\\u03c3\\u0308j \\u0307\\u03c3\\u0308j i\\u00ad\\u03c2\\u0308 \\u0307\\u03c3\\u0308 ", "").unescape(),
242             sigmasUpper=UnicodeString("I\\u0307\\u03a3\\u0308J \\u0307\\u03a3\\u0308J I\\u00ad\\u03a3\\u0308 \\u0307\\u03a3\\u0308 ", "").unescape();
243 
244         (t=sigmas).toLower();
245         if(t!=sigmasLower) {
246             errln("error in sigmas.toLower()=\"" + t + "\" expected \"" + sigmasLower + "\"");
247         }
248 
249         (t=sigmas).toUpper(Locale(""));
250         if(t!=sigmasUpper) {
251             errln("error in sigmas.toUpper()=\"" + t + "\" expected \"" + sigmasUpper + "\"");
252         }
253 
254         // turkish & azerbaijani dotless i & dotted I
255         // remove dot above if there was a capital I before and there are no more accents above
256         UnicodeString
257             dots=UnicodeString("I \\u0130 I\\u0307 I\\u0327\\u0307 I\\u0301\\u0307 I\\u0327\\u0307\\u0301", "").unescape(),
258             dotsTurkish=UnicodeString("\\u0131 i i i\\u0327 \\u0131\\u0301\\u0307 i\\u0327\\u0301", "").unescape(),
259             dotsDefault=UnicodeString("i i\\u0307 i\\u0307 i\\u0327\\u0307 i\\u0301\\u0307 i\\u0327\\u0307\\u0301", "").unescape();
260 
261         (t=dots).toLower("tr");
262         if(t!=dotsTurkish) {
263             errln("error in dots.toLower(tr)=\"" + t + "\" expected \"" + dotsTurkish + "\"");
264         }
265 
266         (t=dots).toLower("de");
267         if(t!=dotsDefault) {
268             errln("error in dots.toLower(de)=\"" + t + "\" expected \"" + dotsDefault + "\"");
269         }
270     }
271 
272     // more Unicode 3.1.1 tests
273     {
274         UnicodeString t;
275 
276         // lithuanian dot above in uppercasing
277         UnicodeString
278             dots=UnicodeString("a\\u0307 \\u0307 i\\u0307 j\\u0327\\u0307 j\\u0301\\u0307", "").unescape(),
279             dotsLithuanian=UnicodeString("A\\u0307 \\u0307 I J\\u0327 J\\u0301\\u0307", "").unescape(),
280             dotsDefault=UnicodeString("A\\u0307 \\u0307 I\\u0307 J\\u0327\\u0307 J\\u0301\\u0307", "").unescape();
281 
282         (t=dots).toUpper("lt");
283         if(t!=dotsLithuanian) {
284             errln("error in dots.toUpper(lt)=\"" + t + "\" expected \"" + dotsLithuanian + "\"");
285         }
286 
287         (t=dots).toUpper("de");
288         if(t!=dotsDefault) {
289             errln("error in dots.toUpper(de)=\"" + t + "\" expected \"" + dotsDefault + "\"");
290         }
291 
292         // lithuanian adds dot above to i in lowercasing if there are more above accents
293         UnicodeString
294             i=UnicodeString("I I\\u0301 J J\\u0301 \\u012e \\u012e\\u0301 \\u00cc\\u00cd\\u0128", "").unescape(),
295             iLithuanian=UnicodeString("i i\\u0307\\u0301 j j\\u0307\\u0301 \\u012f \\u012f\\u0307\\u0301 i\\u0307\\u0300i\\u0307\\u0301i\\u0307\\u0303", "").unescape(),
296             iDefault=UnicodeString("i i\\u0301 j j\\u0301 \\u012f \\u012f\\u0301 \\u00ec\\u00ed\\u0129", "").unescape();
297 
298         (t=i).toLower("lt");
299         if(t!=iLithuanian) {
300             errln("error in i.toLower(lt)=\"" + t + "\" expected \"" + iLithuanian + "\"");
301         }
302 
303         (t=i).toLower("de");
304         if(t!=iDefault) {
305             errln("error in i.toLower(de)=\"" + t + "\" expected \"" + iDefault + "\"");
306         }
307     }
308 
309 #endif
310 
311     // test case folding
312     {
313         UnicodeString
314             s=UnicodeString("A\\u00df\\u00b5\\ufb03\\U0001040c\\u0130\\u0131", "").unescape(),
315             f=UnicodeString("ass\\u03bcffi\\U00010434i\\u0307\\u0131", "").unescape(),
316             g=UnicodeString("ass\\u03bcffi\\U00010434i\\u0131", "").unescape(),
317             t;
318 
319         (t=s).foldCase();
320         if(f!=t) {
321             errln("error in foldCase(\"" + s + "\", default)=\"" + t + "\" but expected \"" + f + "\"");
322         }
323 
324         // alternate handling for dotted I/dotless i (U+0130, U+0131)
325         (t=s).foldCase(U_FOLD_CASE_EXCLUDE_SPECIAL_I);
326         if(g!=t) {
327             errln("error in foldCase(\"" + s + "\", U_FOLD_CASE_EXCLUDE_SPECIAL_I)=\"" + t + "\" but expected \"" + g + "\"");
328         }
329     }
330 }
331 
332 // data-driven case mapping tests ------------------------------------------ ***
333 
334 enum {
335     TEST_LOWER,
336     TEST_UPPER,
337     TEST_TITLE,
338     TEST_FOLD,
339     TEST_COUNT
340 };
341 
342 // names of TestData children in casing.txt
343 static const char *const dataNames[TEST_COUNT+1]={
344     "lowercasing",
345     "uppercasing",
346     "titlecasing",
347     "casefolding",
348     ""
349 };
350 
351 void
TestCasingImpl(const UnicodeString & input,const UnicodeString & output,int32_t whichCase,void * iter,const char * localeID,uint32_t options)352 StringCaseTest::TestCasingImpl(const UnicodeString &input,
353                                const UnicodeString &output,
354                                int32_t whichCase,
355                                void *iter, const char *localeID, uint32_t options) {
356     // UnicodeString
357     UnicodeString result;
358     const char *name;
359     Locale locale(localeID);
360 
361     result=input;
362     switch(whichCase) {
363     case TEST_LOWER:
364         name="toLower";
365         result.toLower(locale);
366         break;
367     case TEST_UPPER:
368         name="toUpper";
369         result.toUpper(locale);
370         break;
371 #if !UCONFIG_NO_BREAK_ITERATION
372     case TEST_TITLE:
373         name="toTitle";
374         result.toTitle((BreakIterator *)iter, locale, options);
375         break;
376 #endif
377     case TEST_FOLD:
378         name="foldCase";
379         result.foldCase(options);
380         break;
381     default:
382         name="";
383         break; // won't happen
384     }
385     if(result!=output) {
386         dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name);
387     }
388 #if !UCONFIG_NO_BREAK_ITERATION
389     if(whichCase==TEST_TITLE && options==0) {
390         result=input;
391         result.toTitle((BreakIterator *)iter, locale);
392         if(result!=output) {
393             dataerrln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res");
394         }
395     }
396 #endif
397 
398     // UTF-8
399     char utf8In[100], utf8Out[100];
400     int32_t utf8InLength, utf8OutLength, resultLength;
401     UChar *buffer;
402 
403     IcuTestErrorCode errorCode(*this, "TestCasingImpl");
404     LocalUCaseMapPointer csm(ucasemap_open(localeID, options, errorCode));
405 #if !UCONFIG_NO_BREAK_ITERATION
406     if(iter!=NULL) {
407         // Clone the break iterator so that the UCaseMap can safely adopt it.
408         UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, NULL, errorCode);
409         ucasemap_setBreakIterator(csm.getAlias(), clone, errorCode);
410     }
411 #endif
412 
413     u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), errorCode);
414     switch(whichCase) {
415     case TEST_LOWER:
416         name="ucasemap_utf8ToLower";
417         utf8OutLength=ucasemap_utf8ToLower(csm.getAlias(),
418                     utf8Out, (int32_t)sizeof(utf8Out),
419                     utf8In, utf8InLength, errorCode);
420         break;
421     case TEST_UPPER:
422         name="ucasemap_utf8ToUpper";
423         utf8OutLength=ucasemap_utf8ToUpper(csm.getAlias(),
424                     utf8Out, (int32_t)sizeof(utf8Out),
425                     utf8In, utf8InLength, errorCode);
426         break;
427 #if !UCONFIG_NO_BREAK_ITERATION
428     case TEST_TITLE:
429         name="ucasemap_utf8ToTitle";
430         utf8OutLength=ucasemap_utf8ToTitle(csm.getAlias(),
431                     utf8Out, (int32_t)sizeof(utf8Out),
432                     utf8In, utf8InLength, errorCode);
433         break;
434 #endif
435     case TEST_FOLD:
436         name="ucasemap_utf8FoldCase";
437         utf8OutLength=ucasemap_utf8FoldCase(csm.getAlias(),
438                     utf8Out, (int32_t)sizeof(utf8Out),
439                     utf8In, utf8InLength, errorCode);
440         break;
441     default:
442         name="";
443         utf8OutLength=0;
444         break; // won't happen
445     }
446     buffer=result.getBuffer(utf8OutLength);
447     u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, errorCode);
448     result.releaseBuffer(errorCode.isSuccess() ? resultLength : 0);
449 
450     if(errorCode.isFailure()) {
451         errcheckln(errorCode, "error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode));
452         errorCode.reset();
453     } else if(result!=output) {
454         errln("error: %s() got a wrong result for a test case from casing.res", name);
455         errln("expected \"" + output + "\" got \"" + result + "\"" );
456     }
457 }
458 
459 void
TestCasing()460 StringCaseTest::TestCasing() {
461     UErrorCode status = U_ZERO_ERROR;
462 #if !UCONFIG_NO_BREAK_ITERATION
463     LocalUBreakIteratorPointer iter;
464 #endif
465     char cLocaleID[100];
466     UnicodeString locale, input, output, optionsString, result;
467     uint32_t options;
468     int32_t whichCase, type;
469     LocalPointer<TestDataModule> driver(TestDataModule::getTestDataModule("casing", *this, status));
470     if(U_SUCCESS(status)) {
471         for(whichCase=0; whichCase<TEST_COUNT; ++whichCase) {
472 #if UCONFIG_NO_BREAK_ITERATION
473             if(whichCase==TEST_TITLE) {
474                 continue;
475             }
476 #endif
477             LocalPointer<TestData> casingTest(driver->createTestData(dataNames[whichCase], status));
478             if(U_FAILURE(status)) {
479                 errln("TestCasing failed to createTestData(%s) - %s", dataNames[whichCase], u_errorName(status));
480                 break;
481             }
482             const DataMap *myCase = NULL;
483             while(casingTest->nextCase(myCase, status)) {
484                 input = myCase->getString("Input", status);
485                 output = myCase->getString("Output", status);
486 
487                 if(whichCase!=TEST_FOLD) {
488                     locale = myCase->getString("Locale", status);
489                 }
490                 locale.extract(0, 0x7fffffff, cLocaleID, sizeof(cLocaleID), "");
491 
492 #if !UCONFIG_NO_BREAK_ITERATION
493                 if(whichCase==TEST_TITLE) {
494                     type = myCase->getInt("Type", status);
495                     if(type>=0) {
496                         iter.adoptInstead(ubrk_open((UBreakIteratorType)type, cLocaleID, NULL, 0, &status));
497                     } else if(type==-2) {
498                         // Open a trivial break iterator that only delivers { 0, length }
499                         // or even just { 0 } as boundaries.
500                         static const UChar rules[] = { 0x2e, 0x2a, 0x3b };  // ".*;"
501                         UParseError parseError;
502                         iter.adoptInstead(ubrk_openRules(rules, UPRV_LENGTHOF(rules), NULL, 0, &parseError, &status));
503                     }
504                 }
505 #endif
506                 options = 0;
507                 if(whichCase==TEST_TITLE || whichCase==TEST_FOLD) {
508                     optionsString = myCase->getString("Options", status);
509                     if(optionsString.indexOf((UChar)0x54)>=0) {  // T
510                         options|=U_FOLD_CASE_EXCLUDE_SPECIAL_I;
511                     }
512                     if(optionsString.indexOf((UChar)0x4c)>=0) {  // L
513                         options|=U_TITLECASE_NO_LOWERCASE;
514                     }
515                     if(optionsString.indexOf((UChar)0x41)>=0) {  // A
516                         options|=U_TITLECASE_NO_BREAK_ADJUSTMENT;
517                     }
518                 }
519 
520                 if(U_FAILURE(status)) {
521                     dataerrln("error: TestCasing() setup failed for %s test case from casing.res: %s", dataNames[whichCase],  u_errorName(status));
522                     status = U_ZERO_ERROR;
523                 } else {
524 #if UCONFIG_NO_BREAK_ITERATION
525                     LocalPointer<UMemory> iter;
526 #endif
527                     TestCasingImpl(input, output, whichCase, iter.getAlias(), cLocaleID, options);
528                 }
529 
530 #if !UCONFIG_NO_BREAK_ITERATION
531                 iter.adoptInstead(NULL);
532 #endif
533             }
534         }
535     }
536 
537 #if !UCONFIG_NO_BREAK_ITERATION
538     // more tests for API coverage
539     status=U_ZERO_ERROR;
540     input=UNICODE_STRING_SIMPLE("sTrA\\u00dfE").unescape();
541     (result=input).toTitle(NULL);
542     if(result!=UNICODE_STRING_SIMPLE("Stra\\u00dfe").unescape()) {
543         dataerrln("UnicodeString::toTitle(NULL) failed.");
544     }
545 #endif
546 }
547 
548 void
TestFullCaseFoldingIterator()549 StringCaseTest::TestFullCaseFoldingIterator() {
550     UnicodeString ffi=UNICODE_STRING_SIMPLE("ffi");
551     UnicodeString ss=UNICODE_STRING_SIMPLE("ss");
552     FullCaseFoldingIterator iter;
553     int32_t count=0;
554     int32_t countSpecific=0;
555     UChar32 c;
556     UnicodeString full;
557     while((c=iter.next(full))>=0) {
558         ++count;
559         // Check that the full Case_Folding has more than 1 code point.
560         if(!full.hasMoreChar32Than(0, 0x7fffffff, 1)) {
561             errln("error: FullCaseFoldingIterator.next()=U+%04lX full Case_Folding has at most 1 code point", (long)c);
562             continue;
563         }
564         // Check that full == Case_Folding(c).
565         UnicodeString cf(c);
566         cf.foldCase();
567         if(full!=cf) {
568             errln("error: FullCaseFoldingIterator.next()=U+%04lX full Case_Folding != cf(c)", (long)c);
569             continue;
570         }
571         // Spot-check a couple of specific cases.
572         if((full==ffi && c==0xfb03) || (full==ss && (c==0xdf || c==0x1e9e))) {
573             ++countSpecific;
574         }
575     }
576     if(countSpecific!=3) {
577         errln("error: FullCaseFoldingIterator did not yield exactly the expected specific cases");
578     }
579     if(count<70) {
580         errln("error: FullCaseFoldingIterator yielded only %d (cp, full) pairs", (int)count);
581     }
582 }
583 
584 void
assertGreekUpper(const char * s,const char * expected)585 StringCaseTest::assertGreekUpper(const char *s, const char *expected) {
586     UnicodeString s16 = UnicodeString(s).unescape();
587     UnicodeString expected16 = UnicodeString(expected).unescape();
588     UnicodeString msg = UnicodeString("UnicodeString::toUpper/Greek(\"") + s16 + "\")";
589     UnicodeString result16(s16);
590     result16.toUpper(GREEK_LOCALE_);
591     assertEquals(msg, expected16, result16);
592 
593     msg = UnicodeString("u_strToUpper/Greek(\"") + s16 + "\") cap=";
594     int32_t length = expected16.length();
595     int32_t capacities[] = {
596         // Keep in sync with the UTF-8 capacities near the bottom of this function.
597         0, length / 2, length - 1, length, length + 1
598     };
599     for (int32_t i = 0; i < UPRV_LENGTHOF(capacities); ++i) {
600         int32_t cap = capacities[i];
601         UChar *dest16 = result16.getBuffer(expected16.length() + 1);
602         u_memset(dest16, 0x55AA, result16.getCapacity());
603         UErrorCode errorCode = U_ZERO_ERROR;
604         length = u_strToUpper(dest16, cap, s16.getBuffer(), s16.length(), "el", &errorCode);
605         assertEquals(msg + cap, expected16.length(), length);
606         UErrorCode expectedErrorCode;
607         if (cap < expected16.length()) {
608             expectedErrorCode = U_BUFFER_OVERFLOW_ERROR;
609         } else if (cap == expected16.length()) {
610             expectedErrorCode = U_STRING_NOT_TERMINATED_WARNING;
611         } else {
612             expectedErrorCode = U_ZERO_ERROR;
613             assertEquals(msg + cap + " NUL", 0, dest16[length]);
614         }
615         assertEquals(msg + cap + " errorCode", expectedErrorCode, errorCode);
616         result16.releaseBuffer(length);
617         if (cap >= expected16.length()) {
618             assertEquals(msg + cap, expected16, result16);
619         }
620     }
621 
622 #if U_HAVE_STD_STRING
623     UErrorCode errorCode = U_ZERO_ERROR;
624     LocalUCaseMapPointer csm(ucasemap_open("el", 0, &errorCode));
625     assertSuccess("ucasemap_open", errorCode);
626     std::string s8;
627     s16.toUTF8String(s8);
628     msg = UnicodeString("ucasemap_utf8ToUpper/Greek(\"") + s16 + "\")";
629     char dest8[1000];
630     length = ucasemap_utf8ToUpper(csm.getAlias(), dest8, UPRV_LENGTHOF(dest8),
631                                   s8.data(), s8.length(), &errorCode);
632     assertSuccess("ucasemap_utf8ToUpper", errorCode);
633     StringPiece result8(dest8, length);
634     UnicodeString result16From8 = UnicodeString::fromUTF8(result8);
635     assertEquals(msg, expected16, result16From8);
636 
637     msg += " cap=";
638     capacities[1] = length / 2;
639     capacities[2] = length - 1;
640     capacities[3] = length;
641     capacities[4] = length + 1;
642     char dest8b[1000];
643     int32_t expected8Length = length;  // Assuming the previous call worked.
644     for (int32_t i = 0; i < UPRV_LENGTHOF(capacities); ++i) {
645         int32_t cap = capacities[i];
646         memset(dest8b, 0x5A, UPRV_LENGTHOF(dest8b));
647         UErrorCode errorCode = U_ZERO_ERROR;
648         length = ucasemap_utf8ToUpper(csm.getAlias(), dest8b, cap,
649                                       s8.data(), s8.length(), &errorCode);
650         assertEquals(msg + cap, expected8Length, length);
651         UErrorCode expectedErrorCode;
652         if (cap < expected8Length) {
653             expectedErrorCode = U_BUFFER_OVERFLOW_ERROR;
654         } else if (cap == expected8Length) {
655             expectedErrorCode = U_STRING_NOT_TERMINATED_WARNING;
656         } else {
657             expectedErrorCode = U_ZERO_ERROR;
658             assertEquals(msg + cap + " NUL", 0, dest8b[length]);
659         }
660         assertEquals(msg + cap + " errorCode", expectedErrorCode, errorCode);
661         if (cap >= expected8Length) {
662             assertEquals(msg + cap + " (memcmp)", 0, memcmp(dest8, dest8b, expected8Length));
663         }
664     }
665 #endif
666 }
667 
668 void
TestGreekUpper()669 StringCaseTest::TestGreekUpper() {
670     // See UCharacterCaseTest.java for human-readable strings.
671 
672     // http://bugs.icu-project.org/trac/ticket/5456
673     assertGreekUpper("\\u03AC\\u03B4\\u03B9\\u03BA\\u03BF\\u03C2, "
674                      "\\u03BA\\u03B5\\u03AF\\u03BC\\u03B5\\u03BD\\u03BF, "
675                      "\\u03AF\\u03C1\\u03B9\\u03B4\\u03B1",
676                      "\\u0391\\u0394\\u0399\\u039A\\u039F\\u03A3, "
677                      "\\u039A\\u0395\\u0399\\u039C\\u0395\\u039D\\u039F, "
678                      "\\u0399\\u03A1\\u0399\\u0394\\u0391");
679     // https://bugzilla.mozilla.org/show_bug.cgi?id=307039
680     // https://bug307039.bmoattachments.org/attachment.cgi?id=194893
681     assertGreekUpper("\\u03A0\\u03B1\\u03C4\\u03AC\\u03C4\\u03B1",
682                      "\\u03A0\\u0391\\u03A4\\u0391\\u03A4\\u0391");
683     assertGreekUpper("\\u0391\\u03AD\\u03C1\\u03B1\\u03C2, "
684                      "\\u039C\\u03C5\\u03C3\\u03C4\\u03AE\\u03C1\\u03B9\\u03BF, "
685                      "\\u03A9\\u03C1\\u03B1\\u03AF\\u03BF",
686                      "\\u0391\\u0395\\u03A1\\u0391\\u03A3, "
687                      "\\u039C\\u03A5\\u03A3\\u03A4\\u0397\\u03A1\\u0399\\u039F, "
688                      "\\u03A9\\u03A1\\u0391\\u0399\\u039F");
689     assertGreekUpper("\\u039C\\u03B1\\u0390\\u03BF\\u03C5, \\u03A0\\u03CC\\u03C1\\u03BF\\u03C2, "
690                      "\\u03A1\\u03CD\\u03B8\\u03BC\\u03B9\\u03C3\\u03B7",
691                      "\\u039C\\u0391\\u03AA\\u039F\\u03A5, \\u03A0\\u039F\\u03A1\\u039F\\u03A3, "
692                      "\\u03A1\\u03A5\\u0398\\u039C\\u0399\\u03A3\\u0397");
693     assertGreekUpper("\\u03B0, \\u03A4\\u03B7\\u03C1\\u03CE, \\u039C\\u03AC\\u03B9\\u03BF\\u03C2",
694                      "\\u03AB, \\u03A4\\u0397\\u03A1\\u03A9, \\u039C\\u0391\\u03AA\\u039F\\u03A3");
695     assertGreekUpper("\\u03AC\\u03C5\\u03BB\\u03BF\\u03C2",
696                      "\\u0391\\u03AB\\u039B\\u039F\\u03A3");
697     assertGreekUpper("\\u0391\\u03AB\\u039B\\u039F\\u03A3",
698                      "\\u0391\\u03AB\\u039B\\u039F\\u03A3");
699     assertGreekUpper("\\u0386\\u03BA\\u03BB\\u03B9\\u03C4\\u03B1 "
700                      "\\u03C1\\u03AE\\u03BC\\u03B1\\u03C4\\u03B1 \\u03AE "
701                      "\\u03AC\\u03BA\\u03BB\\u03B9\\u03C4\\u03B5\\u03C2 "
702                      "\\u03BC\\u03B5\\u03C4\\u03BF\\u03C7\\u03AD\\u03C2",
703                      "\\u0391\\u039A\\u039B\\u0399\\u03A4\\u0391 "
704                      "\\u03A1\\u0397\\u039C\\u0391\\u03A4\\u0391 \\u0397\\u0301 "
705                      "\\u0391\\u039A\\u039B\\u0399\\u03A4\\u0395\\u03A3 "
706                      "\\u039C\\u0395\\u03A4\\u039F\\u03A7\\u0395\\u03A3");
707     // http://www.unicode.org/udhr/d/udhr_ell_monotonic.html
708     assertGreekUpper("\\u0395\\u03C0\\u03B5\\u03B9\\u03B4\\u03AE \\u03B7 "
709                      "\\u03B1\\u03BD\\u03B1\\u03B3\\u03BD\\u03CE\\u03C1\\u03B9\\u03C3\\u03B7 "
710                      "\\u03C4\\u03B7\\u03C2 \\u03B1\\u03BE\\u03B9\\u03BF\\u03C0\\u03C1\\u03AD"
711                      "\\u03C0\\u03B5\\u03B9\\u03B1\\u03C2",
712                      "\\u0395\\u03A0\\u0395\\u0399\\u0394\\u0397 \\u0397 "
713                      "\\u0391\\u039D\\u0391\\u0393\\u039D\\u03A9\\u03A1\\u0399\\u03A3\\u0397 "
714                      "\\u03A4\\u0397\\u03A3 \\u0391\\u039E\\u0399\\u039F\\u03A0\\u03A1\\u0395"
715                      "\\u03A0\\u0395\\u0399\\u0391\\u03A3");
716     assertGreekUpper("\\u03BD\\u03BF\\u03BC\\u03B9\\u03BA\\u03BF\\u03CD \\u03AE "
717                      "\\u03B4\\u03B9\\u03B5\\u03B8\\u03BD\\u03BF\\u03CD\\u03C2",
718                      "\\u039D\\u039F\\u039C\\u0399\\u039A\\u039F\\u03A5 \\u0397\\u0301 "
719                      "\\u0394\\u0399\\u0395\\u0398\\u039D\\u039F\\u03A5\\u03A3");
720     // http://unicode.org/udhr/d/udhr_ell_polytonic.html
721     assertGreekUpper("\\u1F18\\u03C0\\u03B5\\u03B9\\u03B4\\u1F74 \\u1F21 "
722                      "\\u1F00\\u03BD\\u03B1\\u03B3\\u03BD\\u1F7D\\u03C1\\u03B9\\u03C3\\u03B7",
723                      "\\u0395\\u03A0\\u0395\\u0399\\u0394\\u0397 \\u0397 "
724                      "\\u0391\\u039D\\u0391\\u0393\\u039D\\u03A9\\u03A1\\u0399\\u03A3\\u0397");
725     assertGreekUpper("\\u03BD\\u03BF\\u03BC\\u03B9\\u03BA\\u03BF\\u1FE6 \\u1F22 "
726                      "\\u03B4\\u03B9\\u03B5\\u03B8\\u03BD\\u03BF\\u1FE6\\u03C2",
727                      "\\u039D\\u039F\\u039C\\u0399\\u039A\\u039F\\u03A5 \\u0397\\u0301 "
728                      "\\u0394\\u0399\\u0395\\u0398\\u039D\\u039F\\u03A5\\u03A3");
729     // From Google bug report
730     assertGreekUpper("\\u039D\\u03AD\\u03BF, "
731                      "\\u0394\\u03B7\\u03BC\\u03B9\\u03BF\\u03C5\\u03C1\\u03B3\\u03AF\\u03B1",
732                      "\\u039D\\u0395\\u039F, "
733                      "\\u0394\\u0397\\u039C\\u0399\\u039F\\u03A5\\u03A1\\u0393\\u0399\\u0391");
734     // http://crbug.com/234797
735     assertGreekUpper("\\u0395\\u03BB\\u03AC\\u03C4\\u03B5 \\u03BD\\u03B1 \\u03C6\\u03AC\\u03C4\\u03B5 "
736                      "\\u03C4\\u03B1 \\u03BA\\u03B1\\u03BB\\u03CD\\u03C4\\u03B5\\u03C1\\u03B1 "
737                      "\\u03C0\\u03B1\\u03CA\\u03B4\\u03AC\\u03BA\\u03B9\\u03B1!",
738                      "\\u0395\\u039B\\u0391\\u03A4\\u0395 \\u039D\\u0391 \\u03A6\\u0391\\u03A4\\u0395 "
739                      "\\u03A4\\u0391 \\u039A\\u0391\\u039B\\u03A5\\u03A4\\u0395\\u03A1\\u0391 "
740                      "\\u03A0\\u0391\\u03AA\\u0394\\u0391\\u039A\\u0399\\u0391!");
741     assertGreekUpper("\\u039C\\u03B1\\u0390\\u03BF\\u03C5, \\u03C4\\u03C1\\u03CC\\u03BB\\u03B5\\u03CA",
742                      "\\u039C\\u0391\\u03AA\\u039F\\u03A5, \\u03A4\\u03A1\\u039F\\u039B\\u0395\\u03AA");
743     assertGreekUpper("\\u03A4\\u03BF \\u03AD\\u03BD\\u03B1 \\u03AE \\u03C4\\u03BF "
744                      "\\u03AC\\u03BB\\u03BB\\u03BF.",
745                      "\\u03A4\\u039F \\u0395\\u039D\\u0391 \\u0397\\u0301 \\u03A4\\u039F "
746                      "\\u0391\\u039B\\u039B\\u039F.");
747     // http://multilingualtypesetting.co.uk/blog/greek-typesetting-tips/
748     assertGreekUpper("\\u03C1\\u03C9\\u03BC\\u03AD\\u03B9\\u03BA\\u03B1",
749                      "\\u03A1\\u03A9\\u039C\\u0395\\u03AA\\u039A\\u0391");
750 }
751 
752 void
TestLongUpper()753 StringCaseTest::TestLongUpper() {
754     if (quick) {
755         logln("not exhaustive mode: skipping this test");
756         return;
757     }
758     // Ticket #12663, crash with an extremely long string where
759     // U+0390 maps to 0399 0308 0301 so that the result is three times as long
760     // and overflows an int32_t.
761     int32_t length = 0x40000004;  // more than 1G UChars
762     UnicodeString s(length, (UChar32)0x390, length);
763     UnicodeString result;
764     UChar *dest = result.getBuffer(length + 1);
765     if (s.isBogus() || dest == NULL) {
766         logln("Out of memory, unable to run this test on this machine.");
767         return;
768     }
769     IcuTestErrorCode errorCode(*this, "TestLongUpper");
770     int32_t destLength = u_strToUpper(dest, result.getCapacity(),
771                                       s.getBuffer(), s.length(), "", errorCode);
772     result.releaseBuffer(destLength);
773     if (errorCode.reset() != U_INDEX_OUTOFBOUNDS_ERROR) {
774         errln("expected U_INDEX_OUTOFBOUNDS_ERROR, got %s (destLength is undefined, got %ld)",
775               errorCode.errorName(), (long)destLength);
776     }
777 }
778 
TestMalformedUTF8()779 void StringCaseTest::TestMalformedUTF8() {
780     // ticket #12639
781     IcuTestErrorCode errorCode(*this, "TestMalformedUTF8");
782     LocalUCaseMapPointer csm(ucasemap_open("en", U_TITLECASE_NO_BREAK_ADJUSTMENT, errorCode));
783     if (errorCode.isFailure()) {
784         errln("ucasemap_open(English) failed - %s", errorCode.errorName());
785         return;
786     }
787     char src[1] = { (char)0x85 };  // malformed UTF-8
788     char dest[3] = { 0, 0, 0 };
789     int32_t destLength = ucasemap_utf8ToTitle(csm.getAlias(), dest, 3, src, 1, errorCode);
790     if (errorCode.isFailure() || destLength != 1 || dest[0] != src[0]) {
791         errln("ucasemap_utf8ToTitle(\\x85) failed: %s destLength=%d dest[0]=0x%02x",
792               errorCode.errorName(), (int)destLength, dest[0]);
793     }
794 
795     errorCode.reset();
796     dest[0] = 0;
797     destLength = ucasemap_utf8ToLower(csm.getAlias(), dest, 3, src, 1, errorCode);
798     if (errorCode.isFailure() || destLength != 1 || dest[0] != src[0]) {
799         errln("ucasemap_utf8ToLower(\\x85) failed: %s destLength=%d dest[0]=0x%02x",
800               errorCode.errorName(), (int)destLength, dest[0]);
801     }
802 
803     errorCode.reset();
804     dest[0] = 0;
805     destLength = ucasemap_utf8ToUpper(csm.getAlias(), dest, 3, src, 1, errorCode);
806     if (errorCode.isFailure() || destLength != 1 || dest[0] != src[0]) {
807         errln("ucasemap_utf8ToUpper(\\x85) failed: %s destLength=%d dest[0]=0x%02x",
808               errorCode.errorName(), (int)destLength, dest[0]);
809     }
810 
811     errorCode.reset();
812     dest[0] = 0;
813     destLength = ucasemap_utf8FoldCase(csm.getAlias(), dest, 3, src, 1, errorCode);
814     if (errorCode.isFailure() || destLength != 1 || dest[0] != src[0]) {
815         errln("ucasemap_utf8FoldCase(\\x85) failed: %s destLength=%d dest[0]=0x%02x",
816               errorCode.errorName(), (int)destLength, dest[0]);
817     }
818 }
819 
TestBufferOverflow()820 void StringCaseTest::TestBufferOverflow() {
821     // Ticket #12849, incorrect result from Title Case preflight operation,
822     // when buffer overflow error is expected.
823     IcuTestErrorCode errorCode(*this, "TestBufferOverflow");
824     LocalUCaseMapPointer csm(ucasemap_open("en", 0, errorCode));
825     if (errorCode.isFailure()) {
826         errln("ucasemap_open(English) failed - %s", errorCode.errorName());
827         return;
828     }
829 
830     UnicodeString data("hello world");
831     int32_t result = ucasemap_toTitle(csm.getAlias(), NULL, 0, data.getBuffer(), data.length(), errorCode);
832     if (errorCode.get() != U_BUFFER_OVERFLOW_ERROR || result != data.length()) {
833         errln("%s:%d ucasemap_toTitle(\"hello world\") failed: "
834               "expected (U_BUFFER_OVERFLOW_ERROR, %d), got (%s, %d)",
835               __FILE__, __LINE__, data.length(), errorCode.errorName(), result);
836     }
837     errorCode.reset();
838 
839 #if U_HAVE_STD_STRING
840     std::string data_utf8;
841     data.toUTF8String(data_utf8);
842     result = ucasemap_utf8ToTitle(csm.getAlias(), NULL, 0, data_utf8.c_str(), data_utf8.length(), errorCode);
843     if (errorCode.get() != U_BUFFER_OVERFLOW_ERROR || result != (int32_t)data_utf8.length()) {
844         errln("%s:%d ucasemap_toTitle(\"hello world\") failed: "
845               "expected (U_BUFFER_OVERFLOW_ERROR, %d), got (%s, %d)",
846               __FILE__, __LINE__, data_utf8.length(), errorCode.errorName(), result);
847     }
848     errorCode.reset();
849 #endif  // U_HAVE_STD_STRING
850 }
851