1 /*
2 ********************************************************************************
3 *   Copyright (C) 1999-2015 International Business Machines Corporation and
4 *   others. All Rights Reserved.
5 ********************************************************************************
6 *   Date        Name        Description
7 *   10/20/99    alan        Creation.
8 *   03/22/2000  Madhu       Added additional tests
9 ********************************************************************************
10 */
11 
12 #include <stdio.h>
13 
14 #include <string.h>
15 #include "unicode/utypes.h"
16 #include "usettest.h"
17 #include "unicode/ucnv.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/usetiter.h"
21 #include "unicode/ustring.h"
22 #include "unicode/parsepos.h"
23 #include "unicode/symtable.h"
24 #include "unicode/uversion.h"
25 #include "hash.h"
26 
27 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
28     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
29     u_errorName(status));}}
30 
31 #define TEST_ASSERT(expr) {if (!(expr)) { \
32     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
33 
operator +(const UnicodeString & left,const UnicodeSet & set)34 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
35     UnicodeString pat;
36     set.toPattern(pat);
37     return left + UnicodeSetTest::escape(pat);
38 }
39 
40 #define CASE(id,test) case id:                          \
41                           name = #test;                 \
42                           if (exec) {                   \
43                               logln(#test "---");       \
44                               logln();                  \
45                               test();                   \
46                           }                             \
47                           break
48 
UnicodeSetTest()49 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
50 }
51 
openUTF8Converter()52 UConverter *UnicodeSetTest::openUTF8Converter() {
53     if(utf8Cnv==NULL) {
54         UErrorCode errorCode=U_ZERO_ERROR;
55         utf8Cnv=ucnv_open("UTF-8", &errorCode);
56     }
57     return utf8Cnv;
58 }
59 
~UnicodeSetTest()60 UnicodeSetTest::~UnicodeSetTest() {
61     ucnv_close(utf8Cnv);
62 }
63 
64 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)65 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
66                                const char* &name, char* /*par*/) {
67     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
68     switch (index) {
69         CASE(0,TestPatterns);
70         CASE(1,TestAddRemove);
71         CASE(2,TestCategories);
72         CASE(3,TestCloneEqualHash);
73         CASE(4,TestMinimalRep);
74         CASE(5,TestAPI);
75         CASE(6,TestScriptSet);
76         CASE(7,TestPropertySet);
77         CASE(8,TestClone);
78         CASE(9,TestExhaustive);
79         CASE(10,TestToPattern);
80         CASE(11,TestIndexOf);
81         CASE(12,TestStrings);
82         CASE(13,Testj2268);
83         CASE(14,TestCloseOver);
84         CASE(15,TestEscapePattern);
85         CASE(16,TestInvalidCodePoint);
86         CASE(17,TestSymbolTable);
87         CASE(18,TestSurrogate);
88         CASE(19,TestPosixClasses);
89         CASE(20,TestIteration);
90         CASE(21,TestFreezable);
91         CASE(22,TestSpan);
92         CASE(23,TestStringSpan);
93         CASE(24,TestUCAUnsafeBackwards);
94         default: name = ""; break;
95     }
96 }
97 
98 static const char NOT[] = "%%%%";
99 
100 /**
101  * UVector was improperly copying contents
102  * This code will crash this is still true
103  */
Testj2268()104 void UnicodeSetTest::Testj2268() {
105   UnicodeSet t;
106   t.add(UnicodeString("abc"));
107   UnicodeSet test(t);
108   UnicodeString ustrPat;
109   test.toPattern(ustrPat, TRUE);
110 }
111 
112 /**
113  * Test toPattern().
114  */
TestToPattern()115 void UnicodeSetTest::TestToPattern() {
116     UErrorCode ec = U_ZERO_ERROR;
117 
118     // Test that toPattern() round trips with syntax characters and
119     // whitespace.
120     {
121         static const char* OTHER_TOPATTERN_TESTS[] = {
122             "[[:latin:]&[:greek:]]",
123             "[[:latin:]-[:greek:]]",
124             "[:nonspacing mark:]",
125             NULL
126         };
127 
128         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
129             ec = U_ZERO_ERROR;
130             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
131             if (U_FAILURE(ec)) {
132                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
133                 continue;
134             }
135             checkPat(OTHER_TOPATTERN_TESTS[j], s);
136         }
137 
138         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
139             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
140 
141                 // check various combinations to make sure they all work.
142                 if (i != 0 && !toPatternAux(i, i)){
143                     continue;
144                 }
145                 if (!toPatternAux(0, i)){
146                     continue;
147                 }
148                 if (!toPatternAux(i, 0xFFFF)){
149                     continue;
150                 }
151             }
152         }
153     }
154 
155     // Test pattern behavior of multicharacter strings.
156     {
157         ec = U_ZERO_ERROR;
158         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
159 
160         // This loop isn't a loop.  It's here to make the compiler happy.
161         // If you're curious, try removing it and changing the 'break'
162         // statements (except for the last) to goto's.
163         for (;;) {
164             if (U_FAILURE(ec)) break;
165             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
166             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
167 
168             s->add("ac");
169             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
170             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
171 
172             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
173             if (U_FAILURE(ec)) break;
174             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
175             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
176 
177             s->add("[]");
178             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
179             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
180 
181             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
182             if (U_FAILURE(ec)) break;
183             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
184             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
185 
186             // j2189
187             s->clear();
188             s->add(UnicodeString("abc", ""));
189             s->add(UnicodeString("abc", ""));
190             const char* exp6[] = {"abc", NOT, "ab", NULL};
191             expectToPattern(*s, "[{abc}]", exp6);
192 
193             break;
194         }
195 
196         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
197         delete s;
198     }
199 
200     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
201     UnicodeSet s;
202     s.add((UChar)97, (UChar)98); // 'a', 'b'
203     expectToPattern(s, "[ab]", NULL);
204 }
205 
toPatternAux(UChar32 start,UChar32 end)206 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
207 
208     // use Integer.toString because Utility.hex doesn't handle ints
209     UnicodeString pat = "";
210     // TODO do these in hex
211     //String source = "0x" + Integer.toString(start,16).toUpperCase();
212     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
213     UnicodeString source;
214     source = source + (uint32_t)start;
215     if (start != end)
216         source = source + ".." + (uint32_t)end;
217     UnicodeSet testSet;
218     testSet.add(start, end);
219     return checkPat(source, testSet);
220 }
221 
checkPat(const UnicodeString & source,const UnicodeSet & testSet)222 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
223                                const UnicodeSet& testSet) {
224     // What we want to make sure of is that a pattern generated
225     // by toPattern(), with or without escaped unprintables, can
226     // be passed back into the UnicodeSet constructor.
227     UnicodeString pat0;
228 
229     testSet.toPattern(pat0, TRUE);
230 
231     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
232 
233     //String pat1 = unescapeLeniently(pat0);
234     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
235 
236     UnicodeString pat2;
237     testSet.toPattern(pat2, FALSE);
238     if (!checkPat(source, testSet, pat2)) return FALSE;
239 
240     //String pat3 = unescapeLeniently(pat2);
241     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
242 
243     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
244     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
245     return TRUE;
246 }
247 
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)248 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
249                                const UnicodeSet& testSet,
250                                const UnicodeString& pat) {
251     UErrorCode ec = U_ZERO_ERROR;
252     UnicodeSet testSet2(pat, ec);
253     if (testSet2 != testSet) {
254         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
255         return FALSE;
256     }
257     return TRUE;
258 }
259 
260 void
TestPatterns(void)261 UnicodeSetTest::TestPatterns(void) {
262     UnicodeSet set;
263     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
264     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
265     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
266     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
267     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
268     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
269 
270     // Throw in a test of complement
271     set.complement();
272     UnicodeString exp;
273     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
274     expectPairs(set, exp);
275 }
276 
277 void
TestCategories(void)278 UnicodeSetTest::TestCategories(void) {
279     UErrorCode status = U_ZERO_ERROR;
280     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
281     UnicodeSet set(pat, status);
282     if (U_FAILURE(status)) {
283         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
284         return;
285     } else {
286         expectContainment(set, pat, "ABC", "abc");
287     }
288 
289     UChar32 i;
290     int32_t failures = 0;
291     // Make sure generation of L doesn't pollute cached Lu set
292     // First generate L, then Lu
293     set.applyPattern("[:L:]", status);
294     if (U_FAILURE(status)) { errln("FAIL"); return; }
295     for (i=0; i<0x200; ++i) {
296         UBool l = u_isalpha((UChar)i);
297         if (l != set.contains(i)) {
298             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
299                   set.contains(i));
300             if (++failures == 10) break;
301         }
302     }
303 
304     set.applyPattern("[:Lu:]", status);
305     if (U_FAILURE(status)) { errln("FAIL"); return; }
306     for (i=0; i<0x200; ++i) {
307         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
308         if (lu != set.contains(i)) {
309             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
310                   set.contains(i));
311             if (++failures == 20) break;
312         }
313     }
314 }
315 void
TestCloneEqualHash(void)316 UnicodeSetTest::TestCloneEqualHash(void) {
317     UErrorCode status = U_ZERO_ERROR;
318     // set1 and set2 used to be built with the obsolete constructor taking
319     // UCharCategory values; replaced with pattern constructors
320     // markus 20030502
321     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
322     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
323     if (U_FAILURE(status)){
324         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
325         return;
326     }
327     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
328     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
329     if (U_FAILURE(status)){
330         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
331         return;
332     }
333 
334     if (*set1 != *set1a) {
335         errln("FAIL: category constructor for Ll broken");
336     }
337     if (*set2 != *set2a) {
338         errln("FAIL: category constructor for Nd broken");
339     }
340     delete set1a;
341     delete set2a;
342 
343     logln("Testing copy construction");
344     UnicodeSet *set1copy=new UnicodeSet(*set1);
345     if(*set1 != *set1copy || *set1 == *set2 ||
346         getPairs(*set1) != getPairs(*set1copy) ||
347         set1->hashCode() != set1copy->hashCode()){
348         errln("FAIL : Error in copy construction");
349         return;
350     }
351 
352     logln("Testing =operator");
353     UnicodeSet set1equal=*set1;
354     UnicodeSet set2equal=*set2;
355     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
356         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
357         errln("FAIL: Error in =operator");
358     }
359 
360     logln("Testing clone()");
361     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
362     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
363     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
364         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
365         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
366         errln("FAIL: Error in clone");
367     }
368 
369     logln("Testing hashcode");
370     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
371         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
372         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
373         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
374         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
375         errln("FAIL: Error in hashCode()");
376     }
377 
378     delete set1;
379     delete set1copy;
380     delete set2;
381     delete set1clone;
382     delete set2clone;
383 
384 
385 }
386 void
TestAddRemove(void)387 UnicodeSetTest::TestAddRemove(void) {
388     UnicodeSet set; // Construct empty set
389     doAssert(set.isEmpty() == TRUE, "set should be empty");
390     doAssert(set.size() == 0, "size should be 0");
391     set.complement();
392     doAssert(set.size() == 0x110000, "size should be 0x110000");
393     set.clear();
394     set.add(0x0061, 0x007a);
395     expectPairs(set, "az");
396     doAssert(set.isEmpty() == FALSE, "set should not be empty");
397     doAssert(set.size() != 0, "size should not be equal to 0");
398     doAssert(set.size() == 26, "size should be equal to 26");
399     set.remove(0x006d, 0x0070);
400     expectPairs(set, "alqz");
401     doAssert(set.size() == 22, "size should be equal to 22");
402     set.remove(0x0065, 0x0067);
403     expectPairs(set, "adhlqz");
404     doAssert(set.size() == 19, "size should be equal to 19");
405     set.remove(0x0064, 0x0069);
406     expectPairs(set, "acjlqz");
407     doAssert(set.size() == 16, "size should be equal to 16");
408     set.remove(0x0063, 0x0072);
409     expectPairs(set, "absz");
410     doAssert(set.size() == 10, "size should be equal to 10");
411     set.add(0x0066, 0x0071);
412     expectPairs(set, "abfqsz");
413     doAssert(set.size() == 22, "size should be equal to 22");
414     set.remove(0x0061, 0x0067);
415     expectPairs(set, "hqsz");
416     set.remove(0x0061, 0x007a);
417     expectPairs(set, "");
418     doAssert(set.isEmpty() == TRUE, "set should be empty");
419     doAssert(set.size() == 0, "size should be 0");
420     set.add(0x0061);
421     doAssert(set.isEmpty() == FALSE, "set should not be empty");
422     doAssert(set.size() == 1, "size should not be equal to 1");
423     set.add(0x0062);
424     set.add(0x0063);
425     expectPairs(set, "ac");
426     doAssert(set.size() == 3, "size should not be equal to 3");
427     set.add(0x0070);
428     set.add(0x0071);
429     expectPairs(set, "acpq");
430     doAssert(set.size() == 5, "size should not be equal to 5");
431     set.clear();
432     expectPairs(set, "");
433     doAssert(set.isEmpty() == TRUE, "set should be empty");
434     doAssert(set.size() == 0, "size should be 0");
435 
436     // Try removing an entire set from another set
437     expectPattern(set, "[c-x]", "cx");
438     UnicodeSet set2;
439     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
440     set.removeAll(set2);
441     expectPairs(set, "deluxx");
442 
443     // Try adding an entire set to another set
444     expectPattern(set, "[jackiemclean]", "aacceein");
445     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
446     set.addAll(set2);
447     expectPairs(set, "aacehort");
448     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
449 
450     // Try retaining an set of elements contained in another set (intersection)
451     UnicodeSet set3;
452     expectPattern(set3, "[a-c]", "ac");
453     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
454     set3.remove(0x0062);
455     expectPairs(set3, "aacc");
456     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
457     set.retainAll(set3);
458     expectPairs(set, "aacc");
459     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
460     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
461     set.clear();
462     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
463 
464     // Test commutativity
465     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
466     expectPattern(set2, "[jackiemclean]", "aacceein");
467     set.addAll(set2);
468     expectPairs(set, "aacehort");
469     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
470 
471 
472 
473 
474 }
475 
476 /**
477  * Make sure minimal representation is maintained.
478  */
TestMinimalRep()479 void UnicodeSetTest::TestMinimalRep() {
480     UErrorCode status = U_ZERO_ERROR;
481     // This is pretty thoroughly tested by checkCanonicalRep()
482     // run against the exhaustive operation results.  Use the code
483     // here for debugging specific spot problems.
484 
485     // 1 overlap against 2
486     UnicodeSet set("[h-km-q]", status);
487     if (U_FAILURE(status)) { errln("FAIL"); return; }
488     UnicodeSet set2("[i-o]", status);
489     if (U_FAILURE(status)) { errln("FAIL"); return; }
490     set.addAll(set2);
491     expectPairs(set, "hq");
492     // right
493     set.applyPattern("[a-m]", status);
494     if (U_FAILURE(status)) { errln("FAIL"); return; }
495     set2.applyPattern("[e-o]", status);
496     if (U_FAILURE(status)) { errln("FAIL"); return; }
497     set.addAll(set2);
498     expectPairs(set, "ao");
499     // left
500     set.applyPattern("[e-o]", status);
501     if (U_FAILURE(status)) { errln("FAIL"); return; }
502     set2.applyPattern("[a-m]", status);
503     if (U_FAILURE(status)) { errln("FAIL"); return; }
504     set.addAll(set2);
505     expectPairs(set, "ao");
506     // 1 overlap against 3
507     set.applyPattern("[a-eg-mo-w]", status);
508     if (U_FAILURE(status)) { errln("FAIL"); return; }
509     set2.applyPattern("[d-q]", status);
510     if (U_FAILURE(status)) { errln("FAIL"); return; }
511     set.addAll(set2);
512     expectPairs(set, "aw");
513 }
514 
TestAPI()515 void UnicodeSetTest::TestAPI() {
516     UErrorCode status = U_ZERO_ERROR;
517     // default ct
518     UnicodeSet set;
519     if (!set.isEmpty() || set.getRangeCount() != 0) {
520         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
521               set);
522     }
523 
524     // clear(), isEmpty()
525     set.add(0x0061);
526     if (set.isEmpty()) {
527         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
528               set);
529     }
530     set.clear();
531     if (!set.isEmpty()) {
532         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
533               set);
534     }
535 
536     // size()
537     set.clear();
538     if (set.size() != 0) {
539         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
540               ": " + set);
541     }
542     set.add(0x0061);
543     if (set.size() != 1) {
544         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
545               ": " + set);
546     }
547     set.add(0x0031, 0x0039);
548     if (set.size() != 10) {
549         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
550               ": " + set);
551     }
552 
553     // contains(first, last)
554     set.clear();
555     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
556     if (U_FAILURE(status)) { errln("FAIL"); return; }
557     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
558         UChar32 a = set.getRangeStart(i);
559         UChar32 b = set.getRangeEnd(i);
560         if (!set.contains(a, b)) {
561             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
562                   " but doesn't: " + set);
563         }
564         if (set.contains((UChar32)(a-1), b)) {
565             errln((UnicodeString)"FAIL, shouldn't contain " +
566                   (unsigned short)(a-1) + '-' + (unsigned short)b +
567                   " but does: " + set);
568         }
569         if (set.contains(a, (UChar32)(b+1))) {
570             errln((UnicodeString)"FAIL, shouldn't contain " +
571                   (unsigned short)a + '-' + (unsigned short)(b+1) +
572                   " but does: " + set);
573         }
574     }
575 
576     // Ported InversionList test.
577     UnicodeSet a((UChar32)3,(UChar32)10);
578     UnicodeSet b((UChar32)7,(UChar32)15);
579     UnicodeSet c;
580 
581     logln((UnicodeString)"a [3-10]: " + a);
582     logln((UnicodeString)"b [7-15]: " + b);
583     c = a;
584     c.addAll(b);
585     UnicodeSet exp((UChar32)3,(UChar32)15);
586     if (c == exp) {
587         logln((UnicodeString)"c.set(a).add(b): " + c);
588     } else {
589         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
590     }
591     c.complement();
592     exp.set((UChar32)0, (UChar32)2);
593     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
594     if (c == exp) {
595         logln((UnicodeString)"c.complement(): " + c);
596     } else {
597         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
598     }
599     c.complement();
600     exp.set((UChar32)3, (UChar32)15);
601     if (c == exp) {
602         logln((UnicodeString)"c.complement(): " + c);
603     } else {
604         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
605     }
606     c = a;
607     c.complementAll(b);
608     exp.set((UChar32)3,(UChar32)6);
609     exp.add((UChar32)11,(UChar32) 15);
610     if (c == exp) {
611         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
612     } else {
613         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
614     }
615 
616     exp = c;
617     bitsToSet(setToBits(c), c);
618     if (c == exp) {
619         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
620     } else {
621         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
622     }
623 
624     // Additional tests for coverage JB#2118
625     //UnicodeSet::complement(class UnicodeString const &)
626     //UnicodeSet::complementAll(class UnicodeString const &)
627     //UnicodeSet::containsNone(class UnicodeSet const &)
628     //UnicodeSet::containsNone(long,long)
629     //UnicodeSet::containsSome(class UnicodeSet const &)
630     //UnicodeSet::containsSome(long,long)
631     //UnicodeSet::removeAll(class UnicodeString const &)
632     //UnicodeSet::retain(long)
633     //UnicodeSet::retainAll(class UnicodeString const &)
634     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
635     //UnicodeSetIterator::getString(void)
636     set.clear();
637     set.complement("ab");
638     exp.applyPattern("[{ab}]", status);
639     if (U_FAILURE(status)) { errln("FAIL"); return; }
640     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
641 
642     UnicodeSetIterator iset(set);
643     if (!iset.next() || !iset.isString()) {
644         errln("FAIL: UnicodeSetIterator::next/isString");
645     } else if (iset.getString() != "ab") {
646         errln("FAIL: UnicodeSetIterator::getString");
647     }
648 
649     set.add((UChar32)0x61, (UChar32)0x7A);
650     set.complementAll("alan");
651     exp.applyPattern("[{ab}b-kmo-z]", status);
652     if (U_FAILURE(status)) { errln("FAIL"); return; }
653     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
654 
655     exp.applyPattern("[a-z]", status);
656     if (U_FAILURE(status)) { errln("FAIL"); return; }
657     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
658     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
659     exp.applyPattern("[aln]", status);
660     if (U_FAILURE(status)) { errln("FAIL"); return; }
661     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
662     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
663 
664     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
665         errln("FAIL: containsNone(UChar32, UChar32)");
666     }
667     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
668         errln("FAIL: containsSome(UChar32, UChar32)");
669     }
670     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
671         errln("FAIL: containsNone(UChar32, UChar32)");
672     }
673     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
674         errln("FAIL: containsSome(UChar32, UChar32)");
675     }
676 
677     set.removeAll("liu");
678     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
679     if (U_FAILURE(status)) { errln("FAIL"); return; }
680     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
681 
682     set.retainAll("star");
683     exp.applyPattern("[rst]", status);
684     if (U_FAILURE(status)) { errln("FAIL"); return; }
685     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
686 
687     set.retain((UChar32)0x73);
688     exp.applyPattern("[s]", status);
689     if (U_FAILURE(status)) { errln("FAIL"); return; }
690     if (set != exp) { errln("FAIL: retain('s')"); return; }
691 
692     uint16_t buf[32];
693     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
694     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
695     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
696         errln("FAIL: serialize");
697         return;
698     }
699 
700     // Conversions to and from USet
701     UnicodeSet *uniset = &set;
702     USet *uset = uniset->toUSet();
703     TEST_ASSERT((void *)uset == (void *)uniset);
704     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
705     TEST_ASSERT((void *)setx == (void *)uset);
706     const UnicodeSet *constSet = uniset;
707     const USet *constUSet = constSet->toUSet();
708     TEST_ASSERT((void *)constUSet == (void *)constSet);
709     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
710     TEST_ASSERT((void *)constSetx == (void *)constUSet);
711 
712     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
713     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
714     UnicodeSet ac(0x61, 0x63);
715     ac.remove(0x62).freeze();
716     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
717         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
718         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
719         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
720         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
721         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
722         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
723         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
724         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
725         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
726     ) {
727         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
728     }
729     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
730         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
731         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
732         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
733         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
734         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
735         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
736         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
737         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
738         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
739     ) {
740         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
741     }
742 }
743 
TestIteration()744 void UnicodeSetTest::TestIteration() {
745     UErrorCode ec = U_ZERO_ERROR;
746     int i = 0;
747     int outerLoop;
748 
749     // 6 code points, 3 ranges, 2 strings, 8 total elements
750     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
751     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
752     TEST_ASSERT_SUCCESS(ec);
753     UnicodeSetIterator it(set);
754 
755     for (outerLoop=0; outerLoop<3; outerLoop++) {
756         // Run the test multiple times, to check that iterator.reset() is working.
757         for (i=0; i<10; i++) {
758             UBool         nextv        = it.next();
759             UBool         isString     = it.isString();
760             int32_t       codePoint    = it.getCodepoint();
761             //int32_t       codePointEnd = it.getCodepointEnd();
762             UnicodeString s   = it.getString();
763             switch (i) {
764             case 0:
765                 TEST_ASSERT(nextv == TRUE);
766                 TEST_ASSERT(isString == FALSE);
767                 TEST_ASSERT(codePoint==0x61);
768                 TEST_ASSERT(s == "a");
769                 break;
770             case 1:
771                 TEST_ASSERT(nextv == TRUE);
772                 TEST_ASSERT(isString == FALSE);
773                 TEST_ASSERT(codePoint==0x62);
774                 TEST_ASSERT(s == "b");
775                 break;
776             case 2:
777                 TEST_ASSERT(nextv == TRUE);
778                 TEST_ASSERT(isString == FALSE);
779                 TEST_ASSERT(codePoint==0x63);
780                 TEST_ASSERT(s == "c");
781                 break;
782             case 3:
783                 TEST_ASSERT(nextv == TRUE);
784                 TEST_ASSERT(isString == FALSE);
785                 TEST_ASSERT(codePoint==0x79);
786                 TEST_ASSERT(s == "y");
787                 break;
788             case 4:
789                 TEST_ASSERT(nextv == TRUE);
790                 TEST_ASSERT(isString == FALSE);
791                 TEST_ASSERT(codePoint==0x7a);
792                 TEST_ASSERT(s == "z");
793                 break;
794             case 5:
795                 TEST_ASSERT(nextv == TRUE);
796                 TEST_ASSERT(isString == FALSE);
797                 TEST_ASSERT(codePoint==0x1abcd);
798                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
799                 break;
800             case 6:
801                 TEST_ASSERT(nextv == TRUE);
802                 TEST_ASSERT(isString == TRUE);
803                 TEST_ASSERT(s == "str1");
804                 break;
805             case 7:
806                 TEST_ASSERT(nextv == TRUE);
807                 TEST_ASSERT(isString == TRUE);
808                 TEST_ASSERT(s == "str2");
809                 break;
810             case 8:
811                 TEST_ASSERT(nextv == FALSE);
812                 break;
813             case 9:
814                 TEST_ASSERT(nextv == FALSE);
815                 break;
816             }
817         }
818         it.reset();  // prepare to run the iteration again.
819     }
820 }
821 
822 
823 
824 
TestStrings()825 void UnicodeSetTest::TestStrings() {
826     UErrorCode ec = U_ZERO_ERROR;
827 
828     UnicodeSet* testList[] = {
829         UnicodeSet::createFromAll("abc"),
830         new UnicodeSet("[a-c]", ec),
831 
832         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
833         new UnicodeSet("[{ll}{ch}a-z]", ec),
834 
835         UnicodeSet::createFrom("ab}c"),
836         new UnicodeSet("[{ab\\}c}]", ec),
837 
838         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
839         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
840 
841         NULL
842     };
843 
844     if (U_FAILURE(ec)) {
845         errln("FAIL: couldn't construct test sets");
846     }
847 
848     for (int32_t i = 0; testList[i] != NULL; i+=2) {
849         if (U_SUCCESS(ec)) {
850             UnicodeString pat0, pat1;
851             testList[i]->toPattern(pat0, TRUE);
852             testList[i+1]->toPattern(pat1, TRUE);
853             if (*testList[i] == *testList[i+1]) {
854                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
855             } else {
856                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
857             }
858         }
859         delete testList[i];
860         delete testList[i+1];
861     }
862 }
863 
864 /**
865  * Test the [:Latin:] syntax.
866  */
TestScriptSet()867 void UnicodeSetTest::TestScriptSet() {
868     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
869 
870     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
871 
872     /* Jitterbug 1423 */
873     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
874 
875 }
876 
877 /**
878  * Test the [:Latin:] syntax.
879  */
TestPropertySet()880 void UnicodeSetTest::TestPropertySet() {
881     static const char* const DATA[] = {
882         // Pattern, Chars IN, Chars NOT in
883 
884         "[:Latin:]",
885         "aA",
886         "\\u0391\\u03B1",
887 
888         "[\\p{Greek}]",
889         "\\u0391\\u03B1",
890         "aA",
891 
892         "\\P{ GENERAL Category = upper case letter }",
893         "abc",
894         "ABC",
895 
896 #if !UCONFIG_NO_NORMALIZATION
897         // Combining class: @since ICU 2.2
898         // Check both symbolic and numeric
899         "\\p{ccc=Nukta}",
900         "\\u0ABC",
901         "abc",
902 
903         "\\p{Canonical Combining Class = 11}",
904         "\\u05B1",
905         "\\u05B2",
906 
907         "[:c c c = iota subscript :]",
908         "\\u0345",
909         "xyz",
910 #endif
911 
912         // Bidi class: @since ICU 2.2
913         "\\p{bidiclass=lefttoright}",
914         "abc",
915         "\\u0671\\u0672",
916 
917         // Binary properties: @since ICU 2.2
918         "\\p{ideographic}",
919         "\\u4E0A",
920         "x",
921 
922         "[:math=false:]",
923         "q)*(",
924         // weiv: )(and * were removed from math in Unicode 4.0.1
925         //"(*+)",
926         "+<>^",
927 
928         // JB#1767 \N{}, \p{ASCII}
929         "[:Ascii:]",
930         "abc\\u0000\\u007F",
931         "\\u0080\\u4E00",
932 
933         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
934         "az",
935         "qrs",
936 
937         // JB#2015
938         "[:any:]",
939         "a\\U0010FFFF",
940         "",
941 
942         "[:nv=0.5:]",
943         "\\u00BD\\u0F2A",
944         "\\u00BC",
945 
946         // JB#2653: Age
947         "[:Age=1.1:]",
948         "\\u03D6", // 1.1
949         "\\u03D8\\u03D9", // 3.2
950 
951         "[:Age=3.1:]",
952         "\\u1800\\u3400\\U0002f800",
953         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
954 
955         // JB#2350: Case_Sensitive
956         "[:Case Sensitive:]",
957         "A\\u1FFC\\U00010410",
958         ";\\u00B4\\U00010500",
959 
960         // JB#2832: C99-compatibility props
961         "[:blank:]",
962         " \\u0009",
963         "1-9A-Z",
964 
965         "[:graph:]",
966         "19AZ",
967         " \\u0003\\u0007\\u0009\\u000A\\u000D",
968 
969         "[:punct:]",
970         "!@#%&*()[]{}-_\\/;:,.?'\"",
971         "09azAZ",
972 
973         "[:xdigit:]",
974         "09afAF",
975         "gG!",
976 
977         // Regex compatibility test
978         "[-b]", // leading '-' is literal
979         "-b",
980         "ac",
981 
982         "[^-b]", // leading '-' is literal
983         "ac",
984         "-b",
985 
986         "[b-]", // trailing '-' is literal
987         "-b",
988         "ac",
989 
990         "[^b-]", // trailing '-' is literal
991         "ac",
992         "-b",
993 
994         "[a-b-]", // trailing '-' is literal
995         "ab-",
996         "c=",
997 
998         "[[a-q]&[p-z]-]", // trailing '-' is literal
999         "pq-",
1000         "or=",
1001 
1002         "[\\s|\\)|:|$|\\>]", // from regex tests
1003         "s|):$>",
1004         "abc",
1005 
1006         "[\\uDC00cd]", // JB#2906: isolated trail at start
1007         "cd\\uDC00",
1008         "ab\\uD800\\U00010000",
1009 
1010         "[ab\\uD800]", // JB#2906: isolated trail at start
1011         "ab\\uD800",
1012         "cd\\uDC00\\U00010000",
1013 
1014         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1015         "abcd\\uD800",
1016         "ef\\uDC00\\U00010000",
1017 
1018         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1019         "abcd\\uDC00",
1020         "ef\\uD800\\U00010000",
1021 
1022 #if !UCONFIG_NO_NORMALIZATION
1023         "[:^lccc=0:]", // Lead canonical class
1024         "\\u0300\\u0301",
1025         "abcd\\u00c0\\u00c5",
1026 
1027         "[:^tccc=0:]", // Trail canonical class
1028         "\\u0300\\u0301\\u00c0\\u00c5",
1029         "abcd",
1030 
1031         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1032         "\\u0300\\u0301\\u00c0\\u00c5",
1033         "abcd",
1034 
1035         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1036         "",
1037         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1038 
1039         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1040         "\\u0F73\\u0F75\\u0F81",
1041         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1042 #endif /* !UCONFIG_NO_NORMALIZATION */
1043 
1044         "[:Assigned:]",
1045         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1046         "\\u0888\\uFDD3\\uFFFE\\U00050005",
1047 
1048         // Script_Extensions, new in Unicode 6.0
1049         "[:scx=Arab:]",
1050         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1051         "\\u061D\\uFDEF\\uFDFE",
1052 
1053         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1054         // so scx-sc is missing U+FDF2.
1055         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1056         "\\u0640\\u064B\\u0650\\u0655",
1057         "\\uFDF2"
1058     };
1059 
1060     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1061 
1062     for (int32_t i=0; i<DATA_LEN; i+=3) {
1063         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1064                           CharsToUnicodeString(DATA[i+2]));
1065     }
1066 }
1067 
1068 /**
1069   * Test that Posix style character classes [:digit:], etc.
1070   *   have the Unicode definitions from TR 18.
1071   */
TestPosixClasses()1072 void UnicodeSetTest::TestPosixClasses() {
1073     {
1074         UErrorCode status = U_ZERO_ERROR;
1075         UnicodeSet s1("[:alpha:]", status);
1076         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1077         TEST_ASSERT_SUCCESS(status);
1078         TEST_ASSERT(s1==s2);
1079     }
1080     {
1081         UErrorCode status = U_ZERO_ERROR;
1082         UnicodeSet s1("[:lower:]", status);
1083         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1084         TEST_ASSERT_SUCCESS(status);
1085         TEST_ASSERT(s1==s2);
1086     }
1087     {
1088         UErrorCode status = U_ZERO_ERROR;
1089         UnicodeSet s1("[:upper:]", status);
1090         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1091         TEST_ASSERT_SUCCESS(status);
1092         TEST_ASSERT(s1==s2);
1093     }
1094     {
1095         UErrorCode status = U_ZERO_ERROR;
1096         UnicodeSet s1("[:punct:]", status);
1097         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1098         TEST_ASSERT_SUCCESS(status);
1099         TEST_ASSERT(s1==s2);
1100     }
1101     {
1102         UErrorCode status = U_ZERO_ERROR;
1103         UnicodeSet s1("[:digit:]", status);
1104         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1105         TEST_ASSERT_SUCCESS(status);
1106         TEST_ASSERT(s1==s2);
1107     }
1108     {
1109         UErrorCode status = U_ZERO_ERROR;
1110         UnicodeSet s1("[:xdigit:]", status);
1111         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1112         TEST_ASSERT_SUCCESS(status);
1113         TEST_ASSERT(s1==s2);
1114     }
1115     {
1116         UErrorCode status = U_ZERO_ERROR;
1117         UnicodeSet s1("[:alnum:]", status);
1118         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1119         TEST_ASSERT_SUCCESS(status);
1120         TEST_ASSERT(s1==s2);
1121     }
1122     {
1123         UErrorCode status = U_ZERO_ERROR;
1124         UnicodeSet s1("[:space:]", status);
1125         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1126         TEST_ASSERT_SUCCESS(status);
1127         TEST_ASSERT(s1==s2);
1128     }
1129     {
1130         UErrorCode status = U_ZERO_ERROR;
1131         UnicodeSet s1("[:blank:]", status);
1132         TEST_ASSERT_SUCCESS(status);
1133         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1134             status);
1135         TEST_ASSERT_SUCCESS(status);
1136         TEST_ASSERT(s1==s2);
1137     }
1138     {
1139         UErrorCode status = U_ZERO_ERROR;
1140         UnicodeSet s1("[:cntrl:]", status);
1141         TEST_ASSERT_SUCCESS(status);
1142         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1143         TEST_ASSERT_SUCCESS(status);
1144         TEST_ASSERT(s1==s2);
1145     }
1146     {
1147         UErrorCode status = U_ZERO_ERROR;
1148         UnicodeSet s1("[:graph:]", status);
1149         TEST_ASSERT_SUCCESS(status);
1150         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1151         TEST_ASSERT_SUCCESS(status);
1152         TEST_ASSERT(s1==s2);
1153     }
1154     {
1155         UErrorCode status = U_ZERO_ERROR;
1156         UnicodeSet s1("[:print:]", status);
1157         TEST_ASSERT_SUCCESS(status);
1158         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1159         TEST_ASSERT_SUCCESS(status);
1160         TEST_ASSERT(s1==s2);
1161     }
1162 }
1163 /**
1164  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1165  */
TestClone()1166 void UnicodeSetTest::TestClone() {
1167     UErrorCode ec = U_ZERO_ERROR;
1168     UnicodeSet s("[abcxyz]", ec);
1169     UnicodeSet t(s);
1170     expectContainment(t, "abc", "def");
1171 }
1172 
1173 /**
1174  * Test the indexOf() and charAt() methods.
1175  */
TestIndexOf()1176 void UnicodeSetTest::TestIndexOf() {
1177     UErrorCode ec = U_ZERO_ERROR;
1178     UnicodeSet set("[a-cx-y3578]", ec);
1179     if (U_FAILURE(ec)) {
1180         errln("FAIL: UnicodeSet constructor");
1181         return;
1182     }
1183     for (int32_t i=0; i<set.size(); ++i) {
1184         UChar32 c = set.charAt(i);
1185         if (set.indexOf(c) != i) {
1186             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1187                 i, c, set.indexOf(c));
1188         }
1189     }
1190     UChar32 c = set.charAt(set.size());
1191     if (c != -1) {
1192         errln("FAIL: charAt(<out of range>) = %X", c);
1193     }
1194     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1195     if (j != -1) {
1196         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1197     }
1198 }
1199 
1200 /**
1201  * Test closure API.
1202  */
TestCloseOver()1203 void UnicodeSetTest::TestCloseOver() {
1204     UErrorCode ec = U_ZERO_ERROR;
1205 
1206     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1207     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1208     const char* DATA[] = {
1209         // selector, input, output
1210         CASE,
1211         "[aq\\u00DF{Bc}{bC}{Fi}]",
1212         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1213 
1214         CASE,
1215         "[\\u01F1]", // 'DZ'
1216         "[\\u01F1\\u01F2\\u01F3]",
1217 
1218         CASE,
1219         "[\\u1FB4]",
1220         "[\\u1FB4{\\u03AC\\u03B9}]",
1221 
1222         CASE,
1223         "[{F\\uFB01}]",
1224         "[\\uFB03{ffi}]",
1225 
1226         CASE, // make sure binary search finds limits
1227         "[a\\uFF3A]",
1228         "[aA\\uFF3A\\uFF5A]",
1229 
1230         CASE,
1231         "[a-z]","[A-Za-z\\u017F\\u212A]",
1232         CASE,
1233         "[abc]","[A-Ca-c]",
1234         CASE,
1235         "[ABC]","[A-Ca-c]",
1236 
1237         CASE, "[i]", "[iI]",
1238 
1239         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1240         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1241 
1242         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1243 
1244         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1245 
1246         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1247 
1248         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1249 
1250         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1251 
1252         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1253 
1254         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1255         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1256 
1257         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1258 
1259         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1260 
1261         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1262 
1263 #if !UCONFIG_NO_FILE_IO
1264         CASE_MAPPINGS,
1265         "[aq\\u00DF{Bc}{bC}{Fi}]",
1266         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1267 #endif
1268 
1269         CASE_MAPPINGS,
1270         "[\\u01F1]", // 'DZ'
1271         "[\\u01F1\\u01F2\\u01F3]",
1272 
1273         CASE_MAPPINGS,
1274         "[a-z]",
1275         "[A-Za-z]",
1276 
1277         NULL
1278     };
1279 
1280     UnicodeSet s;
1281     UnicodeSet t;
1282     UnicodeString buf;
1283     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1284         int32_t selector = DATA[i][0];
1285         UnicodeString pat(DATA[i+1], -1, US_INV);
1286         UnicodeString exp(DATA[i+2], -1, US_INV);
1287         s.applyPattern(pat, ec);
1288         s.closeOver(selector);
1289         t.applyPattern(exp, ec);
1290         if (U_FAILURE(ec)) {
1291             errln("FAIL: applyPattern failed");
1292             continue;
1293         }
1294         if (s == t) {
1295             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1296         } else {
1297             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1298                   s.toPattern(buf, TRUE) + ", expected " + exp);
1299         }
1300     }
1301 
1302 #if 0
1303     /*
1304      * Unused test code.
1305      * This was used to compare the old implementation (using USET_CASE)
1306      * with the new one (using 0x100 temporarily)
1307      * while transitioning from hardcoded case closure tables in uniset.cpp
1308      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1309      * and using ucase.c functions for closure.
1310      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1311      *
1312      * Note: The old and new implementation never fully matched because
1313      * the old implementation turned out to not map U+0130 and U+0131 correctly
1314      * (dotted I and dotless i) and because the old implementation's data tables
1315      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1316      * new implementation. (So sigmas and some other characters were not handled
1317      * according to the newer Unicode version.)
1318      */
1319     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1320     UnicodeSetIterator si(sens);
1321     UnicodeString str, buf2;
1322     const UnicodeString *pStr;
1323     UChar32 c;
1324     while(si.next()) {
1325         if(!si.isString()) {
1326             c=si.getCodepoint();
1327             s.clear();
1328             s.add(c);
1329 
1330             str.setTo(c);
1331             str.foldCase();
1332             sens2.add(str);
1333 
1334             t=s;
1335             s.closeOver(USET_CASE);
1336             t.closeOver(0x100);
1337             if(s!=t) {
1338                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1339                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1340             }
1341         }
1342     }
1343     // remove all code points
1344     // should contain all full case folding mapping strings
1345     sens2.remove(0, 0x10ffff);
1346     si.reset(sens2);
1347     while(si.next()) {
1348         if(si.isString()) {
1349             pStr=&si.getString();
1350             s.clear();
1351             s.add(*pStr);
1352             t=s2=s;
1353             s.closeOver(USET_CASE);
1354             t.closeOver(0x100);
1355             if(s!=t) {
1356                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1357                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1358             }
1359         }
1360     }
1361 #endif
1362 
1363     // Test the pattern API
1364     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1365     if (U_FAILURE(ec)) {
1366         errln("FAIL: applyPattern failed");
1367     } else {
1368         expectContainment(s, "abcABC", "defDEF");
1369     }
1370     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1371     if (U_FAILURE(ec)) {
1372         errln("FAIL: constructor failed");
1373     } else {
1374         expectContainment(v, "defDEF", "abcABC");
1375     }
1376     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1377     if (U_FAILURE(ec)) {
1378         errln("FAIL: construct w/case mappings failed");
1379     } else {
1380         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1381     }
1382 }
1383 
TestEscapePattern()1384 void UnicodeSetTest::TestEscapePattern() {
1385     const char pattern[] =
1386         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1387     const char exp[] =
1388         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1389     // We test this with two passes; in the second pass we
1390     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1391     // this fails -- which is what we expect.
1392     for (int32_t pass=1; pass<=2; ++pass) {
1393         UErrorCode ec = U_ZERO_ERROR;
1394         UnicodeString pat(pattern, -1, US_INV);
1395         if (pass==2) {
1396             pat = pat.unescape();
1397         }
1398         // Pattern is only good for pass 1
1399         UBool isPatternValid = (pass==1);
1400 
1401         UnicodeSet set(pat, ec);
1402         if (U_SUCCESS(ec) != isPatternValid){
1403             errln((UnicodeString)"FAIL: applyPattern(" +
1404                   escape(pat) + ") => " +
1405                   u_errorName(ec));
1406             continue;
1407         }
1408         if (U_FAILURE(ec)) {
1409             continue;
1410         }
1411         if (set.contains((UChar)0x0644)){
1412             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1413         }
1414 
1415         UnicodeString newpat;
1416         set.toPattern(newpat, TRUE);
1417         if (newpat == UnicodeString(exp, -1, US_INV)) {
1418             logln(escape(pat) + " => " + newpat);
1419         } else {
1420             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1421         }
1422 
1423         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1424             UnicodeString str("Range ");
1425             str.append((UChar)(0x30 + i))
1426                 .append(": ")
1427                 .append((UChar32)set.getRangeStart(i))
1428                 .append(" - ")
1429                 .append((UChar32)set.getRangeEnd(i));
1430             str = str + " (" + set.getRangeStart(i) + " - " +
1431                 set.getRangeEnd(i) + ")";
1432             if (set.getRangeStart(i) < 0) {
1433                 errln((UnicodeString)"FAIL: " + escape(str));
1434             } else {
1435                 logln(escape(str));
1436             }
1437         }
1438     }
1439 }
1440 
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1441 void UnicodeSetTest::expectRange(const UnicodeString& label,
1442                                  const UnicodeSet& set,
1443                                  UChar32 start, UChar32 end) {
1444     UnicodeSet exp(start, end);
1445     UnicodeString pat;
1446     if (set == exp) {
1447         logln(label + " => " + set.toPattern(pat, TRUE));
1448     } else {
1449         UnicodeString xpat;
1450         errln((UnicodeString)"FAIL: " + label + " => " +
1451               set.toPattern(pat, TRUE) +
1452               ", expected " + exp.toPattern(xpat, TRUE));
1453     }
1454 }
1455 
TestInvalidCodePoint()1456 void UnicodeSetTest::TestInvalidCodePoint() {
1457 
1458     const UChar32 DATA[] = {
1459         // Test range             Expected range
1460         0, 0x10FFFF,              0, 0x10FFFF,
1461         (UChar32)-1, 8,           0, 8,
1462         8, 0x110000,              8, 0x10FFFF
1463     };
1464     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1465 
1466     UnicodeString pat;
1467     int32_t i;
1468 
1469     for (i=0; i<DATA_LENGTH; i+=4) {
1470         UChar32 start  = DATA[i];
1471         UChar32 end    = DATA[i+1];
1472         UChar32 xstart = DATA[i+2];
1473         UChar32 xend   = DATA[i+3];
1474 
1475         // Try various API using the test code points
1476 
1477         UnicodeSet set(start, end);
1478         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1479                     set, xstart, xend);
1480 
1481         set.clear();
1482         set.set(start, end);
1483         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1484                     set, xstart, xend);
1485 
1486         UBool b = set.contains(start);
1487         b = set.contains(start, end);
1488         b = set.containsNone(start, end);
1489         b = set.containsSome(start, end);
1490         (void)b;   // Suppress set but not used warning.
1491 
1492         /*int32_t index = set.indexOf(start);*/
1493 
1494         set.clear();
1495         set.add(start);
1496         set.add(start, end);
1497         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1498                     set, xstart, xend);
1499 
1500         set.set(0, 0x10FFFF);
1501         set.retain(start, end);
1502         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1503                     set, xstart, xend);
1504         set.retain(start);
1505 
1506         set.set(0, 0x10FFFF);
1507         set.remove(start);
1508         set.remove(start, end);
1509         set.complement();
1510         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1511                     set, xstart, xend);
1512 
1513         set.set(0, 0x10FFFF);
1514         set.complement(start, end);
1515         set.complement();
1516         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1517                     set, xstart, xend);
1518         set.complement(start);
1519     }
1520 
1521     const UChar32 DATA2[] = {
1522         0,
1523         0x10FFFF,
1524         (UChar32)-1,
1525         0x110000
1526     };
1527     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1528 
1529     for (i=0; i<DATA2_LENGTH; ++i) {
1530         UChar32 c = DATA2[i], end = 0x10FFFF;
1531         UBool valid = (c >= 0 && c <= 0x10FFFF);
1532 
1533         UnicodeSet set(0, 0x10FFFF);
1534 
1535         // For single-codepoint contains, invalid codepoints are NOT contained
1536         UBool b = set.contains(c);
1537         if (b == valid) {
1538             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1539                   ") = " + b);
1540         } else {
1541             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1542                   ") = " + b);
1543         }
1544 
1545         // For codepoint range contains, containsNone, and containsSome,
1546         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1547         b = set.contains(c, end);
1548         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1549               "," + end + ") = " + b);
1550 
1551         b = set.containsNone(c, end);
1552         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1553               "," + end + ") = " + b);
1554 
1555         b = set.containsSome(c, end);
1556         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1557               "," + end + ") = " + b);
1558 
1559         int32_t index = set.indexOf(c);
1560         if ((index >= 0) == valid) {
1561             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1562                   ") = " + index);
1563         } else {
1564             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1565                   ") = " + index);
1566         }
1567     }
1568 }
1569 
1570 // Used by TestSymbolTable
1571 class TokenSymbolTable : public SymbolTable {
1572 public:
1573     Hashtable contents;
1574 
TokenSymbolTable(UErrorCode & ec)1575     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1576         contents.setValueDeleter(uprv_deleteUObject);
1577     }
1578 
~TokenSymbolTable()1579     ~TokenSymbolTable() {}
1580 
1581     /**
1582      * (Non-SymbolTable API) Add the given variable and value to
1583      * the table.  Variable should NOT contain leading '$'.
1584      */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1585     void add(const UnicodeString& var, const UnicodeString& value,
1586              UErrorCode& ec) {
1587         if (U_SUCCESS(ec)) {
1588             contents.put(var, new UnicodeString(value), ec);
1589         }
1590     }
1591 
1592     /**
1593      * SymbolTable API
1594      */
lookup(const UnicodeString & s) const1595     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1596         return (const UnicodeString*) contents.get(s);
1597     }
1598 
1599     /**
1600      * SymbolTable API
1601      */
lookupMatcher(UChar32) const1602     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1603         return NULL;
1604     }
1605 
1606     /**
1607      * SymbolTable API
1608      */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1609     virtual UnicodeString parseReference(const UnicodeString& text,
1610                                          ParsePosition& pos, int32_t limit) const {
1611         int32_t start = pos.getIndex();
1612         int32_t i = start;
1613         UnicodeString result;
1614         while (i < limit) {
1615             UChar c = text.charAt(i);
1616             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1617                 break;
1618             }
1619             ++i;
1620         }
1621         if (i == start) { // No valid name chars
1622             return result; // Indicate failure with empty string
1623         }
1624         pos.setIndex(i);
1625         text.extractBetween(start, i, result);
1626         return result;
1627     }
1628 };
1629 
TestSymbolTable()1630 void UnicodeSetTest::TestSymbolTable() {
1631     // Multiple test cases can be set up here.  Each test case
1632     // is terminated by null:
1633     // var, value, var, value,..., input pat., exp. output pat., null
1634     const char* DATA[] = {
1635         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1636         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1637         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1638         NULL
1639     };
1640 
1641     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1642         UErrorCode ec = U_ZERO_ERROR;
1643         TokenSymbolTable sym(ec);
1644         if (U_FAILURE(ec)) {
1645             errln("FAIL: couldn't construct TokenSymbolTable");
1646             continue;
1647         }
1648 
1649         // Set up variables
1650         while (DATA[i+2] != NULL) {
1651             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1652             if (U_FAILURE(ec)) {
1653                 errln("FAIL: couldn't add to TokenSymbolTable");
1654                 continue;
1655             }
1656             i += 2;
1657         }
1658 
1659         // Input pattern and expected output pattern
1660         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1661         i += 2;
1662 
1663         ParsePosition pos(0);
1664         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1665         if (U_FAILURE(ec)) {
1666             errln("FAIL: couldn't construct UnicodeSet");
1667             continue;
1668         }
1669 
1670         // results
1671         if (pos.getIndex() != inpat.length()) {
1672             errln((UnicodeString)"Failed to read to end of string \""
1673                   + inpat + "\": read to "
1674                   + pos.getIndex() + ", length is "
1675                   + inpat.length());
1676         }
1677 
1678         UnicodeSet us2(exppat, ec);
1679         if (U_FAILURE(ec)) {
1680             errln("FAIL: couldn't construct expected UnicodeSet");
1681             continue;
1682         }
1683 
1684         UnicodeString a, b;
1685         if (us != us2) {
1686             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1687                   ", expected " + us2.toPattern(b, TRUE));
1688         } else {
1689             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1690         }
1691     }
1692 }
1693 
TestSurrogate()1694 void UnicodeSetTest::TestSurrogate() {
1695     const char* DATA[] = {
1696         // These should all behave identically
1697         "[abc\\uD800\\uDC00]",
1698         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1699         "[abc\\U00010000]",
1700         0
1701     };
1702     for (int i=0; DATA[i] != 0; ++i) {
1703         UErrorCode ec = U_ZERO_ERROR;
1704         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1705         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1706         UnicodeSet set(str, ec);
1707         if (U_FAILURE(ec)) {
1708             errln("FAIL: UnicodeSet constructor");
1709             continue;
1710         }
1711         expectContainment(set,
1712                           CharsToUnicodeString("abc\\U00010000"),
1713                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1714         if (set.size() != 4) {
1715             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1716                   set.size() + ", expected 4");
1717         }
1718 
1719         {
1720           UErrorCode subErr = U_ZERO_ERROR;
1721           checkRoundTrip(set);
1722           checkSerializeRoundTrip(set, subErr);
1723         }
1724     }
1725 }
1726 
TestExhaustive()1727 void UnicodeSetTest::TestExhaustive() {
1728     // exhaustive tests. Simulate UnicodeSets with integers.
1729     // That gives us very solid tests (except for large memory tests).
1730 
1731     int32_t limit = 128;
1732 
1733     UnicodeSet x, y, z, aa;
1734 
1735     for (int32_t i = 0; i < limit; ++i) {
1736         bitsToSet(i, x);
1737         logln((UnicodeString)"Testing " + i + ", " + x);
1738         _testComplement(i, x, y);
1739 
1740         UnicodeSet &toTest = bitsToSet(i, aa);
1741 
1742         // AS LONG AS WE ARE HERE, check roundtrip
1743         checkRoundTrip(toTest);
1744         UErrorCode ec = U_ZERO_ERROR;
1745         checkSerializeRoundTrip(toTest, ec);
1746 
1747         for (int32_t j = 0; j < limit; ++j) {
1748             _testAdd(i,j,  x,y,z);
1749             _testXor(i,j,  x,y,z);
1750             _testRetain(i,j,  x,y,z);
1751             _testRemove(i,j,  x,y,z);
1752         }
1753     }
1754 }
1755 
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1756 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1757     bitsToSet(a, x);
1758     z = x;
1759     z.complement();
1760     int32_t c = setToBits(z);
1761     if (c != (~a)) {
1762         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1763         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1764     }
1765     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1766 }
1767 
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1768 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1769     bitsToSet(a, x);
1770     bitsToSet(b, y);
1771     z = x;
1772     z.addAll(y);
1773     int32_t c = setToBits(z);
1774     if (c != (a | b)) {
1775         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1776         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1777     }
1778     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1779 }
1780 
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1781 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1782     bitsToSet(a, x);
1783     bitsToSet(b, y);
1784     z = x;
1785     z.retainAll(y);
1786     int32_t c = setToBits(z);
1787     if (c != (a & b)) {
1788         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1789         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1790     }
1791     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1792 }
1793 
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1794 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1795     bitsToSet(a, x);
1796     bitsToSet(b, y);
1797     z = x;
1798     z.removeAll(y);
1799     int32_t c = setToBits(z);
1800     if (c != (a &~ b)) {
1801         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1802         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1803     }
1804     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1805 }
1806 
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1807 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1808     bitsToSet(a, x);
1809     bitsToSet(b, y);
1810     z = x;
1811     z.complementAll(y);
1812     int32_t c = setToBits(z);
1813     if (c != (a ^ b)) {
1814         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1815         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1816     }
1817     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1818 }
1819 
1820 /**
1821  * Check that ranges are monotonically increasing and non-
1822  * overlapping.
1823  */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1824 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1825     int32_t n = set.getRangeCount();
1826     if (n < 0) {
1827         errln((UnicodeString)"FAIL result of " + msg +
1828               ": range count should be >= 0 but is " +
1829               n /*+ " for " + set.toPattern())*/);
1830         return;
1831     }
1832     UChar32 last = 0;
1833     for (int32_t i=0; i<n; ++i) {
1834         UChar32 start = set.getRangeStart(i);
1835         UChar32 end = set.getRangeEnd(i);
1836         if (start > end) {
1837             errln((UnicodeString)"FAIL result of " + msg +
1838                   ": range " + (i+1) +
1839                   " start > end: " + (int)start + ", " + (int)end +
1840                   " for " + set);
1841         }
1842         if (i > 0 && start <= last) {
1843             errln((UnicodeString)"FAIL result of " + msg +
1844                   ": range " + (i+1) +
1845                   " overlaps previous range: " + (int)start + ", " + (int)end +
1846                   " for " + set);
1847         }
1848         last = end;
1849     }
1850 }
1851 
1852 /**
1853  * Convert a bitmask to a UnicodeSet.
1854  */
bitsToSet(int32_t a,UnicodeSet & result)1855 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1856     result.clear();
1857     for (UChar32 i = 0; i < 32; ++i) {
1858         if ((a & (1<<i)) != 0) {
1859             result.add(i);
1860         }
1861     }
1862     return result;
1863 }
1864 
1865 /**
1866  * Convert a UnicodeSet to a bitmask.  Only the characters
1867  * U+0000 to U+0020 are represented in the bitmask.
1868  */
setToBits(const UnicodeSet & x)1869 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1870     int32_t result = 0;
1871     for (int32_t i = 0; i < 32; ++i) {
1872         if (x.contains((UChar32)i)) {
1873             result |= (1<<i);
1874         }
1875     }
1876     return result;
1877 }
1878 
1879 /**
1880  * Return the representation of an inversion list based UnicodeSet
1881  * as a pairs list.  Ranges are listed in ascending Unicode order.
1882  * For example, the set [a-zA-M3] is represented as "33AMaz".
1883  */
getPairs(const UnicodeSet & set)1884 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1885     UnicodeString pairs;
1886     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1887         UChar32 start = set.getRangeStart(i);
1888         UChar32 end = set.getRangeEnd(i);
1889         if (end > 0xFFFF) {
1890             end = 0xFFFF;
1891             i = set.getRangeCount(); // Should be unnecessary
1892         }
1893         pairs.append((UChar)start).append((UChar)end);
1894     }
1895     return pairs;
1896 }
1897 
1898 /**
1899  * Basic consistency check for a few items.
1900  * That the iterator works, and that we can create a pattern and
1901  * get the same thing back
1902  */
checkRoundTrip(const UnicodeSet & s)1903 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1904     {
1905         UnicodeSet t(s);
1906         checkEqual(s, t, "copy ct");
1907     }
1908 
1909     {
1910         UnicodeSet t(0xabcd, 0xdef0);  // dummy contents should be overwritten
1911         t = s;
1912         checkEqual(s, t, "operator=");
1913     }
1914 
1915     {
1916         UnicodeSet t;
1917         copyWithIterator(t, s, FALSE);
1918         checkEqual(s, t, "iterator roundtrip");
1919     }
1920 
1921     {
1922         UnicodeSet t;
1923         copyWithIterator(t, s, TRUE); // try range
1924         checkEqual(s, t, "iterator roundtrip");
1925     }
1926 
1927     {
1928         UnicodeSet t;
1929         UnicodeString pat;
1930         UErrorCode ec = U_ZERO_ERROR;
1931         s.toPattern(pat, FALSE);
1932         t.applyPattern(pat, ec);
1933         if (U_FAILURE(ec)) {
1934             errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1935             return;
1936         } else {
1937             checkEqual(s, t, "toPattern(false)");
1938         }
1939     }
1940 
1941     {
1942         UnicodeSet t;
1943         UnicodeString pat;
1944         UErrorCode ec = U_ZERO_ERROR;
1945         s.toPattern(pat, TRUE);
1946         t.applyPattern(pat, ec);
1947         if (U_FAILURE(ec)) {
1948             errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1949             return;
1950         } else {
1951             checkEqual(s, t, "toPattern(true)");
1952         }
1953     }
1954 }
1955 
checkSerializeRoundTrip(const UnicodeSet & t,UErrorCode & status)1956 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1957   if(U_FAILURE(status)) return;
1958   int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1959   if(status == U_BUFFER_OVERFLOW_ERROR) {
1960     status = U_ZERO_ERROR;
1961     serializeBuffer.resize(len);
1962     len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1963     // let 2nd error stand
1964   }
1965   if(U_FAILURE(status)) {
1966     errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
1967     return;
1968   }
1969   UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
1970   if(U_FAILURE(status)) {
1971     errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
1972     return;
1973   }
1974 
1975   checkEqual(t, deserialized, "Set was unequal when deserialized");
1976 }
1977 
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)1978 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1979     t.clear();
1980     UnicodeSetIterator it(s);
1981     if (withRange) {
1982         while (it.nextRange()) {
1983             if (it.isString()) {
1984                 t.add(it.getString());
1985             } else {
1986                 t.add(it.getCodepoint(), it.getCodepointEnd());
1987             }
1988         }
1989     } else {
1990         while (it.next()) {
1991             if (it.isString()) {
1992                 t.add(it.getString());
1993             } else {
1994                 t.add(it.getCodepoint());
1995             }
1996         }
1997     }
1998 }
1999 
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)2000 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2001   assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2002   assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2003     UnicodeString source; s.toPattern(source, TRUE);
2004     UnicodeString result; t.toPattern(result, TRUE);
2005     if (s != t) {
2006         errln((UnicodeString)"FAIL: " + message
2007               + "; source = " + source
2008               + "; result = " + result
2009               );
2010         return FALSE;
2011     } else {
2012         logln((UnicodeString)"Ok: " + message
2013               + "; source = " + source
2014               + "; result = " + result
2015               );
2016     }
2017     return TRUE;
2018 }
2019 
2020 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)2021 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2022                                   const UnicodeString& charsIn,
2023                                   const UnicodeString& charsOut) {
2024     UErrorCode ec = U_ZERO_ERROR;
2025     UnicodeSet set(pat, ec);
2026     if (U_FAILURE(ec)) {
2027         dataerrln((UnicodeString)"FAIL: pattern \"" +
2028               pat + "\" => " + u_errorName(ec));
2029         return;
2030     }
2031     expectContainment(set, pat, charsIn, charsOut);
2032 }
2033 
2034 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)2035 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2036                                   const UnicodeString& charsIn,
2037                                   const UnicodeString& charsOut) {
2038     UnicodeString pat;
2039     set.toPattern(pat);
2040     expectContainment(set, pat, charsIn, charsOut);
2041 }
2042 
2043 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)2044 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2045                                   const UnicodeString& setName,
2046                                   const UnicodeString& charsIn,
2047                                   const UnicodeString& charsOut) {
2048     UnicodeString bad;
2049     UChar32 c;
2050     int32_t i;
2051 
2052     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2053         c = charsIn.char32At(i);
2054         if (!set.contains(c)) {
2055             bad.append(c);
2056         }
2057     }
2058     if (bad.length() > 0) {
2059         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2060               ", expected containment of " + prettify(charsIn));
2061     } else {
2062         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2063     }
2064 
2065     bad.truncate(0);
2066     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2067         c = charsOut.char32At(i);
2068         if (set.contains(c)) {
2069             bad.append(c);
2070         }
2071     }
2072     if (bad.length() > 0) {
2073         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2074               ", expected non-containment of " + prettify(charsOut));
2075     } else {
2076         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2077     }
2078 }
2079 
2080 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2081 UnicodeSetTest::expectPattern(UnicodeSet& set,
2082                               const UnicodeString& pattern,
2083                               const UnicodeString& expectedPairs){
2084     UErrorCode status = U_ZERO_ERROR;
2085     set.applyPattern(pattern, status);
2086     if (U_FAILURE(status)) {
2087         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2088               "\") failed");
2089         return;
2090     } else {
2091         if (getPairs(set) != expectedPairs ) {
2092             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2093                   "\") => pairs \"" +
2094                   escape(getPairs(set)) + "\", expected \"" +
2095                   escape(expectedPairs) + "\"");
2096         } else {
2097             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2098                   "\") => pairs \"" +
2099                   escape(getPairs(set)) + "\"");
2100         }
2101     }
2102     // the result of calling set.toPattern(), which is the string representation of
2103     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2104     // will produce another set that is equal to this one.
2105     UnicodeString temppattern;
2106     set.toPattern(temppattern);
2107     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2108     if (U_FAILURE(status)) {
2109         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2110         return;
2111     }
2112     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2113         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2114             escape(getPairs(set)) + "\""));
2115     } else{
2116         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2117     }
2118 
2119     delete tempset;
2120 
2121 }
2122 
2123 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2124 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2125     if (getPairs(set) != expectedPairs) {
2126         errln(UnicodeString("FAIL: Expected pair list \"") +
2127               escape(expectedPairs) + "\", got \"" +
2128               escape(getPairs(set)) + "\"");
2129     }
2130 }
2131 
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2132 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2133                                      const UnicodeString& expPat,
2134                                      const char** expStrings) {
2135     UnicodeString pat;
2136     set.toPattern(pat, TRUE);
2137     if (pat == expPat) {
2138         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2139     } else {
2140         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2141         return;
2142     }
2143     if (expStrings == NULL) {
2144         return;
2145     }
2146     UBool in = TRUE;
2147     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2148         if (expStrings[i] == NOT) { // sic; pointer comparison
2149             in = FALSE;
2150             continue;
2151         }
2152         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2153         UBool contained = set.contains(s);
2154         if (contained == in) {
2155             logln((UnicodeString)"Ok: " + expPat +
2156                   (contained ? " contains {" : " does not contain {") +
2157                   escape(expStrings[i]) + "}");
2158         } else {
2159             errln((UnicodeString)"FAIL: " + expPat +
2160                   (contained ? " contains {" : " does not contain {") +
2161                   escape(expStrings[i]) + "}");
2162         }
2163     }
2164 }
2165 
toHexString(int32_t i)2166 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2167 
2168 void
doAssert(UBool condition,const char * message)2169 UnicodeSetTest::doAssert(UBool condition, const char *message)
2170 {
2171     if (!condition) {
2172         errln(UnicodeString("ERROR : ") + message);
2173     }
2174 }
2175 
2176 UnicodeString
escape(const UnicodeString & s)2177 UnicodeSetTest::escape(const UnicodeString& s) {
2178     UnicodeString buf;
2179     for (int32_t i=0; i<s.length(); )
2180     {
2181         UChar32 c = s.char32At(i);
2182         if (0x0020 <= c && c <= 0x007F) {
2183             buf += c;
2184         } else {
2185             if (c <= 0xFFFF) {
2186                 buf += (UChar)0x5c; buf += (UChar)0x75;
2187             } else {
2188                 buf += (UChar)0x5c; buf += (UChar)0x55;
2189                 buf += toHexString((c & 0xF0000000) >> 28);
2190                 buf += toHexString((c & 0x0F000000) >> 24);
2191                 buf += toHexString((c & 0x00F00000) >> 20);
2192                 buf += toHexString((c & 0x000F0000) >> 16);
2193             }
2194             buf += toHexString((c & 0xF000) >> 12);
2195             buf += toHexString((c & 0x0F00) >> 8);
2196             buf += toHexString((c & 0x00F0) >> 4);
2197             buf += toHexString(c & 0x000F);
2198         }
2199         i += U16_LENGTH(c);
2200     }
2201     return buf;
2202 }
2203 
TestFreezable()2204 void UnicodeSetTest::TestFreezable() {
2205     UErrorCode errorCode=U_ZERO_ERROR;
2206     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2207     UnicodeSet idSet(idPattern, errorCode);
2208     if(U_FAILURE(errorCode)) {
2209         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2210         return;
2211     }
2212 
2213     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2214     UnicodeSet wsSet(wsPattern, errorCode);
2215     if(U_FAILURE(errorCode)) {
2216         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2217         return;
2218     }
2219 
2220     idSet.add(idPattern);
2221     UnicodeSet frozen(idSet);
2222     frozen.freeze();
2223 
2224     if(idSet.isFrozen() || !frozen.isFrozen()) {
2225         errln("FAIL: isFrozen() is wrong");
2226     }
2227     if(frozen!=idSet || !(frozen==idSet)) {
2228         errln("FAIL: a copy-constructed frozen set differs from its original");
2229     }
2230 
2231     frozen=wsSet;
2232     if(frozen!=idSet || !(frozen==idSet)) {
2233         errln("FAIL: a frozen set was modified by operator=");
2234     }
2235 
2236     UnicodeSet frozen2(frozen);
2237     if(frozen2!=frozen || frozen2!=idSet) {
2238         errln("FAIL: a copied frozen set differs from its frozen original");
2239     }
2240     if(!frozen2.isFrozen()) {
2241         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2242     }
2243     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2244     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2245         errln("FAIL: UnicodeSet(5, 55) failed");
2246     }
2247     frozen3=frozen;
2248     if(!frozen3.isFrozen()) {
2249         errln("FAIL: copying a frozen set results in a thawed one");
2250     }
2251 
2252     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2253     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2254         errln("FAIL: clone() failed");
2255     }
2256     cloned->add(0xd802, 0xd805);
2257     if(cloned->containsSome(0xd802, 0xd805)) {
2258         errln("FAIL: unable to modify clone");
2259     }
2260     delete cloned;
2261 
2262     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2263     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2264         errln("FAIL: cloneAsThawed() failed");
2265     }
2266     thawed->add(0xd802, 0xd805);
2267     if(!thawed->contains(0xd802, 0xd805)) {
2268         errln("FAIL: unable to modify thawed clone");
2269     }
2270     delete thawed;
2271 
2272     frozen.set(5, 55);
2273     if(frozen!=idSet || !(frozen==idSet)) {
2274         errln("FAIL: UnicodeSet::set() modified a frozen set");
2275     }
2276 
2277     frozen.clear();
2278     if(frozen!=idSet || !(frozen==idSet)) {
2279         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2280     }
2281 
2282     frozen.closeOver(USET_CASE_INSENSITIVE);
2283     if(frozen!=idSet || !(frozen==idSet)) {
2284         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2285     }
2286 
2287     frozen.compact();
2288     if(frozen!=idSet || !(frozen==idSet)) {
2289         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2290     }
2291 
2292     ParsePosition pos;
2293     frozen.
2294         applyPattern(wsPattern, errorCode).
2295         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2296         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2297         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2298         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2299     if(frozen!=idSet || !(frozen==idSet)) {
2300         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2301     }
2302 
2303     frozen.
2304         add(0xd800).
2305         add(0xd802, 0xd805).
2306         add(wsPattern).
2307         addAll(idPattern).
2308         addAll(wsSet);
2309     if(frozen!=idSet || !(frozen==idSet)) {
2310         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2311     }
2312 
2313     frozen.
2314         retain(0x62).
2315         retain(0x64, 0x69).
2316         retainAll(wsPattern).
2317         retainAll(wsSet);
2318     if(frozen!=idSet || !(frozen==idSet)) {
2319         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2320     }
2321 
2322     frozen.
2323         remove(0x62).
2324         remove(0x64, 0x69).
2325         remove(idPattern).
2326         removeAll(idPattern).
2327         removeAll(idSet);
2328     if(frozen!=idSet || !(frozen==idSet)) {
2329         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2330     }
2331 
2332     frozen.
2333         complement().
2334         complement(0x62).
2335         complement(0x64, 0x69).
2336         complement(idPattern).
2337         complementAll(idPattern).
2338         complementAll(idSet);
2339     if(frozen!=idSet || !(frozen==idSet)) {
2340         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2341     }
2342 }
2343 
2344 // Test span() etc. -------------------------------------------------------- ***
2345 
2346 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2347 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2348 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2349     UErrorCode errorCode=U_ZERO_ERROR;
2350     int32_t length8=0;
2351     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2352     if(U_SUCCESS(errorCode)) {
2353         return length8;
2354     } else {
2355         // The string contains an unpaired surrogate.
2356         // Ignore this string.
2357         return 0;
2358     }
2359 }
2360 
2361 class UnicodeSetWithStringsIterator;
2362 
2363 // Make the strings in a UnicodeSet easily accessible.
2364 class UnicodeSetWithStrings {
2365 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2366     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2367             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2368         int32_t size=set.size();
2369         if(size>0 && set.charAt(size-1)<0) {
2370             // If a set's last element is not a code point, then it must contain strings.
2371             // Iterate over the set, skip all code point ranges, and cache the strings.
2372             // Convert them to UTF-8 for spanUTF8().
2373             UnicodeSetIterator iter(set);
2374             const UnicodeString *s;
2375             char *s8=utf8;
2376             int32_t length8, utf8Count=0;
2377             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2378                 if(iter.isString()) {
2379                     // Store the pointer to the set's string element
2380                     // which we happen to know is a stable pointer.
2381                     strings[stringsLength]=s=&iter.getString();
2382                     utf8Count+=
2383                         utf8Lengths[stringsLength]=length8=
2384                         appendUTF8(s->getBuffer(), s->length(),
2385                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2386                     if(length8==0) {
2387                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2388                     }
2389                     s8+=length8;
2390                     ++stringsLength;
2391                 }
2392             }
2393         }
2394     }
2395 
getSet() const2396     const UnicodeSet &getSet() const {
2397         return set;
2398     }
2399 
hasStrings() const2400     UBool hasStrings() const {
2401         return (UBool)(stringsLength>0);
2402     }
2403 
hasStringsWithSurrogates() const2404     UBool hasStringsWithSurrogates() const {
2405         return hasSurrogates;
2406     }
2407 
2408 private:
2409     friend class UnicodeSetWithStringsIterator;
2410 
2411     const UnicodeSet &set;
2412 
2413     const UnicodeString *strings[20];
2414     int32_t stringsLength;
2415     UBool hasSurrogates;
2416 
2417     char utf8[1024];
2418     int32_t utf8Lengths[20];
2419 };
2420 
2421 class UnicodeSetWithStringsIterator {
2422 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2423     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2424             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2425     }
2426 
reset()2427     void reset() {
2428         nextStringIndex=nextUTF8Start=0;
2429     }
2430 
nextString()2431     const UnicodeString *nextString() {
2432         if(nextStringIndex<fSet.stringsLength) {
2433             return fSet.strings[nextStringIndex++];
2434         } else {
2435             return NULL;
2436         }
2437     }
2438 
2439     // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2440     const char *nextUTF8(int32_t &length) {
2441         if(nextStringIndex<fSet.stringsLength) {
2442             const char *s8=fSet.utf8+nextUTF8Start;
2443             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2444             return s8;
2445         } else {
2446             length=0;
2447             return NULL;
2448         }
2449     }
2450 
2451 private:
2452     const UnicodeSetWithStrings &fSet;
2453     int32_t nextStringIndex;
2454     int32_t nextUTF8Start;
2455 };
2456 
2457 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2458 // at code point boundaries.
2459 // That is, each edge of a match must not be in the middle of a surrogate pair.
2460 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2461 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2462     s+=start;
2463     limit-=start;
2464     int32_t length=t.length();
2465     return 0==t.compare(s, length) &&
2466            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2467            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2468 }
2469 
2470 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2471 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2472                                  USetSpanCondition spanCondition) {
2473     const UnicodeSet &realSet(set.getSet());
2474     if(!set.hasStrings()) {
2475         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2476             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2477         }
2478 
2479         UChar32 c;
2480         int32_t start=0, prev;
2481         while((prev=start)<length) {
2482             U16_NEXT(s, start, length, c);
2483             if(realSet.contains(c)!=spanCondition) {
2484                 break;
2485             }
2486         }
2487         return prev;
2488     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2489         UnicodeSetWithStringsIterator iter(set);
2490         UChar32 c;
2491         int32_t start, next;
2492         for(start=next=0; start<length;) {
2493             U16_NEXT(s, next, length, c);
2494             if(realSet.contains(c)) {
2495                 break;
2496             }
2497             const UnicodeString *str;
2498             iter.reset();
2499             while((str=iter.nextString())!=NULL) {
2500                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2501                     // spanNeedsStrings=TRUE;
2502                     return start;
2503                 }
2504             }
2505             start=next;
2506         }
2507         return start;
2508     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2509         UnicodeSetWithStringsIterator iter(set);
2510         UChar32 c;
2511         int32_t start, next, maxSpanLimit=0;
2512         for(start=next=0; start<length;) {
2513             U16_NEXT(s, next, length, c);
2514             if(!realSet.contains(c)) {
2515                 next=start;  // Do not span this single, not-contained code point.
2516             }
2517             const UnicodeString *str;
2518             iter.reset();
2519             while((str=iter.nextString())!=NULL) {
2520                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2521                     // spanNeedsStrings=TRUE;
2522                     int32_t matchLimit=start+str->length();
2523                     if(matchLimit==length) {
2524                         return length;
2525                     }
2526                     if(spanCondition==USET_SPAN_CONTAINED) {
2527                         // Iterate for the shortest match at each position.
2528                         // Recurse for each but the shortest match.
2529                         if(next==start) {
2530                             next=matchLimit;  // First match from start.
2531                         } else {
2532                             if(matchLimit<next) {
2533                                 // Remember shortest match from start for iteration.
2534                                 int32_t temp=next;
2535                                 next=matchLimit;
2536                                 matchLimit=temp;
2537                             }
2538                             // Recurse for non-shortest match from start.
2539                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2540                                                                  USET_SPAN_CONTAINED);
2541                             if((matchLimit+spanLength)>maxSpanLimit) {
2542                                 maxSpanLimit=matchLimit+spanLength;
2543                                 if(maxSpanLimit==length) {
2544                                     return length;
2545                                 }
2546                             }
2547                         }
2548                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2549                         if(matchLimit>next) {
2550                             // Remember longest match from start.
2551                             next=matchLimit;
2552                         }
2553                     }
2554                 }
2555             }
2556             if(next==start) {
2557                 break;  // No match from start.
2558             }
2559             start=next;
2560         }
2561         if(start>maxSpanLimit) {
2562             return start;
2563         } else {
2564             return maxSpanLimit;
2565         }
2566     }
2567 }
2568 
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2569 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2570                                      USetSpanCondition spanCondition) {
2571     if(length==0) {
2572         return 0;
2573     }
2574     const UnicodeSet &realSet(set.getSet());
2575     if(!set.hasStrings()) {
2576         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2577             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2578         }
2579 
2580         UChar32 c;
2581         int32_t prev=length;
2582         do {
2583             U16_PREV(s, 0, length, c);
2584             if(realSet.contains(c)!=spanCondition) {
2585                 break;
2586             }
2587         } while((prev=length)>0);
2588         return prev;
2589     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2590         UnicodeSetWithStringsIterator iter(set);
2591         UChar32 c;
2592         int32_t prev=length, length0=length;
2593         do {
2594             U16_PREV(s, 0, length, c);
2595             if(realSet.contains(c)) {
2596                 break;
2597             }
2598             const UnicodeString *str;
2599             iter.reset();
2600             while((str=iter.nextString())!=NULL) {
2601                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2602                     // spanNeedsStrings=TRUE;
2603                     return prev;
2604                 }
2605             }
2606         } while((prev=length)>0);
2607         return prev;
2608     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2609         UnicodeSetWithStringsIterator iter(set);
2610         UChar32 c;
2611         int32_t prev=length, minSpanStart=length, length0=length;
2612         do {
2613             U16_PREV(s, 0, length, c);
2614             if(!realSet.contains(c)) {
2615                 length=prev;  // Do not span this single, not-contained code point.
2616             }
2617             const UnicodeString *str;
2618             iter.reset();
2619             while((str=iter.nextString())!=NULL) {
2620                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2621                     // spanNeedsStrings=TRUE;
2622                     int32_t matchStart=prev-str->length();
2623                     if(matchStart==0) {
2624                         return 0;
2625                     }
2626                     if(spanCondition==USET_SPAN_CONTAINED) {
2627                         // Iterate for the shortest match at each position.
2628                         // Recurse for each but the shortest match.
2629                         if(length==prev) {
2630                             length=matchStart;  // First match from prev.
2631                         } else {
2632                             if(matchStart>length) {
2633                                 // Remember shortest match from prev for iteration.
2634                                 int32_t temp=length;
2635                                 length=matchStart;
2636                                 matchStart=temp;
2637                             }
2638                             // Recurse for non-shortest match from prev.
2639                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2640                                                                     USET_SPAN_CONTAINED);
2641                             if(spanStart<minSpanStart) {
2642                                 minSpanStart=spanStart;
2643                                 if(minSpanStart==0) {
2644                                     return 0;
2645                                 }
2646                             }
2647                         }
2648                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2649                         if(matchStart<length) {
2650                             // Remember longest match from prev.
2651                             length=matchStart;
2652                         }
2653                     }
2654                 }
2655             }
2656             if(length==prev) {
2657                 break;  // No match from prev.
2658             }
2659         } while((prev=length)>0);
2660         if(prev<minSpanStart) {
2661             return prev;
2662         } else {
2663             return minSpanStart;
2664         }
2665     }
2666 }
2667 
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2668 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2669                                 USetSpanCondition spanCondition) {
2670     const UnicodeSet &realSet(set.getSet());
2671     if(!set.hasStrings()) {
2672         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2673             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2674         }
2675 
2676         UChar32 c;
2677         int32_t start=0, prev;
2678         while((prev=start)<length) {
2679             U8_NEXT_OR_FFFD(s, start, length, c);
2680             if(realSet.contains(c)!=spanCondition) {
2681                 break;
2682             }
2683         }
2684         return prev;
2685     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2686         UnicodeSetWithStringsIterator iter(set);
2687         UChar32 c;
2688         int32_t start, next;
2689         for(start=next=0; start<length;) {
2690             U8_NEXT_OR_FFFD(s, next, length, c);
2691             if(realSet.contains(c)) {
2692                 break;
2693             }
2694             const char *s8;
2695             int32_t length8;
2696             iter.reset();
2697             while((s8=iter.nextUTF8(length8))!=NULL) {
2698                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2699                     // spanNeedsStrings=TRUE;
2700                     return start;
2701                 }
2702             }
2703             start=next;
2704         }
2705         return start;
2706     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2707         UnicodeSetWithStringsIterator iter(set);
2708         UChar32 c;
2709         int32_t start, next, maxSpanLimit=0;
2710         for(start=next=0; start<length;) {
2711             U8_NEXT_OR_FFFD(s, next, length, c);
2712             if(!realSet.contains(c)) {
2713                 next=start;  // Do not span this single, not-contained code point.
2714             }
2715             const char *s8;
2716             int32_t length8;
2717             iter.reset();
2718             while((s8=iter.nextUTF8(length8))!=NULL) {
2719                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2720                     // spanNeedsStrings=TRUE;
2721                     int32_t matchLimit=start+length8;
2722                     if(matchLimit==length) {
2723                         return length;
2724                     }
2725                     if(spanCondition==USET_SPAN_CONTAINED) {
2726                         // Iterate for the shortest match at each position.
2727                         // Recurse for each but the shortest match.
2728                         if(next==start) {
2729                             next=matchLimit;  // First match from start.
2730                         } else {
2731                             if(matchLimit<next) {
2732                                 // Remember shortest match from start for iteration.
2733                                 int32_t temp=next;
2734                                 next=matchLimit;
2735                                 matchLimit=temp;
2736                             }
2737                             // Recurse for non-shortest match from start.
2738                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2739                                                                 USET_SPAN_CONTAINED);
2740                             if((matchLimit+spanLength)>maxSpanLimit) {
2741                                 maxSpanLimit=matchLimit+spanLength;
2742                                 if(maxSpanLimit==length) {
2743                                     return length;
2744                                 }
2745                             }
2746                         }
2747                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2748                         if(matchLimit>next) {
2749                             // Remember longest match from start.
2750                             next=matchLimit;
2751                         }
2752                     }
2753                 }
2754             }
2755             if(next==start) {
2756                 break;  // No match from start.
2757             }
2758             start=next;
2759         }
2760         if(start>maxSpanLimit) {
2761             return start;
2762         } else {
2763             return maxSpanLimit;
2764         }
2765     }
2766 }
2767 
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2768 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2769                                     USetSpanCondition spanCondition) {
2770     if(length==0) {
2771         return 0;
2772     }
2773     const UnicodeSet &realSet(set.getSet());
2774     if(!set.hasStrings()) {
2775         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2776             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2777         }
2778 
2779         UChar32 c;
2780         int32_t prev=length;
2781         do {
2782             U8_PREV_OR_FFFD(s, 0, length, c);
2783             if(realSet.contains(c)!=spanCondition) {
2784                 break;
2785             }
2786         } while((prev=length)>0);
2787         return prev;
2788     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2789         UnicodeSetWithStringsIterator iter(set);
2790         UChar32 c;
2791         int32_t prev=length;
2792         do {
2793             U8_PREV_OR_FFFD(s, 0, length, c);
2794             if(realSet.contains(c)) {
2795                 break;
2796             }
2797             const char *s8;
2798             int32_t length8;
2799             iter.reset();
2800             while((s8=iter.nextUTF8(length8))!=NULL) {
2801                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2802                     // spanNeedsStrings=TRUE;
2803                     return prev;
2804                 }
2805             }
2806         } while((prev=length)>0);
2807         return prev;
2808     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2809         UnicodeSetWithStringsIterator iter(set);
2810         UChar32 c;
2811         int32_t prev=length, minSpanStart=length;
2812         do {
2813             U8_PREV_OR_FFFD(s, 0, length, c);
2814             if(!realSet.contains(c)) {
2815                 length=prev;  // Do not span this single, not-contained code point.
2816             }
2817             const char *s8;
2818             int32_t length8;
2819             iter.reset();
2820             while((s8=iter.nextUTF8(length8))!=NULL) {
2821                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2822                     // spanNeedsStrings=TRUE;
2823                     int32_t matchStart=prev-length8;
2824                     if(matchStart==0) {
2825                         return 0;
2826                     }
2827                     if(spanCondition==USET_SPAN_CONTAINED) {
2828                         // Iterate for the shortest match at each position.
2829                         // Recurse for each but the shortest match.
2830                         if(length==prev) {
2831                             length=matchStart;  // First match from prev.
2832                         } else {
2833                             if(matchStart>length) {
2834                                 // Remember shortest match from prev for iteration.
2835                                 int32_t temp=length;
2836                                 length=matchStart;
2837                                 matchStart=temp;
2838                             }
2839                             // Recurse for non-shortest match from prev.
2840                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2841                                                                    USET_SPAN_CONTAINED);
2842                             if(spanStart<minSpanStart) {
2843                                 minSpanStart=spanStart;
2844                                 if(minSpanStart==0) {
2845                                     return 0;
2846                                 }
2847                             }
2848                         }
2849                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2850                         if(matchStart<length) {
2851                             // Remember longest match from prev.
2852                             length=matchStart;
2853                         }
2854                     }
2855                 }
2856             }
2857             if(length==prev) {
2858                 break;  // No match from prev.
2859             }
2860         } while((prev=length)>0);
2861         if(prev<minSpanStart) {
2862             return prev;
2863         } else {
2864             return minSpanStart;
2865         }
2866     }
2867 }
2868 
2869 // spans to be performed and compared
2870 enum {
2871     SPAN_UTF16          =1,
2872     SPAN_UTF8           =2,
2873     SPAN_UTFS           =3,
2874 
2875     SPAN_SET            =4,
2876     SPAN_COMPLEMENT     =8,
2877     SPAN_POLARITY       =0xc,
2878 
2879     SPAN_FWD            =0x10,
2880     SPAN_BACK           =0x20,
2881     SPAN_DIRS           =0x30,
2882 
2883     SPAN_CONTAINED      =0x100,
2884     SPAN_SIMPLE         =0x200,
2885     SPAN_CONDITION      =0x300,
2886 
2887     SPAN_ALL            =0x33f
2888 };
2889 
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2890 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2891     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2892 }
2893 
slen(const void * s,UBool isUTF16)2894 static inline int32_t slen(const void *s, UBool isUTF16) {
2895     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2896 }
2897 
2898 /*
2899  * Count spans on a string with the method according to type and set the span limits.
2900  * The set may be the complement of the original.
2901  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2902  * according to the expected number of spans.
2903  * Sets typeName to an empty string if there is no such type.
2904  * Returns -1 if the span option is filtered out.
2905  */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2906 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2907                         const void *s, int32_t length, UBool isUTF16,
2908                         uint32_t whichSpans,
2909                         int type, const char *&typeName,
2910                         int32_t limits[], int32_t limitsCapacity,
2911                         int32_t expectCount) {
2912     const UnicodeSet &realSet(set.getSet());
2913     int32_t start, count;
2914     USetSpanCondition spanCondition, firstSpanCondition, contained;
2915     UBool isForward;
2916 
2917     if(type<0 || 7<type) {
2918         typeName="";
2919         return 0;
2920     }
2921 
2922     static const char *const typeNames16[]={
2923         "contains", "contains(LM)",
2924         "span", "span(LM)",
2925         "containsBack", "containsBack(LM)",
2926         "spanBack", "spanBack(LM)"
2927     };
2928 
2929     static const char *const typeNames8[]={
2930         "containsUTF8", "containsUTF8(LM)",
2931         "spanUTF8", "spanUTF8(LM)",
2932         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2933         "spanBackUTF8", "spanBackUTF8(LM)"
2934     };
2935 
2936     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2937 
2938     // filter span options
2939     if(type<=3) {
2940         // span forward
2941         if((whichSpans&SPAN_FWD)==0) {
2942             return -1;
2943         }
2944         isForward=TRUE;
2945     } else {
2946         // span backward
2947         if((whichSpans&SPAN_BACK)==0) {
2948             return -1;
2949         }
2950         isForward=FALSE;
2951     }
2952     if((type&1)==0) {
2953         // use USET_SPAN_CONTAINED
2954         if((whichSpans&SPAN_CONTAINED)==0) {
2955             return -1;
2956         }
2957         contained=USET_SPAN_CONTAINED;
2958     } else {
2959         // use USET_SPAN_SIMPLE
2960         if((whichSpans&SPAN_SIMPLE)==0) {
2961             return -1;
2962         }
2963         contained=USET_SPAN_SIMPLE;
2964     }
2965 
2966     // Default first span condition for going forward with an uncomplemented set.
2967     spanCondition=USET_SPAN_NOT_CONTAINED;
2968     if(isComplement) {
2969         spanCondition=invertSpanCondition(spanCondition, contained);
2970     }
2971 
2972     // First span condition for span(), used to terminate the spanBack() iteration.
2973     firstSpanCondition=spanCondition;
2974 
2975     // spanBack(): Its initial span condition is span()'s last span condition,
2976     // which is the opposite of span()'s first span condition
2977     // if we expect an even number of spans.
2978     // (The loop inverts spanCondition (expectCount-1) times
2979     // before the expectCount'th span() call.)
2980     // If we do not compare forward and backward directions, then we do not have an
2981     // expectCount and just start with firstSpanCondition.
2982     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2983         spanCondition=invertSpanCondition(spanCondition, contained);
2984     }
2985 
2986     count=0;
2987     switch(type) {
2988     case 0:
2989     case 1:
2990         start=0;
2991         if(length<0) {
2992             length=slen(s, isUTF16);
2993         }
2994         for(;;) {
2995             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2996                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2997             if(count<limitsCapacity) {
2998                 limits[count]=start;
2999             }
3000             ++count;
3001             if(start>=length) {
3002                 break;
3003             }
3004             spanCondition=invertSpanCondition(spanCondition, contained);
3005         }
3006         break;
3007     case 2:
3008     case 3:
3009         start=0;
3010         for(;;) {
3011             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3012                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3013             if(count<limitsCapacity) {
3014                 limits[count]=start;
3015             }
3016             ++count;
3017             if(length>=0 ? start>=length :
3018                            isUTF16 ? ((const UChar *)s)[start]==0 :
3019                                      ((const char *)s)[start]==0
3020             ) {
3021                 break;
3022             }
3023             spanCondition=invertSpanCondition(spanCondition, contained);
3024         }
3025         break;
3026     case 4:
3027     case 5:
3028         if(length<0) {
3029             length=slen(s, isUTF16);
3030         }
3031         for(;;) {
3032             ++count;
3033             if(count<=limitsCapacity) {
3034                 limits[limitsCapacity-count]=length;
3035             }
3036             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3037                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3038             if(length==0 && spanCondition==firstSpanCondition) {
3039                 break;
3040             }
3041             spanCondition=invertSpanCondition(spanCondition, contained);
3042         }
3043         if(count<limitsCapacity) {
3044             memmove(limits, limits+(limitsCapacity-count), count*4);
3045         }
3046         break;
3047     case 6:
3048     case 7:
3049         for(;;) {
3050             ++count;
3051             if(count<=limitsCapacity) {
3052                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3053             }
3054             // Note: Length<0 is tested only for the first spanBack().
3055             // If we wanted to keep length<0 for all spanBack()s, we would have to
3056             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3057             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3058                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3059             if(length==0 && spanCondition==firstSpanCondition) {
3060                 break;
3061             }
3062             spanCondition=invertSpanCondition(spanCondition, contained);
3063         }
3064         if(count<limitsCapacity) {
3065             memmove(limits, limits+(limitsCapacity-count), count*4);
3066         }
3067         break;
3068     default:
3069         typeName="";
3070         return -1;
3071     }
3072 
3073     return count;
3074 }
3075 
3076 // sets to be tested; odd index=isComplement
3077 enum {
3078     SLOW,
3079     SLOW_NOT,
3080     FAST,
3081     FAST_NOT,
3082     SET_COUNT
3083 };
3084 
3085 static const char *const setNames[SET_COUNT]={
3086     "slow",
3087     "slow.not",
3088     "fast",
3089     "fast.not"
3090 };
3091 
3092 /*
3093  * Verify that we get the same results whether we look at text with contains(),
3094  * span() or spanBack(), using unfrozen or frozen versions of the set,
3095  * and using the set or its complement (switching the spanConditions accordingly).
3096  * The latter verifies that
3097  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3098  *
3099  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3100  * or returned to the caller (with an input expectCount<0).
3101  */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3102 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3103                               const void *s, int32_t length, UBool isUTF16,
3104                               uint32_t whichSpans,
3105                               int32_t expectLimits[], int32_t &expectCount,
3106                               const char *testName, int32_t index) {
3107     int32_t limits[500];
3108     int32_t limitsCount;
3109     int i, j;
3110 
3111     const char *typeName;
3112     int type;
3113 
3114     for(i=0; i<SET_COUNT; ++i) {
3115         if((i&1)==0) {
3116             // Even-numbered sets are original, uncomplemented sets.
3117             if((whichSpans&SPAN_SET)==0) {
3118                 continue;
3119             }
3120         } else {
3121             // Odd-numbered sets are complemented.
3122             if((whichSpans&SPAN_COMPLEMENT)==0) {
3123                 continue;
3124             }
3125         }
3126         for(type=0;; ++type) {
3127             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3128                                  s, length, isUTF16,
3129                                  whichSpans,
3130                                  type, typeName,
3131                                  limits, UPRV_LENGTHOF(limits), expectCount);
3132             if(typeName[0]==0) {
3133                 break; // All types tried.
3134             }
3135             if(limitsCount<0) {
3136                 continue; // Span option filtered out.
3137             }
3138             if(expectCount<0) {
3139                 expectCount=limitsCount;
3140                 if(limitsCount>UPRV_LENGTHOF(limits)) {
3141                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3142                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3143                     return;
3144                 }
3145                 memcpy(expectLimits, limits, limitsCount*4);
3146             } else if(limitsCount!=expectCount) {
3147                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3148                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3149             } else {
3150                 for(j=0; j<limitsCount; ++j) {
3151                     if(limits[j]!=expectLimits[j]) {
3152                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3153                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3154                               j, (long)limits[j], (long)expectLimits[j]);
3155                         break;
3156                     }
3157                 }
3158             }
3159         }
3160     }
3161 
3162     // Compare span() with containsAll()/containsNone(),
3163     // but only if we have expectLimits[] from the uncomplemented set.
3164     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3165         const UChar *s16=(const UChar *)s;
3166         UnicodeString string;
3167         int32_t prev=0, limit, length;
3168         for(i=0; i<expectCount; ++i) {
3169             limit=expectLimits[i];
3170             length=limit-prev;
3171             if(length>0) {
3172                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3173                 if(i&1) {
3174                     if(!sets[SLOW]->getSet().containsAll(string)) {
3175                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3176                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3177                         return;
3178                     }
3179                     if(!sets[FAST]->getSet().containsAll(string)) {
3180                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3181                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3182                         return;
3183                     }
3184                 } else {
3185                     if(!sets[SLOW]->getSet().containsNone(string)) {
3186                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3187                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3188                         return;
3189                     }
3190                     if(!sets[FAST]->getSet().containsNone(string)) {
3191                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3192                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3193                         return;
3194                     }
3195                 }
3196             }
3197             prev=limit;
3198         }
3199     }
3200 }
3201 
3202 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3203 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3204                               const void *s, int32_t length, UBool isUTF16,
3205                               uint32_t whichSpans,
3206                               const char *testName, int32_t index) {
3207     int32_t expectLimits[500];
3208     int32_t expectCount=-1;
3209     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3210 }
3211 
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3212 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3213     UChar c, c2;
3214 
3215     if(length>=0) {
3216         while(length>0) {
3217             c=*s++;
3218             --length;
3219             if(0xd800<=c && c<0xe000) {
3220                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3221                     return TRUE;
3222                 }
3223                 --length;
3224             }
3225         }
3226     } else {
3227         while((c=*s++)!=0) {
3228             if(0xd800<=c && c<0xe000) {
3229                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3230                     return TRUE;
3231                 }
3232             }
3233         }
3234     }
3235     return FALSE;
3236 }
3237 
3238 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3239 // unless either UTF is turned off in whichSpans.
3240 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3241 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3242 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3243                                       const UChar *s16, int32_t length16,
3244                                       uint32_t whichSpans,
3245                                       const char *testName, int32_t index) {
3246     int32_t expectLimits[500];
3247     int32_t expectCount;
3248 
3249     expectCount=-1;  // Get expectLimits[] from testSpan().
3250 
3251     if((whichSpans&SPAN_UTF16)!=0) {
3252         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3253     }
3254     if((whichSpans&SPAN_UTF8)==0) {
3255         return;
3256     }
3257 
3258     // Convert s16[] and expectLimits[] to UTF-8.
3259     uint8_t s8[3000];
3260     int32_t offsets[3000];
3261 
3262     const UChar *s16Limit=s16+length16;
3263     char *t=(char *)s8;
3264     char *tLimit=t+sizeof(s8);
3265     int32_t *o=offsets;
3266     UErrorCode errorCode=U_ZERO_ERROR;
3267 
3268     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3269     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3270     if(U_FAILURE(errorCode)) {
3271         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3272               testName, (long)index, u_errorName(errorCode));
3273         ucnv_resetFromUnicode(utf8Cnv);
3274         return;
3275     }
3276     int32_t length8=(int32_t)(t-(char *)s8);
3277 
3278     // Convert expectLimits[].
3279     int32_t i, j, expect;
3280     for(i=j=0; i<expectCount; ++i) {
3281         expect=expectLimits[i];
3282         if(expect==length16) {
3283             expectLimits[i]=length8;
3284         } else {
3285             while(offsets[j]<expect) {
3286                 ++j;
3287             }
3288             expectLimits[i]=j;
3289         }
3290     }
3291 
3292     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3293 }
3294 
nextCodePoint(UChar32 c)3295 static UChar32 nextCodePoint(UChar32 c) {
3296     // Skip some large and boring ranges.
3297     switch(c) {
3298     case 0x3441:
3299         return 0x4d7f;
3300     case 0x5100:
3301         return 0x9f00;
3302     case 0xb040:
3303         return 0xd780;
3304     case 0xe041:
3305         return 0xf8fe;
3306     case 0x10100:
3307         return 0x20000;
3308     case 0x20041:
3309         return 0xe0000;
3310     case 0xe0101:
3311         return 0x10fffd;
3312     default:
3313         return c+1;
3314     }
3315 }
3316 
3317 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3318 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3319     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3320     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3321     // Skip the UTF-8 part of the test - if the string contains surrogates -
3322     // because it is likely to produce a different result.
3323     UBool inconsistentSurrogates=
3324             (!(sets[0]->getSet().contains(0xfffd) ?
3325                sets[0]->getSet().contains(0xd800, 0xdfff) :
3326                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3327              sets[0]->hasStringsWithSurrogates());
3328 
3329     UChar s[1000];
3330     int32_t length=0;
3331     uint32_t localWhichSpans;
3332 
3333     UChar32 c, first;
3334     for(first=c=0;; c=nextCodePoint(c)) {
3335         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3336             localWhichSpans=whichSpans;
3337             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3338                 localWhichSpans&=~SPAN_UTF8;
3339             }
3340             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3341             if(c>0x10ffff) {
3342                 break;
3343             }
3344             length=0;
3345             first=c;
3346         }
3347         U16_APPEND_UNSAFE(s, length, c);
3348     }
3349 }
3350 
3351 // Test with a particular, interesting string.
3352 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3353 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3354     static const UChar s[]={
3355         0x61, 0x62, 0x20,                       // Latin, space
3356         0x3b1, 0x3b2, 0x3b3,                    // Greek
3357         0xd900,                                 // lead surrogate
3358         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3359         0xdc05,                                 // trail surrogate
3360         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3361         0xd900, 0xdc05,                         // unassigned supplementary
3362         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3363         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3364         0                                       // NUL
3365     };
3366 
3367     if((whichSpans&SPAN_UTF16)==0) {
3368         return;
3369     }
3370     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3371     testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3372 }
3373 
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3374 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3375     static const char s[]={
3376         "abc"                                   // Latin
3377 
3378         /* trail byte in lead position */
3379         "\x80"
3380 
3381         " "                                     // space
3382 
3383         /* truncated multi-byte sequences */
3384         "\xd0"
3385         "\xe0"
3386         "\xe1"
3387         "\xed"
3388         "\xee"
3389         "\xf0"
3390         "\xf1"
3391         "\xf4"
3392         "\xf8"
3393         "\xfc"
3394 
3395         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3396 
3397         /* trail byte in lead position */
3398         "\x80"
3399 
3400         "\xe0\x80"
3401         "\xe0\xa0"
3402         "\xe1\x80"
3403         "\xed\x80"
3404         "\xed\xa0"
3405         "\xee\x80"
3406         "\xf0\x80"
3407         "\xf0\x90"
3408         "\xf1\x80"
3409         "\xf4\x80"
3410         "\xf4\x90"
3411         "\xf8\x80"
3412         "\xfc\x80"
3413 
3414         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3415 
3416         /* trail byte in lead position */
3417         "\x80"
3418 
3419         "\xf0\x80\x80"
3420         "\xf0\x90\x80"
3421         "\xf1\x80\x80"
3422         "\xf4\x80\x80"
3423         "\xf4\x90\x80"
3424         "\xf8\x80\x80"
3425         "\xfc\x80\x80"
3426 
3427         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3428 
3429         /* trail byte in lead position */
3430         "\x80"
3431 
3432         "\xf8\x80\x80\x80"
3433         "\xfc\x80\x80\x80"
3434 
3435         "\xF1\x90\x80\x85"                      // unassigned supplementary
3436 
3437         /* trail byte in lead position */
3438         "\x80"
3439 
3440         "\xfc\x80\x80\x80\x80"
3441 
3442         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3443 
3444         /* trail byte in lead position */
3445         "\x80"
3446 
3447         /* complete sequences but non-shortest forms or out of range etc. */
3448         "\xc0\x80"
3449         "\xe0\x80\x80"
3450         "\xed\xa0\x80"
3451         "\xf0\x80\x80\x80"
3452         "\xf4\x90\x80\x80"
3453         "\xf8\x80\x80\x80\x80"
3454         "\xfc\x80\x80\x80\x80\x80"
3455         "\xfe"
3456         "\xff"
3457 
3458         /* trail byte in lead position */
3459         "\x80"
3460 
3461         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3462     };
3463 
3464     if((whichSpans&SPAN_UTF8)==0) {
3465         return;
3466     }
3467     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3468     testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3469 }
3470 
3471 // Take a set of span options and multiply them so that
3472 // each portion only has one of the options a, b and c.
3473 // If b==0, then the set of options is just modified with mask and a.
3474 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3475 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3476 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3477                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3478     uint32_t s;
3479     int32_t i;
3480 
3481     for(i=0; i<whichSpansCount; ++i) {
3482         s=whichSpans[i]&mask;
3483         whichSpans[i]=s|a;
3484         if(b!=0) {
3485             whichSpans[whichSpansCount+i]=s|b;
3486             if(c!=0) {
3487                 whichSpans[2*whichSpansCount+i]=s|c;
3488             }
3489         }
3490     }
3491     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3492 }
3493 
3494 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3495 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3496 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3497 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3498 
TestSpan()3499 void UnicodeSetTest::TestSpan() {
3500     // "[...]" is a UnicodeSet pattern.
3501     // "*" performs tests on all Unicode code points and on a selection of
3502     //   malformed UTF-8/16 strings.
3503     // "-options" limits the scope of testing for the current set.
3504     //   By default, the test verifies that equivalent boundaries are found
3505     //   for UTF-16 and UTF-8, going forward and backward,
3506     //   alternating USET_SPAN_NOT_CONTAINED with
3507     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3508     //   Single-character options:
3509     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3510     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3511     //          or the set contains strings with unpaired surrogates
3512     //          which do not translate to valid UTF-8.
3513     //     c -- set.span() and set.complement().span() boundaries may differ.
3514     //          Cause: Set strings are not complemented.
3515     //     b -- span() and spanBack() boundaries may differ.
3516     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3517     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3518     //          match with non-overlapping substrings.
3519     //          For example, with a set containing "ab" and "ba",
3520     //          span() of "aba" yields boundaries { 0, 2, 3 }
3521     //          because the initial "ab" matches from 0 to 2,
3522     //          while spanBack() yields boundaries { 0, 1, 3 }
3523     //          because the final "ba" matches from 1 to 3.
3524     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3525     //          Cause: Strings in the set overlap, and a longer match may
3526     //          require a sequence including non-longest substrings.
3527     //          For example, with a set containing "ab", "abc" and "cd",
3528     //          span(contained) of "abcd" spans the entire string
3529     //          but span(longest match) only spans the first 3 characters.
3530     //   Each "-options" first resets all options and then applies the specified options.
3531     //   A "-" without options resets the options.
3532     //   The options are also reset for each new set.
3533     // Other strings will be spanned.
3534     static const char *const testdata[]={
3535         "[:ID_Continue:]",
3536         "*",
3537         "[:White_Space:]",
3538         "*",
3539         "[]",
3540         "*",
3541         "[\\u0000-\\U0010FFFF]",
3542         "*",
3543         "[\\u0000\\u0080\\u0800\\U00010000]",
3544         "*",
3545         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3546         "*",
3547         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3548         "-c",
3549         "*",
3550         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3551         "-c",
3552         "*",
3553 
3554         // Overlapping strings cause overlapping attempts to match.
3555         "[x{xy}{xya}{axy}{ax}]",
3556         "-cl",
3557 
3558         // More repetitions of "xya" would take too long with the recursive
3559         // reference implementation.
3560         // containsAll()=FALSE
3561         // test_string 0x14
3562         "xx"
3563         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3564         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3565         "xyaxyaxyaxya"
3566         "xx"
3567         "xyaxyaxyaxya"  // span() ends here.
3568         "aaa",
3569 
3570         // containsAll()=TRUE
3571         // test_string 0x15
3572         "xx"
3573         "xyaxyaxyaxya"
3574         "xx"
3575         "xyaxyaxyaxya"
3576         "xx"
3577         "xyaxyaxyaxy",
3578 
3579         "-bc",
3580         // test_string 0x17
3581         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3582         "-c",
3583         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3584         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3585         "-",
3586         "byaya",     // span() -> { 5 }
3587         "byay",      // span() -> { 4 }
3588         "bya",       // span() -> { 3 }
3589 
3590         // span(longest match) will not span the whole string.
3591         "[a{ab}{bc}]",
3592         "-cl",
3593         // test_string 0x21
3594         "abc",
3595 
3596         "[a{ab}{abc}{cd}]",
3597         "-cl",
3598         "acdabcdabccd",
3599 
3600         // spanBack(longest match) will not span the whole string.
3601         "[c{ab}{bc}]",
3602         "-cl",
3603         "abc",
3604 
3605         "[d{cd}{bcd}{ab}]",
3606         "-cl",
3607         "abbcdabcdabd",
3608 
3609         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3610         // and UTF-8 trail bytes.
3611         // Copies of above test sets and strings, but transliterated to have
3612         // different code points with similar trail units.
3613         // Previous: a      b         c            d
3614         // Unicode:  042B   30AB      200AB        204AB
3615         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3616         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3617         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3618         "-cl",
3619         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3620 
3621         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3622         "-cl",
3623         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3624 
3625         // Stress bookkeeping and recursion.
3626         // The following strings are barely doable with the recursive
3627         // reference implementation.
3628         // The not-contained character at the end prevents an early exit from the span().
3629         "[b{bb}]",
3630         "-c",
3631         // test_string 0x33
3632         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3633         // On complement sets, span() and spanBack() get different results
3634         // because b is not in the complement set and there is an odd number of b's
3635         // in the test string.
3636         "-bc",
3637         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3638 
3639         // Test with set strings with an initial or final code point span
3640         // longer than 254.
3641         "[a{" _64_a _64_a _64_a _64_a "b}"
3642           "{a" _64_b _64_b _64_b _64_b "}]",
3643         "-c",
3644         _64_a _64_a _64_a _63_a "b",
3645         _64_a _64_a _64_a _64_a "b",
3646         _64_a _64_a _64_a _64_a "aaaabbbb",
3647         "a" _64_b _64_b _64_b _63_b,
3648         "a" _64_b _64_b _64_b _64_b,
3649         "aaaabbbb" _64_b _64_b _64_b _64_b,
3650 
3651         // Test with strings containing unpaired surrogates.
3652         // They are not representable in UTF-8, and a leading trail surrogate
3653         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3654         // U+20001 == \\uD840\\uDC01
3655         // U+20400 == \\uD841\\uDC00
3656         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3657         "-8cl",
3658         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3659     };
3660     uint32_t whichSpans[96]={ SPAN_ALL };
3661     int32_t whichSpansCount=1;
3662 
3663     UnicodeSet *sets[SET_COUNT]={ NULL };
3664     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3665 
3666     char testName[1024];
3667     char *testNameLimit=testName;
3668 
3669     int32_t i, j;
3670     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3671         const char *s=testdata[i];
3672         if(s[0]=='[') {
3673             // Create new test sets from this pattern.
3674             for(j=0; j<SET_COUNT; ++j) {
3675                 delete sets_with_str[j];
3676                 delete sets[j];
3677             }
3678             UErrorCode errorCode=U_ZERO_ERROR;
3679             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3680             if(U_FAILURE(errorCode)) {
3681                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3682                 break;
3683             }
3684             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3685             sets[SLOW_NOT]->complement();
3686             // Intermediate set: Test cloning of a frozen set.
3687             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3688             fast->freeze();
3689             sets[FAST]=(UnicodeSet *)fast->clone();
3690             delete fast;
3691             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3692             fastNot->freeze();
3693             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3694             delete fastNot;
3695 
3696             for(j=0; j<SET_COUNT; ++j) {
3697                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3698             }
3699 
3700             strcpy(testName, s);
3701             testNameLimit=strchr(testName, 0);
3702             *testNameLimit++=':';
3703             *testNameLimit=0;
3704 
3705             whichSpans[0]=SPAN_ALL;
3706             whichSpansCount=1;
3707         } else if(s[0]=='-') {
3708             whichSpans[0]=SPAN_ALL;
3709             whichSpansCount=1;
3710 
3711             while(*++s!=0) {
3712                 switch(*s) {
3713                 case 'c':
3714                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3715                                                    ~SPAN_POLARITY,
3716                                                    SPAN_SET,
3717                                                    SPAN_COMPLEMENT,
3718                                                    0);
3719                     break;
3720                 case 'b':
3721                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3722                                                    ~SPAN_DIRS,
3723                                                    SPAN_FWD,
3724                                                    SPAN_BACK,
3725                                                    0);
3726                     break;
3727                 case 'l':
3728                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3729                     // USET_SPAN_SIMPLE only FWD, and separately
3730                     // USET_SPAN_SIMPLE only BACK
3731                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3732                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3733                                                    SPAN_DIRS|SPAN_CONTAINED,
3734                                                    SPAN_FWD|SPAN_SIMPLE,
3735                                                    SPAN_BACK|SPAN_SIMPLE);
3736                     break;
3737                 case '8':
3738                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3739                                                    ~SPAN_UTFS,
3740                                                    SPAN_UTF16,
3741                                                    SPAN_UTF8,
3742                                                    0);
3743                     break;
3744                 default:
3745                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3746                     break;
3747                 }
3748             }
3749         } else if(0==strcmp(s, "*")) {
3750             strcpy(testNameLimit, "bad_string");
3751             for(j=0; j<whichSpansCount; ++j) {
3752                 if(whichSpansCount>1) {
3753                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3754                             "%%0x%3x",
3755                             whichSpans[j]);
3756                 }
3757                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3758                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3759             }
3760 
3761             strcpy(testNameLimit, "contents");
3762             for(j=0; j<whichSpansCount; ++j) {
3763                 if(whichSpansCount>1) {
3764                     sprintf(testNameLimit+8 /* strlen("contents") */,
3765                             "%%0x%3x",
3766                             whichSpans[j]);
3767                 }
3768                 testSpanContents(sets_with_str, whichSpans[j], testName);
3769             }
3770         } else {
3771             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3772             strcpy(testNameLimit, "test_string");
3773             for(j=0; j<whichSpansCount; ++j) {
3774                 if(whichSpansCount>1) {
3775                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3776                             "%%0x%3x",
3777                             whichSpans[j]);
3778                 }
3779                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3780             }
3781         }
3782     }
3783     for(j=0; j<SET_COUNT; ++j) {
3784         delete sets_with_str[j];
3785         delete sets[j];
3786     }
3787 }
3788 
3789 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3790 void UnicodeSetTest::TestStringSpan() {
3791     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3792     static const char *const string=
3793         "xx"
3794         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3795         "xx"
3796         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3797         "xx"
3798         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3799         "aaaa";
3800 
3801     UErrorCode errorCode=U_ZERO_ERROR;
3802     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3803     UnicodeSet set(pattern16, errorCode);
3804     if(U_FAILURE(errorCode)) {
3805         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3806         return;
3807     }
3808 
3809     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3810 
3811     if(set.containsAll(string16)) {
3812         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3813     }
3814 
3815     // Remove trailing "aaaa".
3816     string16.truncate(string16.length()-4);
3817     if(!set.containsAll(string16)) {
3818         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3819     }
3820 
3821     string16=UNICODE_STRING_SIMPLE("byayaxya");
3822     const UChar *s16=string16.getBuffer();
3823     int32_t length16=string16.length();
3824     (void)length16;   // Suppress set but not used warning.
3825     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3826         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3827         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3828         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3829         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3830         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3831     ) {
3832         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3833     }
3834 
3835     pattern="[a{ab}{abc}{cd}]";
3836     pattern16=UnicodeString(pattern, -1, US_INV);
3837     set.applyPattern(pattern16, errorCode);
3838     if(U_FAILURE(errorCode)) {
3839         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3840         return;
3841     }
3842     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3843     s16=string16.getBuffer();
3844     length16=string16.length();
3845     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3846         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3847         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3848     ) {
3849         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3850     }
3851 
3852     pattern="[d{cd}{bcd}{ab}]";
3853     pattern16=UnicodeString(pattern, -1, US_INV);
3854     set.applyPattern(pattern16, errorCode).freeze();
3855     if(U_FAILURE(errorCode)) {
3856         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3857         return;
3858     }
3859     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3860     s16=string16.getBuffer();
3861     length16=string16.length();
3862     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3863         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3864         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3865     ) {
3866         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3867     }
3868 }
3869 
3870 /**
3871  * Including collationroot.h fails here with
3872 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3873  *  .. so, we skip this test on Windows.
3874  *
3875  * the cause is that  intltest builds with /Za which disables language extensions - which means
3876  *  windows header files can't be used.
3877  */
3878 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3879 #include "collationroot.h"
3880 #include "collationtailoring.h"
3881 #endif
3882 
TestUCAUnsafeBackwards()3883 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3884 #if U_PLATFORM_HAS_WIN32_API
3885     infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3886 #elif !UCONFIG_NO_COLLATION
3887     UErrorCode errorCode = U_ZERO_ERROR;
3888 
3889     // Get the unsafeBackwardsSet
3890     const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
3891     if(U_FAILURE(errorCode)) {
3892       dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
3893       return;
3894     }
3895     //const UVersionInfo &version = rootEntry->tailoring->version;
3896     const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
3897 
3898     checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
3899 
3900     if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3901         // simple test case
3902         // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3903         // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3904         UnicodeSet surrogates;
3905         surrogates.add(0xd83a);  // a lead surrogate
3906         surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
3907         UnicodeString pat;
3908         surrogates.toPattern(pat, FALSE);  // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3909         // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3910         // so that at least one type of surrogate code points are escaped,
3911         // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3912         errorCode = U_ZERO_ERROR;
3913         UnicodeSet s2;
3914         s2.applyPattern(pat, errorCode);  // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3915         if(U_FAILURE(errorCode)) {
3916             errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
3917         } else {
3918             checkEqual(surrogates, s2, "surrogates to/from pattern");
3919         }
3920         // This occurs in the UCA unsafe-backwards set.
3921         checkRoundTrip(*unsafeBackwardSet);
3922     }
3923 #endif
3924 }
3925