1 /*
2 ********************************************************************************
3 *   Copyright (C) 1999-2014 International Business Machines Corporation and
4 *   others. All Rights Reserved.
5 ********************************************************************************
6 *   Date        Name        Description
7 *   10/20/99    alan        Creation.
8 *   03/22/2000  Madhu       Added additional tests
9 ********************************************************************************
10 */
11 
12 #include <stdio.h>
13 
14 #include <string.h>
15 #include "unicode/utypes.h"
16 #include "usettest.h"
17 #include "unicode/ucnv.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/usetiter.h"
21 #include "unicode/ustring.h"
22 #include "unicode/parsepos.h"
23 #include "unicode/symtable.h"
24 #include "unicode/uversion.h"
25 #include "hash.h"
26 
27 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
28     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
29     u_errorName(status));}}
30 
31 #define TEST_ASSERT(expr) {if (!(expr)) { \
32     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
33 
operator +(const UnicodeString & left,const UnicodeSet & set)34 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
35     UnicodeString pat;
36     set.toPattern(pat);
37     return left + UnicodeSetTest::escape(pat);
38 }
39 
40 #define CASE(id,test) case id:                          \
41                           name = #test;                 \
42                           if (exec) {                   \
43                               logln(#test "---");       \
44                               logln();                  \
45                               test();                   \
46                           }                             \
47                           break
48 
UnicodeSetTest()49 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
50 }
51 
openUTF8Converter()52 UConverter *UnicodeSetTest::openUTF8Converter() {
53     if(utf8Cnv==NULL) {
54         UErrorCode errorCode=U_ZERO_ERROR;
55         utf8Cnv=ucnv_open("UTF-8", &errorCode);
56     }
57     return utf8Cnv;
58 }
59 
~UnicodeSetTest()60 UnicodeSetTest::~UnicodeSetTest() {
61     ucnv_close(utf8Cnv);
62 }
63 
64 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)65 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
66                                const char* &name, char* /*par*/) {
67     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
68     switch (index) {
69         CASE(0,TestPatterns);
70         CASE(1,TestAddRemove);
71         CASE(2,TestCategories);
72         CASE(3,TestCloneEqualHash);
73         CASE(4,TestMinimalRep);
74         CASE(5,TestAPI);
75         CASE(6,TestScriptSet);
76         CASE(7,TestPropertySet);
77         CASE(8,TestClone);
78         CASE(9,TestExhaustive);
79         CASE(10,TestToPattern);
80         CASE(11,TestIndexOf);
81         CASE(12,TestStrings);
82         CASE(13,Testj2268);
83         CASE(14,TestCloseOver);
84         CASE(15,TestEscapePattern);
85         CASE(16,TestInvalidCodePoint);
86         CASE(17,TestSymbolTable);
87         CASE(18,TestSurrogate);
88         CASE(19,TestPosixClasses);
89         CASE(20,TestIteration);
90         CASE(21,TestFreezable);
91         CASE(22,TestSpan);
92         CASE(23,TestStringSpan);
93         default: name = ""; break;
94     }
95 }
96 
97 static const char NOT[] = "%%%%";
98 
99 /**
100  * UVector was improperly copying contents
101  * This code will crash this is still true
102  */
Testj2268()103 void UnicodeSetTest::Testj2268() {
104   UnicodeSet t;
105   t.add(UnicodeString("abc"));
106   UnicodeSet test(t);
107   UnicodeString ustrPat;
108   test.toPattern(ustrPat, TRUE);
109 }
110 
111 /**
112  * Test toPattern().
113  */
TestToPattern()114 void UnicodeSetTest::TestToPattern() {
115     UErrorCode ec = U_ZERO_ERROR;
116 
117     // Test that toPattern() round trips with syntax characters and
118     // whitespace.
119     {
120         static const char* OTHER_TOPATTERN_TESTS[] = {
121             "[[:latin:]&[:greek:]]",
122             "[[:latin:]-[:greek:]]",
123             "[:nonspacing mark:]",
124             NULL
125         };
126 
127         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
128             ec = U_ZERO_ERROR;
129             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
130             if (U_FAILURE(ec)) {
131                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
132                 continue;
133             }
134             checkPat(OTHER_TOPATTERN_TESTS[j], s);
135         }
136 
137         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
138             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
139 
140                 // check various combinations to make sure they all work.
141                 if (i != 0 && !toPatternAux(i, i)){
142                     continue;
143                 }
144                 if (!toPatternAux(0, i)){
145                     continue;
146                 }
147                 if (!toPatternAux(i, 0xFFFF)){
148                     continue;
149                 }
150             }
151         }
152     }
153 
154     // Test pattern behavior of multicharacter strings.
155     {
156         ec = U_ZERO_ERROR;
157         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
158 
159         // This loop isn't a loop.  It's here to make the compiler happy.
160         // If you're curious, try removing it and changing the 'break'
161         // statements (except for the last) to goto's.
162         for (;;) {
163             if (U_FAILURE(ec)) break;
164             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
165             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
166 
167             s->add("ac");
168             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
169             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
170 
171             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
172             if (U_FAILURE(ec)) break;
173             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
174             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
175 
176             s->add("[]");
177             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
178             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
179 
180             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
181             if (U_FAILURE(ec)) break;
182             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
183             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
184 
185             // j2189
186             s->clear();
187             s->add(UnicodeString("abc", ""));
188             s->add(UnicodeString("abc", ""));
189             const char* exp6[] = {"abc", NOT, "ab", NULL};
190             expectToPattern(*s, "[{abc}]", exp6);
191 
192             break;
193         }
194 
195         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
196         delete s;
197     }
198 
199     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
200     UnicodeSet s;
201     s.add((UChar)97, (UChar)98); // 'a', 'b'
202     expectToPattern(s, "[ab]", NULL);
203 }
204 
toPatternAux(UChar32 start,UChar32 end)205 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
206 
207     // use Integer.toString because Utility.hex doesn't handle ints
208     UnicodeString pat = "";
209     // TODO do these in hex
210     //String source = "0x" + Integer.toString(start,16).toUpperCase();
211     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
212     UnicodeString source;
213     source = source + (uint32_t)start;
214     if (start != end)
215         source = source + ".." + (uint32_t)end;
216     UnicodeSet testSet;
217     testSet.add(start, end);
218     return checkPat(source, testSet);
219 }
220 
checkPat(const UnicodeString & source,const UnicodeSet & testSet)221 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
222                                const UnicodeSet& testSet) {
223     // What we want to make sure of is that a pattern generated
224     // by toPattern(), with or without escaped unprintables, can
225     // be passed back into the UnicodeSet constructor.
226     UnicodeString pat0;
227 
228     testSet.toPattern(pat0, TRUE);
229 
230     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
231 
232     //String pat1 = unescapeLeniently(pat0);
233     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
234 
235     UnicodeString pat2;
236     testSet.toPattern(pat2, FALSE);
237     if (!checkPat(source, testSet, pat2)) return FALSE;
238 
239     //String pat3 = unescapeLeniently(pat2);
240     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
241 
242     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
243     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
244     return TRUE;
245 }
246 
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)247 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
248                                const UnicodeSet& testSet,
249                                const UnicodeString& pat) {
250     UErrorCode ec = U_ZERO_ERROR;
251     UnicodeSet testSet2(pat, ec);
252     if (testSet2 != testSet) {
253         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
254         return FALSE;
255     }
256     return TRUE;
257 }
258 
259 void
TestPatterns(void)260 UnicodeSetTest::TestPatterns(void) {
261     UnicodeSet set;
262     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
263     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
264     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
265     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
266     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
267     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
268 
269     // Throw in a test of complement
270     set.complement();
271     UnicodeString exp;
272     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
273     expectPairs(set, exp);
274 }
275 
276 void
TestCategories(void)277 UnicodeSetTest::TestCategories(void) {
278     UErrorCode status = U_ZERO_ERROR;
279     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
280     UnicodeSet set(pat, status);
281     if (U_FAILURE(status)) {
282         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
283         return;
284     } else {
285         expectContainment(set, pat, "ABC", "abc");
286     }
287 
288     UChar32 i;
289     int32_t failures = 0;
290     // Make sure generation of L doesn't pollute cached Lu set
291     // First generate L, then Lu
292     set.applyPattern("[:L:]", status);
293     if (U_FAILURE(status)) { errln("FAIL"); return; }
294     for (i=0; i<0x200; ++i) {
295         UBool l = u_isalpha((UChar)i);
296         if (l != set.contains(i)) {
297             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
298                   set.contains(i));
299             if (++failures == 10) break;
300         }
301     }
302 
303     set.applyPattern("[:Lu:]", status);
304     if (U_FAILURE(status)) { errln("FAIL"); return; }
305     for (i=0; i<0x200; ++i) {
306         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
307         if (lu != set.contains(i)) {
308             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
309                   set.contains(i));
310             if (++failures == 20) break;
311         }
312     }
313 }
314 void
TestCloneEqualHash(void)315 UnicodeSetTest::TestCloneEqualHash(void) {
316     UErrorCode status = U_ZERO_ERROR;
317     // set1 and set2 used to be built with the obsolete constructor taking
318     // UCharCategory values; replaced with pattern constructors
319     // markus 20030502
320     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
321     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
322     if (U_FAILURE(status)){
323         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
324         return;
325     }
326     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
327     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
328     if (U_FAILURE(status)){
329         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
330         return;
331     }
332 
333     if (*set1 != *set1a) {
334         errln("FAIL: category constructor for Ll broken");
335     }
336     if (*set2 != *set2a) {
337         errln("FAIL: category constructor for Nd broken");
338     }
339     delete set1a;
340     delete set2a;
341 
342     logln("Testing copy construction");
343     UnicodeSet *set1copy=new UnicodeSet(*set1);
344     if(*set1 != *set1copy || *set1 == *set2 ||
345         getPairs(*set1) != getPairs(*set1copy) ||
346         set1->hashCode() != set1copy->hashCode()){
347         errln("FAIL : Error in copy construction");
348         return;
349     }
350 
351     logln("Testing =operator");
352     UnicodeSet set1equal=*set1;
353     UnicodeSet set2equal=*set2;
354     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
355         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
356         errln("FAIL: Error in =operator");
357     }
358 
359     logln("Testing clone()");
360     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
361     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
362     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
363         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
364         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
365         errln("FAIL: Error in clone");
366     }
367 
368     logln("Testing hashcode");
369     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
370         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
371         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
372         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
373         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
374         errln("FAIL: Error in hashCode()");
375     }
376 
377     delete set1;
378     delete set1copy;
379     delete set2;
380     delete set1clone;
381     delete set2clone;
382 
383 
384 }
385 void
TestAddRemove(void)386 UnicodeSetTest::TestAddRemove(void) {
387     UnicodeSet set; // Construct empty set
388     doAssert(set.isEmpty() == TRUE, "set should be empty");
389     doAssert(set.size() == 0, "size should be 0");
390     set.complement();
391     doAssert(set.size() == 0x110000, "size should be 0x110000");
392     set.clear();
393     set.add(0x0061, 0x007a);
394     expectPairs(set, "az");
395     doAssert(set.isEmpty() == FALSE, "set should not be empty");
396     doAssert(set.size() != 0, "size should not be equal to 0");
397     doAssert(set.size() == 26, "size should be equal to 26");
398     set.remove(0x006d, 0x0070);
399     expectPairs(set, "alqz");
400     doAssert(set.size() == 22, "size should be equal to 22");
401     set.remove(0x0065, 0x0067);
402     expectPairs(set, "adhlqz");
403     doAssert(set.size() == 19, "size should be equal to 19");
404     set.remove(0x0064, 0x0069);
405     expectPairs(set, "acjlqz");
406     doAssert(set.size() == 16, "size should be equal to 16");
407     set.remove(0x0063, 0x0072);
408     expectPairs(set, "absz");
409     doAssert(set.size() == 10, "size should be equal to 10");
410     set.add(0x0066, 0x0071);
411     expectPairs(set, "abfqsz");
412     doAssert(set.size() == 22, "size should be equal to 22");
413     set.remove(0x0061, 0x0067);
414     expectPairs(set, "hqsz");
415     set.remove(0x0061, 0x007a);
416     expectPairs(set, "");
417     doAssert(set.isEmpty() == TRUE, "set should be empty");
418     doAssert(set.size() == 0, "size should be 0");
419     set.add(0x0061);
420     doAssert(set.isEmpty() == FALSE, "set should not be empty");
421     doAssert(set.size() == 1, "size should not be equal to 1");
422     set.add(0x0062);
423     set.add(0x0063);
424     expectPairs(set, "ac");
425     doAssert(set.size() == 3, "size should not be equal to 3");
426     set.add(0x0070);
427     set.add(0x0071);
428     expectPairs(set, "acpq");
429     doAssert(set.size() == 5, "size should not be equal to 5");
430     set.clear();
431     expectPairs(set, "");
432     doAssert(set.isEmpty() == TRUE, "set should be empty");
433     doAssert(set.size() == 0, "size should be 0");
434 
435     // Try removing an entire set from another set
436     expectPattern(set, "[c-x]", "cx");
437     UnicodeSet set2;
438     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
439     set.removeAll(set2);
440     expectPairs(set, "deluxx");
441 
442     // Try adding an entire set to another set
443     expectPattern(set, "[jackiemclean]", "aacceein");
444     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
445     set.addAll(set2);
446     expectPairs(set, "aacehort");
447     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
448 
449     // Try retaining an set of elements contained in another set (intersection)
450     UnicodeSet set3;
451     expectPattern(set3, "[a-c]", "ac");
452     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
453     set3.remove(0x0062);
454     expectPairs(set3, "aacc");
455     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
456     set.retainAll(set3);
457     expectPairs(set, "aacc");
458     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
459     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
460     set.clear();
461     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
462 
463     // Test commutativity
464     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
465     expectPattern(set2, "[jackiemclean]", "aacceein");
466     set.addAll(set2);
467     expectPairs(set, "aacehort");
468     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
469 
470 
471 
472 
473 }
474 
475 /**
476  * Make sure minimal representation is maintained.
477  */
TestMinimalRep()478 void UnicodeSetTest::TestMinimalRep() {
479     UErrorCode status = U_ZERO_ERROR;
480     // This is pretty thoroughly tested by checkCanonicalRep()
481     // run against the exhaustive operation results.  Use the code
482     // here for debugging specific spot problems.
483 
484     // 1 overlap against 2
485     UnicodeSet set("[h-km-q]", status);
486     if (U_FAILURE(status)) { errln("FAIL"); return; }
487     UnicodeSet set2("[i-o]", status);
488     if (U_FAILURE(status)) { errln("FAIL"); return; }
489     set.addAll(set2);
490     expectPairs(set, "hq");
491     // right
492     set.applyPattern("[a-m]", status);
493     if (U_FAILURE(status)) { errln("FAIL"); return; }
494     set2.applyPattern("[e-o]", status);
495     if (U_FAILURE(status)) { errln("FAIL"); return; }
496     set.addAll(set2);
497     expectPairs(set, "ao");
498     // left
499     set.applyPattern("[e-o]", status);
500     if (U_FAILURE(status)) { errln("FAIL"); return; }
501     set2.applyPattern("[a-m]", status);
502     if (U_FAILURE(status)) { errln("FAIL"); return; }
503     set.addAll(set2);
504     expectPairs(set, "ao");
505     // 1 overlap against 3
506     set.applyPattern("[a-eg-mo-w]", status);
507     if (U_FAILURE(status)) { errln("FAIL"); return; }
508     set2.applyPattern("[d-q]", status);
509     if (U_FAILURE(status)) { errln("FAIL"); return; }
510     set.addAll(set2);
511     expectPairs(set, "aw");
512 }
513 
TestAPI()514 void UnicodeSetTest::TestAPI() {
515     UErrorCode status = U_ZERO_ERROR;
516     // default ct
517     UnicodeSet set;
518     if (!set.isEmpty() || set.getRangeCount() != 0) {
519         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
520               set);
521     }
522 
523     // clear(), isEmpty()
524     set.add(0x0061);
525     if (set.isEmpty()) {
526         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
527               set);
528     }
529     set.clear();
530     if (!set.isEmpty()) {
531         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
532               set);
533     }
534 
535     // size()
536     set.clear();
537     if (set.size() != 0) {
538         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
539               ": " + set);
540     }
541     set.add(0x0061);
542     if (set.size() != 1) {
543         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
544               ": " + set);
545     }
546     set.add(0x0031, 0x0039);
547     if (set.size() != 10) {
548         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
549               ": " + set);
550     }
551 
552     // contains(first, last)
553     set.clear();
554     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
555     if (U_FAILURE(status)) { errln("FAIL"); return; }
556     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
557         UChar32 a = set.getRangeStart(i);
558         UChar32 b = set.getRangeEnd(i);
559         if (!set.contains(a, b)) {
560             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
561                   " but doesn't: " + set);
562         }
563         if (set.contains((UChar32)(a-1), b)) {
564             errln((UnicodeString)"FAIL, shouldn't contain " +
565                   (unsigned short)(a-1) + '-' + (unsigned short)b +
566                   " but does: " + set);
567         }
568         if (set.contains(a, (UChar32)(b+1))) {
569             errln((UnicodeString)"FAIL, shouldn't contain " +
570                   (unsigned short)a + '-' + (unsigned short)(b+1) +
571                   " but does: " + set);
572         }
573     }
574 
575     // Ported InversionList test.
576     UnicodeSet a((UChar32)3,(UChar32)10);
577     UnicodeSet b((UChar32)7,(UChar32)15);
578     UnicodeSet c;
579 
580     logln((UnicodeString)"a [3-10]: " + a);
581     logln((UnicodeString)"b [7-15]: " + b);
582     c = a;
583     c.addAll(b);
584     UnicodeSet exp((UChar32)3,(UChar32)15);
585     if (c == exp) {
586         logln((UnicodeString)"c.set(a).add(b): " + c);
587     } else {
588         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
589     }
590     c.complement();
591     exp.set((UChar32)0, (UChar32)2);
592     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
593     if (c == exp) {
594         logln((UnicodeString)"c.complement(): " + c);
595     } else {
596         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
597     }
598     c.complement();
599     exp.set((UChar32)3, (UChar32)15);
600     if (c == exp) {
601         logln((UnicodeString)"c.complement(): " + c);
602     } else {
603         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
604     }
605     c = a;
606     c.complementAll(b);
607     exp.set((UChar32)3,(UChar32)6);
608     exp.add((UChar32)11,(UChar32) 15);
609     if (c == exp) {
610         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
611     } else {
612         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
613     }
614 
615     exp = c;
616     bitsToSet(setToBits(c), c);
617     if (c == exp) {
618         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
619     } else {
620         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
621     }
622 
623     // Additional tests for coverage JB#2118
624     //UnicodeSet::complement(class UnicodeString const &)
625     //UnicodeSet::complementAll(class UnicodeString const &)
626     //UnicodeSet::containsNone(class UnicodeSet const &)
627     //UnicodeSet::containsNone(long,long)
628     //UnicodeSet::containsSome(class UnicodeSet const &)
629     //UnicodeSet::containsSome(long,long)
630     //UnicodeSet::removeAll(class UnicodeString const &)
631     //UnicodeSet::retain(long)
632     //UnicodeSet::retainAll(class UnicodeString const &)
633     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
634     //UnicodeSetIterator::getString(void)
635     set.clear();
636     set.complement("ab");
637     exp.applyPattern("[{ab}]", status);
638     if (U_FAILURE(status)) { errln("FAIL"); return; }
639     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
640 
641     UnicodeSetIterator iset(set);
642     if (!iset.next() || !iset.isString()) {
643         errln("FAIL: UnicodeSetIterator::next/isString");
644     } else if (iset.getString() != "ab") {
645         errln("FAIL: UnicodeSetIterator::getString");
646     }
647 
648     set.add((UChar32)0x61, (UChar32)0x7A);
649     set.complementAll("alan");
650     exp.applyPattern("[{ab}b-kmo-z]", status);
651     if (U_FAILURE(status)) { errln("FAIL"); return; }
652     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
653 
654     exp.applyPattern("[a-z]", status);
655     if (U_FAILURE(status)) { errln("FAIL"); return; }
656     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
657     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
658     exp.applyPattern("[aln]", status);
659     if (U_FAILURE(status)) { errln("FAIL"); return; }
660     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
661     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
662 
663     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
664         errln("FAIL: containsNone(UChar32, UChar32)");
665     }
666     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
667         errln("FAIL: containsSome(UChar32, UChar32)");
668     }
669     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
670         errln("FAIL: containsNone(UChar32, UChar32)");
671     }
672     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
673         errln("FAIL: containsSome(UChar32, UChar32)");
674     }
675 
676     set.removeAll("liu");
677     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
678     if (U_FAILURE(status)) { errln("FAIL"); return; }
679     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
680 
681     set.retainAll("star");
682     exp.applyPattern("[rst]", status);
683     if (U_FAILURE(status)) { errln("FAIL"); return; }
684     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
685 
686     set.retain((UChar32)0x73);
687     exp.applyPattern("[s]", status);
688     if (U_FAILURE(status)) { errln("FAIL"); return; }
689     if (set != exp) { errln("FAIL: retain('s')"); return; }
690 
691     uint16_t buf[32];
692     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
693     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
694     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
695         errln("FAIL: serialize");
696         return;
697     }
698 
699     // Conversions to and from USet
700     UnicodeSet *uniset = &set;
701     USet *uset = uniset->toUSet();
702     TEST_ASSERT((void *)uset == (void *)uniset);
703     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
704     TEST_ASSERT((void *)setx == (void *)uset);
705     const UnicodeSet *constSet = uniset;
706     const USet *constUSet = constSet->toUSet();
707     TEST_ASSERT((void *)constUSet == (void *)constSet);
708     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
709     TEST_ASSERT((void *)constSetx == (void *)constUSet);
710 
711     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
712     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
713     UnicodeSet ac(0x61, 0x63);
714     ac.remove(0x62).freeze();
715     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
716         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
717         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
718         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
719         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
720         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
721         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
722         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
723         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
724         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
725     ) {
726         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
727     }
728     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
729         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
730         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
731         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
732         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
733         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
734         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
735         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
736         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
737         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
738     ) {
739         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
740     }
741 }
742 
TestIteration()743 void UnicodeSetTest::TestIteration() {
744     UErrorCode ec = U_ZERO_ERROR;
745     int i = 0;
746     int outerLoop;
747 
748     // 6 code points, 3 ranges, 2 strings, 8 total elements
749     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
750     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
751     TEST_ASSERT_SUCCESS(ec);
752     UnicodeSetIterator it(set);
753 
754     for (outerLoop=0; outerLoop<3; outerLoop++) {
755         // Run the test multiple times, to check that iterator.reset() is working.
756         for (i=0; i<10; i++) {
757             UBool         nextv        = it.next();
758             UBool         isString     = it.isString();
759             int32_t       codePoint    = it.getCodepoint();
760             //int32_t       codePointEnd = it.getCodepointEnd();
761             UnicodeString s   = it.getString();
762             switch (i) {
763             case 0:
764                 TEST_ASSERT(nextv == TRUE);
765                 TEST_ASSERT(isString == FALSE);
766                 TEST_ASSERT(codePoint==0x61);
767                 TEST_ASSERT(s == "a");
768                 break;
769             case 1:
770                 TEST_ASSERT(nextv == TRUE);
771                 TEST_ASSERT(isString == FALSE);
772                 TEST_ASSERT(codePoint==0x62);
773                 TEST_ASSERT(s == "b");
774                 break;
775             case 2:
776                 TEST_ASSERT(nextv == TRUE);
777                 TEST_ASSERT(isString == FALSE);
778                 TEST_ASSERT(codePoint==0x63);
779                 TEST_ASSERT(s == "c");
780                 break;
781             case 3:
782                 TEST_ASSERT(nextv == TRUE);
783                 TEST_ASSERT(isString == FALSE);
784                 TEST_ASSERT(codePoint==0x79);
785                 TEST_ASSERT(s == "y");
786                 break;
787             case 4:
788                 TEST_ASSERT(nextv == TRUE);
789                 TEST_ASSERT(isString == FALSE);
790                 TEST_ASSERT(codePoint==0x7a);
791                 TEST_ASSERT(s == "z");
792                 break;
793             case 5:
794                 TEST_ASSERT(nextv == TRUE);
795                 TEST_ASSERT(isString == FALSE);
796                 TEST_ASSERT(codePoint==0x1abcd);
797                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
798                 break;
799             case 6:
800                 TEST_ASSERT(nextv == TRUE);
801                 TEST_ASSERT(isString == TRUE);
802                 TEST_ASSERT(s == "str1");
803                 break;
804             case 7:
805                 TEST_ASSERT(nextv == TRUE);
806                 TEST_ASSERT(isString == TRUE);
807                 TEST_ASSERT(s == "str2");
808                 break;
809             case 8:
810                 TEST_ASSERT(nextv == FALSE);
811                 break;
812             case 9:
813                 TEST_ASSERT(nextv == FALSE);
814                 break;
815             }
816         }
817         it.reset();  // prepare to run the iteration again.
818     }
819 }
820 
821 
822 
823 
TestStrings()824 void UnicodeSetTest::TestStrings() {
825     UErrorCode ec = U_ZERO_ERROR;
826 
827     UnicodeSet* testList[] = {
828         UnicodeSet::createFromAll("abc"),
829         new UnicodeSet("[a-c]", ec),
830 
831         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
832         new UnicodeSet("[{ll}{ch}a-z]", ec),
833 
834         UnicodeSet::createFrom("ab}c"),
835         new UnicodeSet("[{ab\\}c}]", ec),
836 
837         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
838         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
839 
840         NULL
841     };
842 
843     if (U_FAILURE(ec)) {
844         errln("FAIL: couldn't construct test sets");
845     }
846 
847     for (int32_t i = 0; testList[i] != NULL; i+=2) {
848         if (U_SUCCESS(ec)) {
849             UnicodeString pat0, pat1;
850             testList[i]->toPattern(pat0, TRUE);
851             testList[i+1]->toPattern(pat1, TRUE);
852             if (*testList[i] == *testList[i+1]) {
853                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
854             } else {
855                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
856             }
857         }
858         delete testList[i];
859         delete testList[i+1];
860     }
861 }
862 
863 /**
864  * Test the [:Latin:] syntax.
865  */
TestScriptSet()866 void UnicodeSetTest::TestScriptSet() {
867     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
868 
869     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
870 
871     /* Jitterbug 1423 */
872     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
873 
874 }
875 
876 /**
877  * Test the [:Latin:] syntax.
878  */
TestPropertySet()879 void UnicodeSetTest::TestPropertySet() {
880     static const char* const DATA[] = {
881         // Pattern, Chars IN, Chars NOT in
882 
883         "[:Latin:]",
884         "aA",
885         "\\u0391\\u03B1",
886 
887         "[\\p{Greek}]",
888         "\\u0391\\u03B1",
889         "aA",
890 
891         "\\P{ GENERAL Category = upper case letter }",
892         "abc",
893         "ABC",
894 
895 #if !UCONFIG_NO_NORMALIZATION
896         // Combining class: @since ICU 2.2
897         // Check both symbolic and numeric
898         "\\p{ccc=Nukta}",
899         "\\u0ABC",
900         "abc",
901 
902         "\\p{Canonical Combining Class = 11}",
903         "\\u05B1",
904         "\\u05B2",
905 
906         "[:c c c = iota subscript :]",
907         "\\u0345",
908         "xyz",
909 #endif
910 
911         // Bidi class: @since ICU 2.2
912         "\\p{bidiclass=lefttoright}",
913         "abc",
914         "\\u0671\\u0672",
915 
916         // Binary properties: @since ICU 2.2
917         "\\p{ideographic}",
918         "\\u4E0A",
919         "x",
920 
921         "[:math=false:]",
922         "q)*(",
923         // weiv: )(and * were removed from math in Unicode 4.0.1
924         //"(*+)",
925         "+<>^",
926 
927         // JB#1767 \N{}, \p{ASCII}
928         "[:Ascii:]",
929         "abc\\u0000\\u007F",
930         "\\u0080\\u4E00",
931 
932         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
933         "az",
934         "qrs",
935 
936         // JB#2015
937         "[:any:]",
938         "a\\U0010FFFF",
939         "",
940 
941         "[:nv=0.5:]",
942         "\\u00BD\\u0F2A",
943         "\\u00BC",
944 
945         // JB#2653: Age
946         "[:Age=1.1:]",
947         "\\u03D6", // 1.1
948         "\\u03D8\\u03D9", // 3.2
949 
950         "[:Age=3.1:]",
951         "\\u1800\\u3400\\U0002f800",
952         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
953 
954         // JB#2350: Case_Sensitive
955         "[:Case Sensitive:]",
956         "A\\u1FFC\\U00010410",
957         ";\\u00B4\\U00010500",
958 
959         // JB#2832: C99-compatibility props
960         "[:blank:]",
961         " \\u0009",
962         "1-9A-Z",
963 
964         "[:graph:]",
965         "19AZ",
966         " \\u0003\\u0007\\u0009\\u000A\\u000D",
967 
968         "[:punct:]",
969         "!@#%&*()[]{}-_\\/;:,.?'\"",
970         "09azAZ",
971 
972         "[:xdigit:]",
973         "09afAF",
974         "gG!",
975 
976         // Regex compatibility test
977         "[-b]", // leading '-' is literal
978         "-b",
979         "ac",
980 
981         "[^-b]", // leading '-' is literal
982         "ac",
983         "-b",
984 
985         "[b-]", // trailing '-' is literal
986         "-b",
987         "ac",
988 
989         "[^b-]", // trailing '-' is literal
990         "ac",
991         "-b",
992 
993         "[a-b-]", // trailing '-' is literal
994         "ab-",
995         "c=",
996 
997         "[[a-q]&[p-z]-]", // trailing '-' is literal
998         "pq-",
999         "or=",
1000 
1001         "[\\s|\\)|:|$|\\>]", // from regex tests
1002         "s|):$>",
1003         "abc",
1004 
1005         "[\\uDC00cd]", // JB#2906: isolated trail at start
1006         "cd\\uDC00",
1007         "ab\\uD800\\U00010000",
1008 
1009         "[ab\\uD800]", // JB#2906: isolated trail at start
1010         "ab\\uD800",
1011         "cd\\uDC00\\U00010000",
1012 
1013         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1014         "abcd\\uD800",
1015         "ef\\uDC00\\U00010000",
1016 
1017         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1018         "abcd\\uDC00",
1019         "ef\\uD800\\U00010000",
1020 
1021 #if !UCONFIG_NO_NORMALIZATION
1022         "[:^lccc=0:]", // Lead canonical class
1023         "\\u0300\\u0301",
1024         "abcd\\u00c0\\u00c5",
1025 
1026         "[:^tccc=0:]", // Trail canonical class
1027         "\\u0300\\u0301\\u00c0\\u00c5",
1028         "abcd",
1029 
1030         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1031         "\\u0300\\u0301\\u00c0\\u00c5",
1032         "abcd",
1033 
1034         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1035         "",
1036         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1037 
1038         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1039         "\\u0F73\\u0F75\\u0F81",
1040         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1041 #endif /* !UCONFIG_NO_NORMALIZATION */
1042 
1043         "[:Assigned:]",
1044         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1045         "\\u0888\\uFDD3\\uFFFE\\U00050005",
1046 
1047         // Script_Extensions, new in Unicode 6.0
1048         "[:scx=Arab:]",
1049         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1050         "\\u061D\\uFDEF\\uFDFE",
1051 
1052         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1053         // so scx-sc is missing U+FDF2.
1054         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1055         "\\u0640\\u064B\\u0650\\u0655",
1056         "\\uFDF2"
1057     };
1058 
1059     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1060 
1061     for (int32_t i=0; i<DATA_LEN; i+=3) {
1062         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1063                           CharsToUnicodeString(DATA[i+2]));
1064     }
1065 }
1066 
1067 /**
1068   * Test that Posix style character classes [:digit:], etc.
1069   *   have the Unicode definitions from TR 18.
1070   */
TestPosixClasses()1071 void UnicodeSetTest::TestPosixClasses() {
1072     {
1073         UErrorCode status = U_ZERO_ERROR;
1074         UnicodeSet s1("[:alpha:]", status);
1075         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1076         TEST_ASSERT_SUCCESS(status);
1077         TEST_ASSERT(s1==s2);
1078     }
1079     {
1080         UErrorCode status = U_ZERO_ERROR;
1081         UnicodeSet s1("[:lower:]", status);
1082         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1083         TEST_ASSERT_SUCCESS(status);
1084         TEST_ASSERT(s1==s2);
1085     }
1086     {
1087         UErrorCode status = U_ZERO_ERROR;
1088         UnicodeSet s1("[:upper:]", status);
1089         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1090         TEST_ASSERT_SUCCESS(status);
1091         TEST_ASSERT(s1==s2);
1092     }
1093     {
1094         UErrorCode status = U_ZERO_ERROR;
1095         UnicodeSet s1("[:punct:]", status);
1096         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1097         TEST_ASSERT_SUCCESS(status);
1098         TEST_ASSERT(s1==s2);
1099     }
1100     {
1101         UErrorCode status = U_ZERO_ERROR;
1102         UnicodeSet s1("[:digit:]", status);
1103         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1104         TEST_ASSERT_SUCCESS(status);
1105         TEST_ASSERT(s1==s2);
1106     }
1107     {
1108         UErrorCode status = U_ZERO_ERROR;
1109         UnicodeSet s1("[:xdigit:]", status);
1110         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1111         TEST_ASSERT_SUCCESS(status);
1112         TEST_ASSERT(s1==s2);
1113     }
1114     {
1115         UErrorCode status = U_ZERO_ERROR;
1116         UnicodeSet s1("[:alnum:]", status);
1117         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1118         TEST_ASSERT_SUCCESS(status);
1119         TEST_ASSERT(s1==s2);
1120     }
1121     {
1122         UErrorCode status = U_ZERO_ERROR;
1123         UnicodeSet s1("[:space:]", status);
1124         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1125         TEST_ASSERT_SUCCESS(status);
1126         TEST_ASSERT(s1==s2);
1127     }
1128     {
1129         UErrorCode status = U_ZERO_ERROR;
1130         UnicodeSet s1("[:blank:]", status);
1131         TEST_ASSERT_SUCCESS(status);
1132         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1133             status);
1134         TEST_ASSERT_SUCCESS(status);
1135         TEST_ASSERT(s1==s2);
1136     }
1137     {
1138         UErrorCode status = U_ZERO_ERROR;
1139         UnicodeSet s1("[:cntrl:]", status);
1140         TEST_ASSERT_SUCCESS(status);
1141         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1142         TEST_ASSERT_SUCCESS(status);
1143         TEST_ASSERT(s1==s2);
1144     }
1145     {
1146         UErrorCode status = U_ZERO_ERROR;
1147         UnicodeSet s1("[:graph:]", status);
1148         TEST_ASSERT_SUCCESS(status);
1149         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1150         TEST_ASSERT_SUCCESS(status);
1151         TEST_ASSERT(s1==s2);
1152     }
1153     {
1154         UErrorCode status = U_ZERO_ERROR;
1155         UnicodeSet s1("[:print:]", status);
1156         TEST_ASSERT_SUCCESS(status);
1157         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1158         TEST_ASSERT_SUCCESS(status);
1159         TEST_ASSERT(s1==s2);
1160     }
1161 }
1162 /**
1163  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1164  */
TestClone()1165 void UnicodeSetTest::TestClone() {
1166     UErrorCode ec = U_ZERO_ERROR;
1167     UnicodeSet s("[abcxyz]", ec);
1168     UnicodeSet t(s);
1169     expectContainment(t, "abc", "def");
1170 }
1171 
1172 /**
1173  * Test the indexOf() and charAt() methods.
1174  */
TestIndexOf()1175 void UnicodeSetTest::TestIndexOf() {
1176     UErrorCode ec = U_ZERO_ERROR;
1177     UnicodeSet set("[a-cx-y3578]", ec);
1178     if (U_FAILURE(ec)) {
1179         errln("FAIL: UnicodeSet constructor");
1180         return;
1181     }
1182     for (int32_t i=0; i<set.size(); ++i) {
1183         UChar32 c = set.charAt(i);
1184         if (set.indexOf(c) != i) {
1185             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1186                 i, c, set.indexOf(c));
1187         }
1188     }
1189     UChar32 c = set.charAt(set.size());
1190     if (c != -1) {
1191         errln("FAIL: charAt(<out of range>) = %X", c);
1192     }
1193     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1194     if (j != -1) {
1195         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1196     }
1197 }
1198 
1199 /**
1200  * Test closure API.
1201  */
TestCloseOver()1202 void UnicodeSetTest::TestCloseOver() {
1203     UErrorCode ec = U_ZERO_ERROR;
1204 
1205     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1206     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1207     const char* DATA[] = {
1208         // selector, input, output
1209         CASE,
1210         "[aq\\u00DF{Bc}{bC}{Fi}]",
1211         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1212 
1213         CASE,
1214         "[\\u01F1]", // 'DZ'
1215         "[\\u01F1\\u01F2\\u01F3]",
1216 
1217         CASE,
1218         "[\\u1FB4]",
1219         "[\\u1FB4{\\u03AC\\u03B9}]",
1220 
1221         CASE,
1222         "[{F\\uFB01}]",
1223         "[\\uFB03{ffi}]",
1224 
1225         CASE, // make sure binary search finds limits
1226         "[a\\uFF3A]",
1227         "[aA\\uFF3A\\uFF5A]",
1228 
1229         CASE,
1230         "[a-z]","[A-Za-z\\u017F\\u212A]",
1231         CASE,
1232         "[abc]","[A-Ca-c]",
1233         CASE,
1234         "[ABC]","[A-Ca-c]",
1235 
1236         CASE, "[i]", "[iI]",
1237 
1238         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1239         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1240 
1241         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1242 
1243         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1244 
1245         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1246 
1247         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1248 
1249         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1250 
1251         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1252 
1253         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1254         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1255 
1256         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1257 
1258         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1259 
1260         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1261 
1262 #if !UCONFIG_NO_FILE_IO
1263         CASE_MAPPINGS,
1264         "[aq\\u00DF{Bc}{bC}{Fi}]",
1265         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1266 #endif
1267 
1268         CASE_MAPPINGS,
1269         "[\\u01F1]", // 'DZ'
1270         "[\\u01F1\\u01F2\\u01F3]",
1271 
1272         CASE_MAPPINGS,
1273         "[a-z]",
1274         "[A-Za-z]",
1275 
1276         NULL
1277     };
1278 
1279     UnicodeSet s;
1280     UnicodeSet t;
1281     UnicodeString buf;
1282     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1283         int32_t selector = DATA[i][0];
1284         UnicodeString pat(DATA[i+1], -1, US_INV);
1285         UnicodeString exp(DATA[i+2], -1, US_INV);
1286         s.applyPattern(pat, ec);
1287         s.closeOver(selector);
1288         t.applyPattern(exp, ec);
1289         if (U_FAILURE(ec)) {
1290             errln("FAIL: applyPattern failed");
1291             continue;
1292         }
1293         if (s == t) {
1294             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1295         } else {
1296             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1297                   s.toPattern(buf, TRUE) + ", expected " + exp);
1298         }
1299     }
1300 
1301 #if 0
1302     /*
1303      * Unused test code.
1304      * This was used to compare the old implementation (using USET_CASE)
1305      * with the new one (using 0x100 temporarily)
1306      * while transitioning from hardcoded case closure tables in uniset.cpp
1307      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1308      * and using ucase.c functions for closure.
1309      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1310      *
1311      * Note: The old and new implementation never fully matched because
1312      * the old implementation turned out to not map U+0130 and U+0131 correctly
1313      * (dotted I and dotless i) and because the old implementation's data tables
1314      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1315      * new implementation. (So sigmas and some other characters were not handled
1316      * according to the newer Unicode version.)
1317      */
1318     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1319     UnicodeSetIterator si(sens);
1320     UnicodeString str, buf2;
1321     const UnicodeString *pStr;
1322     UChar32 c;
1323     while(si.next()) {
1324         if(!si.isString()) {
1325             c=si.getCodepoint();
1326             s.clear();
1327             s.add(c);
1328 
1329             str.setTo(c);
1330             str.foldCase();
1331             sens2.add(str);
1332 
1333             t=s;
1334             s.closeOver(USET_CASE);
1335             t.closeOver(0x100);
1336             if(s!=t) {
1337                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1338                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1339             }
1340         }
1341     }
1342     // remove all code points
1343     // should contain all full case folding mapping strings
1344     sens2.remove(0, 0x10ffff);
1345     si.reset(sens2);
1346     while(si.next()) {
1347         if(si.isString()) {
1348             pStr=&si.getString();
1349             s.clear();
1350             s.add(*pStr);
1351             t=s2=s;
1352             s.closeOver(USET_CASE);
1353             t.closeOver(0x100);
1354             if(s!=t) {
1355                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1356                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1357             }
1358         }
1359     }
1360 #endif
1361 
1362     // Test the pattern API
1363     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1364     if (U_FAILURE(ec)) {
1365         errln("FAIL: applyPattern failed");
1366     } else {
1367         expectContainment(s, "abcABC", "defDEF");
1368     }
1369     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1370     if (U_FAILURE(ec)) {
1371         errln("FAIL: constructor failed");
1372     } else {
1373         expectContainment(v, "defDEF", "abcABC");
1374     }
1375     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1376     if (U_FAILURE(ec)) {
1377         errln("FAIL: construct w/case mappings failed");
1378     } else {
1379         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1380     }
1381 }
1382 
TestEscapePattern()1383 void UnicodeSetTest::TestEscapePattern() {
1384     const char pattern[] =
1385         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1386     const char exp[] =
1387         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1388     // We test this with two passes; in the second pass we
1389     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1390     // this fails -- which is what we expect.
1391     for (int32_t pass=1; pass<=2; ++pass) {
1392         UErrorCode ec = U_ZERO_ERROR;
1393         UnicodeString pat(pattern, -1, US_INV);
1394         if (pass==2) {
1395             pat = pat.unescape();
1396         }
1397         // Pattern is only good for pass 1
1398         UBool isPatternValid = (pass==1);
1399 
1400         UnicodeSet set(pat, ec);
1401         if (U_SUCCESS(ec) != isPatternValid){
1402             errln((UnicodeString)"FAIL: applyPattern(" +
1403                   escape(pat) + ") => " +
1404                   u_errorName(ec));
1405             continue;
1406         }
1407         if (U_FAILURE(ec)) {
1408             continue;
1409         }
1410         if (set.contains((UChar)0x0644)){
1411             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1412         }
1413 
1414         UnicodeString newpat;
1415         set.toPattern(newpat, TRUE);
1416         if (newpat == UnicodeString(exp, -1, US_INV)) {
1417             logln(escape(pat) + " => " + newpat);
1418         } else {
1419             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1420         }
1421 
1422         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1423             UnicodeString str("Range ");
1424             str.append((UChar)(0x30 + i))
1425                 .append(": ")
1426                 .append((UChar32)set.getRangeStart(i))
1427                 .append(" - ")
1428                 .append((UChar32)set.getRangeEnd(i));
1429             str = str + " (" + set.getRangeStart(i) + " - " +
1430                 set.getRangeEnd(i) + ")";
1431             if (set.getRangeStart(i) < 0) {
1432                 errln((UnicodeString)"FAIL: " + escape(str));
1433             } else {
1434                 logln(escape(str));
1435             }
1436         }
1437     }
1438 }
1439 
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1440 void UnicodeSetTest::expectRange(const UnicodeString& label,
1441                                  const UnicodeSet& set,
1442                                  UChar32 start, UChar32 end) {
1443     UnicodeSet exp(start, end);
1444     UnicodeString pat;
1445     if (set == exp) {
1446         logln(label + " => " + set.toPattern(pat, TRUE));
1447     } else {
1448         UnicodeString xpat;
1449         errln((UnicodeString)"FAIL: " + label + " => " +
1450               set.toPattern(pat, TRUE) +
1451               ", expected " + exp.toPattern(xpat, TRUE));
1452     }
1453 }
1454 
TestInvalidCodePoint()1455 void UnicodeSetTest::TestInvalidCodePoint() {
1456 
1457     const UChar32 DATA[] = {
1458         // Test range             Expected range
1459         0, 0x10FFFF,              0, 0x10FFFF,
1460         (UChar32)-1, 8,           0, 8,
1461         8, 0x110000,              8, 0x10FFFF
1462     };
1463     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1464 
1465     UnicodeString pat;
1466     int32_t i;
1467 
1468     for (i=0; i<DATA_LENGTH; i+=4) {
1469         UChar32 start  = DATA[i];
1470         UChar32 end    = DATA[i+1];
1471         UChar32 xstart = DATA[i+2];
1472         UChar32 xend   = DATA[i+3];
1473 
1474         // Try various API using the test code points
1475 
1476         UnicodeSet set(start, end);
1477         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1478                     set, xstart, xend);
1479 
1480         set.clear();
1481         set.set(start, end);
1482         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1483                     set, xstart, xend);
1484 
1485         UBool b = set.contains(start);
1486         b = set.contains(start, end);
1487         b = set.containsNone(start, end);
1488         b = set.containsSome(start, end);
1489         (void)b;   // Suppress set but not used warning.
1490 
1491         /*int32_t index = set.indexOf(start);*/
1492 
1493         set.clear();
1494         set.add(start);
1495         set.add(start, end);
1496         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1497                     set, xstart, xend);
1498 
1499         set.set(0, 0x10FFFF);
1500         set.retain(start, end);
1501         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1502                     set, xstart, xend);
1503         set.retain(start);
1504 
1505         set.set(0, 0x10FFFF);
1506         set.remove(start);
1507         set.remove(start, end);
1508         set.complement();
1509         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1510                     set, xstart, xend);
1511 
1512         set.set(0, 0x10FFFF);
1513         set.complement(start, end);
1514         set.complement();
1515         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1516                     set, xstart, xend);
1517         set.complement(start);
1518     }
1519 
1520     const UChar32 DATA2[] = {
1521         0,
1522         0x10FFFF,
1523         (UChar32)-1,
1524         0x110000
1525     };
1526     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1527 
1528     for (i=0; i<DATA2_LENGTH; ++i) {
1529         UChar32 c = DATA2[i], end = 0x10FFFF;
1530         UBool valid = (c >= 0 && c <= 0x10FFFF);
1531 
1532         UnicodeSet set(0, 0x10FFFF);
1533 
1534         // For single-codepoint contains, invalid codepoints are NOT contained
1535         UBool b = set.contains(c);
1536         if (b == valid) {
1537             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1538                   ") = " + b);
1539         } else {
1540             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1541                   ") = " + b);
1542         }
1543 
1544         // For codepoint range contains, containsNone, and containsSome,
1545         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1546         b = set.contains(c, end);
1547         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1548               "," + end + ") = " + b);
1549 
1550         b = set.containsNone(c, end);
1551         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1552               "," + end + ") = " + b);
1553 
1554         b = set.containsSome(c, end);
1555         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1556               "," + end + ") = " + b);
1557 
1558         int32_t index = set.indexOf(c);
1559         if ((index >= 0) == valid) {
1560             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1561                   ") = " + index);
1562         } else {
1563             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1564                   ") = " + index);
1565         }
1566     }
1567 }
1568 
1569 // Used by TestSymbolTable
1570 class TokenSymbolTable : public SymbolTable {
1571 public:
1572     Hashtable contents;
1573 
TokenSymbolTable(UErrorCode & ec)1574     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1575         contents.setValueDeleter(uprv_deleteUObject);
1576     }
1577 
~TokenSymbolTable()1578     ~TokenSymbolTable() {}
1579 
1580     /**
1581      * (Non-SymbolTable API) Add the given variable and value to
1582      * the table.  Variable should NOT contain leading '$'.
1583      */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1584     void add(const UnicodeString& var, const UnicodeString& value,
1585              UErrorCode& ec) {
1586         if (U_SUCCESS(ec)) {
1587             contents.put(var, new UnicodeString(value), ec);
1588         }
1589     }
1590 
1591     /**
1592      * SymbolTable API
1593      */
lookup(const UnicodeString & s) const1594     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1595         return (const UnicodeString*) contents.get(s);
1596     }
1597 
1598     /**
1599      * SymbolTable API
1600      */
lookupMatcher(UChar32) const1601     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1602         return NULL;
1603     }
1604 
1605     /**
1606      * SymbolTable API
1607      */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1608     virtual UnicodeString parseReference(const UnicodeString& text,
1609                                          ParsePosition& pos, int32_t limit) const {
1610         int32_t start = pos.getIndex();
1611         int32_t i = start;
1612         UnicodeString result;
1613         while (i < limit) {
1614             UChar c = text.charAt(i);
1615             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1616                 break;
1617             }
1618             ++i;
1619         }
1620         if (i == start) { // No valid name chars
1621             return result; // Indicate failure with empty string
1622         }
1623         pos.setIndex(i);
1624         text.extractBetween(start, i, result);
1625         return result;
1626     }
1627 };
1628 
TestSymbolTable()1629 void UnicodeSetTest::TestSymbolTable() {
1630     // Multiple test cases can be set up here.  Each test case
1631     // is terminated by null:
1632     // var, value, var, value,..., input pat., exp. output pat., null
1633     const char* DATA[] = {
1634         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1635         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1636         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1637         NULL
1638     };
1639 
1640     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1641         UErrorCode ec = U_ZERO_ERROR;
1642         TokenSymbolTable sym(ec);
1643         if (U_FAILURE(ec)) {
1644             errln("FAIL: couldn't construct TokenSymbolTable");
1645             continue;
1646         }
1647 
1648         // Set up variables
1649         while (DATA[i+2] != NULL) {
1650             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1651             if (U_FAILURE(ec)) {
1652                 errln("FAIL: couldn't add to TokenSymbolTable");
1653                 continue;
1654             }
1655             i += 2;
1656         }
1657 
1658         // Input pattern and expected output pattern
1659         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1660         i += 2;
1661 
1662         ParsePosition pos(0);
1663         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1664         if (U_FAILURE(ec)) {
1665             errln("FAIL: couldn't construct UnicodeSet");
1666             continue;
1667         }
1668 
1669         // results
1670         if (pos.getIndex() != inpat.length()) {
1671             errln((UnicodeString)"Failed to read to end of string \""
1672                   + inpat + "\": read to "
1673                   + pos.getIndex() + ", length is "
1674                   + inpat.length());
1675         }
1676 
1677         UnicodeSet us2(exppat, ec);
1678         if (U_FAILURE(ec)) {
1679             errln("FAIL: couldn't construct expected UnicodeSet");
1680             continue;
1681         }
1682 
1683         UnicodeString a, b;
1684         if (us != us2) {
1685             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1686                   ", expected " + us2.toPattern(b, TRUE));
1687         } else {
1688             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1689         }
1690     }
1691 }
1692 
TestSurrogate()1693 void UnicodeSetTest::TestSurrogate() {
1694     const char* DATA[] = {
1695         // These should all behave identically
1696         "[abc\\uD800\\uDC00]",
1697         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1698         "[abc\\U00010000]",
1699         0
1700     };
1701     for (int i=0; DATA[i] != 0; ++i) {
1702         UErrorCode ec = U_ZERO_ERROR;
1703         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1704         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1705         UnicodeSet set(str, ec);
1706         if (U_FAILURE(ec)) {
1707             errln("FAIL: UnicodeSet constructor");
1708             continue;
1709         }
1710         expectContainment(set,
1711                           CharsToUnicodeString("abc\\U00010000"),
1712                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1713         if (set.size() != 4) {
1714             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1715                   set.size() + ", expected 4");
1716         }
1717     }
1718 }
1719 
TestExhaustive()1720 void UnicodeSetTest::TestExhaustive() {
1721     // exhaustive tests. Simulate UnicodeSets with integers.
1722     // That gives us very solid tests (except for large memory tests).
1723 
1724     int32_t limit = 128;
1725 
1726     UnicodeSet x, y, z, aa;
1727 
1728     for (int32_t i = 0; i < limit; ++i) {
1729         bitsToSet(i, x);
1730         logln((UnicodeString)"Testing " + i + ", " + x);
1731         _testComplement(i, x, y);
1732 
1733         // AS LONG AS WE ARE HERE, check roundtrip
1734         checkRoundTrip(bitsToSet(i, aa));
1735 
1736         for (int32_t j = 0; j < limit; ++j) {
1737             _testAdd(i,j,  x,y,z);
1738             _testXor(i,j,  x,y,z);
1739             _testRetain(i,j,  x,y,z);
1740             _testRemove(i,j,  x,y,z);
1741         }
1742     }
1743 }
1744 
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1745 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1746     bitsToSet(a, x);
1747     z = x;
1748     z.complement();
1749     int32_t c = setToBits(z);
1750     if (c != (~a)) {
1751         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1752         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1753     }
1754     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1755 }
1756 
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1757 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1758     bitsToSet(a, x);
1759     bitsToSet(b, y);
1760     z = x;
1761     z.addAll(y);
1762     int32_t c = setToBits(z);
1763     if (c != (a | b)) {
1764         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1765         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1766     }
1767     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1768 }
1769 
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1770 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1771     bitsToSet(a, x);
1772     bitsToSet(b, y);
1773     z = x;
1774     z.retainAll(y);
1775     int32_t c = setToBits(z);
1776     if (c != (a & b)) {
1777         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1778         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1779     }
1780     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1781 }
1782 
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1783 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1784     bitsToSet(a, x);
1785     bitsToSet(b, y);
1786     z = x;
1787     z.removeAll(y);
1788     int32_t c = setToBits(z);
1789     if (c != (a &~ b)) {
1790         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1791         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1792     }
1793     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1794 }
1795 
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1796 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1797     bitsToSet(a, x);
1798     bitsToSet(b, y);
1799     z = x;
1800     z.complementAll(y);
1801     int32_t c = setToBits(z);
1802     if (c != (a ^ b)) {
1803         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1804         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1805     }
1806     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1807 }
1808 
1809 /**
1810  * Check that ranges are monotonically increasing and non-
1811  * overlapping.
1812  */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1813 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1814     int32_t n = set.getRangeCount();
1815     if (n < 0) {
1816         errln((UnicodeString)"FAIL result of " + msg +
1817               ": range count should be >= 0 but is " +
1818               n /*+ " for " + set.toPattern())*/);
1819         return;
1820     }
1821     UChar32 last = 0;
1822     for (int32_t i=0; i<n; ++i) {
1823         UChar32 start = set.getRangeStart(i);
1824         UChar32 end = set.getRangeEnd(i);
1825         if (start > end) {
1826             errln((UnicodeString)"FAIL result of " + msg +
1827                   ": range " + (i+1) +
1828                   " start > end: " + (int)start + ", " + (int)end +
1829                   " for " + set);
1830         }
1831         if (i > 0 && start <= last) {
1832             errln((UnicodeString)"FAIL result of " + msg +
1833                   ": range " + (i+1) +
1834                   " overlaps previous range: " + (int)start + ", " + (int)end +
1835                   " for " + set);
1836         }
1837         last = end;
1838     }
1839 }
1840 
1841 /**
1842  * Convert a bitmask to a UnicodeSet.
1843  */
bitsToSet(int32_t a,UnicodeSet & result)1844 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1845     result.clear();
1846     for (UChar32 i = 0; i < 32; ++i) {
1847         if ((a & (1<<i)) != 0) {
1848             result.add(i);
1849         }
1850     }
1851     return result;
1852 }
1853 
1854 /**
1855  * Convert a UnicodeSet to a bitmask.  Only the characters
1856  * U+0000 to U+0020 are represented in the bitmask.
1857  */
setToBits(const UnicodeSet & x)1858 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1859     int32_t result = 0;
1860     for (int32_t i = 0; i < 32; ++i) {
1861         if (x.contains((UChar32)i)) {
1862             result |= (1<<i);
1863         }
1864     }
1865     return result;
1866 }
1867 
1868 /**
1869  * Return the representation of an inversion list based UnicodeSet
1870  * as a pairs list.  Ranges are listed in ascending Unicode order.
1871  * For example, the set [a-zA-M3] is represented as "33AMaz".
1872  */
getPairs(const UnicodeSet & set)1873 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1874     UnicodeString pairs;
1875     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1876         UChar32 start = set.getRangeStart(i);
1877         UChar32 end = set.getRangeEnd(i);
1878         if (end > 0xFFFF) {
1879             end = 0xFFFF;
1880             i = set.getRangeCount(); // Should be unnecessary
1881         }
1882         pairs.append((UChar)start).append((UChar)end);
1883     }
1884     return pairs;
1885 }
1886 
1887 /**
1888  * Basic consistency check for a few items.
1889  * That the iterator works, and that we can create a pattern and
1890  * get the same thing back
1891  */
checkRoundTrip(const UnicodeSet & s)1892 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1893     UErrorCode ec = U_ZERO_ERROR;
1894 
1895     UnicodeSet t(s);
1896     checkEqual(s, t, "copy ct");
1897 
1898     t = s;
1899     checkEqual(s, t, "operator=");
1900 
1901     copyWithIterator(t, s, FALSE);
1902     checkEqual(s, t, "iterator roundtrip");
1903 
1904     copyWithIterator(t, s, TRUE); // try range
1905     checkEqual(s, t, "iterator roundtrip");
1906 
1907     UnicodeString pat; s.toPattern(pat, FALSE);
1908     t.applyPattern(pat, ec);
1909     if (U_FAILURE(ec)) {
1910         errln("FAIL: applyPattern");
1911         return;
1912     } else {
1913         checkEqual(s, t, "toPattern(false)");
1914     }
1915 
1916     s.toPattern(pat, TRUE);
1917     t.applyPattern(pat, ec);
1918     if (U_FAILURE(ec)) {
1919         errln("FAIL: applyPattern");
1920         return;
1921     } else {
1922         checkEqual(s, t, "toPattern(true)");
1923     }
1924 }
1925 
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)1926 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1927     t.clear();
1928     UnicodeSetIterator it(s);
1929     if (withRange) {
1930         while (it.nextRange()) {
1931             if (it.isString()) {
1932                 t.add(it.getString());
1933             } else {
1934                 t.add(it.getCodepoint(), it.getCodepointEnd());
1935             }
1936         }
1937     } else {
1938         while (it.next()) {
1939             if (it.isString()) {
1940                 t.add(it.getString());
1941             } else {
1942                 t.add(it.getCodepoint());
1943             }
1944         }
1945     }
1946 }
1947 
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)1948 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1949     UnicodeString source; s.toPattern(source, TRUE);
1950     UnicodeString result; t.toPattern(result, TRUE);
1951     if (s != t) {
1952         errln((UnicodeString)"FAIL: " + message
1953               + "; source = " + source
1954               + "; result = " + result
1955               );
1956         return FALSE;
1957     } else {
1958         logln((UnicodeString)"Ok: " + message
1959               + "; source = " + source
1960               + "; result = " + result
1961               );
1962     }
1963     return TRUE;
1964 }
1965 
1966 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)1967 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1968                                   const UnicodeString& charsIn,
1969                                   const UnicodeString& charsOut) {
1970     UErrorCode ec = U_ZERO_ERROR;
1971     UnicodeSet set(pat, ec);
1972     if (U_FAILURE(ec)) {
1973         dataerrln((UnicodeString)"FAIL: pattern \"" +
1974               pat + "\" => " + u_errorName(ec));
1975         return;
1976     }
1977     expectContainment(set, pat, charsIn, charsOut);
1978 }
1979 
1980 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)1981 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1982                                   const UnicodeString& charsIn,
1983                                   const UnicodeString& charsOut) {
1984     UnicodeString pat;
1985     set.toPattern(pat);
1986     expectContainment(set, pat, charsIn, charsOut);
1987 }
1988 
1989 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)1990 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1991                                   const UnicodeString& setName,
1992                                   const UnicodeString& charsIn,
1993                                   const UnicodeString& charsOut) {
1994     UnicodeString bad;
1995     UChar32 c;
1996     int32_t i;
1997 
1998     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1999         c = charsIn.char32At(i);
2000         if (!set.contains(c)) {
2001             bad.append(c);
2002         }
2003     }
2004     if (bad.length() > 0) {
2005         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2006               ", expected containment of " + prettify(charsIn));
2007     } else {
2008         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2009     }
2010 
2011     bad.truncate(0);
2012     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2013         c = charsOut.char32At(i);
2014         if (set.contains(c)) {
2015             bad.append(c);
2016         }
2017     }
2018     if (bad.length() > 0) {
2019         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2020               ", expected non-containment of " + prettify(charsOut));
2021     } else {
2022         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2023     }
2024 }
2025 
2026 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2027 UnicodeSetTest::expectPattern(UnicodeSet& set,
2028                               const UnicodeString& pattern,
2029                               const UnicodeString& expectedPairs){
2030     UErrorCode status = U_ZERO_ERROR;
2031     set.applyPattern(pattern, status);
2032     if (U_FAILURE(status)) {
2033         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2034               "\") failed");
2035         return;
2036     } else {
2037         if (getPairs(set) != expectedPairs ) {
2038             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2039                   "\") => pairs \"" +
2040                   escape(getPairs(set)) + "\", expected \"" +
2041                   escape(expectedPairs) + "\"");
2042         } else {
2043             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2044                   "\") => pairs \"" +
2045                   escape(getPairs(set)) + "\"");
2046         }
2047     }
2048     // the result of calling set.toPattern(), which is the string representation of
2049     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2050     // will produce another set that is equal to this one.
2051     UnicodeString temppattern;
2052     set.toPattern(temppattern);
2053     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2054     if (U_FAILURE(status)) {
2055         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2056         return;
2057     }
2058     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2059         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2060             escape(getPairs(set)) + "\""));
2061     } else{
2062         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2063     }
2064 
2065     delete tempset;
2066 
2067 }
2068 
2069 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2070 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2071     if (getPairs(set) != expectedPairs) {
2072         errln(UnicodeString("FAIL: Expected pair list \"") +
2073               escape(expectedPairs) + "\", got \"" +
2074               escape(getPairs(set)) + "\"");
2075     }
2076 }
2077 
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2078 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2079                                      const UnicodeString& expPat,
2080                                      const char** expStrings) {
2081     UnicodeString pat;
2082     set.toPattern(pat, TRUE);
2083     if (pat == expPat) {
2084         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2085     } else {
2086         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2087         return;
2088     }
2089     if (expStrings == NULL) {
2090         return;
2091     }
2092     UBool in = TRUE;
2093     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2094         if (expStrings[i] == NOT) { // sic; pointer comparison
2095             in = FALSE;
2096             continue;
2097         }
2098         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2099         UBool contained = set.contains(s);
2100         if (contained == in) {
2101             logln((UnicodeString)"Ok: " + expPat +
2102                   (contained ? " contains {" : " does not contain {") +
2103                   escape(expStrings[i]) + "}");
2104         } else {
2105             errln((UnicodeString)"FAIL: " + expPat +
2106                   (contained ? " contains {" : " does not contain {") +
2107                   escape(expStrings[i]) + "}");
2108         }
2109     }
2110 }
2111 
toHexString(int32_t i)2112 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2113 
2114 void
doAssert(UBool condition,const char * message)2115 UnicodeSetTest::doAssert(UBool condition, const char *message)
2116 {
2117     if (!condition) {
2118         errln(UnicodeString("ERROR : ") + message);
2119     }
2120 }
2121 
2122 UnicodeString
escape(const UnicodeString & s)2123 UnicodeSetTest::escape(const UnicodeString& s) {
2124     UnicodeString buf;
2125     for (int32_t i=0; i<s.length(); )
2126     {
2127         UChar32 c = s.char32At(i);
2128         if (0x0020 <= c && c <= 0x007F) {
2129             buf += c;
2130         } else {
2131             if (c <= 0xFFFF) {
2132                 buf += (UChar)0x5c; buf += (UChar)0x75;
2133             } else {
2134                 buf += (UChar)0x5c; buf += (UChar)0x55;
2135                 buf += toHexString((c & 0xF0000000) >> 28);
2136                 buf += toHexString((c & 0x0F000000) >> 24);
2137                 buf += toHexString((c & 0x00F00000) >> 20);
2138                 buf += toHexString((c & 0x000F0000) >> 16);
2139             }
2140             buf += toHexString((c & 0xF000) >> 12);
2141             buf += toHexString((c & 0x0F00) >> 8);
2142             buf += toHexString((c & 0x00F0) >> 4);
2143             buf += toHexString(c & 0x000F);
2144         }
2145         i += U16_LENGTH(c);
2146     }
2147     return buf;
2148 }
2149 
TestFreezable()2150 void UnicodeSetTest::TestFreezable() {
2151     UErrorCode errorCode=U_ZERO_ERROR;
2152     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2153     UnicodeSet idSet(idPattern, errorCode);
2154     if(U_FAILURE(errorCode)) {
2155         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2156         return;
2157     }
2158 
2159     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2160     UnicodeSet wsSet(wsPattern, errorCode);
2161     if(U_FAILURE(errorCode)) {
2162         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2163         return;
2164     }
2165 
2166     idSet.add(idPattern);
2167     UnicodeSet frozen(idSet);
2168     frozen.freeze();
2169 
2170     if(idSet.isFrozen() || !frozen.isFrozen()) {
2171         errln("FAIL: isFrozen() is wrong");
2172     }
2173     if(frozen!=idSet || !(frozen==idSet)) {
2174         errln("FAIL: a copy-constructed frozen set differs from its original");
2175     }
2176 
2177     frozen=wsSet;
2178     if(frozen!=idSet || !(frozen==idSet)) {
2179         errln("FAIL: a frozen set was modified by operator=");
2180     }
2181 
2182     UnicodeSet frozen2(frozen);
2183     if(frozen2!=frozen || frozen2!=idSet) {
2184         errln("FAIL: a copied frozen set differs from its frozen original");
2185     }
2186     if(!frozen2.isFrozen()) {
2187         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2188     }
2189     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2190     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2191         errln("FAIL: UnicodeSet(5, 55) failed");
2192     }
2193     frozen3=frozen;
2194     if(!frozen3.isFrozen()) {
2195         errln("FAIL: copying a frozen set results in a thawed one");
2196     }
2197 
2198     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2199     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2200         errln("FAIL: clone() failed");
2201     }
2202     cloned->add(0xd802, 0xd805);
2203     if(cloned->containsSome(0xd802, 0xd805)) {
2204         errln("FAIL: unable to modify clone");
2205     }
2206     delete cloned;
2207 
2208     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2209     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2210         errln("FAIL: cloneAsThawed() failed");
2211     }
2212     thawed->add(0xd802, 0xd805);
2213     if(!thawed->contains(0xd802, 0xd805)) {
2214         errln("FAIL: unable to modify thawed clone");
2215     }
2216     delete thawed;
2217 
2218     frozen.set(5, 55);
2219     if(frozen!=idSet || !(frozen==idSet)) {
2220         errln("FAIL: UnicodeSet::set() modified a frozen set");
2221     }
2222 
2223     frozen.clear();
2224     if(frozen!=idSet || !(frozen==idSet)) {
2225         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2226     }
2227 
2228     frozen.closeOver(USET_CASE_INSENSITIVE);
2229     if(frozen!=idSet || !(frozen==idSet)) {
2230         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2231     }
2232 
2233     frozen.compact();
2234     if(frozen!=idSet || !(frozen==idSet)) {
2235         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2236     }
2237 
2238     ParsePosition pos;
2239     frozen.
2240         applyPattern(wsPattern, errorCode).
2241         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2242         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2243         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2244         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2245     if(frozen!=idSet || !(frozen==idSet)) {
2246         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2247     }
2248 
2249     frozen.
2250         add(0xd800).
2251         add(0xd802, 0xd805).
2252         add(wsPattern).
2253         addAll(idPattern).
2254         addAll(wsSet);
2255     if(frozen!=idSet || !(frozen==idSet)) {
2256         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2257     }
2258 
2259     frozen.
2260         retain(0x62).
2261         retain(0x64, 0x69).
2262         retainAll(wsPattern).
2263         retainAll(wsSet);
2264     if(frozen!=idSet || !(frozen==idSet)) {
2265         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2266     }
2267 
2268     frozen.
2269         remove(0x62).
2270         remove(0x64, 0x69).
2271         remove(idPattern).
2272         removeAll(idPattern).
2273         removeAll(idSet);
2274     if(frozen!=idSet || !(frozen==idSet)) {
2275         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2276     }
2277 
2278     frozen.
2279         complement().
2280         complement(0x62).
2281         complement(0x64, 0x69).
2282         complement(idPattern).
2283         complementAll(idPattern).
2284         complementAll(idSet);
2285     if(frozen!=idSet || !(frozen==idSet)) {
2286         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2287     }
2288 }
2289 
2290 // Test span() etc. -------------------------------------------------------- ***
2291 
2292 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2293 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2294 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2295     UErrorCode errorCode=U_ZERO_ERROR;
2296     int32_t length8=0;
2297     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2298     if(U_SUCCESS(errorCode)) {
2299         return length8;
2300     } else {
2301         // The string contains an unpaired surrogate.
2302         // Ignore this string.
2303         return 0;
2304     }
2305 }
2306 
2307 class UnicodeSetWithStringsIterator;
2308 
2309 // Make the strings in a UnicodeSet easily accessible.
2310 class UnicodeSetWithStrings {
2311 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2312     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2313             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2314         int32_t size=set.size();
2315         if(size>0 && set.charAt(size-1)<0) {
2316             // If a set's last element is not a code point, then it must contain strings.
2317             // Iterate over the set, skip all code point ranges, and cache the strings.
2318             // Convert them to UTF-8 for spanUTF8().
2319             UnicodeSetIterator iter(set);
2320             const UnicodeString *s;
2321             char *s8=utf8;
2322             int32_t length8, utf8Count=0;
2323             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2324                 if(iter.isString()) {
2325                     // Store the pointer to the set's string element
2326                     // which we happen to know is a stable pointer.
2327                     strings[stringsLength]=s=&iter.getString();
2328                     utf8Count+=
2329                         utf8Lengths[stringsLength]=length8=
2330                         appendUTF8(s->getBuffer(), s->length(),
2331                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2332                     if(length8==0) {
2333                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2334                     }
2335                     s8+=length8;
2336                     ++stringsLength;
2337                 }
2338             }
2339         }
2340     }
2341 
getSet() const2342     const UnicodeSet &getSet() const {
2343         return set;
2344     }
2345 
hasStrings() const2346     UBool hasStrings() const {
2347         return (UBool)(stringsLength>0);
2348     }
2349 
hasStringsWithSurrogates() const2350     UBool hasStringsWithSurrogates() const {
2351         return hasSurrogates;
2352     }
2353 
2354 private:
2355     friend class UnicodeSetWithStringsIterator;
2356 
2357     const UnicodeSet &set;
2358 
2359     const UnicodeString *strings[20];
2360     int32_t stringsLength;
2361     UBool hasSurrogates;
2362 
2363     char utf8[1024];
2364     int32_t utf8Lengths[20];
2365 };
2366 
2367 class UnicodeSetWithStringsIterator {
2368 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2369     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2370             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2371     }
2372 
reset()2373     void reset() {
2374         nextStringIndex=nextUTF8Start=0;
2375     }
2376 
nextString()2377     const UnicodeString *nextString() {
2378         if(nextStringIndex<fSet.stringsLength) {
2379             return fSet.strings[nextStringIndex++];
2380         } else {
2381             return NULL;
2382         }
2383     }
2384 
2385     // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2386     const char *nextUTF8(int32_t &length) {
2387         if(nextStringIndex<fSet.stringsLength) {
2388             const char *s8=fSet.utf8+nextUTF8Start;
2389             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2390             return s8;
2391         } else {
2392             length=0;
2393             return NULL;
2394         }
2395     }
2396 
2397 private:
2398     const UnicodeSetWithStrings &fSet;
2399     int32_t nextStringIndex;
2400     int32_t nextUTF8Start;
2401 };
2402 
2403 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2404 // at code point boundaries.
2405 // That is, each edge of a match must not be in the middle of a surrogate pair.
2406 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2407 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2408     s+=start;
2409     limit-=start;
2410     int32_t length=t.length();
2411     return 0==t.compare(s, length) &&
2412            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2413            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2414 }
2415 
2416 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2417 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2418                                  USetSpanCondition spanCondition) {
2419     const UnicodeSet &realSet(set.getSet());
2420     if(!set.hasStrings()) {
2421         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2422             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2423         }
2424 
2425         UChar32 c;
2426         int32_t start=0, prev;
2427         while((prev=start)<length) {
2428             U16_NEXT(s, start, length, c);
2429             if(realSet.contains(c)!=spanCondition) {
2430                 break;
2431             }
2432         }
2433         return prev;
2434     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2435         UnicodeSetWithStringsIterator iter(set);
2436         UChar32 c;
2437         int32_t start, next;
2438         for(start=next=0; start<length;) {
2439             U16_NEXT(s, next, length, c);
2440             if(realSet.contains(c)) {
2441                 break;
2442             }
2443             const UnicodeString *str;
2444             iter.reset();
2445             while((str=iter.nextString())!=NULL) {
2446                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2447                     // spanNeedsStrings=TRUE;
2448                     return start;
2449                 }
2450             }
2451             start=next;
2452         }
2453         return start;
2454     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2455         UnicodeSetWithStringsIterator iter(set);
2456         UChar32 c;
2457         int32_t start, next, maxSpanLimit=0;
2458         for(start=next=0; start<length;) {
2459             U16_NEXT(s, next, length, c);
2460             if(!realSet.contains(c)) {
2461                 next=start;  // Do not span this single, not-contained code point.
2462             }
2463             const UnicodeString *str;
2464             iter.reset();
2465             while((str=iter.nextString())!=NULL) {
2466                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2467                     // spanNeedsStrings=TRUE;
2468                     int32_t matchLimit=start+str->length();
2469                     if(matchLimit==length) {
2470                         return length;
2471                     }
2472                     if(spanCondition==USET_SPAN_CONTAINED) {
2473                         // Iterate for the shortest match at each position.
2474                         // Recurse for each but the shortest match.
2475                         if(next==start) {
2476                             next=matchLimit;  // First match from start.
2477                         } else {
2478                             if(matchLimit<next) {
2479                                 // Remember shortest match from start for iteration.
2480                                 int32_t temp=next;
2481                                 next=matchLimit;
2482                                 matchLimit=temp;
2483                             }
2484                             // Recurse for non-shortest match from start.
2485                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2486                                                                  USET_SPAN_CONTAINED);
2487                             if((matchLimit+spanLength)>maxSpanLimit) {
2488                                 maxSpanLimit=matchLimit+spanLength;
2489                                 if(maxSpanLimit==length) {
2490                                     return length;
2491                                 }
2492                             }
2493                         }
2494                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2495                         if(matchLimit>next) {
2496                             // Remember longest match from start.
2497                             next=matchLimit;
2498                         }
2499                     }
2500                 }
2501             }
2502             if(next==start) {
2503                 break;  // No match from start.
2504             }
2505             start=next;
2506         }
2507         if(start>maxSpanLimit) {
2508             return start;
2509         } else {
2510             return maxSpanLimit;
2511         }
2512     }
2513 }
2514 
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2515 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2516                                      USetSpanCondition spanCondition) {
2517     if(length==0) {
2518         return 0;
2519     }
2520     const UnicodeSet &realSet(set.getSet());
2521     if(!set.hasStrings()) {
2522         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2523             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2524         }
2525 
2526         UChar32 c;
2527         int32_t prev=length;
2528         do {
2529             U16_PREV(s, 0, length, c);
2530             if(realSet.contains(c)!=spanCondition) {
2531                 break;
2532             }
2533         } while((prev=length)>0);
2534         return prev;
2535     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2536         UnicodeSetWithStringsIterator iter(set);
2537         UChar32 c;
2538         int32_t prev=length, length0=length;
2539         do {
2540             U16_PREV(s, 0, length, c);
2541             if(realSet.contains(c)) {
2542                 break;
2543             }
2544             const UnicodeString *str;
2545             iter.reset();
2546             while((str=iter.nextString())!=NULL) {
2547                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2548                     // spanNeedsStrings=TRUE;
2549                     return prev;
2550                 }
2551             }
2552         } while((prev=length)>0);
2553         return prev;
2554     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2555         UnicodeSetWithStringsIterator iter(set);
2556         UChar32 c;
2557         int32_t prev=length, minSpanStart=length, length0=length;
2558         do {
2559             U16_PREV(s, 0, length, c);
2560             if(!realSet.contains(c)) {
2561                 length=prev;  // Do not span this single, not-contained code point.
2562             }
2563             const UnicodeString *str;
2564             iter.reset();
2565             while((str=iter.nextString())!=NULL) {
2566                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2567                     // spanNeedsStrings=TRUE;
2568                     int32_t matchStart=prev-str->length();
2569                     if(matchStart==0) {
2570                         return 0;
2571                     }
2572                     if(spanCondition==USET_SPAN_CONTAINED) {
2573                         // Iterate for the shortest match at each position.
2574                         // Recurse for each but the shortest match.
2575                         if(length==prev) {
2576                             length=matchStart;  // First match from prev.
2577                         } else {
2578                             if(matchStart>length) {
2579                                 // Remember shortest match from prev for iteration.
2580                                 int32_t temp=length;
2581                                 length=matchStart;
2582                                 matchStart=temp;
2583                             }
2584                             // Recurse for non-shortest match from prev.
2585                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2586                                                                     USET_SPAN_CONTAINED);
2587                             if(spanStart<minSpanStart) {
2588                                 minSpanStart=spanStart;
2589                                 if(minSpanStart==0) {
2590                                     return 0;
2591                                 }
2592                             }
2593                         }
2594                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2595                         if(matchStart<length) {
2596                             // Remember longest match from prev.
2597                             length=matchStart;
2598                         }
2599                     }
2600                 }
2601             }
2602             if(length==prev) {
2603                 break;  // No match from prev.
2604             }
2605         } while((prev=length)>0);
2606         if(prev<minSpanStart) {
2607             return prev;
2608         } else {
2609             return minSpanStart;
2610         }
2611     }
2612 }
2613 
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2614 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2615                                 USetSpanCondition spanCondition) {
2616     const UnicodeSet &realSet(set.getSet());
2617     if(!set.hasStrings()) {
2618         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2619             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2620         }
2621 
2622         UChar32 c;
2623         int32_t start=0, prev;
2624         while((prev=start)<length) {
2625             U8_NEXT_OR_FFFD(s, start, length, c);
2626             if(realSet.contains(c)!=spanCondition) {
2627                 break;
2628             }
2629         }
2630         return prev;
2631     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2632         UnicodeSetWithStringsIterator iter(set);
2633         UChar32 c;
2634         int32_t start, next;
2635         for(start=next=0; start<length;) {
2636             U8_NEXT_OR_FFFD(s, next, length, c);
2637             if(realSet.contains(c)) {
2638                 break;
2639             }
2640             const char *s8;
2641             int32_t length8;
2642             iter.reset();
2643             while((s8=iter.nextUTF8(length8))!=NULL) {
2644                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2645                     // spanNeedsStrings=TRUE;
2646                     return start;
2647                 }
2648             }
2649             start=next;
2650         }
2651         return start;
2652     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2653         UnicodeSetWithStringsIterator iter(set);
2654         UChar32 c;
2655         int32_t start, next, maxSpanLimit=0;
2656         for(start=next=0; start<length;) {
2657             U8_NEXT_OR_FFFD(s, next, length, c);
2658             if(!realSet.contains(c)) {
2659                 next=start;  // Do not span this single, not-contained code point.
2660             }
2661             const char *s8;
2662             int32_t length8;
2663             iter.reset();
2664             while((s8=iter.nextUTF8(length8))!=NULL) {
2665                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2666                     // spanNeedsStrings=TRUE;
2667                     int32_t matchLimit=start+length8;
2668                     if(matchLimit==length) {
2669                         return length;
2670                     }
2671                     if(spanCondition==USET_SPAN_CONTAINED) {
2672                         // Iterate for the shortest match at each position.
2673                         // Recurse for each but the shortest match.
2674                         if(next==start) {
2675                             next=matchLimit;  // First match from start.
2676                         } else {
2677                             if(matchLimit<next) {
2678                                 // Remember shortest match from start for iteration.
2679                                 int32_t temp=next;
2680                                 next=matchLimit;
2681                                 matchLimit=temp;
2682                             }
2683                             // Recurse for non-shortest match from start.
2684                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2685                                                                 USET_SPAN_CONTAINED);
2686                             if((matchLimit+spanLength)>maxSpanLimit) {
2687                                 maxSpanLimit=matchLimit+spanLength;
2688                                 if(maxSpanLimit==length) {
2689                                     return length;
2690                                 }
2691                             }
2692                         }
2693                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2694                         if(matchLimit>next) {
2695                             // Remember longest match from start.
2696                             next=matchLimit;
2697                         }
2698                     }
2699                 }
2700             }
2701             if(next==start) {
2702                 break;  // No match from start.
2703             }
2704             start=next;
2705         }
2706         if(start>maxSpanLimit) {
2707             return start;
2708         } else {
2709             return maxSpanLimit;
2710         }
2711     }
2712 }
2713 
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2714 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2715                                     USetSpanCondition spanCondition) {
2716     if(length==0) {
2717         return 0;
2718     }
2719     const UnicodeSet &realSet(set.getSet());
2720     if(!set.hasStrings()) {
2721         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2722             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2723         }
2724 
2725         UChar32 c;
2726         int32_t prev=length;
2727         do {
2728             U8_PREV_OR_FFFD(s, 0, length, c);
2729             if(realSet.contains(c)!=spanCondition) {
2730                 break;
2731             }
2732         } while((prev=length)>0);
2733         return prev;
2734     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2735         UnicodeSetWithStringsIterator iter(set);
2736         UChar32 c;
2737         int32_t prev=length;
2738         do {
2739             U8_PREV_OR_FFFD(s, 0, length, c);
2740             if(realSet.contains(c)) {
2741                 break;
2742             }
2743             const char *s8;
2744             int32_t length8;
2745             iter.reset();
2746             while((s8=iter.nextUTF8(length8))!=NULL) {
2747                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2748                     // spanNeedsStrings=TRUE;
2749                     return prev;
2750                 }
2751             }
2752         } while((prev=length)>0);
2753         return prev;
2754     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2755         UnicodeSetWithStringsIterator iter(set);
2756         UChar32 c;
2757         int32_t prev=length, minSpanStart=length;
2758         do {
2759             U8_PREV_OR_FFFD(s, 0, length, c);
2760             if(!realSet.contains(c)) {
2761                 length=prev;  // Do not span this single, not-contained code point.
2762             }
2763             const char *s8;
2764             int32_t length8;
2765             iter.reset();
2766             while((s8=iter.nextUTF8(length8))!=NULL) {
2767                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2768                     // spanNeedsStrings=TRUE;
2769                     int32_t matchStart=prev-length8;
2770                     if(matchStart==0) {
2771                         return 0;
2772                     }
2773                     if(spanCondition==USET_SPAN_CONTAINED) {
2774                         // Iterate for the shortest match at each position.
2775                         // Recurse for each but the shortest match.
2776                         if(length==prev) {
2777                             length=matchStart;  // First match from prev.
2778                         } else {
2779                             if(matchStart>length) {
2780                                 // Remember shortest match from prev for iteration.
2781                                 int32_t temp=length;
2782                                 length=matchStart;
2783                                 matchStart=temp;
2784                             }
2785                             // Recurse for non-shortest match from prev.
2786                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2787                                                                    USET_SPAN_CONTAINED);
2788                             if(spanStart<minSpanStart) {
2789                                 minSpanStart=spanStart;
2790                                 if(minSpanStart==0) {
2791                                     return 0;
2792                                 }
2793                             }
2794                         }
2795                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2796                         if(matchStart<length) {
2797                             // Remember longest match from prev.
2798                             length=matchStart;
2799                         }
2800                     }
2801                 }
2802             }
2803             if(length==prev) {
2804                 break;  // No match from prev.
2805             }
2806         } while((prev=length)>0);
2807         if(prev<minSpanStart) {
2808             return prev;
2809         } else {
2810             return minSpanStart;
2811         }
2812     }
2813 }
2814 
2815 // spans to be performed and compared
2816 enum {
2817     SPAN_UTF16          =1,
2818     SPAN_UTF8           =2,
2819     SPAN_UTFS           =3,
2820 
2821     SPAN_SET            =4,
2822     SPAN_COMPLEMENT     =8,
2823     SPAN_POLARITY       =0xc,
2824 
2825     SPAN_FWD            =0x10,
2826     SPAN_BACK           =0x20,
2827     SPAN_DIRS           =0x30,
2828 
2829     SPAN_CONTAINED      =0x100,
2830     SPAN_SIMPLE         =0x200,
2831     SPAN_CONDITION      =0x300,
2832 
2833     SPAN_ALL            =0x33f
2834 };
2835 
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2836 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2837     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2838 }
2839 
slen(const void * s,UBool isUTF16)2840 static inline int32_t slen(const void *s, UBool isUTF16) {
2841     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2842 }
2843 
2844 /*
2845  * Count spans on a string with the method according to type and set the span limits.
2846  * The set may be the complement of the original.
2847  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2848  * according to the expected number of spans.
2849  * Sets typeName to an empty string if there is no such type.
2850  * Returns -1 if the span option is filtered out.
2851  */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2852 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2853                         const void *s, int32_t length, UBool isUTF16,
2854                         uint32_t whichSpans,
2855                         int type, const char *&typeName,
2856                         int32_t limits[], int32_t limitsCapacity,
2857                         int32_t expectCount) {
2858     const UnicodeSet &realSet(set.getSet());
2859     int32_t start, count;
2860     USetSpanCondition spanCondition, firstSpanCondition, contained;
2861     UBool isForward;
2862 
2863     if(type<0 || 7<type) {
2864         typeName="";
2865         return 0;
2866     }
2867 
2868     static const char *const typeNames16[]={
2869         "contains", "contains(LM)",
2870         "span", "span(LM)",
2871         "containsBack", "containsBack(LM)",
2872         "spanBack", "spanBack(LM)"
2873     };
2874 
2875     static const char *const typeNames8[]={
2876         "containsUTF8", "containsUTF8(LM)",
2877         "spanUTF8", "spanUTF8(LM)",
2878         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2879         "spanBackUTF8", "spanBackUTF8(LM)"
2880     };
2881 
2882     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2883 
2884     // filter span options
2885     if(type<=3) {
2886         // span forward
2887         if((whichSpans&SPAN_FWD)==0) {
2888             return -1;
2889         }
2890         isForward=TRUE;
2891     } else {
2892         // span backward
2893         if((whichSpans&SPAN_BACK)==0) {
2894             return -1;
2895         }
2896         isForward=FALSE;
2897     }
2898     if((type&1)==0) {
2899         // use USET_SPAN_CONTAINED
2900         if((whichSpans&SPAN_CONTAINED)==0) {
2901             return -1;
2902         }
2903         contained=USET_SPAN_CONTAINED;
2904     } else {
2905         // use USET_SPAN_SIMPLE
2906         if((whichSpans&SPAN_SIMPLE)==0) {
2907             return -1;
2908         }
2909         contained=USET_SPAN_SIMPLE;
2910     }
2911 
2912     // Default first span condition for going forward with an uncomplemented set.
2913     spanCondition=USET_SPAN_NOT_CONTAINED;
2914     if(isComplement) {
2915         spanCondition=invertSpanCondition(spanCondition, contained);
2916     }
2917 
2918     // First span condition for span(), used to terminate the spanBack() iteration.
2919     firstSpanCondition=spanCondition;
2920 
2921     // spanBack(): Its initial span condition is span()'s last span condition,
2922     // which is the opposite of span()'s first span condition
2923     // if we expect an even number of spans.
2924     // (The loop inverts spanCondition (expectCount-1) times
2925     // before the expectCount'th span() call.)
2926     // If we do not compare forward and backward directions, then we do not have an
2927     // expectCount and just start with firstSpanCondition.
2928     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2929         spanCondition=invertSpanCondition(spanCondition, contained);
2930     }
2931 
2932     count=0;
2933     switch(type) {
2934     case 0:
2935     case 1:
2936         start=0;
2937         if(length<0) {
2938             length=slen(s, isUTF16);
2939         }
2940         for(;;) {
2941             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2942                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2943             if(count<limitsCapacity) {
2944                 limits[count]=start;
2945             }
2946             ++count;
2947             if(start>=length) {
2948                 break;
2949             }
2950             spanCondition=invertSpanCondition(spanCondition, contained);
2951         }
2952         break;
2953     case 2:
2954     case 3:
2955         start=0;
2956         for(;;) {
2957             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2958                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2959             if(count<limitsCapacity) {
2960                 limits[count]=start;
2961             }
2962             ++count;
2963             if(length>=0 ? start>=length :
2964                            isUTF16 ? ((const UChar *)s)[start]==0 :
2965                                      ((const char *)s)[start]==0
2966             ) {
2967                 break;
2968             }
2969             spanCondition=invertSpanCondition(spanCondition, contained);
2970         }
2971         break;
2972     case 4:
2973     case 5:
2974         if(length<0) {
2975             length=slen(s, isUTF16);
2976         }
2977         for(;;) {
2978             ++count;
2979             if(count<=limitsCapacity) {
2980                 limits[limitsCapacity-count]=length;
2981             }
2982             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
2983                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
2984             if(length==0 && spanCondition==firstSpanCondition) {
2985                 break;
2986             }
2987             spanCondition=invertSpanCondition(spanCondition, contained);
2988         }
2989         if(count<limitsCapacity) {
2990             memmove(limits, limits+(limitsCapacity-count), count*4);
2991         }
2992         break;
2993     case 6:
2994     case 7:
2995         for(;;) {
2996             ++count;
2997             if(count<=limitsCapacity) {
2998                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
2999             }
3000             // Note: Length<0 is tested only for the first spanBack().
3001             // If we wanted to keep length<0 for all spanBack()s, we would have to
3002             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3003             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3004                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3005             if(length==0 && spanCondition==firstSpanCondition) {
3006                 break;
3007             }
3008             spanCondition=invertSpanCondition(spanCondition, contained);
3009         }
3010         if(count<limitsCapacity) {
3011             memmove(limits, limits+(limitsCapacity-count), count*4);
3012         }
3013         break;
3014     default:
3015         typeName="";
3016         return -1;
3017     }
3018 
3019     return count;
3020 }
3021 
3022 // sets to be tested; odd index=isComplement
3023 enum {
3024     SLOW,
3025     SLOW_NOT,
3026     FAST,
3027     FAST_NOT,
3028     SET_COUNT
3029 };
3030 
3031 static const char *const setNames[SET_COUNT]={
3032     "slow",
3033     "slow.not",
3034     "fast",
3035     "fast.not"
3036 };
3037 
3038 /*
3039  * Verify that we get the same results whether we look at text with contains(),
3040  * span() or spanBack(), using unfrozen or frozen versions of the set,
3041  * and using the set or its complement (switching the spanConditions accordingly).
3042  * The latter verifies that
3043  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3044  *
3045  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3046  * or returned to the caller (with an input expectCount<0).
3047  */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3048 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3049                               const void *s, int32_t length, UBool isUTF16,
3050                               uint32_t whichSpans,
3051                               int32_t expectLimits[], int32_t &expectCount,
3052                               const char *testName, int32_t index) {
3053     int32_t limits[500];
3054     int32_t limitsCount;
3055     int i, j;
3056 
3057     const char *typeName;
3058     int type;
3059 
3060     for(i=0; i<SET_COUNT; ++i) {
3061         if((i&1)==0) {
3062             // Even-numbered sets are original, uncomplemented sets.
3063             if((whichSpans&SPAN_SET)==0) {
3064                 continue;
3065             }
3066         } else {
3067             // Odd-numbered sets are complemented.
3068             if((whichSpans&SPAN_COMPLEMENT)==0) {
3069                 continue;
3070             }
3071         }
3072         for(type=0;; ++type) {
3073             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3074                                  s, length, isUTF16,
3075                                  whichSpans,
3076                                  type, typeName,
3077                                  limits, UPRV_LENGTHOF(limits), expectCount);
3078             if(typeName[0]==0) {
3079                 break; // All types tried.
3080             }
3081             if(limitsCount<0) {
3082                 continue; // Span option filtered out.
3083             }
3084             if(expectCount<0) {
3085                 expectCount=limitsCount;
3086                 if(limitsCount>UPRV_LENGTHOF(limits)) {
3087                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3088                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3089                     return;
3090                 }
3091                 memcpy(expectLimits, limits, limitsCount*4);
3092             } else if(limitsCount!=expectCount) {
3093                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3094                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3095             } else {
3096                 for(j=0; j<limitsCount; ++j) {
3097                     if(limits[j]!=expectLimits[j]) {
3098                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3099                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3100                               j, (long)limits[j], (long)expectLimits[j]);
3101                         break;
3102                     }
3103                 }
3104             }
3105         }
3106     }
3107 
3108     // Compare span() with containsAll()/containsNone(),
3109     // but only if we have expectLimits[] from the uncomplemented set.
3110     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3111         const UChar *s16=(const UChar *)s;
3112         UnicodeString string;
3113         int32_t prev=0, limit, length;
3114         for(i=0; i<expectCount; ++i) {
3115             limit=expectLimits[i];
3116             length=limit-prev;
3117             if(length>0) {
3118                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3119                 if(i&1) {
3120                     if(!sets[SLOW]->getSet().containsAll(string)) {
3121                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3122                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3123                         return;
3124                     }
3125                     if(!sets[FAST]->getSet().containsAll(string)) {
3126                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3127                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3128                         return;
3129                     }
3130                 } else {
3131                     if(!sets[SLOW]->getSet().containsNone(string)) {
3132                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3133                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3134                         return;
3135                     }
3136                     if(!sets[FAST]->getSet().containsNone(string)) {
3137                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3138                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3139                         return;
3140                     }
3141                 }
3142             }
3143             prev=limit;
3144         }
3145     }
3146 }
3147 
3148 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3149 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3150                               const void *s, int32_t length, UBool isUTF16,
3151                               uint32_t whichSpans,
3152                               const char *testName, int32_t index) {
3153     int32_t expectLimits[500];
3154     int32_t expectCount=-1;
3155     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3156 }
3157 
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3158 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3159     UChar c, c2;
3160 
3161     if(length>=0) {
3162         while(length>0) {
3163             c=*s++;
3164             --length;
3165             if(0xd800<=c && c<0xe000) {
3166                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3167                     return TRUE;
3168                 }
3169                 --length;
3170             }
3171         }
3172     } else {
3173         while((c=*s++)!=0) {
3174             if(0xd800<=c && c<0xe000) {
3175                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3176                     return TRUE;
3177                 }
3178             }
3179         }
3180     }
3181     return FALSE;
3182 }
3183 
3184 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3185 // unless either UTF is turned off in whichSpans.
3186 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3187 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3188 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3189                                       const UChar *s16, int32_t length16,
3190                                       uint32_t whichSpans,
3191                                       const char *testName, int32_t index) {
3192     int32_t expectLimits[500];
3193     int32_t expectCount;
3194 
3195     expectCount=-1;  // Get expectLimits[] from testSpan().
3196 
3197     if((whichSpans&SPAN_UTF16)!=0) {
3198         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3199     }
3200     if((whichSpans&SPAN_UTF8)==0) {
3201         return;
3202     }
3203 
3204     // Convert s16[] and expectLimits[] to UTF-8.
3205     uint8_t s8[3000];
3206     int32_t offsets[3000];
3207 
3208     const UChar *s16Limit=s16+length16;
3209     char *t=(char *)s8;
3210     char *tLimit=t+sizeof(s8);
3211     int32_t *o=offsets;
3212     UErrorCode errorCode=U_ZERO_ERROR;
3213 
3214     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3215     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3216     if(U_FAILURE(errorCode)) {
3217         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3218               testName, (long)index, u_errorName(errorCode));
3219         ucnv_resetFromUnicode(utf8Cnv);
3220         return;
3221     }
3222     int32_t length8=(int32_t)(t-(char *)s8);
3223 
3224     // Convert expectLimits[].
3225     int32_t i, j, expect;
3226     for(i=j=0; i<expectCount; ++i) {
3227         expect=expectLimits[i];
3228         if(expect==length16) {
3229             expectLimits[i]=length8;
3230         } else {
3231             while(offsets[j]<expect) {
3232                 ++j;
3233             }
3234             expectLimits[i]=j;
3235         }
3236     }
3237 
3238     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3239 }
3240 
nextCodePoint(UChar32 c)3241 static UChar32 nextCodePoint(UChar32 c) {
3242     // Skip some large and boring ranges.
3243     switch(c) {
3244     case 0x3441:
3245         return 0x4d7f;
3246     case 0x5100:
3247         return 0x9f00;
3248     case 0xb040:
3249         return 0xd780;
3250     case 0xe041:
3251         return 0xf8fe;
3252     case 0x10100:
3253         return 0x20000;
3254     case 0x20041:
3255         return 0xe0000;
3256     case 0xe0101:
3257         return 0x10fffd;
3258     default:
3259         return c+1;
3260     }
3261 }
3262 
3263 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3264 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3265     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3266     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3267     // Skip the UTF-8 part of the test - if the string contains surrogates -
3268     // because it is likely to produce a different result.
3269     UBool inconsistentSurrogates=
3270             (!(sets[0]->getSet().contains(0xfffd) ?
3271                sets[0]->getSet().contains(0xd800, 0xdfff) :
3272                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3273              sets[0]->hasStringsWithSurrogates());
3274 
3275     UChar s[1000];
3276     int32_t length=0;
3277     uint32_t localWhichSpans;
3278 
3279     UChar32 c, first;
3280     for(first=c=0;; c=nextCodePoint(c)) {
3281         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3282             localWhichSpans=whichSpans;
3283             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3284                 localWhichSpans&=~SPAN_UTF8;
3285             }
3286             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3287             if(c>0x10ffff) {
3288                 break;
3289             }
3290             length=0;
3291             first=c;
3292         }
3293         U16_APPEND_UNSAFE(s, length, c);
3294     }
3295 }
3296 
3297 // Test with a particular, interesting string.
3298 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3299 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3300     static const UChar s[]={
3301         0x61, 0x62, 0x20,                       // Latin, space
3302         0x3b1, 0x3b2, 0x3b3,                    // Greek
3303         0xd900,                                 // lead surrogate
3304         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3305         0xdc05,                                 // trail surrogate
3306         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3307         0xd900, 0xdc05,                         // unassigned supplementary
3308         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3309         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3310         0                                       // NUL
3311     };
3312 
3313     if((whichSpans&SPAN_UTF16)==0) {
3314         return;
3315     }
3316     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3317     testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3318 }
3319 
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3320 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3321     static const char s[]={
3322         "abc"                                   // Latin
3323 
3324         /* trail byte in lead position */
3325         "\x80"
3326 
3327         " "                                     // space
3328 
3329         /* truncated multi-byte sequences */
3330         "\xd0"
3331         "\xe0"
3332         "\xe1"
3333         "\xed"
3334         "\xee"
3335         "\xf0"
3336         "\xf1"
3337         "\xf4"
3338         "\xf8"
3339         "\xfc"
3340 
3341         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3342 
3343         /* trail byte in lead position */
3344         "\x80"
3345 
3346         "\xe0\x80"
3347         "\xe0\xa0"
3348         "\xe1\x80"
3349         "\xed\x80"
3350         "\xed\xa0"
3351         "\xee\x80"
3352         "\xf0\x80"
3353         "\xf0\x90"
3354         "\xf1\x80"
3355         "\xf4\x80"
3356         "\xf4\x90"
3357         "\xf8\x80"
3358         "\xfc\x80"
3359 
3360         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3361 
3362         /* trail byte in lead position */
3363         "\x80"
3364 
3365         "\xf0\x80\x80"
3366         "\xf0\x90\x80"
3367         "\xf1\x80\x80"
3368         "\xf4\x80\x80"
3369         "\xf4\x90\x80"
3370         "\xf8\x80\x80"
3371         "\xfc\x80\x80"
3372 
3373         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3374 
3375         /* trail byte in lead position */
3376         "\x80"
3377 
3378         "\xf8\x80\x80\x80"
3379         "\xfc\x80\x80\x80"
3380 
3381         "\xF1\x90\x80\x85"                      // unassigned supplementary
3382 
3383         /* trail byte in lead position */
3384         "\x80"
3385 
3386         "\xfc\x80\x80\x80\x80"
3387 
3388         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3389 
3390         /* trail byte in lead position */
3391         "\x80"
3392 
3393         /* complete sequences but non-shortest forms or out of range etc. */
3394         "\xc0\x80"
3395         "\xe0\x80\x80"
3396         "\xed\xa0\x80"
3397         "\xf0\x80\x80\x80"
3398         "\xf4\x90\x80\x80"
3399         "\xf8\x80\x80\x80\x80"
3400         "\xfc\x80\x80\x80\x80\x80"
3401         "\xfe"
3402         "\xff"
3403 
3404         /* trail byte in lead position */
3405         "\x80"
3406 
3407         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3408     };
3409 
3410     if((whichSpans&SPAN_UTF8)==0) {
3411         return;
3412     }
3413     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3414     testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3415 }
3416 
3417 // Take a set of span options and multiply them so that
3418 // each portion only has one of the options a, b and c.
3419 // If b==0, then the set of options is just modified with mask and a.
3420 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3421 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3422 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3423                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3424     uint32_t s;
3425     int32_t i;
3426 
3427     for(i=0; i<whichSpansCount; ++i) {
3428         s=whichSpans[i]&mask;
3429         whichSpans[i]=s|a;
3430         if(b!=0) {
3431             whichSpans[whichSpansCount+i]=s|b;
3432             if(c!=0) {
3433                 whichSpans[2*whichSpansCount+i]=s|c;
3434             }
3435         }
3436     }
3437     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3438 }
3439 
3440 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3441 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3442 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3443 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3444 
TestSpan()3445 void UnicodeSetTest::TestSpan() {
3446     // "[...]" is a UnicodeSet pattern.
3447     // "*" performs tests on all Unicode code points and on a selection of
3448     //   malformed UTF-8/16 strings.
3449     // "-options" limits the scope of testing for the current set.
3450     //   By default, the test verifies that equivalent boundaries are found
3451     //   for UTF-16 and UTF-8, going forward and backward,
3452     //   alternating USET_SPAN_NOT_CONTAINED with
3453     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3454     //   Single-character options:
3455     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3456     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3457     //          or the set contains strings with unpaired surrogates
3458     //          which do not translate to valid UTF-8.
3459     //     c -- set.span() and set.complement().span() boundaries may differ.
3460     //          Cause: Set strings are not complemented.
3461     //     b -- span() and spanBack() boundaries may differ.
3462     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3463     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3464     //          match with non-overlapping substrings.
3465     //          For example, with a set containing "ab" and "ba",
3466     //          span() of "aba" yields boundaries { 0, 2, 3 }
3467     //          because the initial "ab" matches from 0 to 2,
3468     //          while spanBack() yields boundaries { 0, 1, 3 }
3469     //          because the final "ba" matches from 1 to 3.
3470     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3471     //          Cause: Strings in the set overlap, and a longer match may
3472     //          require a sequence including non-longest substrings.
3473     //          For example, with a set containing "ab", "abc" and "cd",
3474     //          span(contained) of "abcd" spans the entire string
3475     //          but span(longest match) only spans the first 3 characters.
3476     //   Each "-options" first resets all options and then applies the specified options.
3477     //   A "-" without options resets the options.
3478     //   The options are also reset for each new set.
3479     // Other strings will be spanned.
3480     static const char *const testdata[]={
3481         "[:ID_Continue:]",
3482         "*",
3483         "[:White_Space:]",
3484         "*",
3485         "[]",
3486         "*",
3487         "[\\u0000-\\U0010FFFF]",
3488         "*",
3489         "[\\u0000\\u0080\\u0800\\U00010000]",
3490         "*",
3491         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3492         "*",
3493         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3494         "-c",
3495         "*",
3496         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3497         "-c",
3498         "*",
3499 
3500         // Overlapping strings cause overlapping attempts to match.
3501         "[x{xy}{xya}{axy}{ax}]",
3502         "-cl",
3503 
3504         // More repetitions of "xya" would take too long with the recursive
3505         // reference implementation.
3506         // containsAll()=FALSE
3507         // test_string 0x14
3508         "xx"
3509         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3510         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3511         "xyaxyaxyaxya"
3512         "xx"
3513         "xyaxyaxyaxya"  // span() ends here.
3514         "aaa",
3515 
3516         // containsAll()=TRUE
3517         // test_string 0x15
3518         "xx"
3519         "xyaxyaxyaxya"
3520         "xx"
3521         "xyaxyaxyaxya"
3522         "xx"
3523         "xyaxyaxyaxy",
3524 
3525         "-bc",
3526         // test_string 0x17
3527         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3528         "-c",
3529         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3530         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3531         "-",
3532         "byaya",     // span() -> { 5 }
3533         "byay",      // span() -> { 4 }
3534         "bya",       // span() -> { 3 }
3535 
3536         // span(longest match) will not span the whole string.
3537         "[a{ab}{bc}]",
3538         "-cl",
3539         // test_string 0x21
3540         "abc",
3541 
3542         "[a{ab}{abc}{cd}]",
3543         "-cl",
3544         "acdabcdabccd",
3545 
3546         // spanBack(longest match) will not span the whole string.
3547         "[c{ab}{bc}]",
3548         "-cl",
3549         "abc",
3550 
3551         "[d{cd}{bcd}{ab}]",
3552         "-cl",
3553         "abbcdabcdabd",
3554 
3555         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3556         // and UTF-8 trail bytes.
3557         // Copies of above test sets and strings, but transliterated to have
3558         // different code points with similar trail units.
3559         // Previous: a      b         c            d
3560         // Unicode:  042B   30AB      200AB        204AB
3561         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3562         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3563         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3564         "-cl",
3565         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3566 
3567         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3568         "-cl",
3569         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3570 
3571         // Stress bookkeeping and recursion.
3572         // The following strings are barely doable with the recursive
3573         // reference implementation.
3574         // The not-contained character at the end prevents an early exit from the span().
3575         "[b{bb}]",
3576         "-c",
3577         // test_string 0x33
3578         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3579         // On complement sets, span() and spanBack() get different results
3580         // because b is not in the complement set and there is an odd number of b's
3581         // in the test string.
3582         "-bc",
3583         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3584 
3585         // Test with set strings with an initial or final code point span
3586         // longer than 254.
3587         "[a{" _64_a _64_a _64_a _64_a "b}"
3588           "{a" _64_b _64_b _64_b _64_b "}]",
3589         "-c",
3590         _64_a _64_a _64_a _63_a "b",
3591         _64_a _64_a _64_a _64_a "b",
3592         _64_a _64_a _64_a _64_a "aaaabbbb",
3593         "a" _64_b _64_b _64_b _63_b,
3594         "a" _64_b _64_b _64_b _64_b,
3595         "aaaabbbb" _64_b _64_b _64_b _64_b,
3596 
3597         // Test with strings containing unpaired surrogates.
3598         // They are not representable in UTF-8, and a leading trail surrogate
3599         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3600         // U+20001 == \\uD840\\uDC01
3601         // U+20400 == \\uD841\\uDC00
3602         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3603         "-8cl",
3604         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3605     };
3606     uint32_t whichSpans[96]={ SPAN_ALL };
3607     int32_t whichSpansCount=1;
3608 
3609     UnicodeSet *sets[SET_COUNT]={ NULL };
3610     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3611 
3612     char testName[1024];
3613     char *testNameLimit=testName;
3614 
3615     int32_t i, j;
3616     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3617         const char *s=testdata[i];
3618         if(s[0]=='[') {
3619             // Create new test sets from this pattern.
3620             for(j=0; j<SET_COUNT; ++j) {
3621                 delete sets_with_str[j];
3622                 delete sets[j];
3623             }
3624             UErrorCode errorCode=U_ZERO_ERROR;
3625             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3626             if(U_FAILURE(errorCode)) {
3627                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3628                 break;
3629             }
3630             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3631             sets[SLOW_NOT]->complement();
3632             // Intermediate set: Test cloning of a frozen set.
3633             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3634             fast->freeze();
3635             sets[FAST]=(UnicodeSet *)fast->clone();
3636             delete fast;
3637             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3638             fastNot->freeze();
3639             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3640             delete fastNot;
3641 
3642             for(j=0; j<SET_COUNT; ++j) {
3643                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3644             }
3645 
3646             strcpy(testName, s);
3647             testNameLimit=strchr(testName, 0);
3648             *testNameLimit++=':';
3649             *testNameLimit=0;
3650 
3651             whichSpans[0]=SPAN_ALL;
3652             whichSpansCount=1;
3653         } else if(s[0]=='-') {
3654             whichSpans[0]=SPAN_ALL;
3655             whichSpansCount=1;
3656 
3657             while(*++s!=0) {
3658                 switch(*s) {
3659                 case 'c':
3660                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3661                                                    ~SPAN_POLARITY,
3662                                                    SPAN_SET,
3663                                                    SPAN_COMPLEMENT,
3664                                                    0);
3665                     break;
3666                 case 'b':
3667                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3668                                                    ~SPAN_DIRS,
3669                                                    SPAN_FWD,
3670                                                    SPAN_BACK,
3671                                                    0);
3672                     break;
3673                 case 'l':
3674                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3675                     // USET_SPAN_SIMPLE only FWD, and separately
3676                     // USET_SPAN_SIMPLE only BACK
3677                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3678                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3679                                                    SPAN_DIRS|SPAN_CONTAINED,
3680                                                    SPAN_FWD|SPAN_SIMPLE,
3681                                                    SPAN_BACK|SPAN_SIMPLE);
3682                     break;
3683                 case '8':
3684                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3685                                                    ~SPAN_UTFS,
3686                                                    SPAN_UTF16,
3687                                                    SPAN_UTF8,
3688                                                    0);
3689                     break;
3690                 default:
3691                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3692                     break;
3693                 }
3694             }
3695         } else if(0==strcmp(s, "*")) {
3696             strcpy(testNameLimit, "bad_string");
3697             for(j=0; j<whichSpansCount; ++j) {
3698                 if(whichSpansCount>1) {
3699                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3700                             "%%0x%3x",
3701                             whichSpans[j]);
3702                 }
3703                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3704                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3705             }
3706 
3707             strcpy(testNameLimit, "contents");
3708             for(j=0; j<whichSpansCount; ++j) {
3709                 if(whichSpansCount>1) {
3710                     sprintf(testNameLimit+8 /* strlen("contents") */,
3711                             "%%0x%3x",
3712                             whichSpans[j]);
3713                 }
3714                 testSpanContents(sets_with_str, whichSpans[j], testName);
3715             }
3716         } else {
3717             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3718             strcpy(testNameLimit, "test_string");
3719             for(j=0; j<whichSpansCount; ++j) {
3720                 if(whichSpansCount>1) {
3721                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3722                             "%%0x%3x",
3723                             whichSpans[j]);
3724                 }
3725                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3726             }
3727         }
3728     }
3729     for(j=0; j<SET_COUNT; ++j) {
3730         delete sets_with_str[j];
3731         delete sets[j];
3732     }
3733 }
3734 
3735 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3736 void UnicodeSetTest::TestStringSpan() {
3737     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3738     static const char *const string=
3739         "xx"
3740         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3741         "xx"
3742         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3743         "xx"
3744         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3745         "aaaa";
3746 
3747     UErrorCode errorCode=U_ZERO_ERROR;
3748     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3749     UnicodeSet set(pattern16, errorCode);
3750     if(U_FAILURE(errorCode)) {
3751         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3752         return;
3753     }
3754 
3755     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3756 
3757     if(set.containsAll(string16)) {
3758         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3759     }
3760 
3761     // Remove trailing "aaaa".
3762     string16.truncate(string16.length()-4);
3763     if(!set.containsAll(string16)) {
3764         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3765     }
3766 
3767     string16=UNICODE_STRING_SIMPLE("byayaxya");
3768     const UChar *s16=string16.getBuffer();
3769     int32_t length16=string16.length();
3770     (void)length16;   // Suppress set but not used warning.
3771     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3772         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3773         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3774         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3775         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3776         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3777     ) {
3778         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3779     }
3780 
3781     pattern="[a{ab}{abc}{cd}]";
3782     pattern16=UnicodeString(pattern, -1, US_INV);
3783     set.applyPattern(pattern16, errorCode);
3784     if(U_FAILURE(errorCode)) {
3785         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3786         return;
3787     }
3788     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3789     s16=string16.getBuffer();
3790     length16=string16.length();
3791     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3792         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3793         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3794     ) {
3795         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3796     }
3797 
3798     pattern="[d{cd}{bcd}{ab}]";
3799     pattern16=UnicodeString(pattern, -1, US_INV);
3800     set.applyPattern(pattern16, errorCode).freeze();
3801     if(U_FAILURE(errorCode)) {
3802         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3803         return;
3804     }
3805     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3806     s16=string16.getBuffer();
3807     length16=string16.length();
3808     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3809         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3810         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3811     ) {
3812         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3813     }
3814 }
3815