1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 * 03/22/2000 Madhu Added additional tests
11 ********************************************************************************
12 */
13
14 #include <stdio.h>
15
16 #include <string.h>
17 #include "unicode/utypes.h"
18 #include "usettest.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/usetiter.h"
23 #include "unicode/ustring.h"
24 #include "unicode/parsepos.h"
25 #include "unicode/symtable.h"
26 #include "unicode/utf8.h"
27 #include "unicode/utf16.h"
28 #include "unicode/uversion.h"
29 #include "cmemory.h"
30 #include "hash.h"
31
32 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
33 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
34 u_errorName(status));}}
35
36 #define TEST_ASSERT(expr) {if (!(expr)) { \
37 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
38
operator +(const UnicodeString & left,const UnicodeSet & set)39 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
40 UnicodeString pat;
41 set.toPattern(pat);
42 return left + UnicodeSetTest::escape(pat);
43 }
44
UnicodeSetTest()45 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
46 }
47
openUTF8Converter()48 UConverter *UnicodeSetTest::openUTF8Converter() {
49 if(utf8Cnv==NULL) {
50 UErrorCode errorCode=U_ZERO_ERROR;
51 utf8Cnv=ucnv_open("UTF-8", &errorCode);
52 }
53 return utf8Cnv;
54 }
55
~UnicodeSetTest()56 UnicodeSetTest::~UnicodeSetTest() {
57 ucnv_close(utf8Cnv);
58 }
59
60 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)61 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
62 const char* &name, char* /*par*/) {
63 if (exec) {
64 logln(u"TestSuite UnicodeSetTest");
65 }
66 TESTCASE_AUTO_BEGIN;
67 TESTCASE_AUTO(TestPatterns);
68 TESTCASE_AUTO(TestAddRemove);
69 TESTCASE_AUTO(TestCategories);
70 TESTCASE_AUTO(TestCloneEqualHash);
71 TESTCASE_AUTO(TestMinimalRep);
72 TESTCASE_AUTO(TestAPI);
73 TESTCASE_AUTO(TestScriptSet);
74 TESTCASE_AUTO(TestPropertySet);
75 TESTCASE_AUTO(TestClone);
76 TESTCASE_AUTO(TestExhaustive);
77 TESTCASE_AUTO(TestToPattern);
78 TESTCASE_AUTO(TestIndexOf);
79 TESTCASE_AUTO(TestStrings);
80 TESTCASE_AUTO(Testj2268);
81 TESTCASE_AUTO(TestCloseOver);
82 TESTCASE_AUTO(TestEscapePattern);
83 TESTCASE_AUTO(TestInvalidCodePoint);
84 TESTCASE_AUTO(TestSymbolTable);
85 TESTCASE_AUTO(TestSurrogate);
86 TESTCASE_AUTO(TestPosixClasses);
87 TESTCASE_AUTO(TestIteration);
88 TESTCASE_AUTO(TestFreezable);
89 TESTCASE_AUTO(TestSpan);
90 TESTCASE_AUTO(TestStringSpan);
91 TESTCASE_AUTO(TestUCAUnsafeBackwards);
92 TESTCASE_AUTO(TestIntOverflow);
93 TESTCASE_AUTO(TestUnusedCcc);
94 TESTCASE_AUTO(TestDeepPattern);
95 TESTCASE_AUTO_END;
96 }
97
98 static const char NOT[] = "%%%%";
99
100 /**
101 * UVector was improperly copying contents
102 * This code will crash this is still true
103 */
Testj2268()104 void UnicodeSetTest::Testj2268() {
105 UnicodeSet t;
106 t.add(UnicodeString("abc"));
107 UnicodeSet test(t);
108 UnicodeString ustrPat;
109 test.toPattern(ustrPat, TRUE);
110 }
111
112 /**
113 * Test toPattern().
114 */
TestToPattern()115 void UnicodeSetTest::TestToPattern() {
116 UErrorCode ec = U_ZERO_ERROR;
117
118 // Test that toPattern() round trips with syntax characters and
119 // whitespace.
120 {
121 static const char* OTHER_TOPATTERN_TESTS[] = {
122 "[[:latin:]&[:greek:]]",
123 "[[:latin:]-[:greek:]]",
124 "[:nonspacing mark:]",
125 NULL
126 };
127
128 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
129 ec = U_ZERO_ERROR;
130 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
131 if (U_FAILURE(ec)) {
132 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
133 continue;
134 }
135 checkPat(OTHER_TOPATTERN_TESTS[j], s);
136 }
137
138 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
139 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
140
141 // check various combinations to make sure they all work.
142 if (i != 0 && !toPatternAux(i, i)){
143 continue;
144 }
145 if (!toPatternAux(0, i)){
146 continue;
147 }
148 if (!toPatternAux(i, 0xFFFF)){
149 continue;
150 }
151 }
152 }
153 }
154
155 // Test pattern behavior of multicharacter strings.
156 {
157 ec = U_ZERO_ERROR;
158 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
159
160 // This loop isn't a loop. It's here to make the compiler happy.
161 // If you're curious, try removing it and changing the 'break'
162 // statements (except for the last) to goto's.
163 for (;;) {
164 if (U_FAILURE(ec)) break;
165 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
166 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
167
168 s->add("ac");
169 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
170 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
171
172 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
173 if (U_FAILURE(ec)) break;
174 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
175 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
176
177 s->add("[]");
178 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
179 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
180
181 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
182 if (U_FAILURE(ec)) break;
183 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
184 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
185
186 // j2189
187 s->clear();
188 s->add(UnicodeString("abc", ""));
189 s->add(UnicodeString("abc", ""));
190 const char* exp6[] = {"abc", NOT, "ab", NULL};
191 expectToPattern(*s, "[{abc}]", exp6);
192
193 break;
194 }
195
196 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
197 delete s;
198 }
199
200 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
201 UnicodeSet s;
202 s.add((UChar)97, (UChar)98); // 'a', 'b'
203 expectToPattern(s, "[ab]", NULL);
204 }
205
toPatternAux(UChar32 start,UChar32 end)206 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
207
208 // use Integer.toString because Utility.hex doesn't handle ints
209 UnicodeString pat = "";
210 // TODO do these in hex
211 //String source = "0x" + Integer.toString(start,16).toUpperCase();
212 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
213 UnicodeString source;
214 source = source + (uint32_t)start;
215 if (start != end)
216 source = source + ".." + (uint32_t)end;
217 UnicodeSet testSet;
218 testSet.add(start, end);
219 return checkPat(source, testSet);
220 }
221
checkPat(const UnicodeString & source,const UnicodeSet & testSet)222 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
223 const UnicodeSet& testSet) {
224 // What we want to make sure of is that a pattern generated
225 // by toPattern(), with or without escaped unprintables, can
226 // be passed back into the UnicodeSet constructor.
227 UnicodeString pat0;
228
229 testSet.toPattern(pat0, TRUE);
230
231 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
232
233 //String pat1 = unescapeLeniently(pat0);
234 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
235
236 UnicodeString pat2;
237 testSet.toPattern(pat2, FALSE);
238 if (!checkPat(source, testSet, pat2)) return FALSE;
239
240 //String pat3 = unescapeLeniently(pat2);
241 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
242
243 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
244 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
245 return TRUE;
246 }
247
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)248 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
249 const UnicodeSet& testSet,
250 const UnicodeString& pat) {
251 UErrorCode ec = U_ZERO_ERROR;
252 UnicodeSet testSet2(pat, ec);
253 if (testSet2 != testSet) {
254 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
255 return FALSE;
256 }
257 return TRUE;
258 }
259
260 void
TestPatterns(void)261 UnicodeSetTest::TestPatterns(void) {
262 UnicodeSet set;
263 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
264 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
265 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
266 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
267 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
268 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
269
270 // Throw in a test of complement
271 set.complement();
272 UnicodeString exp;
273 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
274 expectPairs(set, exp);
275 }
276
277 void
TestCategories(void)278 UnicodeSetTest::TestCategories(void) {
279 UErrorCode status = U_ZERO_ERROR;
280 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
281 UnicodeSet set(pat, status);
282 if (U_FAILURE(status)) {
283 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
284 return;
285 } else {
286 expectContainment(set, pat, "ABC", "abc");
287 }
288
289 UChar32 i;
290 int32_t failures = 0;
291 // Make sure generation of L doesn't pollute cached Lu set
292 // First generate L, then Lu
293 set.applyPattern("[:L:]", status);
294 if (U_FAILURE(status)) { errln("FAIL"); return; }
295 for (i=0; i<0x200; ++i) {
296 UBool l = u_isalpha((UChar)i);
297 if (l != set.contains(i)) {
298 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
299 set.contains(i));
300 if (++failures == 10) break;
301 }
302 }
303
304 set.applyPattern("[:Lu:]", status);
305 if (U_FAILURE(status)) { errln("FAIL"); return; }
306 for (i=0; i<0x200; ++i) {
307 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
308 if (lu != set.contains(i)) {
309 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
310 set.contains(i));
311 if (++failures == 20) break;
312 }
313 }
314 }
315 void
TestCloneEqualHash(void)316 UnicodeSetTest::TestCloneEqualHash(void) {
317 UErrorCode status = U_ZERO_ERROR;
318 // set1 and set2 used to be built with the obsolete constructor taking
319 // UCharCategory values; replaced with pattern constructors
320 // markus 20030502
321 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase
322 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase
323 if (U_FAILURE(status)){
324 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
325 return;
326 }
327 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit
328 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit
329 if (U_FAILURE(status)){
330 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
331 return;
332 }
333
334 if (*set1 != *set1a) {
335 errln("FAIL: category constructor for Ll broken");
336 }
337 if (*set2 != *set2a) {
338 errln("FAIL: category constructor for Nd broken");
339 }
340 delete set1a;
341 delete set2a;
342
343 logln("Testing copy construction");
344 UnicodeSet *set1copy=new UnicodeSet(*set1);
345 if(*set1 != *set1copy || *set1 == *set2 ||
346 getPairs(*set1) != getPairs(*set1copy) ||
347 set1->hashCode() != set1copy->hashCode()){
348 errln("FAIL : Error in copy construction");
349 return;
350 }
351
352 logln("Testing =operator");
353 UnicodeSet set1equal=*set1;
354 UnicodeSet set2equal=*set2;
355 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
356 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
357 errln("FAIL: Error in =operator");
358 }
359
360 logln("Testing clone()");
361 UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
362 UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
363 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
364 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
365 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
366 errln("FAIL: Error in clone");
367 }
368
369 logln("Testing hashcode");
370 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
371 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
372 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
373 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
374 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
375 errln("FAIL: Error in hashCode()");
376 }
377
378 delete set1;
379 delete set1copy;
380 delete set2;
381 delete set1clone;
382 delete set2clone;
383
384
385 }
386 void
TestAddRemove(void)387 UnicodeSetTest::TestAddRemove(void) {
388 UnicodeSet set; // Construct empty set
389 doAssert(set.isEmpty() == TRUE, "set should be empty");
390 doAssert(set.size() == 0, "size should be 0");
391 set.complement();
392 doAssert(set.size() == 0x110000, "size should be 0x110000");
393 set.clear();
394 set.add(0x0061, 0x007a);
395 expectPairs(set, "az");
396 doAssert(set.isEmpty() == FALSE, "set should not be empty");
397 doAssert(set.size() != 0, "size should not be equal to 0");
398 doAssert(set.size() == 26, "size should be equal to 26");
399 set.remove(0x006d, 0x0070);
400 expectPairs(set, "alqz");
401 doAssert(set.size() == 22, "size should be equal to 22");
402 set.remove(0x0065, 0x0067);
403 expectPairs(set, "adhlqz");
404 doAssert(set.size() == 19, "size should be equal to 19");
405 set.remove(0x0064, 0x0069);
406 expectPairs(set, "acjlqz");
407 doAssert(set.size() == 16, "size should be equal to 16");
408 set.remove(0x0063, 0x0072);
409 expectPairs(set, "absz");
410 doAssert(set.size() == 10, "size should be equal to 10");
411 set.add(0x0066, 0x0071);
412 expectPairs(set, "abfqsz");
413 doAssert(set.size() == 22, "size should be equal to 22");
414 set.remove(0x0061, 0x0067);
415 expectPairs(set, "hqsz");
416 set.remove(0x0061, 0x007a);
417 expectPairs(set, "");
418 doAssert(set.isEmpty() == TRUE, "set should be empty");
419 doAssert(set.size() == 0, "size should be 0");
420 set.add(0x0061);
421 doAssert(set.isEmpty() == FALSE, "set should not be empty");
422 doAssert(set.size() == 1, "size should not be equal to 1");
423 set.add(0x0062);
424 set.add(0x0063);
425 expectPairs(set, "ac");
426 doAssert(set.size() == 3, "size should not be equal to 3");
427 set.add(0x0070);
428 set.add(0x0071);
429 expectPairs(set, "acpq");
430 doAssert(set.size() == 5, "size should not be equal to 5");
431 set.clear();
432 expectPairs(set, "");
433 doAssert(set.isEmpty() == TRUE, "set should be empty");
434 doAssert(set.size() == 0, "size should be 0");
435
436 // Try removing an entire set from another set
437 expectPattern(set, "[c-x]", "cx");
438 UnicodeSet set2;
439 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
440 set.removeAll(set2);
441 expectPairs(set, "deluxx");
442
443 // Try adding an entire set to another set
444 expectPattern(set, "[jackiemclean]", "aacceein");
445 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
446 set.addAll(set2);
447 expectPairs(set, "aacehort");
448 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
449
450 // Try retaining an set of elements contained in another set (intersection)
451 UnicodeSet set3;
452 expectPattern(set3, "[a-c]", "ac");
453 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
454 set3.remove(0x0062);
455 expectPairs(set3, "aacc");
456 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
457 set.retainAll(set3);
458 expectPairs(set, "aacc");
459 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
460 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
461 set.clear();
462 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
463
464 // Test commutativity
465 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
466 expectPattern(set2, "[jackiemclean]", "aacceein");
467 set.addAll(set2);
468 expectPairs(set, "aacehort");
469 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
470
471
472
473
474 }
475
476 /**
477 * Make sure minimal representation is maintained.
478 */
TestMinimalRep()479 void UnicodeSetTest::TestMinimalRep() {
480 UErrorCode status = U_ZERO_ERROR;
481 // This is pretty thoroughly tested by checkCanonicalRep()
482 // run against the exhaustive operation results. Use the code
483 // here for debugging specific spot problems.
484
485 // 1 overlap against 2
486 UnicodeSet set("[h-km-q]", status);
487 if (U_FAILURE(status)) { errln("FAIL"); return; }
488 UnicodeSet set2("[i-o]", status);
489 if (U_FAILURE(status)) { errln("FAIL"); return; }
490 set.addAll(set2);
491 expectPairs(set, "hq");
492 // right
493 set.applyPattern("[a-m]", status);
494 if (U_FAILURE(status)) { errln("FAIL"); return; }
495 set2.applyPattern("[e-o]", status);
496 if (U_FAILURE(status)) { errln("FAIL"); return; }
497 set.addAll(set2);
498 expectPairs(set, "ao");
499 // left
500 set.applyPattern("[e-o]", status);
501 if (U_FAILURE(status)) { errln("FAIL"); return; }
502 set2.applyPattern("[a-m]", status);
503 if (U_FAILURE(status)) { errln("FAIL"); return; }
504 set.addAll(set2);
505 expectPairs(set, "ao");
506 // 1 overlap against 3
507 set.applyPattern("[a-eg-mo-w]", status);
508 if (U_FAILURE(status)) { errln("FAIL"); return; }
509 set2.applyPattern("[d-q]", status);
510 if (U_FAILURE(status)) { errln("FAIL"); return; }
511 set.addAll(set2);
512 expectPairs(set, "aw");
513 }
514
TestAPI()515 void UnicodeSetTest::TestAPI() {
516 UErrorCode status = U_ZERO_ERROR;
517 // default ct
518 UnicodeSet set;
519 if (!set.isEmpty() || set.getRangeCount() != 0) {
520 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
521 set);
522 }
523
524 // clear(), isEmpty()
525 set.add(0x0061);
526 if (set.isEmpty()) {
527 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
528 set);
529 }
530 set.clear();
531 if (!set.isEmpty()) {
532 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
533 set);
534 }
535
536 // size()
537 set.clear();
538 if (set.size() != 0) {
539 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
540 ": " + set);
541 }
542 set.add(0x0061);
543 if (set.size() != 1) {
544 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
545 ": " + set);
546 }
547 set.add(0x0031, 0x0039);
548 if (set.size() != 10) {
549 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
550 ": " + set);
551 }
552
553 // contains(first, last)
554 set.clear();
555 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
556 if (U_FAILURE(status)) { errln("FAIL"); return; }
557 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
558 UChar32 a = set.getRangeStart(i);
559 UChar32 b = set.getRangeEnd(i);
560 if (!set.contains(a, b)) {
561 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
562 " but doesn't: " + set);
563 }
564 if (set.contains((UChar32)(a-1), b)) {
565 errln((UnicodeString)"FAIL, shouldn't contain " +
566 (unsigned short)(a-1) + '-' + (unsigned short)b +
567 " but does: " + set);
568 }
569 if (set.contains(a, (UChar32)(b+1))) {
570 errln((UnicodeString)"FAIL, shouldn't contain " +
571 (unsigned short)a + '-' + (unsigned short)(b+1) +
572 " but does: " + set);
573 }
574 }
575
576 // Ported InversionList test.
577 UnicodeSet a((UChar32)3,(UChar32)10);
578 UnicodeSet b((UChar32)7,(UChar32)15);
579 UnicodeSet c;
580
581 logln((UnicodeString)"a [3-10]: " + a);
582 logln((UnicodeString)"b [7-15]: " + b);
583 c = a;
584 c.addAll(b);
585 UnicodeSet exp((UChar32)3,(UChar32)15);
586 if (c == exp) {
587 logln((UnicodeString)"c.set(a).add(b): " + c);
588 } else {
589 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
590 }
591 c.complement();
592 exp.set((UChar32)0, (UChar32)2);
593 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
594 if (c == exp) {
595 logln((UnicodeString)"c.complement(): " + c);
596 } else {
597 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
598 }
599 c.complement();
600 exp.set((UChar32)3, (UChar32)15);
601 if (c == exp) {
602 logln((UnicodeString)"c.complement(): " + c);
603 } else {
604 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
605 }
606 c = a;
607 c.complementAll(b);
608 exp.set((UChar32)3,(UChar32)6);
609 exp.add((UChar32)11,(UChar32) 15);
610 if (c == exp) {
611 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
612 } else {
613 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
614 }
615
616 exp = c;
617 bitsToSet(setToBits(c), c);
618 if (c == exp) {
619 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
620 } else {
621 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
622 }
623
624 // Additional tests for coverage JB#2118
625 //UnicodeSet::complement(class UnicodeString const &)
626 //UnicodeSet::complementAll(class UnicodeString const &)
627 //UnicodeSet::containsNone(class UnicodeSet const &)
628 //UnicodeSet::containsNone(long,long)
629 //UnicodeSet::containsSome(class UnicodeSet const &)
630 //UnicodeSet::containsSome(long,long)
631 //UnicodeSet::removeAll(class UnicodeString const &)
632 //UnicodeSet::retain(long)
633 //UnicodeSet::retainAll(class UnicodeString const &)
634 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
635 //UnicodeSetIterator::getString(void)
636 set.clear();
637 set.complement("ab");
638 exp.applyPattern("[{ab}]", status);
639 if (U_FAILURE(status)) { errln("FAIL"); return; }
640 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
641
642 UnicodeSetIterator iset(set);
643 if (!iset.next() || !iset.isString()) {
644 errln("FAIL: UnicodeSetIterator::next/isString");
645 } else if (iset.getString() != "ab") {
646 errln("FAIL: UnicodeSetIterator::getString");
647 }
648
649 set.add((UChar32)0x61, (UChar32)0x7A);
650 set.complementAll("alan");
651 exp.applyPattern("[{ab}b-kmo-z]", status);
652 if (U_FAILURE(status)) { errln("FAIL"); return; }
653 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
654
655 exp.applyPattern("[a-z]", status);
656 if (U_FAILURE(status)) { errln("FAIL"); return; }
657 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
658 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
659 exp.applyPattern("[aln]", status);
660 if (U_FAILURE(status)) { errln("FAIL"); return; }
661 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
662 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
663
664 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
665 errln("FAIL: containsNone(UChar32, UChar32)");
666 }
667 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
668 errln("FAIL: containsSome(UChar32, UChar32)");
669 }
670 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
671 errln("FAIL: containsNone(UChar32, UChar32)");
672 }
673 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
674 errln("FAIL: containsSome(UChar32, UChar32)");
675 }
676
677 set.removeAll("liu");
678 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
679 if (U_FAILURE(status)) { errln("FAIL"); return; }
680 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
681
682 set.retainAll("star");
683 exp.applyPattern("[rst]", status);
684 if (U_FAILURE(status)) { errln("FAIL"); return; }
685 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
686
687 set.retain((UChar32)0x73);
688 exp.applyPattern("[s]", status);
689 if (U_FAILURE(status)) { errln("FAIL"); return; }
690 if (set != exp) { errln("FAIL: retain('s')"); return; }
691
692 uint16_t buf[32];
693 int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
694 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
695 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
696 errln("FAIL: serialize");
697 return;
698 }
699
700 // Conversions to and from USet
701 UnicodeSet *uniset = &set;
702 USet *uset = uniset->toUSet();
703 TEST_ASSERT((void *)uset == (void *)uniset);
704 UnicodeSet *setx = UnicodeSet::fromUSet(uset);
705 TEST_ASSERT((void *)setx == (void *)uset);
706 const UnicodeSet *constSet = uniset;
707 const USet *constUSet = constSet->toUSet();
708 TEST_ASSERT((void *)constUSet == (void *)constSet);
709 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
710 TEST_ASSERT((void *)constSetx == (void *)constUSet);
711
712 // span(UnicodeString) and spanBack(UnicodeString) convenience methods
713 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
714 UnicodeSet ac(0x61, 0x63);
715 ac.remove(0x62).freeze();
716 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
717 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
718 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
719 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
720 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
721 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
722 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
723 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
724 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
725 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
726 ) {
727 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
728 }
729 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
730 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
731 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
732 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
733 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
734 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
735 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
736 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
737 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
738 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
739 ) {
740 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
741 }
742 }
743
TestIteration()744 void UnicodeSetTest::TestIteration() {
745 UErrorCode ec = U_ZERO_ERROR;
746 int i = 0;
747 int outerLoop;
748
749 // 6 code points, 3 ranges, 2 strings, 8 total elements
750 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
751 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
752 TEST_ASSERT_SUCCESS(ec);
753 UnicodeSetIterator it(set);
754
755 for (outerLoop=0; outerLoop<3; outerLoop++) {
756 // Run the test multiple times, to check that iterator.reset() is working.
757 for (i=0; i<10; i++) {
758 UBool nextv = it.next();
759 UBool isString = it.isString();
760 int32_t codePoint = it.getCodepoint();
761 //int32_t codePointEnd = it.getCodepointEnd();
762 UnicodeString s = it.getString();
763 switch (i) {
764 case 0:
765 TEST_ASSERT(nextv == TRUE);
766 TEST_ASSERT(isString == FALSE);
767 TEST_ASSERT(codePoint==0x61);
768 TEST_ASSERT(s == "a");
769 break;
770 case 1:
771 TEST_ASSERT(nextv == TRUE);
772 TEST_ASSERT(isString == FALSE);
773 TEST_ASSERT(codePoint==0x62);
774 TEST_ASSERT(s == "b");
775 break;
776 case 2:
777 TEST_ASSERT(nextv == TRUE);
778 TEST_ASSERT(isString == FALSE);
779 TEST_ASSERT(codePoint==0x63);
780 TEST_ASSERT(s == "c");
781 break;
782 case 3:
783 TEST_ASSERT(nextv == TRUE);
784 TEST_ASSERT(isString == FALSE);
785 TEST_ASSERT(codePoint==0x79);
786 TEST_ASSERT(s == "y");
787 break;
788 case 4:
789 TEST_ASSERT(nextv == TRUE);
790 TEST_ASSERT(isString == FALSE);
791 TEST_ASSERT(codePoint==0x7a);
792 TEST_ASSERT(s == "z");
793 break;
794 case 5:
795 TEST_ASSERT(nextv == TRUE);
796 TEST_ASSERT(isString == FALSE);
797 TEST_ASSERT(codePoint==0x1abcd);
798 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
799 break;
800 case 6:
801 TEST_ASSERT(nextv == TRUE);
802 TEST_ASSERT(isString == TRUE);
803 TEST_ASSERT(s == "str1");
804 break;
805 case 7:
806 TEST_ASSERT(nextv == TRUE);
807 TEST_ASSERT(isString == TRUE);
808 TEST_ASSERT(s == "str2");
809 break;
810 case 8:
811 TEST_ASSERT(nextv == FALSE);
812 break;
813 case 9:
814 TEST_ASSERT(nextv == FALSE);
815 break;
816 }
817 }
818 it.reset(); // prepare to run the iteration again.
819 }
820 }
821
822
823
824
TestStrings()825 void UnicodeSetTest::TestStrings() {
826 UErrorCode ec = U_ZERO_ERROR;
827
828 UnicodeSet* testList[] = {
829 UnicodeSet::createFromAll("abc"),
830 new UnicodeSet("[a-c]", ec),
831
832 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
833 new UnicodeSet("[{ll}{ch}a-z]", ec),
834
835 UnicodeSet::createFrom("ab}c"),
836 new UnicodeSet("[{ab\\}c}]", ec),
837
838 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
839 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
840
841 NULL
842 };
843
844 if (U_FAILURE(ec)) {
845 errln("FAIL: couldn't construct test sets");
846 }
847
848 for (int32_t i = 0; testList[i] != NULL; i+=2) {
849 if (U_SUCCESS(ec)) {
850 UnicodeString pat0, pat1;
851 testList[i]->toPattern(pat0, TRUE);
852 testList[i+1]->toPattern(pat1, TRUE);
853 if (*testList[i] == *testList[i+1]) {
854 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
855 } else {
856 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
857 }
858 }
859 delete testList[i];
860 delete testList[i+1];
861 }
862 }
863
864 /**
865 * Test the [:Latin:] syntax.
866 */
TestScriptSet()867 void UnicodeSetTest::TestScriptSet() {
868 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
869
870 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
871
872 /* Jitterbug 1423 */
873 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
874
875 }
876
877 /**
878 * Test the [:Latin:] syntax.
879 */
TestPropertySet()880 void UnicodeSetTest::TestPropertySet() {
881 static const char* const DATA[] = {
882 // Pattern, Chars IN, Chars NOT in
883
884 "[:Latin:]",
885 "aA",
886 "\\u0391\\u03B1",
887
888 "[\\p{Greek}]",
889 "\\u0391\\u03B1",
890 "aA",
891
892 "\\P{ GENERAL Category = upper case letter }",
893 "abc",
894 "ABC",
895
896 #if !UCONFIG_NO_NORMALIZATION
897 // Combining class: @since ICU 2.2
898 // Check both symbolic and numeric
899 "\\p{ccc=Nukta}",
900 "\\u0ABC",
901 "abc",
902
903 "\\p{Canonical Combining Class = 11}",
904 "\\u05B1",
905 "\\u05B2",
906
907 "[:c c c = iota subscript :]",
908 "\\u0345",
909 "xyz",
910 #endif
911
912 // Bidi class: @since ICU 2.2
913 "\\p{bidiclass=lefttoright}",
914 "abc",
915 "\\u0671\\u0672",
916
917 // Binary properties: @since ICU 2.2
918 "\\p{ideographic}",
919 "\\u4E0A",
920 "x",
921
922 "[:math=false:]",
923 "q)*(",
924 // weiv: )(and * were removed from math in Unicode 4.0.1
925 //"(*+)",
926 "+<>^",
927
928 // JB#1767 \N{}, \p{ASCII}
929 "[:Ascii:]",
930 "abc\\u0000\\u007F",
931 "\\u0080\\u4E00",
932
933 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
934 "az",
935 "qrs",
936
937 // JB#2015
938 "[:any:]",
939 "a\\U0010FFFF",
940 "",
941
942 "[:nv=0.5:]",
943 "\\u00BD\\u0F2A",
944 "\\u00BC",
945
946 // JB#2653: Age
947 "[:Age=1.1:]",
948 "\\u03D6", // 1.1
949 "\\u03D8\\u03D9", // 3.2
950
951 "[:Age=3.1:]",
952 "\\u1800\\u3400\\U0002f800",
953 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
954
955 // JB#2350: Case_Sensitive
956 "[:Case Sensitive:]",
957 "A\\u1FFC\\U00010410",
958 ";\\u00B4\\U00010500",
959
960 // JB#2832: C99-compatibility props
961 "[:blank:]",
962 " \\u0009",
963 "1-9A-Z",
964
965 "[:graph:]",
966 "19AZ",
967 " \\u0003\\u0007\\u0009\\u000A\\u000D",
968
969 "[:punct:]",
970 "!@#%&*()[]{}-_\\/;:,.?'\"",
971 "09azAZ",
972
973 "[:xdigit:]",
974 "09afAF",
975 "gG!",
976
977 // Regex compatibility test
978 "[-b]", // leading '-' is literal
979 "-b",
980 "ac",
981
982 "[^-b]", // leading '-' is literal
983 "ac",
984 "-b",
985
986 "[b-]", // trailing '-' is literal
987 "-b",
988 "ac",
989
990 "[^b-]", // trailing '-' is literal
991 "ac",
992 "-b",
993
994 "[a-b-]", // trailing '-' is literal
995 "ab-",
996 "c=",
997
998 "[[a-q]&[p-z]-]", // trailing '-' is literal
999 "pq-",
1000 "or=",
1001
1002 "[\\s|\\)|:|$|\\>]", // from regex tests
1003 "s|):$>",
1004 "abc",
1005
1006 "[\\uDC00cd]", // JB#2906: isolated trail at start
1007 "cd\\uDC00",
1008 "ab\\uD800\\U00010000",
1009
1010 "[ab\\uD800]", // JB#2906: isolated trail at start
1011 "ab\\uD800",
1012 "cd\\uDC00\\U00010000",
1013
1014 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1015 "abcd\\uD800",
1016 "ef\\uDC00\\U00010000",
1017
1018 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1019 "abcd\\uDC00",
1020 "ef\\uD800\\U00010000",
1021
1022 #if !UCONFIG_NO_NORMALIZATION
1023 "[:^lccc=0:]", // Lead canonical class
1024 "\\u0300\\u0301",
1025 "abcd\\u00c0\\u00c5",
1026
1027 "[:^tccc=0:]", // Trail canonical class
1028 "\\u0300\\u0301\\u00c0\\u00c5",
1029 "abcd",
1030
1031 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1032 "\\u0300\\u0301\\u00c0\\u00c5",
1033 "abcd",
1034
1035 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1036 "",
1037 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1038
1039 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1040 "\\u0F73\\u0F75\\u0F81",
1041 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1042 #endif /* !UCONFIG_NO_NORMALIZATION */
1043
1044 "[:Assigned:]",
1045 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1046 "\\u0888\\uFDD3\\uFFFE\\U00050005",
1047
1048 // Script_Extensions, new in Unicode 6.0
1049 "[:scx=Arab:]",
1050 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1051 "\\u061D\\uFDEF\\uFDFE",
1052
1053 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1054 // so scx-sc is missing U+FDF2.
1055 "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1056 "\\u0640\\u064B\\u0650\\u0655",
1057 "\\uFDF2"
1058 };
1059
1060 static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1061
1062 for (int32_t i=0; i<DATA_LEN; i+=3) {
1063 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1064 CharsToUnicodeString(DATA[i+2]));
1065 }
1066 }
1067
1068 /**
1069 * Test that Posix style character classes [:digit:], etc.
1070 * have the Unicode definitions from TR 18.
1071 */
TestPosixClasses()1072 void UnicodeSetTest::TestPosixClasses() {
1073 {
1074 UErrorCode status = U_ZERO_ERROR;
1075 UnicodeSet s1("[:alpha:]", status);
1076 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1077 TEST_ASSERT_SUCCESS(status);
1078 TEST_ASSERT(s1==s2);
1079 }
1080 {
1081 UErrorCode status = U_ZERO_ERROR;
1082 UnicodeSet s1("[:lower:]", status);
1083 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1084 TEST_ASSERT_SUCCESS(status);
1085 TEST_ASSERT(s1==s2);
1086 }
1087 {
1088 UErrorCode status = U_ZERO_ERROR;
1089 UnicodeSet s1("[:upper:]", status);
1090 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1091 TEST_ASSERT_SUCCESS(status);
1092 TEST_ASSERT(s1==s2);
1093 }
1094 {
1095 UErrorCode status = U_ZERO_ERROR;
1096 UnicodeSet s1("[:punct:]", status);
1097 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1098 TEST_ASSERT_SUCCESS(status);
1099 TEST_ASSERT(s1==s2);
1100 }
1101 {
1102 UErrorCode status = U_ZERO_ERROR;
1103 UnicodeSet s1("[:digit:]", status);
1104 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1105 TEST_ASSERT_SUCCESS(status);
1106 TEST_ASSERT(s1==s2);
1107 }
1108 {
1109 UErrorCode status = U_ZERO_ERROR;
1110 UnicodeSet s1("[:xdigit:]", status);
1111 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1112 TEST_ASSERT_SUCCESS(status);
1113 TEST_ASSERT(s1==s2);
1114 }
1115 {
1116 UErrorCode status = U_ZERO_ERROR;
1117 UnicodeSet s1("[:alnum:]", status);
1118 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1119 TEST_ASSERT_SUCCESS(status);
1120 TEST_ASSERT(s1==s2);
1121 }
1122 {
1123 UErrorCode status = U_ZERO_ERROR;
1124 UnicodeSet s1("[:space:]", status);
1125 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1126 TEST_ASSERT_SUCCESS(status);
1127 TEST_ASSERT(s1==s2);
1128 }
1129 {
1130 UErrorCode status = U_ZERO_ERROR;
1131 UnicodeSet s1("[:blank:]", status);
1132 TEST_ASSERT_SUCCESS(status);
1133 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1134 status);
1135 TEST_ASSERT_SUCCESS(status);
1136 TEST_ASSERT(s1==s2);
1137 }
1138 {
1139 UErrorCode status = U_ZERO_ERROR;
1140 UnicodeSet s1("[:cntrl:]", status);
1141 TEST_ASSERT_SUCCESS(status);
1142 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1143 TEST_ASSERT_SUCCESS(status);
1144 TEST_ASSERT(s1==s2);
1145 }
1146 {
1147 UErrorCode status = U_ZERO_ERROR;
1148 UnicodeSet s1("[:graph:]", status);
1149 TEST_ASSERT_SUCCESS(status);
1150 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1151 TEST_ASSERT_SUCCESS(status);
1152 TEST_ASSERT(s1==s2);
1153 }
1154 {
1155 UErrorCode status = U_ZERO_ERROR;
1156 UnicodeSet s1("[:print:]", status);
1157 TEST_ASSERT_SUCCESS(status);
1158 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1159 TEST_ASSERT_SUCCESS(status);
1160 TEST_ASSERT(s1==s2);
1161 }
1162 }
1163 /**
1164 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1165 */
TestClone()1166 void UnicodeSetTest::TestClone() {
1167 UErrorCode ec = U_ZERO_ERROR;
1168 UnicodeSet s("[abcxyz]", ec);
1169 UnicodeSet t(s);
1170 expectContainment(t, "abc", "def");
1171 }
1172
1173 /**
1174 * Test the indexOf() and charAt() methods.
1175 */
TestIndexOf()1176 void UnicodeSetTest::TestIndexOf() {
1177 UErrorCode ec = U_ZERO_ERROR;
1178 UnicodeSet set("[a-cx-y3578]", ec);
1179 if (U_FAILURE(ec)) {
1180 errln("FAIL: UnicodeSet constructor");
1181 return;
1182 }
1183 for (int32_t i=0; i<set.size(); ++i) {
1184 UChar32 c = set.charAt(i);
1185 if (set.indexOf(c) != i) {
1186 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1187 i, c, set.indexOf(c));
1188 }
1189 }
1190 UChar32 c = set.charAt(set.size());
1191 if (c != -1) {
1192 errln("FAIL: charAt(<out of range>) = %X", c);
1193 }
1194 int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1195 if (j != -1) {
1196 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1197 }
1198 }
1199
1200 /**
1201 * Test closure API.
1202 */
TestCloseOver()1203 void UnicodeSetTest::TestCloseOver() {
1204 UErrorCode ec = U_ZERO_ERROR;
1205
1206 char CASE[] = {(char)USET_CASE_INSENSITIVE};
1207 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1208 const char* DATA[] = {
1209 // selector, input, output
1210 CASE,
1211 "[aq\\u00DF{Bc}{bC}{Fi}]",
1212 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1213
1214 CASE,
1215 "[\\u01F1]", // 'DZ'
1216 "[\\u01F1\\u01F2\\u01F3]",
1217
1218 CASE,
1219 "[\\u1FB4]",
1220 "[\\u1FB4{\\u03AC\\u03B9}]",
1221
1222 CASE,
1223 "[{F\\uFB01}]",
1224 "[\\uFB03{ffi}]",
1225
1226 CASE, // make sure binary search finds limits
1227 "[a\\uFF3A]",
1228 "[aA\\uFF3A\\uFF5A]",
1229
1230 CASE,
1231 "[a-z]","[A-Za-z\\u017F\\u212A]",
1232 CASE,
1233 "[abc]","[A-Ca-c]",
1234 CASE,
1235 "[ABC]","[A-Ca-c]",
1236
1237 CASE, "[i]", "[iI]",
1238
1239 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1240 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1241
1242 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1243
1244 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1245
1246 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1247
1248 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1249
1250 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1251
1252 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1253
1254 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1255 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1256
1257 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1258
1259 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1260
1261 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1262
1263 #if !UCONFIG_NO_FILE_IO
1264 CASE_MAPPINGS,
1265 "[aq\\u00DF{Bc}{bC}{Fi}]",
1266 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1267 #endif
1268
1269 CASE_MAPPINGS,
1270 "[\\u01F1]", // 'DZ'
1271 "[\\u01F1\\u01F2\\u01F3]",
1272
1273 CASE_MAPPINGS,
1274 "[a-z]",
1275 "[A-Za-z]",
1276
1277 NULL
1278 };
1279
1280 UnicodeSet s;
1281 UnicodeSet t;
1282 UnicodeString buf;
1283 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1284 int32_t selector = DATA[i][0];
1285 UnicodeString pat(DATA[i+1], -1, US_INV);
1286 UnicodeString exp(DATA[i+2], -1, US_INV);
1287 s.applyPattern(pat, ec);
1288 s.closeOver(selector);
1289 t.applyPattern(exp, ec);
1290 if (U_FAILURE(ec)) {
1291 errln("FAIL: applyPattern failed");
1292 continue;
1293 }
1294 if (s == t) {
1295 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1296 } else {
1297 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1298 s.toPattern(buf, TRUE) + ", expected " + exp);
1299 }
1300 }
1301
1302 #if 0
1303 /*
1304 * Unused test code.
1305 * This was used to compare the old implementation (using USET_CASE)
1306 * with the new one (using 0x100 temporarily)
1307 * while transitioning from hardcoded case closure tables in uniset.cpp
1308 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1309 * and using ucase.c functions for closure.
1310 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1311 *
1312 * Note: The old and new implementation never fully matched because
1313 * the old implementation turned out to not map U+0130 and U+0131 correctly
1314 * (dotted I and dotless i) and because the old implementation's data tables
1315 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1316 * new implementation. (So sigmas and some other characters were not handled
1317 * according to the newer Unicode version.)
1318 */
1319 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1320 UnicodeSetIterator si(sens);
1321 UnicodeString str, buf2;
1322 const UnicodeString *pStr;
1323 UChar32 c;
1324 while(si.next()) {
1325 if(!si.isString()) {
1326 c=si.getCodepoint();
1327 s.clear();
1328 s.add(c);
1329
1330 str.setTo(c);
1331 str.foldCase();
1332 sens2.add(str);
1333
1334 t=s;
1335 s.closeOver(USET_CASE);
1336 t.closeOver(0x100);
1337 if(s!=t) {
1338 errln("FAIL: closeOver(U+%04x) differs: ", c);
1339 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1340 }
1341 }
1342 }
1343 // remove all code points
1344 // should contain all full case folding mapping strings
1345 sens2.remove(0, 0x10ffff);
1346 si.reset(sens2);
1347 while(si.next()) {
1348 if(si.isString()) {
1349 pStr=&si.getString();
1350 s.clear();
1351 s.add(*pStr);
1352 t=s2=s;
1353 s.closeOver(USET_CASE);
1354 t.closeOver(0x100);
1355 if(s!=t) {
1356 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1357 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1358 }
1359 }
1360 }
1361 #endif
1362
1363 // Test the pattern API
1364 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1365 if (U_FAILURE(ec)) {
1366 errln("FAIL: applyPattern failed");
1367 } else {
1368 expectContainment(s, "abcABC", "defDEF");
1369 }
1370 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1371 if (U_FAILURE(ec)) {
1372 errln("FAIL: constructor failed");
1373 } else {
1374 expectContainment(v, "defDEF", "abcABC");
1375 }
1376 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1377 if (U_FAILURE(ec)) {
1378 errln("FAIL: construct w/case mappings failed");
1379 } else {
1380 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1381 }
1382 }
1383
TestEscapePattern()1384 void UnicodeSetTest::TestEscapePattern() {
1385 const char pattern[] =
1386 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1387 const char exp[] =
1388 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1389 // We test this with two passes; in the second pass we
1390 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,
1391 // this fails -- which is what we expect.
1392 for (int32_t pass=1; pass<=2; ++pass) {
1393 UErrorCode ec = U_ZERO_ERROR;
1394 UnicodeString pat(pattern, -1, US_INV);
1395 if (pass==2) {
1396 pat = pat.unescape();
1397 }
1398 // Pattern is only good for pass 1
1399 UBool isPatternValid = (pass==1);
1400
1401 UnicodeSet set(pat, ec);
1402 if (U_SUCCESS(ec) != isPatternValid){
1403 errln((UnicodeString)"FAIL: applyPattern(" +
1404 escape(pat) + ") => " +
1405 u_errorName(ec));
1406 continue;
1407 }
1408 if (U_FAILURE(ec)) {
1409 continue;
1410 }
1411 if (set.contains((UChar)0x0644)){
1412 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1413 }
1414
1415 UnicodeString newpat;
1416 set.toPattern(newpat, TRUE);
1417 if (newpat == UnicodeString(exp, -1, US_INV)) {
1418 logln(escape(pat) + " => " + newpat);
1419 } else {
1420 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1421 }
1422
1423 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1424 UnicodeString str("Range ");
1425 str.append((UChar)(0x30 + i))
1426 .append(": ")
1427 .append((UChar32)set.getRangeStart(i))
1428 .append(" - ")
1429 .append((UChar32)set.getRangeEnd(i));
1430 str = str + " (" + set.getRangeStart(i) + " - " +
1431 set.getRangeEnd(i) + ")";
1432 if (set.getRangeStart(i) < 0) {
1433 errln((UnicodeString)"FAIL: " + escape(str));
1434 } else {
1435 logln(escape(str));
1436 }
1437 }
1438 }
1439 }
1440
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1441 void UnicodeSetTest::expectRange(const UnicodeString& label,
1442 const UnicodeSet& set,
1443 UChar32 start, UChar32 end) {
1444 UnicodeSet exp(start, end);
1445 UnicodeString pat;
1446 if (set == exp) {
1447 logln(label + " => " + set.toPattern(pat, TRUE));
1448 } else {
1449 UnicodeString xpat;
1450 errln((UnicodeString)"FAIL: " + label + " => " +
1451 set.toPattern(pat, TRUE) +
1452 ", expected " + exp.toPattern(xpat, TRUE));
1453 }
1454 }
1455
TestInvalidCodePoint()1456 void UnicodeSetTest::TestInvalidCodePoint() {
1457
1458 const UChar32 DATA[] = {
1459 // Test range Expected range
1460 0, 0x10FFFF, 0, 0x10FFFF,
1461 (UChar32)-1, 8, 0, 8,
1462 8, 0x110000, 8, 0x10FFFF
1463 };
1464 const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1465
1466 UnicodeString pat;
1467 int32_t i;
1468
1469 for (i=0; i<DATA_LENGTH; i+=4) {
1470 UChar32 start = DATA[i];
1471 UChar32 end = DATA[i+1];
1472 UChar32 xstart = DATA[i+2];
1473 UChar32 xend = DATA[i+3];
1474
1475 // Try various API using the test code points
1476
1477 UnicodeSet set(start, end);
1478 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1479 set, xstart, xend);
1480
1481 set.clear();
1482 set.set(start, end);
1483 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1484 set, xstart, xend);
1485
1486 UBool b = set.contains(start);
1487 b = set.contains(start, end);
1488 b = set.containsNone(start, end);
1489 b = set.containsSome(start, end);
1490 (void)b; // Suppress set but not used warning.
1491
1492 /*int32_t index = set.indexOf(start);*/
1493
1494 set.clear();
1495 set.add(start);
1496 set.add(start, end);
1497 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1498 set, xstart, xend);
1499
1500 set.set(0, 0x10FFFF);
1501 set.retain(start, end);
1502 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1503 set, xstart, xend);
1504 set.retain(start);
1505
1506 set.set(0, 0x10FFFF);
1507 set.remove(start);
1508 set.remove(start, end);
1509 set.complement();
1510 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1511 set, xstart, xend);
1512
1513 set.set(0, 0x10FFFF);
1514 set.complement(start, end);
1515 set.complement();
1516 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1517 set, xstart, xend);
1518 set.complement(start);
1519 }
1520
1521 const UChar32 DATA2[] = {
1522 0,
1523 0x10FFFF,
1524 (UChar32)-1,
1525 0x110000
1526 };
1527 const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1528
1529 for (i=0; i<DATA2_LENGTH; ++i) {
1530 UChar32 c = DATA2[i], end = 0x10FFFF;
1531 UBool valid = (c >= 0 && c <= 0x10FFFF);
1532
1533 UnicodeSet set(0, 0x10FFFF);
1534
1535 // For single-codepoint contains, invalid codepoints are NOT contained
1536 UBool b = set.contains(c);
1537 if (b == valid) {
1538 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1539 ") = " + b);
1540 } else {
1541 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1542 ") = " + b);
1543 }
1544
1545 // For codepoint range contains, containsNone, and containsSome,
1546 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1547 b = set.contains(c, end);
1548 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1549 "," + end + ") = " + b);
1550
1551 b = set.containsNone(c, end);
1552 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1553 "," + end + ") = " + b);
1554
1555 b = set.containsSome(c, end);
1556 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1557 "," + end + ") = " + b);
1558
1559 int32_t index = set.indexOf(c);
1560 if ((index >= 0) == valid) {
1561 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1562 ") = " + index);
1563 } else {
1564 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1565 ") = " + index);
1566 }
1567 }
1568 }
1569
1570 // Used by TestSymbolTable
1571 class TokenSymbolTable : public SymbolTable {
1572 public:
1573 Hashtable contents;
1574
TokenSymbolTable(UErrorCode & ec)1575 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1576 contents.setValueDeleter(uprv_deleteUObject);
1577 }
1578
~TokenSymbolTable()1579 ~TokenSymbolTable() {}
1580
1581 /**
1582 * (Non-SymbolTable API) Add the given variable and value to
1583 * the table. Variable should NOT contain leading '$'.
1584 */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1585 void add(const UnicodeString& var, const UnicodeString& value,
1586 UErrorCode& ec) {
1587 if (U_SUCCESS(ec)) {
1588 contents.put(var, new UnicodeString(value), ec);
1589 }
1590 }
1591
1592 /**
1593 * SymbolTable API
1594 */
lookup(const UnicodeString & s) const1595 virtual const UnicodeString* lookup(const UnicodeString& s) const {
1596 return (const UnicodeString*) contents.get(s);
1597 }
1598
1599 /**
1600 * SymbolTable API
1601 */
lookupMatcher(UChar32) const1602 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1603 return NULL;
1604 }
1605
1606 /**
1607 * SymbolTable API
1608 */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1609 virtual UnicodeString parseReference(const UnicodeString& text,
1610 ParsePosition& pos, int32_t limit) const {
1611 int32_t start = pos.getIndex();
1612 int32_t i = start;
1613 UnicodeString result;
1614 while (i < limit) {
1615 UChar c = text.charAt(i);
1616 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1617 break;
1618 }
1619 ++i;
1620 }
1621 if (i == start) { // No valid name chars
1622 return result; // Indicate failure with empty string
1623 }
1624 pos.setIndex(i);
1625 text.extractBetween(start, i, result);
1626 return result;
1627 }
1628 };
1629
TestSymbolTable()1630 void UnicodeSetTest::TestSymbolTable() {
1631 // Multiple test cases can be set up here. Each test case
1632 // is terminated by null:
1633 // var, value, var, value,..., input pat., exp. output pat., null
1634 const char* DATA[] = {
1635 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1636 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1637 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1638 NULL
1639 };
1640
1641 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1642 UErrorCode ec = U_ZERO_ERROR;
1643 TokenSymbolTable sym(ec);
1644 if (U_FAILURE(ec)) {
1645 errln("FAIL: couldn't construct TokenSymbolTable");
1646 continue;
1647 }
1648
1649 // Set up variables
1650 while (DATA[i+2] != NULL) {
1651 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1652 if (U_FAILURE(ec)) {
1653 errln("FAIL: couldn't add to TokenSymbolTable");
1654 continue;
1655 }
1656 i += 2;
1657 }
1658
1659 // Input pattern and expected output pattern
1660 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1661 i += 2;
1662
1663 ParsePosition pos(0);
1664 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1665 if (U_FAILURE(ec)) {
1666 errln("FAIL: couldn't construct UnicodeSet");
1667 continue;
1668 }
1669
1670 // results
1671 if (pos.getIndex() != inpat.length()) {
1672 errln((UnicodeString)"Failed to read to end of string \""
1673 + inpat + "\": read to "
1674 + pos.getIndex() + ", length is "
1675 + inpat.length());
1676 }
1677
1678 UnicodeSet us2(exppat, ec);
1679 if (U_FAILURE(ec)) {
1680 errln("FAIL: couldn't construct expected UnicodeSet");
1681 continue;
1682 }
1683
1684 UnicodeString a, b;
1685 if (us != us2) {
1686 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1687 ", expected " + us2.toPattern(b, TRUE));
1688 } else {
1689 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1690 }
1691 }
1692 }
1693
TestSurrogate()1694 void UnicodeSetTest::TestSurrogate() {
1695 const char* DATA[] = {
1696 // These should all behave identically
1697 "[abc\\uD800\\uDC00]",
1698 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1699 "[abc\\U00010000]",
1700 0
1701 };
1702 for (int i=0; DATA[i] != 0; ++i) {
1703 UErrorCode ec = U_ZERO_ERROR;
1704 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1705 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1706 UnicodeSet set(str, ec);
1707 if (U_FAILURE(ec)) {
1708 errln("FAIL: UnicodeSet constructor");
1709 continue;
1710 }
1711 expectContainment(set,
1712 CharsToUnicodeString("abc\\U00010000"),
1713 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1714 if (set.size() != 4) {
1715 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1716 set.size() + ", expected 4");
1717 }
1718
1719 {
1720 UErrorCode subErr = U_ZERO_ERROR;
1721 checkRoundTrip(set);
1722 checkSerializeRoundTrip(set, subErr);
1723 }
1724 }
1725 }
1726
TestExhaustive()1727 void UnicodeSetTest::TestExhaustive() {
1728 // exhaustive tests. Simulate UnicodeSets with integers.
1729 // That gives us very solid tests (except for large memory tests).
1730
1731 int32_t limit = 128;
1732
1733 UnicodeSet x, y, z, aa;
1734
1735 for (int32_t i = 0; i < limit; ++i) {
1736 bitsToSet(i, x);
1737 logln((UnicodeString)"Testing " + i + ", " + x);
1738 _testComplement(i, x, y);
1739
1740 UnicodeSet &toTest = bitsToSet(i, aa);
1741
1742 // AS LONG AS WE ARE HERE, check roundtrip
1743 checkRoundTrip(toTest);
1744 UErrorCode ec = U_ZERO_ERROR;
1745 checkSerializeRoundTrip(toTest, ec);
1746
1747 for (int32_t j = 0; j < limit; ++j) {
1748 _testAdd(i,j, x,y,z);
1749 _testXor(i,j, x,y,z);
1750 _testRetain(i,j, x,y,z);
1751 _testRemove(i,j, x,y,z);
1752 }
1753 }
1754 }
1755
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1756 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1757 bitsToSet(a, x);
1758 z = x;
1759 z.complement();
1760 int32_t c = setToBits(z);
1761 if (c != (~a)) {
1762 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1763 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1764 }
1765 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1766 }
1767
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1768 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1769 bitsToSet(a, x);
1770 bitsToSet(b, y);
1771 z = x;
1772 z.addAll(y);
1773 int32_t c = setToBits(z);
1774 if (c != (a | b)) {
1775 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1776 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1777 }
1778 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1779 }
1780
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1781 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1782 bitsToSet(a, x);
1783 bitsToSet(b, y);
1784 z = x;
1785 z.retainAll(y);
1786 int32_t c = setToBits(z);
1787 if (c != (a & b)) {
1788 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1789 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1790 }
1791 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1792 }
1793
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1794 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1795 bitsToSet(a, x);
1796 bitsToSet(b, y);
1797 z = x;
1798 z.removeAll(y);
1799 int32_t c = setToBits(z);
1800 if (c != (a &~ b)) {
1801 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1802 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1803 }
1804 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1805 }
1806
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1807 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1808 bitsToSet(a, x);
1809 bitsToSet(b, y);
1810 z = x;
1811 z.complementAll(y);
1812 int32_t c = setToBits(z);
1813 if (c != (a ^ b)) {
1814 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1815 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1816 }
1817 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1818 }
1819
1820 /**
1821 * Check that ranges are monotonically increasing and non-
1822 * overlapping.
1823 */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1824 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1825 int32_t n = set.getRangeCount();
1826 if (n < 0) {
1827 errln((UnicodeString)"FAIL result of " + msg +
1828 ": range count should be >= 0 but is " +
1829 n /*+ " for " + set.toPattern())*/);
1830 return;
1831 }
1832 UChar32 last = 0;
1833 for (int32_t i=0; i<n; ++i) {
1834 UChar32 start = set.getRangeStart(i);
1835 UChar32 end = set.getRangeEnd(i);
1836 if (start > end) {
1837 errln((UnicodeString)"FAIL result of " + msg +
1838 ": range " + (i+1) +
1839 " start > end: " + (int)start + ", " + (int)end +
1840 " for " + set);
1841 }
1842 if (i > 0 && start <= last) {
1843 errln((UnicodeString)"FAIL result of " + msg +
1844 ": range " + (i+1) +
1845 " overlaps previous range: " + (int)start + ", " + (int)end +
1846 " for " + set);
1847 }
1848 last = end;
1849 }
1850 }
1851
1852 /**
1853 * Convert a bitmask to a UnicodeSet.
1854 */
bitsToSet(int32_t a,UnicodeSet & result)1855 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1856 result.clear();
1857 for (UChar32 i = 0; i < 32; ++i) {
1858 if ((a & (1<<i)) != 0) {
1859 result.add(i);
1860 }
1861 }
1862 return result;
1863 }
1864
1865 /**
1866 * Convert a UnicodeSet to a bitmask. Only the characters
1867 * U+0000 to U+0020 are represented in the bitmask.
1868 */
setToBits(const UnicodeSet & x)1869 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1870 int32_t result = 0;
1871 for (int32_t i = 0; i < 32; ++i) {
1872 if (x.contains((UChar32)i)) {
1873 result |= (1<<i);
1874 }
1875 }
1876 return result;
1877 }
1878
1879 /**
1880 * Return the representation of an inversion list based UnicodeSet
1881 * as a pairs list. Ranges are listed in ascending Unicode order.
1882 * For example, the set [a-zA-M3] is represented as "33AMaz".
1883 */
getPairs(const UnicodeSet & set)1884 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1885 UnicodeString pairs;
1886 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1887 UChar32 start = set.getRangeStart(i);
1888 UChar32 end = set.getRangeEnd(i);
1889 if (end > 0xFFFF) {
1890 end = 0xFFFF;
1891 i = set.getRangeCount(); // Should be unnecessary
1892 }
1893 pairs.append((UChar)start).append((UChar)end);
1894 }
1895 return pairs;
1896 }
1897
1898 /**
1899 * Basic consistency check for a few items.
1900 * That the iterator works, and that we can create a pattern and
1901 * get the same thing back
1902 */
checkRoundTrip(const UnicodeSet & s)1903 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1904 {
1905 UnicodeSet t(s);
1906 checkEqual(s, t, "copy ct");
1907 }
1908
1909 {
1910 UnicodeSet t(0xabcd, 0xdef0); // dummy contents should be overwritten
1911 t = s;
1912 checkEqual(s, t, "operator=");
1913 }
1914
1915 {
1916 UnicodeSet t;
1917 copyWithIterator(t, s, FALSE);
1918 checkEqual(s, t, "iterator roundtrip");
1919 }
1920
1921 {
1922 UnicodeSet t;
1923 copyWithIterator(t, s, TRUE); // try range
1924 checkEqual(s, t, "iterator roundtrip");
1925 }
1926
1927 {
1928 UnicodeSet t;
1929 UnicodeString pat;
1930 UErrorCode ec = U_ZERO_ERROR;
1931 s.toPattern(pat, FALSE);
1932 t.applyPattern(pat, ec);
1933 if (U_FAILURE(ec)) {
1934 errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1935 return;
1936 } else {
1937 checkEqual(s, t, "toPattern(false)");
1938 }
1939 }
1940
1941 {
1942 UnicodeSet t;
1943 UnicodeString pat;
1944 UErrorCode ec = U_ZERO_ERROR;
1945 s.toPattern(pat, TRUE);
1946 t.applyPattern(pat, ec);
1947 if (U_FAILURE(ec)) {
1948 errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1949 return;
1950 } else {
1951 checkEqual(s, t, "toPattern(true)");
1952 }
1953 }
1954 }
1955
checkSerializeRoundTrip(const UnicodeSet & t,UErrorCode & status)1956 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1957 if(U_FAILURE(status)) return;
1958 int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1959 if(status == U_BUFFER_OVERFLOW_ERROR) {
1960 status = U_ZERO_ERROR;
1961 serializeBuffer.resize(len);
1962 len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1963 // let 2nd error stand
1964 }
1965 if(U_FAILURE(status)) {
1966 errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
1967 return;
1968 }
1969 UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
1970 if(U_FAILURE(status)) {
1971 errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
1972 return;
1973 }
1974
1975 checkEqual(t, deserialized, "Set was unequal when deserialized");
1976 }
1977
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)1978 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1979 t.clear();
1980 UnicodeSetIterator it(s);
1981 if (withRange) {
1982 while (it.nextRange()) {
1983 if (it.isString()) {
1984 t.add(it.getString());
1985 } else {
1986 t.add(it.getCodepoint(), it.getCodepointEnd());
1987 }
1988 }
1989 } else {
1990 while (it.next()) {
1991 if (it.isString()) {
1992 t.add(it.getString());
1993 } else {
1994 t.add(it.getCodepoint());
1995 }
1996 }
1997 }
1998 }
1999
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)2000 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2001 assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2002 assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2003 UnicodeString source; s.toPattern(source, TRUE);
2004 UnicodeString result; t.toPattern(result, TRUE);
2005 if (s != t) {
2006 errln((UnicodeString)"FAIL: " + message
2007 + "; source = " + source
2008 + "; result = " + result
2009 );
2010 return FALSE;
2011 } else {
2012 logln((UnicodeString)"Ok: " + message
2013 + "; source = " + source
2014 + "; result = " + result
2015 );
2016 }
2017 return TRUE;
2018 }
2019
2020 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)2021 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2022 const UnicodeString& charsIn,
2023 const UnicodeString& charsOut) {
2024 UErrorCode ec = U_ZERO_ERROR;
2025 UnicodeSet set(pat, ec);
2026 if (U_FAILURE(ec)) {
2027 dataerrln((UnicodeString)"FAIL: pattern \"" +
2028 pat + "\" => " + u_errorName(ec));
2029 return;
2030 }
2031 expectContainment(set, pat, charsIn, charsOut);
2032 }
2033
2034 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)2035 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2036 const UnicodeString& charsIn,
2037 const UnicodeString& charsOut) {
2038 UnicodeString pat;
2039 set.toPattern(pat);
2040 expectContainment(set, pat, charsIn, charsOut);
2041 }
2042
2043 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)2044 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2045 const UnicodeString& setName,
2046 const UnicodeString& charsIn,
2047 const UnicodeString& charsOut) {
2048 UnicodeString bad;
2049 UChar32 c;
2050 int32_t i;
2051
2052 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2053 c = charsIn.char32At(i);
2054 if (!set.contains(c)) {
2055 bad.append(c);
2056 }
2057 }
2058 if (bad.length() > 0) {
2059 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2060 ", expected containment of " + prettify(charsIn));
2061 } else {
2062 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2063 }
2064
2065 bad.truncate(0);
2066 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2067 c = charsOut.char32At(i);
2068 if (set.contains(c)) {
2069 bad.append(c);
2070 }
2071 }
2072 if (bad.length() > 0) {
2073 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2074 ", expected non-containment of " + prettify(charsOut));
2075 } else {
2076 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2077 }
2078 }
2079
2080 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2081 UnicodeSetTest::expectPattern(UnicodeSet& set,
2082 const UnicodeString& pattern,
2083 const UnicodeString& expectedPairs){
2084 UErrorCode status = U_ZERO_ERROR;
2085 set.applyPattern(pattern, status);
2086 if (U_FAILURE(status)) {
2087 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2088 "\") failed");
2089 return;
2090 } else {
2091 if (getPairs(set) != expectedPairs ) {
2092 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2093 "\") => pairs \"" +
2094 escape(getPairs(set)) + "\", expected \"" +
2095 escape(expectedPairs) + "\"");
2096 } else {
2097 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
2098 "\") => pairs \"" +
2099 escape(getPairs(set)) + "\"");
2100 }
2101 }
2102 // the result of calling set.toPattern(), which is the string representation of
2103 // this set(set), is passed to a UnicodeSet constructor, and tested that it
2104 // will produce another set that is equal to this one.
2105 UnicodeString temppattern;
2106 set.toPattern(temppattern);
2107 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2108 if (U_FAILURE(status)) {
2109 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2110 return;
2111 }
2112 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2113 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2114 escape(getPairs(set)) + "\""));
2115 } else{
2116 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2117 }
2118
2119 delete tempset;
2120
2121 }
2122
2123 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2124 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2125 if (getPairs(set) != expectedPairs) {
2126 errln(UnicodeString("FAIL: Expected pair list \"") +
2127 escape(expectedPairs) + "\", got \"" +
2128 escape(getPairs(set)) + "\"");
2129 }
2130 }
2131
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2132 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2133 const UnicodeString& expPat,
2134 const char** expStrings) {
2135 UnicodeString pat;
2136 set.toPattern(pat, TRUE);
2137 if (pat == expPat) {
2138 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2139 } else {
2140 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2141 return;
2142 }
2143 if (expStrings == NULL) {
2144 return;
2145 }
2146 UBool in = TRUE;
2147 for (int32_t i=0; expStrings[i] != NULL; ++i) {
2148 if (expStrings[i] == NOT) { // sic; pointer comparison
2149 in = FALSE;
2150 continue;
2151 }
2152 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2153 UBool contained = set.contains(s);
2154 if (contained == in) {
2155 logln((UnicodeString)"Ok: " + expPat +
2156 (contained ? " contains {" : " does not contain {") +
2157 escape(expStrings[i]) + "}");
2158 } else {
2159 errln((UnicodeString)"FAIL: " + expPat +
2160 (contained ? " contains {" : " does not contain {") +
2161 escape(expStrings[i]) + "}");
2162 }
2163 }
2164 }
2165
toHexString(int32_t i)2166 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2167
2168 void
doAssert(UBool condition,const char * message)2169 UnicodeSetTest::doAssert(UBool condition, const char *message)
2170 {
2171 if (!condition) {
2172 errln(UnicodeString("ERROR : ") + message);
2173 }
2174 }
2175
2176 UnicodeString
escape(const UnicodeString & s)2177 UnicodeSetTest::escape(const UnicodeString& s) {
2178 UnicodeString buf;
2179 for (int32_t i=0; i<s.length(); )
2180 {
2181 UChar32 c = s.char32At(i);
2182 if (0x0020 <= c && c <= 0x007F) {
2183 buf += c;
2184 } else {
2185 if (c <= 0xFFFF) {
2186 buf += (UChar)0x5c; buf += (UChar)0x75;
2187 } else {
2188 buf += (UChar)0x5c; buf += (UChar)0x55;
2189 buf += toHexString((c & 0xF0000000) >> 28);
2190 buf += toHexString((c & 0x0F000000) >> 24);
2191 buf += toHexString((c & 0x00F00000) >> 20);
2192 buf += toHexString((c & 0x000F0000) >> 16);
2193 }
2194 buf += toHexString((c & 0xF000) >> 12);
2195 buf += toHexString((c & 0x0F00) >> 8);
2196 buf += toHexString((c & 0x00F0) >> 4);
2197 buf += toHexString(c & 0x000F);
2198 }
2199 i += U16_LENGTH(c);
2200 }
2201 return buf;
2202 }
2203
TestFreezable()2204 void UnicodeSetTest::TestFreezable() {
2205 UErrorCode errorCode=U_ZERO_ERROR;
2206 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2207 UnicodeSet idSet(idPattern, errorCode);
2208 if(U_FAILURE(errorCode)) {
2209 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2210 return;
2211 }
2212
2213 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2214 UnicodeSet wsSet(wsPattern, errorCode);
2215 if(U_FAILURE(errorCode)) {
2216 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2217 return;
2218 }
2219
2220 idSet.add(idPattern);
2221 UnicodeSet frozen(idSet);
2222 frozen.freeze();
2223
2224 if(idSet.isFrozen() || !frozen.isFrozen()) {
2225 errln("FAIL: isFrozen() is wrong");
2226 }
2227 if(frozen!=idSet || !(frozen==idSet)) {
2228 errln("FAIL: a copy-constructed frozen set differs from its original");
2229 }
2230
2231 frozen=wsSet;
2232 if(frozen!=idSet || !(frozen==idSet)) {
2233 errln("FAIL: a frozen set was modified by operator=");
2234 }
2235
2236 UnicodeSet frozen2(frozen);
2237 if(frozen2!=frozen || frozen2!=idSet) {
2238 errln("FAIL: a copied frozen set differs from its frozen original");
2239 }
2240 if(!frozen2.isFrozen()) {
2241 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2242 }
2243 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2244 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2245 errln("FAIL: UnicodeSet(5, 55) failed");
2246 }
2247 frozen3=frozen;
2248 if(!frozen3.isFrozen()) {
2249 errln("FAIL: copying a frozen set results in a thawed one");
2250 }
2251
2252 UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2253 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2254 errln("FAIL: clone() failed");
2255 }
2256 cloned->add(0xd802, 0xd805);
2257 if(cloned->containsSome(0xd802, 0xd805)) {
2258 errln("FAIL: unable to modify clone");
2259 }
2260 delete cloned;
2261
2262 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2263 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2264 errln("FAIL: cloneAsThawed() failed");
2265 }
2266 thawed->add(0xd802, 0xd805);
2267 if(!thawed->contains(0xd802, 0xd805)) {
2268 errln("FAIL: unable to modify thawed clone");
2269 }
2270 delete thawed;
2271
2272 frozen.set(5, 55);
2273 if(frozen!=idSet || !(frozen==idSet)) {
2274 errln("FAIL: UnicodeSet::set() modified a frozen set");
2275 }
2276
2277 frozen.clear();
2278 if(frozen!=idSet || !(frozen==idSet)) {
2279 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2280 }
2281
2282 frozen.closeOver(USET_CASE_INSENSITIVE);
2283 if(frozen!=idSet || !(frozen==idSet)) {
2284 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2285 }
2286
2287 frozen.compact();
2288 if(frozen!=idSet || !(frozen==idSet)) {
2289 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2290 }
2291
2292 ParsePosition pos;
2293 frozen.
2294 applyPattern(wsPattern, errorCode).
2295 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2296 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2297 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2298 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2299 if(frozen!=idSet || !(frozen==idSet)) {
2300 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2301 }
2302
2303 frozen.
2304 add(0xd800).
2305 add(0xd802, 0xd805).
2306 add(wsPattern).
2307 addAll(idPattern).
2308 addAll(wsSet);
2309 if(frozen!=idSet || !(frozen==idSet)) {
2310 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2311 }
2312
2313 frozen.
2314 retain(0x62).
2315 retain(0x64, 0x69).
2316 retainAll(wsPattern).
2317 retainAll(wsSet);
2318 if(frozen!=idSet || !(frozen==idSet)) {
2319 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2320 }
2321
2322 frozen.
2323 remove(0x62).
2324 remove(0x64, 0x69).
2325 remove(idPattern).
2326 removeAll(idPattern).
2327 removeAll(idSet);
2328 if(frozen!=idSet || !(frozen==idSet)) {
2329 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2330 }
2331
2332 frozen.
2333 complement().
2334 complement(0x62).
2335 complement(0x64, 0x69).
2336 complement(idPattern).
2337 complementAll(idPattern).
2338 complementAll(idSet);
2339 if(frozen!=idSet || !(frozen==idSet)) {
2340 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2341 }
2342 }
2343
2344 // Test span() etc. -------------------------------------------------------- ***
2345
2346 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2347 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2348 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2349 UErrorCode errorCode=U_ZERO_ERROR;
2350 int32_t length8=0;
2351 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2352 if(U_SUCCESS(errorCode)) {
2353 return length8;
2354 } else {
2355 // The string contains an unpaired surrogate.
2356 // Ignore this string.
2357 return 0;
2358 }
2359 }
2360
2361 class UnicodeSetWithStringsIterator;
2362
2363 // Make the strings in a UnicodeSet easily accessible.
2364 class UnicodeSetWithStrings {
2365 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2366 UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2367 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2368 int32_t size=set.size();
2369 if(size>0 && set.charAt(size-1)<0) {
2370 // If a set's last element is not a code point, then it must contain strings.
2371 // Iterate over the set, skip all code point ranges, and cache the strings.
2372 // Convert them to UTF-8 for spanUTF8().
2373 UnicodeSetIterator iter(set);
2374 const UnicodeString *s;
2375 char *s8=utf8;
2376 int32_t length8, utf8Count=0;
2377 while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2378 if(iter.isString()) {
2379 // Store the pointer to the set's string element
2380 // which we happen to know is a stable pointer.
2381 strings[stringsLength]=s=&iter.getString();
2382 utf8Count+=
2383 utf8Lengths[stringsLength]=length8=
2384 appendUTF8(s->getBuffer(), s->length(),
2385 s8, (int32_t)(sizeof(utf8)-utf8Count));
2386 if(length8==0) {
2387 hasSurrogates=TRUE; // Contains unpaired surrogates.
2388 }
2389 s8+=length8;
2390 ++stringsLength;
2391 }
2392 }
2393 }
2394 }
2395
getSet() const2396 const UnicodeSet &getSet() const {
2397 return set;
2398 }
2399
hasStrings() const2400 UBool hasStrings() const {
2401 return (UBool)(stringsLength>0);
2402 }
2403
hasStringsWithSurrogates() const2404 UBool hasStringsWithSurrogates() const {
2405 return hasSurrogates;
2406 }
2407
2408 private:
2409 friend class UnicodeSetWithStringsIterator;
2410
2411 const UnicodeSet &set;
2412
2413 const UnicodeString *strings[20];
2414 int32_t stringsLength;
2415 UBool hasSurrogates;
2416
2417 char utf8[1024];
2418 int32_t utf8Lengths[20];
2419 };
2420
2421 class UnicodeSetWithStringsIterator {
2422 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2423 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2424 fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2425 }
2426
reset()2427 void reset() {
2428 nextStringIndex=nextUTF8Start=0;
2429 }
2430
nextString()2431 const UnicodeString *nextString() {
2432 if(nextStringIndex<fSet.stringsLength) {
2433 return fSet.strings[nextStringIndex++];
2434 } else {
2435 return NULL;
2436 }
2437 }
2438
2439 // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2440 const char *nextUTF8(int32_t &length) {
2441 if(nextStringIndex<fSet.stringsLength) {
2442 const char *s8=fSet.utf8+nextUTF8Start;
2443 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2444 return s8;
2445 } else {
2446 length=0;
2447 return NULL;
2448 }
2449 }
2450
2451 private:
2452 const UnicodeSetWithStrings &fSet;
2453 int32_t nextStringIndex;
2454 int32_t nextUTF8Start;
2455 };
2456
2457 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2458 // at code point boundaries.
2459 // That is, each edge of a match must not be in the middle of a surrogate pair.
2460 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2461 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2462 s+=start;
2463 limit-=start;
2464 int32_t length=t.length();
2465 return 0==t.compare(s, length) &&
2466 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2467 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2468 }
2469
2470 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2471 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2472 USetSpanCondition spanCondition) {
2473 const UnicodeSet &realSet(set.getSet());
2474 if(!set.hasStrings()) {
2475 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2476 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2477 }
2478
2479 UChar32 c;
2480 int32_t start=0, prev;
2481 while((prev=start)<length) {
2482 U16_NEXT(s, start, length, c);
2483 if(realSet.contains(c)!=spanCondition) {
2484 break;
2485 }
2486 }
2487 return prev;
2488 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2489 UnicodeSetWithStringsIterator iter(set);
2490 UChar32 c;
2491 int32_t start, next;
2492 for(start=next=0; start<length;) {
2493 U16_NEXT(s, next, length, c);
2494 if(realSet.contains(c)) {
2495 break;
2496 }
2497 const UnicodeString *str;
2498 iter.reset();
2499 while((str=iter.nextString())!=NULL) {
2500 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2501 // spanNeedsStrings=TRUE;
2502 return start;
2503 }
2504 }
2505 start=next;
2506 }
2507 return start;
2508 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2509 UnicodeSetWithStringsIterator iter(set);
2510 UChar32 c;
2511 int32_t start, next, maxSpanLimit=0;
2512 for(start=next=0; start<length;) {
2513 U16_NEXT(s, next, length, c);
2514 if(!realSet.contains(c)) {
2515 next=start; // Do not span this single, not-contained code point.
2516 }
2517 const UnicodeString *str;
2518 iter.reset();
2519 while((str=iter.nextString())!=NULL) {
2520 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2521 // spanNeedsStrings=TRUE;
2522 int32_t matchLimit=start+str->length();
2523 if(matchLimit==length) {
2524 return length;
2525 }
2526 if(spanCondition==USET_SPAN_CONTAINED) {
2527 // Iterate for the shortest match at each position.
2528 // Recurse for each but the shortest match.
2529 if(next==start) {
2530 next=matchLimit; // First match from start.
2531 } else {
2532 if(matchLimit<next) {
2533 // Remember shortest match from start for iteration.
2534 int32_t temp=next;
2535 next=matchLimit;
2536 matchLimit=temp;
2537 }
2538 // Recurse for non-shortest match from start.
2539 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2540 USET_SPAN_CONTAINED);
2541 if((matchLimit+spanLength)>maxSpanLimit) {
2542 maxSpanLimit=matchLimit+spanLength;
2543 if(maxSpanLimit==length) {
2544 return length;
2545 }
2546 }
2547 }
2548 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2549 if(matchLimit>next) {
2550 // Remember longest match from start.
2551 next=matchLimit;
2552 }
2553 }
2554 }
2555 }
2556 if(next==start) {
2557 break; // No match from start.
2558 }
2559 start=next;
2560 }
2561 if(start>maxSpanLimit) {
2562 return start;
2563 } else {
2564 return maxSpanLimit;
2565 }
2566 }
2567 }
2568
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2569 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2570 USetSpanCondition spanCondition) {
2571 if(length==0) {
2572 return 0;
2573 }
2574 const UnicodeSet &realSet(set.getSet());
2575 if(!set.hasStrings()) {
2576 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2577 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2578 }
2579
2580 UChar32 c;
2581 int32_t prev=length;
2582 do {
2583 U16_PREV(s, 0, length, c);
2584 if(realSet.contains(c)!=spanCondition) {
2585 break;
2586 }
2587 } while((prev=length)>0);
2588 return prev;
2589 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2590 UnicodeSetWithStringsIterator iter(set);
2591 UChar32 c;
2592 int32_t prev=length, length0=length;
2593 do {
2594 U16_PREV(s, 0, length, c);
2595 if(realSet.contains(c)) {
2596 break;
2597 }
2598 const UnicodeString *str;
2599 iter.reset();
2600 while((str=iter.nextString())!=NULL) {
2601 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2602 // spanNeedsStrings=TRUE;
2603 return prev;
2604 }
2605 }
2606 } while((prev=length)>0);
2607 return prev;
2608 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2609 UnicodeSetWithStringsIterator iter(set);
2610 UChar32 c;
2611 int32_t prev=length, minSpanStart=length, length0=length;
2612 do {
2613 U16_PREV(s, 0, length, c);
2614 if(!realSet.contains(c)) {
2615 length=prev; // Do not span this single, not-contained code point.
2616 }
2617 const UnicodeString *str;
2618 iter.reset();
2619 while((str=iter.nextString())!=NULL) {
2620 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2621 // spanNeedsStrings=TRUE;
2622 int32_t matchStart=prev-str->length();
2623 if(matchStart==0) {
2624 return 0;
2625 }
2626 if(spanCondition==USET_SPAN_CONTAINED) {
2627 // Iterate for the shortest match at each position.
2628 // Recurse for each but the shortest match.
2629 if(length==prev) {
2630 length=matchStart; // First match from prev.
2631 } else {
2632 if(matchStart>length) {
2633 // Remember shortest match from prev for iteration.
2634 int32_t temp=length;
2635 length=matchStart;
2636 matchStart=temp;
2637 }
2638 // Recurse for non-shortest match from prev.
2639 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2640 USET_SPAN_CONTAINED);
2641 if(spanStart<minSpanStart) {
2642 minSpanStart=spanStart;
2643 if(minSpanStart==0) {
2644 return 0;
2645 }
2646 }
2647 }
2648 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2649 if(matchStart<length) {
2650 // Remember longest match from prev.
2651 length=matchStart;
2652 }
2653 }
2654 }
2655 }
2656 if(length==prev) {
2657 break; // No match from prev.
2658 }
2659 } while((prev=length)>0);
2660 if(prev<minSpanStart) {
2661 return prev;
2662 } else {
2663 return minSpanStart;
2664 }
2665 }
2666 }
2667
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2668 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2669 USetSpanCondition spanCondition) {
2670 const UnicodeSet &realSet(set.getSet());
2671 if(!set.hasStrings()) {
2672 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2673 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2674 }
2675
2676 UChar32 c;
2677 int32_t start=0, prev;
2678 while((prev=start)<length) {
2679 U8_NEXT_OR_FFFD(s, start, length, c);
2680 if(realSet.contains(c)!=spanCondition) {
2681 break;
2682 }
2683 }
2684 return prev;
2685 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2686 UnicodeSetWithStringsIterator iter(set);
2687 UChar32 c;
2688 int32_t start, next;
2689 for(start=next=0; start<length;) {
2690 U8_NEXT_OR_FFFD(s, next, length, c);
2691 if(realSet.contains(c)) {
2692 break;
2693 }
2694 const char *s8;
2695 int32_t length8;
2696 iter.reset();
2697 while((s8=iter.nextUTF8(length8))!=NULL) {
2698 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2699 // spanNeedsStrings=TRUE;
2700 return start;
2701 }
2702 }
2703 start=next;
2704 }
2705 return start;
2706 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2707 UnicodeSetWithStringsIterator iter(set);
2708 UChar32 c;
2709 int32_t start, next, maxSpanLimit=0;
2710 for(start=next=0; start<length;) {
2711 U8_NEXT_OR_FFFD(s, next, length, c);
2712 if(!realSet.contains(c)) {
2713 next=start; // Do not span this single, not-contained code point.
2714 }
2715 const char *s8;
2716 int32_t length8;
2717 iter.reset();
2718 while((s8=iter.nextUTF8(length8))!=NULL) {
2719 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2720 // spanNeedsStrings=TRUE;
2721 int32_t matchLimit=start+length8;
2722 if(matchLimit==length) {
2723 return length;
2724 }
2725 if(spanCondition==USET_SPAN_CONTAINED) {
2726 // Iterate for the shortest match at each position.
2727 // Recurse for each but the shortest match.
2728 if(next==start) {
2729 next=matchLimit; // First match from start.
2730 } else {
2731 if(matchLimit<next) {
2732 // Remember shortest match from start for iteration.
2733 int32_t temp=next;
2734 next=matchLimit;
2735 matchLimit=temp;
2736 }
2737 // Recurse for non-shortest match from start.
2738 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2739 USET_SPAN_CONTAINED);
2740 if((matchLimit+spanLength)>maxSpanLimit) {
2741 maxSpanLimit=matchLimit+spanLength;
2742 if(maxSpanLimit==length) {
2743 return length;
2744 }
2745 }
2746 }
2747 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2748 if(matchLimit>next) {
2749 // Remember longest match from start.
2750 next=matchLimit;
2751 }
2752 }
2753 }
2754 }
2755 if(next==start) {
2756 break; // No match from start.
2757 }
2758 start=next;
2759 }
2760 if(start>maxSpanLimit) {
2761 return start;
2762 } else {
2763 return maxSpanLimit;
2764 }
2765 }
2766 }
2767
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2768 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2769 USetSpanCondition spanCondition) {
2770 if(length==0) {
2771 return 0;
2772 }
2773 const UnicodeSet &realSet(set.getSet());
2774 if(!set.hasStrings()) {
2775 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2776 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2777 }
2778
2779 UChar32 c;
2780 int32_t prev=length;
2781 do {
2782 U8_PREV_OR_FFFD(s, 0, length, c);
2783 if(realSet.contains(c)!=spanCondition) {
2784 break;
2785 }
2786 } while((prev=length)>0);
2787 return prev;
2788 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2789 UnicodeSetWithStringsIterator iter(set);
2790 UChar32 c;
2791 int32_t prev=length;
2792 do {
2793 U8_PREV_OR_FFFD(s, 0, length, c);
2794 if(realSet.contains(c)) {
2795 break;
2796 }
2797 const char *s8;
2798 int32_t length8;
2799 iter.reset();
2800 while((s8=iter.nextUTF8(length8))!=NULL) {
2801 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2802 // spanNeedsStrings=TRUE;
2803 return prev;
2804 }
2805 }
2806 } while((prev=length)>0);
2807 return prev;
2808 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2809 UnicodeSetWithStringsIterator iter(set);
2810 UChar32 c;
2811 int32_t prev=length, minSpanStart=length;
2812 do {
2813 U8_PREV_OR_FFFD(s, 0, length, c);
2814 if(!realSet.contains(c)) {
2815 length=prev; // Do not span this single, not-contained code point.
2816 }
2817 const char *s8;
2818 int32_t length8;
2819 iter.reset();
2820 while((s8=iter.nextUTF8(length8))!=NULL) {
2821 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2822 // spanNeedsStrings=TRUE;
2823 int32_t matchStart=prev-length8;
2824 if(matchStart==0) {
2825 return 0;
2826 }
2827 if(spanCondition==USET_SPAN_CONTAINED) {
2828 // Iterate for the shortest match at each position.
2829 // Recurse for each but the shortest match.
2830 if(length==prev) {
2831 length=matchStart; // First match from prev.
2832 } else {
2833 if(matchStart>length) {
2834 // Remember shortest match from prev for iteration.
2835 int32_t temp=length;
2836 length=matchStart;
2837 matchStart=temp;
2838 }
2839 // Recurse for non-shortest match from prev.
2840 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2841 USET_SPAN_CONTAINED);
2842 if(spanStart<minSpanStart) {
2843 minSpanStart=spanStart;
2844 if(minSpanStart==0) {
2845 return 0;
2846 }
2847 }
2848 }
2849 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2850 if(matchStart<length) {
2851 // Remember longest match from prev.
2852 length=matchStart;
2853 }
2854 }
2855 }
2856 }
2857 if(length==prev) {
2858 break; // No match from prev.
2859 }
2860 } while((prev=length)>0);
2861 if(prev<minSpanStart) {
2862 return prev;
2863 } else {
2864 return minSpanStart;
2865 }
2866 }
2867 }
2868
2869 // spans to be performed and compared
2870 enum {
2871 SPAN_UTF16 =1,
2872 SPAN_UTF8 =2,
2873 SPAN_UTFS =3,
2874
2875 SPAN_SET =4,
2876 SPAN_COMPLEMENT =8,
2877 SPAN_POLARITY =0xc,
2878
2879 SPAN_FWD =0x10,
2880 SPAN_BACK =0x20,
2881 SPAN_DIRS =0x30,
2882
2883 SPAN_CONTAINED =0x100,
2884 SPAN_SIMPLE =0x200,
2885 SPAN_CONDITION =0x300,
2886
2887 SPAN_ALL =0x33f
2888 };
2889
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2890 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2891 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2892 }
2893
slen(const void * s,UBool isUTF16)2894 static inline int32_t slen(const void *s, UBool isUTF16) {
2895 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2896 }
2897
2898 /*
2899 * Count spans on a string with the method according to type and set the span limits.
2900 * The set may be the complement of the original.
2901 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2902 * according to the expected number of spans.
2903 * Sets typeName to an empty string if there is no such type.
2904 * Returns -1 if the span option is filtered out.
2905 */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2906 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2907 const void *s, int32_t length, UBool isUTF16,
2908 uint32_t whichSpans,
2909 int type, const char *&typeName,
2910 int32_t limits[], int32_t limitsCapacity,
2911 int32_t expectCount) {
2912 const UnicodeSet &realSet(set.getSet());
2913 int32_t start, count;
2914 USetSpanCondition spanCondition, firstSpanCondition, contained;
2915 UBool isForward;
2916
2917 if(type<0 || 7<type) {
2918 typeName="";
2919 return 0;
2920 }
2921
2922 static const char *const typeNames16[]={
2923 "contains", "contains(LM)",
2924 "span", "span(LM)",
2925 "containsBack", "containsBack(LM)",
2926 "spanBack", "spanBack(LM)"
2927 };
2928
2929 static const char *const typeNames8[]={
2930 "containsUTF8", "containsUTF8(LM)",
2931 "spanUTF8", "spanUTF8(LM)",
2932 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2933 "spanBackUTF8", "spanBackUTF8(LM)"
2934 };
2935
2936 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2937
2938 // filter span options
2939 if(type<=3) {
2940 // span forward
2941 if((whichSpans&SPAN_FWD)==0) {
2942 return -1;
2943 }
2944 isForward=TRUE;
2945 } else {
2946 // span backward
2947 if((whichSpans&SPAN_BACK)==0) {
2948 return -1;
2949 }
2950 isForward=FALSE;
2951 }
2952 if((type&1)==0) {
2953 // use USET_SPAN_CONTAINED
2954 if((whichSpans&SPAN_CONTAINED)==0) {
2955 return -1;
2956 }
2957 contained=USET_SPAN_CONTAINED;
2958 } else {
2959 // use USET_SPAN_SIMPLE
2960 if((whichSpans&SPAN_SIMPLE)==0) {
2961 return -1;
2962 }
2963 contained=USET_SPAN_SIMPLE;
2964 }
2965
2966 // Default first span condition for going forward with an uncomplemented set.
2967 spanCondition=USET_SPAN_NOT_CONTAINED;
2968 if(isComplement) {
2969 spanCondition=invertSpanCondition(spanCondition, contained);
2970 }
2971
2972 // First span condition for span(), used to terminate the spanBack() iteration.
2973 firstSpanCondition=spanCondition;
2974
2975 // spanBack(): Its initial span condition is span()'s last span condition,
2976 // which is the opposite of span()'s first span condition
2977 // if we expect an even number of spans.
2978 // (The loop inverts spanCondition (expectCount-1) times
2979 // before the expectCount'th span() call.)
2980 // If we do not compare forward and backward directions, then we do not have an
2981 // expectCount and just start with firstSpanCondition.
2982 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2983 spanCondition=invertSpanCondition(spanCondition, contained);
2984 }
2985
2986 count=0;
2987 switch(type) {
2988 case 0:
2989 case 1:
2990 start=0;
2991 if(length<0) {
2992 length=slen(s, isUTF16);
2993 }
2994 for(;;) {
2995 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2996 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2997 if(count<limitsCapacity) {
2998 limits[count]=start;
2999 }
3000 ++count;
3001 if(start>=length) {
3002 break;
3003 }
3004 spanCondition=invertSpanCondition(spanCondition, contained);
3005 }
3006 break;
3007 case 2:
3008 case 3:
3009 start=0;
3010 for(;;) {
3011 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3012 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3013 if(count<limitsCapacity) {
3014 limits[count]=start;
3015 }
3016 ++count;
3017 if(length>=0 ? start>=length :
3018 isUTF16 ? ((const UChar *)s)[start]==0 :
3019 ((const char *)s)[start]==0
3020 ) {
3021 break;
3022 }
3023 spanCondition=invertSpanCondition(spanCondition, contained);
3024 }
3025 break;
3026 case 4:
3027 case 5:
3028 if(length<0) {
3029 length=slen(s, isUTF16);
3030 }
3031 for(;;) {
3032 ++count;
3033 if(count<=limitsCapacity) {
3034 limits[limitsCapacity-count]=length;
3035 }
3036 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3037 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3038 if(length==0 && spanCondition==firstSpanCondition) {
3039 break;
3040 }
3041 spanCondition=invertSpanCondition(spanCondition, contained);
3042 }
3043 if(count<limitsCapacity) {
3044 memmove(limits, limits+(limitsCapacity-count), count*4);
3045 }
3046 break;
3047 case 6:
3048 case 7:
3049 for(;;) {
3050 ++count;
3051 if(count<=limitsCapacity) {
3052 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3053 }
3054 // Note: Length<0 is tested only for the first spanBack().
3055 // If we wanted to keep length<0 for all spanBack()s, we would have to
3056 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3057 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3058 realSet.spanBackUTF8((const char *)s, length, spanCondition);
3059 if(length==0 && spanCondition==firstSpanCondition) {
3060 break;
3061 }
3062 spanCondition=invertSpanCondition(spanCondition, contained);
3063 }
3064 if(count<limitsCapacity) {
3065 memmove(limits, limits+(limitsCapacity-count), count*4);
3066 }
3067 break;
3068 default:
3069 typeName="";
3070 return -1;
3071 }
3072
3073 return count;
3074 }
3075
3076 // sets to be tested; odd index=isComplement
3077 enum {
3078 SLOW,
3079 SLOW_NOT,
3080 FAST,
3081 FAST_NOT,
3082 SET_COUNT
3083 };
3084
3085 static const char *const setNames[SET_COUNT]={
3086 "slow",
3087 "slow.not",
3088 "fast",
3089 "fast.not"
3090 };
3091
3092 /*
3093 * Verify that we get the same results whether we look at text with contains(),
3094 * span() or spanBack(), using unfrozen or frozen versions of the set,
3095 * and using the set or its complement (switching the spanConditions accordingly).
3096 * The latter verifies that
3097 * set.span(spanCondition) == set.complement().span(!spanCondition).
3098 *
3099 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3100 * or returned to the caller (with an input expectCount<0).
3101 */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3102 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3103 const void *s, int32_t length, UBool isUTF16,
3104 uint32_t whichSpans,
3105 int32_t expectLimits[], int32_t &expectCount,
3106 const char *testName, int32_t index) {
3107 int32_t limits[500];
3108 int32_t limitsCount;
3109 int i, j;
3110
3111 const char *typeName;
3112 int type;
3113
3114 for(i=0; i<SET_COUNT; ++i) {
3115 if((i&1)==0) {
3116 // Even-numbered sets are original, uncomplemented sets.
3117 if((whichSpans&SPAN_SET)==0) {
3118 continue;
3119 }
3120 } else {
3121 // Odd-numbered sets are complemented.
3122 if((whichSpans&SPAN_COMPLEMENT)==0) {
3123 continue;
3124 }
3125 }
3126 for(type=0;; ++type) {
3127 limitsCount=getSpans(*sets[i], (UBool)(i&1),
3128 s, length, isUTF16,
3129 whichSpans,
3130 type, typeName,
3131 limits, UPRV_LENGTHOF(limits), expectCount);
3132 if(typeName[0]==0) {
3133 break; // All types tried.
3134 }
3135 if(limitsCount<0) {
3136 continue; // Span option filtered out.
3137 }
3138 if(expectCount<0) {
3139 expectCount=limitsCount;
3140 if(limitsCount>UPRV_LENGTHOF(limits)) {
3141 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3142 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3143 return;
3144 }
3145 memcpy(expectLimits, limits, limitsCount*4);
3146 } else if(limitsCount!=expectCount) {
3147 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3148 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3149 } else {
3150 for(j=0; j<limitsCount; ++j) {
3151 if(limits[j]!=expectLimits[j]) {
3152 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3153 testName, (long)index, setNames[i], typeName, (long)limitsCount,
3154 j, (long)limits[j], (long)expectLimits[j]);
3155 break;
3156 }
3157 }
3158 }
3159 }
3160 }
3161
3162 // Compare span() with containsAll()/containsNone(),
3163 // but only if we have expectLimits[] from the uncomplemented set.
3164 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3165 const UChar *s16=(const UChar *)s;
3166 UnicodeString string;
3167 int32_t prev=0, limit, length;
3168 for(i=0; i<expectCount; ++i) {
3169 limit=expectLimits[i];
3170 length=limit-prev;
3171 if(length>0) {
3172 string.setTo(FALSE, s16+prev, length); // read-only alias
3173 if(i&1) {
3174 if(!sets[SLOW]->getSet().containsAll(string)) {
3175 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3176 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3177 return;
3178 }
3179 if(!sets[FAST]->getSet().containsAll(string)) {
3180 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3181 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3182 return;
3183 }
3184 } else {
3185 if(!sets[SLOW]->getSet().containsNone(string)) {
3186 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3187 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3188 return;
3189 }
3190 if(!sets[FAST]->getSet().containsNone(string)) {
3191 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3192 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3193 return;
3194 }
3195 }
3196 }
3197 prev=limit;
3198 }
3199 }
3200 }
3201
3202 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3203 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3204 const void *s, int32_t length, UBool isUTF16,
3205 uint32_t whichSpans,
3206 const char *testName, int32_t index) {
3207 int32_t expectLimits[500];
3208 int32_t expectCount=-1;
3209 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3210 }
3211
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3212 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3213 UChar c, c2;
3214
3215 if(length>=0) {
3216 while(length>0) {
3217 c=*s++;
3218 --length;
3219 if(0xd800<=c && c<0xe000) {
3220 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3221 return TRUE;
3222 }
3223 --length;
3224 }
3225 }
3226 } else {
3227 while((c=*s++)!=0) {
3228 if(0xd800<=c && c<0xe000) {
3229 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3230 return TRUE;
3231 }
3232 }
3233 }
3234 }
3235 return FALSE;
3236 }
3237
3238 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3239 // unless either UTF is turned off in whichSpans.
3240 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3241 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3242 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3243 const UChar *s16, int32_t length16,
3244 uint32_t whichSpans,
3245 const char *testName, int32_t index) {
3246 int32_t expectLimits[500];
3247 int32_t expectCount;
3248
3249 expectCount=-1; // Get expectLimits[] from testSpan().
3250
3251 if((whichSpans&SPAN_UTF16)!=0) {
3252 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3253 }
3254 if((whichSpans&SPAN_UTF8)==0) {
3255 return;
3256 }
3257
3258 // Convert s16[] and expectLimits[] to UTF-8.
3259 uint8_t s8[3000];
3260 int32_t offsets[3000];
3261
3262 const UChar *s16Limit=s16+length16;
3263 char *t=(char *)s8;
3264 char *tLimit=t+sizeof(s8);
3265 int32_t *o=offsets;
3266 UErrorCode errorCode=U_ZERO_ERROR;
3267
3268 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3269 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3270 if(U_FAILURE(errorCode)) {
3271 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3272 testName, (long)index, u_errorName(errorCode));
3273 ucnv_resetFromUnicode(utf8Cnv);
3274 return;
3275 }
3276 int32_t length8=(int32_t)(t-(char *)s8);
3277
3278 // Convert expectLimits[].
3279 int32_t i, j, expect;
3280 for(i=j=0; i<expectCount; ++i) {
3281 expect=expectLimits[i];
3282 if(expect==length16) {
3283 expectLimits[i]=length8;
3284 } else {
3285 while(offsets[j]<expect) {
3286 ++j;
3287 }
3288 expectLimits[i]=j;
3289 }
3290 }
3291
3292 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3293 }
3294
nextCodePoint(UChar32 c)3295 static UChar32 nextCodePoint(UChar32 c) {
3296 // Skip some large and boring ranges.
3297 switch(c) {
3298 case 0x3441:
3299 return 0x4d7f;
3300 case 0x5100:
3301 return 0x9f00;
3302 case 0xb040:
3303 return 0xd780;
3304 case 0xe041:
3305 return 0xf8fe;
3306 case 0x10100:
3307 return 0x20000;
3308 case 0x20041:
3309 return 0xe0000;
3310 case 0xe0101:
3311 return 0x10fffd;
3312 default:
3313 return c+1;
3314 }
3315 }
3316
3317 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3318 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3319 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3320 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3321 // Skip the UTF-8 part of the test - if the string contains surrogates -
3322 // because it is likely to produce a different result.
3323 UBool inconsistentSurrogates=
3324 (!(sets[0]->getSet().contains(0xfffd) ?
3325 sets[0]->getSet().contains(0xd800, 0xdfff) :
3326 sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3327 sets[0]->hasStringsWithSurrogates());
3328
3329 UChar s[1000];
3330 int32_t length=0;
3331 uint32_t localWhichSpans;
3332
3333 UChar32 c, first;
3334 for(first=c=0;; c=nextCodePoint(c)) {
3335 if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3336 localWhichSpans=whichSpans;
3337 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3338 localWhichSpans&=~SPAN_UTF8;
3339 }
3340 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3341 if(c>0x10ffff) {
3342 break;
3343 }
3344 length=0;
3345 first=c;
3346 }
3347 U16_APPEND_UNSAFE(s, length, c);
3348 }
3349 }
3350
3351 // Test with a particular, interesting string.
3352 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3353 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3354 static const UChar s[]={
3355 0x61, 0x62, 0x20, // Latin, space
3356 0x3b1, 0x3b2, 0x3b3, // Greek
3357 0xd900, // lead surrogate
3358 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3359 0xdc05, // trail surrogate
3360 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3361 0xd900, 0xdc05, // unassigned supplementary
3362 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3363 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3364 0 // NUL
3365 };
3366
3367 if((whichSpans&SPAN_UTF16)==0) {
3368 return;
3369 }
3370 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3371 testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3372 }
3373
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3374 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3375 static const char s[]={
3376 "abc" // Latin
3377
3378 /* trail byte in lead position */
3379 "\x80"
3380
3381 " " // space
3382
3383 /* truncated multi-byte sequences */
3384 "\xd0"
3385 "\xe0"
3386 "\xe1"
3387 "\xed"
3388 "\xee"
3389 "\xf0"
3390 "\xf1"
3391 "\xf4"
3392 "\xf8"
3393 "\xfc"
3394
3395 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3396
3397 /* trail byte in lead position */
3398 "\x80"
3399
3400 "\xe0\x80"
3401 "\xe0\xa0"
3402 "\xe1\x80"
3403 "\xed\x80"
3404 "\xed\xa0"
3405 "\xee\x80"
3406 "\xf0\x80"
3407 "\xf0\x90"
3408 "\xf1\x80"
3409 "\xf4\x80"
3410 "\xf4\x90"
3411 "\xf8\x80"
3412 "\xfc\x80"
3413
3414 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3415
3416 /* trail byte in lead position */
3417 "\x80"
3418
3419 "\xf0\x80\x80"
3420 "\xf0\x90\x80"
3421 "\xf1\x80\x80"
3422 "\xf4\x80\x80"
3423 "\xf4\x90\x80"
3424 "\xf8\x80\x80"
3425 "\xfc\x80\x80"
3426
3427 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3428
3429 /* trail byte in lead position */
3430 "\x80"
3431
3432 "\xf8\x80\x80\x80"
3433 "\xfc\x80\x80\x80"
3434
3435 "\xF1\x90\x80\x85" // unassigned supplementary
3436
3437 /* trail byte in lead position */
3438 "\x80"
3439
3440 "\xfc\x80\x80\x80\x80"
3441
3442 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3443
3444 /* trail byte in lead position */
3445 "\x80"
3446
3447 /* complete sequences but non-shortest forms or out of range etc. */
3448 "\xc0\x80"
3449 "\xe0\x80\x80"
3450 "\xed\xa0\x80"
3451 "\xf0\x80\x80\x80"
3452 "\xf4\x90\x80\x80"
3453 "\xf8\x80\x80\x80\x80"
3454 "\xfc\x80\x80\x80\x80\x80"
3455 "\xfe"
3456 "\xff"
3457
3458 /* trail byte in lead position */
3459 "\x80"
3460
3461 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3462 };
3463
3464 if((whichSpans&SPAN_UTF8)==0) {
3465 return;
3466 }
3467 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3468 testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3469 }
3470
3471 // Take a set of span options and multiply them so that
3472 // each portion only has one of the options a, b and c.
3473 // If b==0, then the set of options is just modified with mask and a.
3474 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3475 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3476 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3477 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3478 uint32_t s;
3479 int32_t i;
3480
3481 for(i=0; i<whichSpansCount; ++i) {
3482 s=whichSpans[i]&mask;
3483 whichSpans[i]=s|a;
3484 if(b!=0) {
3485 whichSpans[whichSpansCount+i]=s|b;
3486 if(c!=0) {
3487 whichSpans[2*whichSpansCount+i]=s|c;
3488 }
3489 }
3490 }
3491 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3492 }
3493
3494 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3495 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3496 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3497 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3498
TestSpan()3499 void UnicodeSetTest::TestSpan() {
3500 // "[...]" is a UnicodeSet pattern.
3501 // "*" performs tests on all Unicode code points and on a selection of
3502 // malformed UTF-8/16 strings.
3503 // "-options" limits the scope of testing for the current set.
3504 // By default, the test verifies that equivalent boundaries are found
3505 // for UTF-16 and UTF-8, going forward and backward,
3506 // alternating USET_SPAN_NOT_CONTAINED with
3507 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3508 // Single-character options:
3509 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3510 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3511 // or the set contains strings with unpaired surrogates
3512 // which do not translate to valid UTF-8.
3513 // c -- set.span() and set.complement().span() boundaries may differ.
3514 // Cause: Set strings are not complemented.
3515 // b -- span() and spanBack() boundaries may differ.
3516 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3517 // and spanBack(USET_SPAN_SIMPLE) are defined to
3518 // match with non-overlapping substrings.
3519 // For example, with a set containing "ab" and "ba",
3520 // span() of "aba" yields boundaries { 0, 2, 3 }
3521 // because the initial "ab" matches from 0 to 2,
3522 // while spanBack() yields boundaries { 0, 1, 3 }
3523 // because the final "ba" matches from 1 to 3.
3524 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3525 // Cause: Strings in the set overlap, and a longer match may
3526 // require a sequence including non-longest substrings.
3527 // For example, with a set containing "ab", "abc" and "cd",
3528 // span(contained) of "abcd" spans the entire string
3529 // but span(longest match) only spans the first 3 characters.
3530 // Each "-options" first resets all options and then applies the specified options.
3531 // A "-" without options resets the options.
3532 // The options are also reset for each new set.
3533 // Other strings will be spanned.
3534 static const char *const testdata[]={
3535 "[:ID_Continue:]",
3536 "*",
3537 "[:White_Space:]",
3538 "*",
3539 "[]",
3540 "*",
3541 "[\\u0000-\\U0010FFFF]",
3542 "*",
3543 "[\\u0000\\u0080\\u0800\\U00010000]",
3544 "*",
3545 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3546 "*",
3547 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3548 "-c",
3549 "*",
3550 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3551 "-c",
3552 "*",
3553
3554 // Overlapping strings cause overlapping attempts to match.
3555 "[x{xy}{xya}{axy}{ax}]",
3556 "-cl",
3557
3558 // More repetitions of "xya" would take too long with the recursive
3559 // reference implementation.
3560 // containsAll()=FALSE
3561 // test_string 0x14
3562 "xx"
3563 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3564 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3565 "xyaxyaxyaxya"
3566 "xx"
3567 "xyaxyaxyaxya" // span() ends here.
3568 "aaa",
3569
3570 // containsAll()=TRUE
3571 // test_string 0x15
3572 "xx"
3573 "xyaxyaxyaxya"
3574 "xx"
3575 "xyaxyaxyaxya"
3576 "xx"
3577 "xyaxyaxyaxy",
3578
3579 "-bc",
3580 // test_string 0x17
3581 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3582 "-c",
3583 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3584 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3585 "-",
3586 "byaya", // span() -> { 5 }
3587 "byay", // span() -> { 4 }
3588 "bya", // span() -> { 3 }
3589
3590 // span(longest match) will not span the whole string.
3591 "[a{ab}{bc}]",
3592 "-cl",
3593 // test_string 0x21
3594 "abc",
3595
3596 "[a{ab}{abc}{cd}]",
3597 "-cl",
3598 "acdabcdabccd",
3599
3600 // spanBack(longest match) will not span the whole string.
3601 "[c{ab}{bc}]",
3602 "-cl",
3603 "abc",
3604
3605 "[d{cd}{bcd}{ab}]",
3606 "-cl",
3607 "abbcdabcdabd",
3608
3609 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3610 // and UTF-8 trail bytes.
3611 // Copies of above test sets and strings, but transliterated to have
3612 // different code points with similar trail units.
3613 // Previous: a b c d
3614 // Unicode: 042B 30AB 200AB 204AB
3615 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3616 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3617 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3618 "-cl",
3619 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3620
3621 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3622 "-cl",
3623 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3624
3625 // Stress bookkeeping and recursion.
3626 // The following strings are barely doable with the recursive
3627 // reference implementation.
3628 // The not-contained character at the end prevents an early exit from the span().
3629 "[b{bb}]",
3630 "-c",
3631 // test_string 0x33
3632 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3633 // On complement sets, span() and spanBack() get different results
3634 // because b is not in the complement set and there is an odd number of b's
3635 // in the test string.
3636 "-bc",
3637 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3638
3639 // Test with set strings with an initial or final code point span
3640 // longer than 254.
3641 "[a{" _64_a _64_a _64_a _64_a "b}"
3642 "{a" _64_b _64_b _64_b _64_b "}]",
3643 "-c",
3644 _64_a _64_a _64_a _63_a "b",
3645 _64_a _64_a _64_a _64_a "b",
3646 _64_a _64_a _64_a _64_a "aaaabbbb",
3647 "a" _64_b _64_b _64_b _63_b,
3648 "a" _64_b _64_b _64_b _64_b,
3649 "aaaabbbb" _64_b _64_b _64_b _64_b,
3650
3651 // Test with strings containing unpaired surrogates.
3652 // They are not representable in UTF-8, and a leading trail surrogate
3653 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3654 // U+20001 == \\uD840\\uDC01
3655 // U+20400 == \\uD841\\uDC00
3656 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3657 "-8cl",
3658 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3659 };
3660 uint32_t whichSpans[96]={ SPAN_ALL };
3661 int32_t whichSpansCount=1;
3662
3663 UnicodeSet *sets[SET_COUNT]={ NULL };
3664 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3665
3666 char testName[1024];
3667 char *testNameLimit=testName;
3668
3669 int32_t i, j;
3670 for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3671 const char *s=testdata[i];
3672 if(s[0]=='[') {
3673 // Create new test sets from this pattern.
3674 for(j=0; j<SET_COUNT; ++j) {
3675 delete sets_with_str[j];
3676 delete sets[j];
3677 }
3678 UErrorCode errorCode=U_ZERO_ERROR;
3679 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3680 if(U_FAILURE(errorCode)) {
3681 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3682 break;
3683 }
3684 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3685 sets[SLOW_NOT]->complement();
3686 // Intermediate set: Test cloning of a frozen set.
3687 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3688 fast->freeze();
3689 sets[FAST]=(UnicodeSet *)fast->clone();
3690 delete fast;
3691 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3692 fastNot->freeze();
3693 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3694 delete fastNot;
3695
3696 for(j=0; j<SET_COUNT; ++j) {
3697 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3698 }
3699
3700 strcpy(testName, s);
3701 testNameLimit=strchr(testName, 0);
3702 *testNameLimit++=':';
3703 *testNameLimit=0;
3704
3705 whichSpans[0]=SPAN_ALL;
3706 whichSpansCount=1;
3707 } else if(s[0]=='-') {
3708 whichSpans[0]=SPAN_ALL;
3709 whichSpansCount=1;
3710
3711 while(*++s!=0) {
3712 switch(*s) {
3713 case 'c':
3714 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3715 ~SPAN_POLARITY,
3716 SPAN_SET,
3717 SPAN_COMPLEMENT,
3718 0);
3719 break;
3720 case 'b':
3721 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3722 ~SPAN_DIRS,
3723 SPAN_FWD,
3724 SPAN_BACK,
3725 0);
3726 break;
3727 case 'l':
3728 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3729 // USET_SPAN_SIMPLE only FWD, and separately
3730 // USET_SPAN_SIMPLE only BACK
3731 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3732 ~(SPAN_DIRS|SPAN_CONDITION),
3733 SPAN_DIRS|SPAN_CONTAINED,
3734 SPAN_FWD|SPAN_SIMPLE,
3735 SPAN_BACK|SPAN_SIMPLE);
3736 break;
3737 case '8':
3738 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3739 ~SPAN_UTFS,
3740 SPAN_UTF16,
3741 SPAN_UTF8,
3742 0);
3743 break;
3744 default:
3745 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3746 break;
3747 }
3748 }
3749 } else if(0==strcmp(s, "*")) {
3750 strcpy(testNameLimit, "bad_string");
3751 for(j=0; j<whichSpansCount; ++j) {
3752 if(whichSpansCount>1) {
3753 sprintf(testNameLimit+10 /* strlen("bad_string") */,
3754 "%%0x%3x",
3755 whichSpans[j]);
3756 }
3757 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3758 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3759 }
3760
3761 strcpy(testNameLimit, "contents");
3762 for(j=0; j<whichSpansCount; ++j) {
3763 if(whichSpansCount>1) {
3764 sprintf(testNameLimit+8 /* strlen("contents") */,
3765 "%%0x%3x",
3766 whichSpans[j]);
3767 }
3768 testSpanContents(sets_with_str, whichSpans[j], testName);
3769 }
3770 } else {
3771 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3772 strcpy(testNameLimit, "test_string");
3773 for(j=0; j<whichSpansCount; ++j) {
3774 if(whichSpansCount>1) {
3775 sprintf(testNameLimit+11 /* strlen("test_string") */,
3776 "%%0x%3x",
3777 whichSpans[j]);
3778 }
3779 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3780 }
3781 }
3782 }
3783 for(j=0; j<SET_COUNT; ++j) {
3784 delete sets_with_str[j];
3785 delete sets[j];
3786 }
3787 }
3788
3789 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3790 void UnicodeSetTest::TestStringSpan() {
3791 static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3792 static const char *const string=
3793 "xx"
3794 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3795 "xx"
3796 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3797 "xx"
3798 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3799 "aaaa";
3800
3801 UErrorCode errorCode=U_ZERO_ERROR;
3802 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3803 UnicodeSet set(pattern16, errorCode);
3804 if(U_FAILURE(errorCode)) {
3805 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3806 return;
3807 }
3808
3809 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3810
3811 if(set.containsAll(string16)) {
3812 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3813 }
3814
3815 // Remove trailing "aaaa".
3816 string16.truncate(string16.length()-4);
3817 if(!set.containsAll(string16)) {
3818 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3819 }
3820
3821 string16=UNICODE_STRING_SIMPLE("byayaxya");
3822 const UChar *s16=string16.getBuffer();
3823 int32_t length16=string16.length();
3824 (void)length16; // Suppress set but not used warning.
3825 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3826 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3827 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3828 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3829 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3830 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3831 ) {
3832 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3833 }
3834
3835 pattern="[a{ab}{abc}{cd}]";
3836 pattern16=UnicodeString(pattern, -1, US_INV);
3837 set.applyPattern(pattern16, errorCode);
3838 if(U_FAILURE(errorCode)) {
3839 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3840 return;
3841 }
3842 string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3843 s16=string16.getBuffer();
3844 length16=string16.length();
3845 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3846 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3847 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3848 ) {
3849 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3850 }
3851
3852 pattern="[d{cd}{bcd}{ab}]";
3853 pattern16=UnicodeString(pattern, -1, US_INV);
3854 set.applyPattern(pattern16, errorCode).freeze();
3855 if(U_FAILURE(errorCode)) {
3856 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3857 return;
3858 }
3859 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3860 s16=string16.getBuffer();
3861 length16=string16.length();
3862 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3863 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3864 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3865 ) {
3866 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3867 }
3868 }
3869
3870 /**
3871 * Including collationroot.h fails here with
3872 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3873 * .. so, we skip this test on Windows.
3874 *
3875 * the cause is that intltest builds with /Za which disables language extensions - which means
3876 * windows header files can't be used.
3877 */
3878 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3879 #include "collationroot.h"
3880 #include "collationtailoring.h"
3881 #endif
3882
TestUCAUnsafeBackwards()3883 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3884 #if U_PLATFORM_HAS_WIN32_API
3885 infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3886 #elif !UCONFIG_NO_COLLATION
3887 UErrorCode errorCode = U_ZERO_ERROR;
3888
3889 // Get the unsafeBackwardsSet
3890 const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
3891 if(U_FAILURE(errorCode)) {
3892 dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
3893 return;
3894 }
3895 //const UVersionInfo &version = rootEntry->tailoring->version;
3896 const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
3897
3898 checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
3899
3900 if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3901 // simple test case
3902 // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3903 // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3904 UnicodeSet surrogates;
3905 surrogates.add(0xd83a); // a lead surrogate
3906 surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
3907 UnicodeString pat;
3908 surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3909 // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3910 // so that at least one type of surrogate code points are escaped,
3911 // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3912 errorCode = U_ZERO_ERROR;
3913 UnicodeSet s2;
3914 s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3915 if(U_FAILURE(errorCode)) {
3916 errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
3917 } else {
3918 checkEqual(surrogates, s2, "surrogates to/from pattern");
3919 }
3920 // This occurs in the UCA unsafe-backwards set.
3921 checkRoundTrip(*unsafeBackwardSet);
3922 }
3923 #endif
3924 }
3925
TestIntOverflow()3926 void UnicodeSetTest::TestIntOverflow() {
3927 // This test triggers undefined double->int conversion behavior
3928 // if the implementation is not careful.
3929 IcuTestErrorCode errorCode(*this, "TestIntOverflow");
3930 UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
3931 assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
3932 assertEquals("[:ccc=int_overflow:] -> illegal argument",
3933 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3934 }
3935
TestUnusedCcc()3936 void UnicodeSetTest::TestUnusedCcc() {
3937 #if !UCONFIG_NO_NORMALIZATION
3938 // All numeric ccc values 0..255 are valid, but many are unused.
3939 IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
3940 UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
3941 assertSuccess("[:ccc=2:]", errorCode);
3942 assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
3943
3944 UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
3945 assertSuccess("[:ccc=255:]", errorCode);
3946 assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
3947
3948 // Non-integer values and values outside 0..255 are invalid.
3949 UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
3950 assertEquals("[:ccc=-1:] -> illegal argument",
3951 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3952 assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
3953
3954 UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
3955 assertEquals("[:ccc=256:] -> illegal argument",
3956 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3957 assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
3958
3959 UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
3960 assertEquals("[:ccc=1.1:] -> illegal argument",
3961 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3962 assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
3963 #endif
3964 }
3965
TestDeepPattern()3966 void UnicodeSetTest::TestDeepPattern() {
3967 IcuTestErrorCode errorCode(*this, "TestDeepPattern");
3968 // Nested ranges are parsed via recursion which can use a lot of stack space.
3969 // After a reasonable limit, we should get an error.
3970 constexpr int32_t DEPTH = 20000;
3971 UnicodeString pattern, suffix;
3972 for (int32_t i = 0; i < DEPTH; ++i) {
3973 pattern.append(u"[a", 2);
3974 suffix.append(']');
3975 }
3976 pattern.append(suffix);
3977 UnicodeSet set(pattern, errorCode);
3978 assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure());
3979 errorCode.reset();
3980 }
3981