1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 * 03/22/2000 Madhu Added additional tests
11 ********************************************************************************
12 */
13
14 #include <stdio.h>
15
16 #include <string.h>
17 #include "unicode/utypes.h"
18 #include "usettest.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/usetiter.h"
23 #include "unicode/ustring.h"
24 #include "unicode/parsepos.h"
25 #include "unicode/symtable.h"
26 #include "unicode/utf8.h"
27 #include "unicode/utf16.h"
28 #include "unicode/uversion.h"
29 #include "cmemory.h"
30 #include "hash.h"
31
32 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
33 if (U_FAILURE(status)) { \
34 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
35 u_errorName(status)); \
36 } \
37 } UPRV_BLOCK_MACRO_END
38
39 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
40 if (!(expr)) { \
41 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); \
42 } \
43 } UPRV_BLOCK_MACRO_END
44
operator +(const UnicodeString & left,const UnicodeSet & set)45 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
46 UnicodeString pat;
47 set.toPattern(pat);
48 return left + UnicodeSetTest::escape(pat);
49 }
50
UnicodeSetTest()51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52 }
53
openUTF8Converter()54 UConverter *UnicodeSetTest::openUTF8Converter() {
55 if(utf8Cnv==NULL) {
56 UErrorCode errorCode=U_ZERO_ERROR;
57 utf8Cnv=ucnv_open("UTF-8", &errorCode);
58 }
59 return utf8Cnv;
60 }
61
~UnicodeSetTest()62 UnicodeSetTest::~UnicodeSetTest() {
63 ucnv_close(utf8Cnv);
64 }
65
66 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68 const char* &name, char* /*par*/) {
69 if (exec) {
70 logln(u"TestSuite UnicodeSetTest");
71 }
72 TESTCASE_AUTO_BEGIN;
73 TESTCASE_AUTO(TestPatterns);
74 TESTCASE_AUTO(TestAddRemove);
75 TESTCASE_AUTO(TestCategories);
76 TESTCASE_AUTO(TestCloneEqualHash);
77 TESTCASE_AUTO(TestMinimalRep);
78 TESTCASE_AUTO(TestAPI);
79 TESTCASE_AUTO(TestScriptSet);
80 TESTCASE_AUTO(TestPropertySet);
81 TESTCASE_AUTO(TestClone);
82 TESTCASE_AUTO(TestExhaustive);
83 TESTCASE_AUTO(TestToPattern);
84 TESTCASE_AUTO(TestIndexOf);
85 TESTCASE_AUTO(TestStrings);
86 TESTCASE_AUTO(Testj2268);
87 TESTCASE_AUTO(TestCloseOver);
88 TESTCASE_AUTO(TestEscapePattern);
89 TESTCASE_AUTO(TestInvalidCodePoint);
90 TESTCASE_AUTO(TestSymbolTable);
91 TESTCASE_AUTO(TestSurrogate);
92 TESTCASE_AUTO(TestPosixClasses);
93 TESTCASE_AUTO(TestIteration);
94 TESTCASE_AUTO(TestFreezable);
95 TESTCASE_AUTO(TestSpan);
96 TESTCASE_AUTO(TestStringSpan);
97 TESTCASE_AUTO(TestUCAUnsafeBackwards);
98 TESTCASE_AUTO(TestIntOverflow);
99 TESTCASE_AUTO(TestUnusedCcc);
100 TESTCASE_AUTO(TestDeepPattern);
101 TESTCASE_AUTO_END;
102 }
103
104 static const char NOT[] = "%%%%";
105
106 /**
107 * UVector was improperly copying contents
108 * This code will crash this is still true
109 */
Testj2268()110 void UnicodeSetTest::Testj2268() {
111 UnicodeSet t;
112 t.add(UnicodeString("abc"));
113 UnicodeSet test(t);
114 UnicodeString ustrPat;
115 test.toPattern(ustrPat, TRUE);
116 }
117
118 /**
119 * Test toPattern().
120 */
TestToPattern()121 void UnicodeSetTest::TestToPattern() {
122 UErrorCode ec = U_ZERO_ERROR;
123
124 // Test that toPattern() round trips with syntax characters and
125 // whitespace.
126 {
127 static const char* OTHER_TOPATTERN_TESTS[] = {
128 "[[:latin:]&[:greek:]]",
129 "[[:latin:]-[:greek:]]",
130 "[:nonspacing mark:]",
131 NULL
132 };
133
134 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
135 ec = U_ZERO_ERROR;
136 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
137 if (U_FAILURE(ec)) {
138 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
139 continue;
140 }
141 checkPat(OTHER_TOPATTERN_TESTS[j], s);
142 }
143
144 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
145 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
146
147 // check various combinations to make sure they all work.
148 if (i != 0 && !toPatternAux(i, i)){
149 continue;
150 }
151 if (!toPatternAux(0, i)){
152 continue;
153 }
154 if (!toPatternAux(i, 0xFFFF)){
155 continue;
156 }
157 }
158 }
159 }
160
161 // Test pattern behavior of multicharacter strings.
162 {
163 ec = U_ZERO_ERROR;
164 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
165
166 // This loop isn't a loop. It's here to make the compiler happy.
167 // If you're curious, try removing it and changing the 'break'
168 // statements (except for the last) to goto's.
169 for (;;) {
170 if (U_FAILURE(ec)) break;
171 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
172 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
173
174 s->add("ac");
175 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
176 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
177
178 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
179 if (U_FAILURE(ec)) break;
180 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
181 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
182
183 s->add("[]");
184 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
185 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
186
187 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
188 if (U_FAILURE(ec)) break;
189 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
190 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
191
192 // j2189
193 s->clear();
194 s->add(UnicodeString("abc", ""));
195 s->add(UnicodeString("abc", ""));
196 const char* exp6[] = {"abc", NOT, "ab", NULL};
197 expectToPattern(*s, "[{abc}]", exp6);
198
199 break;
200 }
201
202 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
203 delete s;
204 }
205
206 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
207 UnicodeSet s;
208 s.add((UChar)97, (UChar)98); // 'a', 'b'
209 expectToPattern(s, "[ab]", NULL);
210 }
211
toPatternAux(UChar32 start,UChar32 end)212 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
213
214 // use Integer.toString because Utility.hex doesn't handle ints
215 UnicodeString pat = "";
216 // TODO do these in hex
217 //String source = "0x" + Integer.toString(start,16).toUpperCase();
218 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
219 UnicodeString source;
220 source = source + (uint32_t)start;
221 if (start != end)
222 source = source + ".." + (uint32_t)end;
223 UnicodeSet testSet;
224 testSet.add(start, end);
225 return checkPat(source, testSet);
226 }
227
checkPat(const UnicodeString & source,const UnicodeSet & testSet)228 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
229 const UnicodeSet& testSet) {
230 // What we want to make sure of is that a pattern generated
231 // by toPattern(), with or without escaped unprintables, can
232 // be passed back into the UnicodeSet constructor.
233 UnicodeString pat0;
234
235 testSet.toPattern(pat0, TRUE);
236
237 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
238
239 //String pat1 = unescapeLeniently(pat0);
240 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
241
242 UnicodeString pat2;
243 testSet.toPattern(pat2, FALSE);
244 if (!checkPat(source, testSet, pat2)) return FALSE;
245
246 //String pat3 = unescapeLeniently(pat2);
247 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
248
249 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
250 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
251 return TRUE;
252 }
253
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)254 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
255 const UnicodeSet& testSet,
256 const UnicodeString& pat) {
257 UErrorCode ec = U_ZERO_ERROR;
258 UnicodeSet testSet2(pat, ec);
259 if (testSet2 != testSet) {
260 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
261 return FALSE;
262 }
263 return TRUE;
264 }
265
266 void
TestPatterns(void)267 UnicodeSetTest::TestPatterns(void) {
268 UnicodeSet set;
269 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
270 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
271 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
272 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
273 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
274 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
275
276 // Throw in a test of complement
277 set.complement();
278 UnicodeString exp;
279 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
280 expectPairs(set, exp);
281 }
282
283 void
TestCategories(void)284 UnicodeSetTest::TestCategories(void) {
285 UErrorCode status = U_ZERO_ERROR;
286 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
287 UnicodeSet set(pat, status);
288 if (U_FAILURE(status)) {
289 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
290 return;
291 } else {
292 expectContainment(set, pat, "ABC", "abc");
293 }
294
295 UChar32 i;
296 int32_t failures = 0;
297 // Make sure generation of L doesn't pollute cached Lu set
298 // First generate L, then Lu
299 set.applyPattern("[:L:]", status);
300 if (U_FAILURE(status)) { errln("FAIL"); return; }
301 for (i=0; i<0x200; ++i) {
302 UBool l = u_isalpha((UChar)i);
303 if (l != set.contains(i)) {
304 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
305 set.contains(i));
306 if (++failures == 10) break;
307 }
308 }
309
310 set.applyPattern("[:Lu:]", status);
311 if (U_FAILURE(status)) { errln("FAIL"); return; }
312 for (i=0; i<0x200; ++i) {
313 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
314 if (lu != set.contains(i)) {
315 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
316 set.contains(i));
317 if (++failures == 20) break;
318 }
319 }
320 }
321 void
TestCloneEqualHash(void)322 UnicodeSetTest::TestCloneEqualHash(void) {
323 UErrorCode status = U_ZERO_ERROR;
324 // set1 and set2 used to be built with the obsolete constructor taking
325 // UCharCategory values; replaced with pattern constructors
326 // markus 20030502
327 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase
328 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase
329 if (U_FAILURE(status)){
330 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
331 return;
332 }
333 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit
334 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit
335 if (U_FAILURE(status)){
336 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
337 return;
338 }
339
340 if (*set1 != *set1a) {
341 errln("FAIL: category constructor for Ll broken");
342 }
343 if (*set2 != *set2a) {
344 errln("FAIL: category constructor for Nd broken");
345 }
346 delete set1a;
347 delete set2a;
348
349 logln("Testing copy construction");
350 UnicodeSet *set1copy=new UnicodeSet(*set1);
351 if(*set1 != *set1copy || *set1 == *set2 ||
352 getPairs(*set1) != getPairs(*set1copy) ||
353 set1->hashCode() != set1copy->hashCode()){
354 errln("FAIL : Error in copy construction");
355 return;
356 }
357
358 logln("Testing =operator");
359 UnicodeSet set1equal=*set1;
360 UnicodeSet set2equal=*set2;
361 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
362 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
363 errln("FAIL: Error in =operator");
364 }
365
366 logln("Testing clone()");
367 UnicodeSet *set1clone=set1->clone();
368 UnicodeSet *set2clone=set2->clone();
369 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
370 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
371 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
372 errln("FAIL: Error in clone");
373 }
374
375 logln("Testing hashcode");
376 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
377 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
378 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
379 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
380 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
381 errln("FAIL: Error in hashCode()");
382 }
383
384 delete set1;
385 delete set1copy;
386 delete set2;
387 delete set1clone;
388 delete set2clone;
389
390
391 }
392 void
TestAddRemove(void)393 UnicodeSetTest::TestAddRemove(void) {
394 UnicodeSet set; // Construct empty set
395 doAssert(set.isEmpty() == TRUE, "set should be empty");
396 doAssert(set.size() == 0, "size should be 0");
397 set.complement();
398 doAssert(set.size() == 0x110000, "size should be 0x110000");
399 set.clear();
400 set.add(0x0061, 0x007a);
401 expectPairs(set, "az");
402 doAssert(set.isEmpty() == FALSE, "set should not be empty");
403 doAssert(set.size() != 0, "size should not be equal to 0");
404 doAssert(set.size() == 26, "size should be equal to 26");
405 set.remove(0x006d, 0x0070);
406 expectPairs(set, "alqz");
407 doAssert(set.size() == 22, "size should be equal to 22");
408 set.remove(0x0065, 0x0067);
409 expectPairs(set, "adhlqz");
410 doAssert(set.size() == 19, "size should be equal to 19");
411 set.remove(0x0064, 0x0069);
412 expectPairs(set, "acjlqz");
413 doAssert(set.size() == 16, "size should be equal to 16");
414 set.remove(0x0063, 0x0072);
415 expectPairs(set, "absz");
416 doAssert(set.size() == 10, "size should be equal to 10");
417 set.add(0x0066, 0x0071);
418 expectPairs(set, "abfqsz");
419 doAssert(set.size() == 22, "size should be equal to 22");
420 set.remove(0x0061, 0x0067);
421 expectPairs(set, "hqsz");
422 set.remove(0x0061, 0x007a);
423 expectPairs(set, "");
424 doAssert(set.isEmpty() == TRUE, "set should be empty");
425 doAssert(set.size() == 0, "size should be 0");
426 set.add(0x0061);
427 doAssert(set.isEmpty() == FALSE, "set should not be empty");
428 doAssert(set.size() == 1, "size should not be equal to 1");
429 set.add(0x0062);
430 set.add(0x0063);
431 expectPairs(set, "ac");
432 doAssert(set.size() == 3, "size should not be equal to 3");
433 set.add(0x0070);
434 set.add(0x0071);
435 expectPairs(set, "acpq");
436 doAssert(set.size() == 5, "size should not be equal to 5");
437 set.clear();
438 expectPairs(set, "");
439 doAssert(set.isEmpty() == TRUE, "set should be empty");
440 doAssert(set.size() == 0, "size should be 0");
441
442 // Try removing an entire set from another set
443 expectPattern(set, "[c-x]", "cx");
444 UnicodeSet set2;
445 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
446 set.removeAll(set2);
447 expectPairs(set, "deluxx");
448
449 // Try adding an entire set to another set
450 expectPattern(set, "[jackiemclean]", "aacceein");
451 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
452 set.addAll(set2);
453 expectPairs(set, "aacehort");
454 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
455
456 // Try retaining an set of elements contained in another set (intersection)
457 UnicodeSet set3;
458 expectPattern(set3, "[a-c]", "ac");
459 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
460 set3.remove(0x0062);
461 expectPairs(set3, "aacc");
462 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
463 set.retainAll(set3);
464 expectPairs(set, "aacc");
465 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
466 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
467 set.clear();
468 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
469
470 // Test commutativity
471 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
472 expectPattern(set2, "[jackiemclean]", "aacceein");
473 set.addAll(set2);
474 expectPairs(set, "aacehort");
475 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
476
477
478
479
480 }
481
482 /**
483 * Make sure minimal representation is maintained.
484 */
TestMinimalRep()485 void UnicodeSetTest::TestMinimalRep() {
486 UErrorCode status = U_ZERO_ERROR;
487 // This is pretty thoroughly tested by checkCanonicalRep()
488 // run against the exhaustive operation results. Use the code
489 // here for debugging specific spot problems.
490
491 // 1 overlap against 2
492 UnicodeSet set("[h-km-q]", status);
493 if (U_FAILURE(status)) { errln("FAIL"); return; }
494 UnicodeSet set2("[i-o]", status);
495 if (U_FAILURE(status)) { errln("FAIL"); return; }
496 set.addAll(set2);
497 expectPairs(set, "hq");
498 // right
499 set.applyPattern("[a-m]", status);
500 if (U_FAILURE(status)) { errln("FAIL"); return; }
501 set2.applyPattern("[e-o]", status);
502 if (U_FAILURE(status)) { errln("FAIL"); return; }
503 set.addAll(set2);
504 expectPairs(set, "ao");
505 // left
506 set.applyPattern("[e-o]", status);
507 if (U_FAILURE(status)) { errln("FAIL"); return; }
508 set2.applyPattern("[a-m]", status);
509 if (U_FAILURE(status)) { errln("FAIL"); return; }
510 set.addAll(set2);
511 expectPairs(set, "ao");
512 // 1 overlap against 3
513 set.applyPattern("[a-eg-mo-w]", status);
514 if (U_FAILURE(status)) { errln("FAIL"); return; }
515 set2.applyPattern("[d-q]", status);
516 if (U_FAILURE(status)) { errln("FAIL"); return; }
517 set.addAll(set2);
518 expectPairs(set, "aw");
519 }
520
TestAPI()521 void UnicodeSetTest::TestAPI() {
522 UErrorCode status = U_ZERO_ERROR;
523 // default ct
524 UnicodeSet set;
525 if (!set.isEmpty() || set.getRangeCount() != 0) {
526 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
527 set);
528 }
529
530 // clear(), isEmpty()
531 set.add(0x0061);
532 if (set.isEmpty()) {
533 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
534 set);
535 }
536 set.clear();
537 if (!set.isEmpty()) {
538 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
539 set);
540 }
541
542 // size()
543 set.clear();
544 if (set.size() != 0) {
545 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
546 ": " + set);
547 }
548 set.add(0x0061);
549 if (set.size() != 1) {
550 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
551 ": " + set);
552 }
553 set.add(0x0031, 0x0039);
554 if (set.size() != 10) {
555 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
556 ": " + set);
557 }
558
559 // contains(first, last)
560 set.clear();
561 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
562 if (U_FAILURE(status)) { errln("FAIL"); return; }
563 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
564 UChar32 a = set.getRangeStart(i);
565 UChar32 b = set.getRangeEnd(i);
566 if (!set.contains(a, b)) {
567 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
568 " but doesn't: " + set);
569 }
570 if (set.contains((UChar32)(a-1), b)) {
571 errln((UnicodeString)"FAIL, shouldn't contain " +
572 (unsigned short)(a-1) + '-' + (unsigned short)b +
573 " but does: " + set);
574 }
575 if (set.contains(a, (UChar32)(b+1))) {
576 errln((UnicodeString)"FAIL, shouldn't contain " +
577 (unsigned short)a + '-' + (unsigned short)(b+1) +
578 " but does: " + set);
579 }
580 }
581
582 // Ported InversionList test.
583 UnicodeSet a((UChar32)3,(UChar32)10);
584 UnicodeSet b((UChar32)7,(UChar32)15);
585 UnicodeSet c;
586
587 logln((UnicodeString)"a [3-10]: " + a);
588 logln((UnicodeString)"b [7-15]: " + b);
589 c = a;
590 c.addAll(b);
591 UnicodeSet exp((UChar32)3,(UChar32)15);
592 if (c == exp) {
593 logln((UnicodeString)"c.set(a).add(b): " + c);
594 } else {
595 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
596 }
597 c.complement();
598 exp.set((UChar32)0, (UChar32)2);
599 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
600 if (c == exp) {
601 logln((UnicodeString)"c.complement(): " + c);
602 } else {
603 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
604 }
605 c.complement();
606 exp.set((UChar32)3, (UChar32)15);
607 if (c == exp) {
608 logln((UnicodeString)"c.complement(): " + c);
609 } else {
610 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
611 }
612 c = a;
613 c.complementAll(b);
614 exp.set((UChar32)3,(UChar32)6);
615 exp.add((UChar32)11,(UChar32) 15);
616 if (c == exp) {
617 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
618 } else {
619 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
620 }
621
622 exp = c;
623 bitsToSet(setToBits(c), c);
624 if (c == exp) {
625 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
626 } else {
627 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
628 }
629
630 // Additional tests for coverage JB#2118
631 //UnicodeSet::complement(class UnicodeString const &)
632 //UnicodeSet::complementAll(class UnicodeString const &)
633 //UnicodeSet::containsNone(class UnicodeSet const &)
634 //UnicodeSet::containsNone(long,long)
635 //UnicodeSet::containsSome(class UnicodeSet const &)
636 //UnicodeSet::containsSome(long,long)
637 //UnicodeSet::removeAll(class UnicodeString const &)
638 //UnicodeSet::retain(long)
639 //UnicodeSet::retainAll(class UnicodeString const &)
640 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
641 //UnicodeSetIterator::getString(void)
642 set.clear();
643 set.complement("ab");
644 exp.applyPattern("[{ab}]", status);
645 if (U_FAILURE(status)) { errln("FAIL"); return; }
646 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
647
648 UnicodeSetIterator iset(set);
649 if (!iset.next() || !iset.isString()) {
650 errln("FAIL: UnicodeSetIterator::next/isString");
651 } else if (iset.getString() != "ab") {
652 errln("FAIL: UnicodeSetIterator::getString");
653 }
654
655 set.add((UChar32)0x61, (UChar32)0x7A);
656 set.complementAll("alan");
657 exp.applyPattern("[{ab}b-kmo-z]", status);
658 if (U_FAILURE(status)) { errln("FAIL"); return; }
659 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
660
661 exp.applyPattern("[a-z]", status);
662 if (U_FAILURE(status)) { errln("FAIL"); return; }
663 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
664 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
665 exp.applyPattern("[aln]", status);
666 if (U_FAILURE(status)) { errln("FAIL"); return; }
667 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
668 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
669
670 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
671 errln("FAIL: containsNone(UChar32, UChar32)");
672 }
673 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
674 errln("FAIL: containsSome(UChar32, UChar32)");
675 }
676 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
677 errln("FAIL: containsNone(UChar32, UChar32)");
678 }
679 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
680 errln("FAIL: containsSome(UChar32, UChar32)");
681 }
682
683 set.removeAll("liu");
684 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
685 if (U_FAILURE(status)) { errln("FAIL"); return; }
686 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
687
688 set.retainAll("star");
689 exp.applyPattern("[rst]", status);
690 if (U_FAILURE(status)) { errln("FAIL"); return; }
691 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
692
693 set.retain((UChar32)0x73);
694 exp.applyPattern("[s]", status);
695 if (U_FAILURE(status)) { errln("FAIL"); return; }
696 if (set != exp) { errln("FAIL: retain('s')"); return; }
697
698 uint16_t buf[32];
699 int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
700 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
701 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
702 errln("FAIL: serialize");
703 return;
704 }
705
706 // Conversions to and from USet
707 UnicodeSet *uniset = &set;
708 USet *uset = uniset->toUSet();
709 TEST_ASSERT((void *)uset == (void *)uniset);
710 UnicodeSet *setx = UnicodeSet::fromUSet(uset);
711 TEST_ASSERT((void *)setx == (void *)uset);
712 const UnicodeSet *constSet = uniset;
713 const USet *constUSet = constSet->toUSet();
714 TEST_ASSERT((void *)constUSet == (void *)constSet);
715 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
716 TEST_ASSERT((void *)constSetx == (void *)constUSet);
717
718 // span(UnicodeString) and spanBack(UnicodeString) convenience methods
719 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
720 UnicodeSet ac(0x61, 0x63);
721 ac.remove(0x62).freeze();
722 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
723 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
724 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
725 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
726 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
727 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
728 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
729 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
730 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
731 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
732 ) {
733 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
734 }
735 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
736 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
737 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
738 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
739 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
740 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
741 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
742 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
743 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
744 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
745 ) {
746 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
747 }
748 }
749
TestIteration()750 void UnicodeSetTest::TestIteration() {
751 UErrorCode ec = U_ZERO_ERROR;
752 int i = 0;
753 int outerLoop;
754
755 // 6 code points, 3 ranges, 2 strings, 8 total elements
756 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
757 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
758 TEST_ASSERT_SUCCESS(ec);
759 UnicodeSetIterator it(set);
760
761 for (outerLoop=0; outerLoop<3; outerLoop++) {
762 // Run the test multiple times, to check that iterator.reset() is working.
763 for (i=0; i<10; i++) {
764 UBool nextv = it.next();
765 UBool isString = it.isString();
766 int32_t codePoint = it.getCodepoint();
767 //int32_t codePointEnd = it.getCodepointEnd();
768 UnicodeString s = it.getString();
769 switch (i) {
770 case 0:
771 TEST_ASSERT(nextv == TRUE);
772 TEST_ASSERT(isString == FALSE);
773 TEST_ASSERT(codePoint==0x61);
774 TEST_ASSERT(s == "a");
775 break;
776 case 1:
777 TEST_ASSERT(nextv == TRUE);
778 TEST_ASSERT(isString == FALSE);
779 TEST_ASSERT(codePoint==0x62);
780 TEST_ASSERT(s == "b");
781 break;
782 case 2:
783 TEST_ASSERT(nextv == TRUE);
784 TEST_ASSERT(isString == FALSE);
785 TEST_ASSERT(codePoint==0x63);
786 TEST_ASSERT(s == "c");
787 break;
788 case 3:
789 TEST_ASSERT(nextv == TRUE);
790 TEST_ASSERT(isString == FALSE);
791 TEST_ASSERT(codePoint==0x79);
792 TEST_ASSERT(s == "y");
793 break;
794 case 4:
795 TEST_ASSERT(nextv == TRUE);
796 TEST_ASSERT(isString == FALSE);
797 TEST_ASSERT(codePoint==0x7a);
798 TEST_ASSERT(s == "z");
799 break;
800 case 5:
801 TEST_ASSERT(nextv == TRUE);
802 TEST_ASSERT(isString == FALSE);
803 TEST_ASSERT(codePoint==0x1abcd);
804 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
805 break;
806 case 6:
807 TEST_ASSERT(nextv == TRUE);
808 TEST_ASSERT(isString == TRUE);
809 TEST_ASSERT(s == "str1");
810 break;
811 case 7:
812 TEST_ASSERT(nextv == TRUE);
813 TEST_ASSERT(isString == TRUE);
814 TEST_ASSERT(s == "str2");
815 break;
816 case 8:
817 TEST_ASSERT(nextv == FALSE);
818 break;
819 case 9:
820 TEST_ASSERT(nextv == FALSE);
821 break;
822 }
823 }
824 it.reset(); // prepare to run the iteration again.
825 }
826 }
827
828
829
830
TestStrings()831 void UnicodeSetTest::TestStrings() {
832 UErrorCode ec = U_ZERO_ERROR;
833
834 UnicodeSet* testList[] = {
835 UnicodeSet::createFromAll("abc"),
836 new UnicodeSet("[a-c]", ec),
837
838 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
839 new UnicodeSet("[{ll}{ch}a-z]", ec),
840
841 UnicodeSet::createFrom("ab}c"),
842 new UnicodeSet("[{ab\\}c}]", ec),
843
844 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
845 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
846
847 NULL
848 };
849
850 if (U_FAILURE(ec)) {
851 errln("FAIL: couldn't construct test sets");
852 }
853
854 for (int32_t i = 0; testList[i] != NULL; i+=2) {
855 if (U_SUCCESS(ec)) {
856 UnicodeString pat0, pat1;
857 testList[i]->toPattern(pat0, TRUE);
858 testList[i+1]->toPattern(pat1, TRUE);
859 if (*testList[i] == *testList[i+1]) {
860 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
861 } else {
862 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
863 }
864 }
865 delete testList[i];
866 delete testList[i+1];
867 }
868 }
869
870 /**
871 * Test the [:Latin:] syntax.
872 */
TestScriptSet()873 void UnicodeSetTest::TestScriptSet() {
874 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
875
876 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
877
878 /* Jitterbug 1423 */
879 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
880
881 }
882
883 /**
884 * Test the [:Latin:] syntax.
885 */
TestPropertySet()886 void UnicodeSetTest::TestPropertySet() {
887 static const char* const DATA[] = {
888 // Pattern, Chars IN, Chars NOT in
889
890 "[:Latin:]",
891 "aA",
892 "\\u0391\\u03B1",
893
894 "[\\p{Greek}]",
895 "\\u0391\\u03B1",
896 "aA",
897
898 "\\P{ GENERAL Category = upper case letter }",
899 "abc",
900 "ABC",
901
902 #if !UCONFIG_NO_NORMALIZATION
903 // Combining class: @since ICU 2.2
904 // Check both symbolic and numeric
905 "\\p{ccc=Nukta}",
906 "\\u0ABC",
907 "abc",
908
909 "\\p{Canonical Combining Class = 11}",
910 "\\u05B1",
911 "\\u05B2",
912
913 "[:c c c = iota subscript :]",
914 "\\u0345",
915 "xyz",
916 #endif
917
918 // Bidi class: @since ICU 2.2
919 "\\p{bidiclass=lefttoright}",
920 "abc",
921 "\\u0671\\u0672",
922
923 // Binary properties: @since ICU 2.2
924 "\\p{ideographic}",
925 "\\u4E0A",
926 "x",
927
928 "[:math=false:]",
929 "q)*(",
930 // weiv: )(and * were removed from math in Unicode 4.0.1
931 //"(*+)",
932 "+<>^",
933
934 // JB#1767 \N{}, \p{ASCII}
935 "[:Ascii:]",
936 "abc\\u0000\\u007F",
937 "\\u0080\\u4E00",
938
939 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
940 "az",
941 "qrs",
942
943 // JB#2015
944 "[:any:]",
945 "a\\U0010FFFF",
946 "",
947
948 "[:nv=0.5:]",
949 "\\u00BD\\u0F2A",
950 "\\u00BC",
951
952 // JB#2653: Age
953 "[:Age=1.1:]",
954 "\\u03D6", // 1.1
955 "\\u03D8\\u03D9", // 3.2
956
957 "[:Age=3.1:]",
958 "\\u1800\\u3400\\U0002f800",
959 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
960
961 // JB#2350: Case_Sensitive
962 "[:Case Sensitive:]",
963 "A\\u1FFC\\U00010410",
964 ";\\u00B4\\U00010500",
965
966 // JB#2832: C99-compatibility props
967 "[:blank:]",
968 " \\u0009",
969 "1-9A-Z",
970
971 "[:graph:]",
972 "19AZ",
973 " \\u0003\\u0007\\u0009\\u000A\\u000D",
974
975 "[:punct:]",
976 "!@#%&*()[]{}-_\\/;:,.?'\"",
977 "09azAZ",
978
979 "[:xdigit:]",
980 "09afAF",
981 "gG!",
982
983 // Regex compatibility test
984 "[-b]", // leading '-' is literal
985 "-b",
986 "ac",
987
988 "[^-b]", // leading '-' is literal
989 "ac",
990 "-b",
991
992 "[b-]", // trailing '-' is literal
993 "-b",
994 "ac",
995
996 "[^b-]", // trailing '-' is literal
997 "ac",
998 "-b",
999
1000 "[a-b-]", // trailing '-' is literal
1001 "ab-",
1002 "c=",
1003
1004 "[[a-q]&[p-z]-]", // trailing '-' is literal
1005 "pq-",
1006 "or=",
1007
1008 "[\\s|\\)|:|$|\\>]", // from regex tests
1009 "s|):$>",
1010 "abc",
1011
1012 "[\\uDC00cd]", // JB#2906: isolated trail at start
1013 "cd\\uDC00",
1014 "ab\\uD800\\U00010000",
1015
1016 "[ab\\uD800]", // JB#2906: isolated trail at start
1017 "ab\\uD800",
1018 "cd\\uDC00\\U00010000",
1019
1020 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1021 "abcd\\uD800",
1022 "ef\\uDC00\\U00010000",
1023
1024 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1025 "abcd\\uDC00",
1026 "ef\\uD800\\U00010000",
1027
1028 #if !UCONFIG_NO_NORMALIZATION
1029 "[:^lccc=0:]", // Lead canonical class
1030 "\\u0300\\u0301",
1031 "abcd\\u00c0\\u00c5",
1032
1033 "[:^tccc=0:]", // Trail canonical class
1034 "\\u0300\\u0301\\u00c0\\u00c5",
1035 "abcd",
1036
1037 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1038 "\\u0300\\u0301\\u00c0\\u00c5",
1039 "abcd",
1040
1041 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1042 "",
1043 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1044
1045 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1046 "\\u0F73\\u0F75\\u0F81",
1047 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1048 #endif /* !UCONFIG_NO_NORMALIZATION */
1049
1050 "[:Assigned:]",
1051 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1052 "\\u0888\\uFDD3\\uFFFE\\U00050005",
1053
1054 // Script_Extensions, new in Unicode 6.0
1055 "[:scx=Arab:]",
1056 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1057 "\\u061D\\uFDEF\\uFDFE",
1058
1059 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1060 // so scx-sc is missing U+FDF2.
1061 "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1062 "\\u0640\\u064B\\u0650\\u0655",
1063 "\\uFDF2"
1064 };
1065
1066 static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1067
1068 for (int32_t i=0; i<DATA_LEN; i+=3) {
1069 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1070 CharsToUnicodeString(DATA[i+2]));
1071 }
1072 }
1073
1074 /**
1075 * Test that Posix style character classes [:digit:], etc.
1076 * have the Unicode definitions from TR 18.
1077 */
TestPosixClasses()1078 void UnicodeSetTest::TestPosixClasses() {
1079 {
1080 UErrorCode status = U_ZERO_ERROR;
1081 UnicodeSet s1("[:alpha:]", status);
1082 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1083 TEST_ASSERT_SUCCESS(status);
1084 TEST_ASSERT(s1==s2);
1085 }
1086 {
1087 UErrorCode status = U_ZERO_ERROR;
1088 UnicodeSet s1("[:lower:]", status);
1089 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1090 TEST_ASSERT_SUCCESS(status);
1091 TEST_ASSERT(s1==s2);
1092 }
1093 {
1094 UErrorCode status = U_ZERO_ERROR;
1095 UnicodeSet s1("[:upper:]", status);
1096 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1097 TEST_ASSERT_SUCCESS(status);
1098 TEST_ASSERT(s1==s2);
1099 }
1100 {
1101 UErrorCode status = U_ZERO_ERROR;
1102 UnicodeSet s1("[:punct:]", status);
1103 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1104 TEST_ASSERT_SUCCESS(status);
1105 TEST_ASSERT(s1==s2);
1106 }
1107 {
1108 UErrorCode status = U_ZERO_ERROR;
1109 UnicodeSet s1("[:digit:]", status);
1110 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1111 TEST_ASSERT_SUCCESS(status);
1112 TEST_ASSERT(s1==s2);
1113 }
1114 {
1115 UErrorCode status = U_ZERO_ERROR;
1116 UnicodeSet s1("[:xdigit:]", status);
1117 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1118 TEST_ASSERT_SUCCESS(status);
1119 TEST_ASSERT(s1==s2);
1120 }
1121 {
1122 UErrorCode status = U_ZERO_ERROR;
1123 UnicodeSet s1("[:alnum:]", status);
1124 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1125 TEST_ASSERT_SUCCESS(status);
1126 TEST_ASSERT(s1==s2);
1127 }
1128 {
1129 UErrorCode status = U_ZERO_ERROR;
1130 UnicodeSet s1("[:space:]", status);
1131 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1132 TEST_ASSERT_SUCCESS(status);
1133 TEST_ASSERT(s1==s2);
1134 }
1135 {
1136 UErrorCode status = U_ZERO_ERROR;
1137 UnicodeSet s1("[:blank:]", status);
1138 TEST_ASSERT_SUCCESS(status);
1139 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1140 status);
1141 TEST_ASSERT_SUCCESS(status);
1142 TEST_ASSERT(s1==s2);
1143 }
1144 {
1145 UErrorCode status = U_ZERO_ERROR;
1146 UnicodeSet s1("[:cntrl:]", status);
1147 TEST_ASSERT_SUCCESS(status);
1148 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1149 TEST_ASSERT_SUCCESS(status);
1150 TEST_ASSERT(s1==s2);
1151 }
1152 {
1153 UErrorCode status = U_ZERO_ERROR;
1154 UnicodeSet s1("[:graph:]", status);
1155 TEST_ASSERT_SUCCESS(status);
1156 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1157 TEST_ASSERT_SUCCESS(status);
1158 TEST_ASSERT(s1==s2);
1159 }
1160 {
1161 UErrorCode status = U_ZERO_ERROR;
1162 UnicodeSet s1("[:print:]", status);
1163 TEST_ASSERT_SUCCESS(status);
1164 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1165 TEST_ASSERT_SUCCESS(status);
1166 TEST_ASSERT(s1==s2);
1167 }
1168 }
1169 /**
1170 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1171 */
TestClone()1172 void UnicodeSetTest::TestClone() {
1173 UErrorCode ec = U_ZERO_ERROR;
1174 UnicodeSet s("[abcxyz]", ec);
1175 UnicodeSet t(s);
1176 expectContainment(t, "abc", "def");
1177 }
1178
1179 /**
1180 * Test the indexOf() and charAt() methods.
1181 */
TestIndexOf()1182 void UnicodeSetTest::TestIndexOf() {
1183 UErrorCode ec = U_ZERO_ERROR;
1184 UnicodeSet set("[a-cx-y3578]", ec);
1185 if (U_FAILURE(ec)) {
1186 errln("FAIL: UnicodeSet constructor");
1187 return;
1188 }
1189 for (int32_t i=0; i<set.size(); ++i) {
1190 UChar32 c = set.charAt(i);
1191 if (set.indexOf(c) != i) {
1192 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1193 i, c, set.indexOf(c));
1194 }
1195 }
1196 UChar32 c = set.charAt(set.size());
1197 if (c != -1) {
1198 errln("FAIL: charAt(<out of range>) = %X", c);
1199 }
1200 int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1201 if (j != -1) {
1202 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1203 }
1204 }
1205
1206 /**
1207 * Test closure API.
1208 */
TestCloseOver()1209 void UnicodeSetTest::TestCloseOver() {
1210 UErrorCode ec = U_ZERO_ERROR;
1211
1212 char CASE[] = {(char)USET_CASE_INSENSITIVE};
1213 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1214 const char* DATA[] = {
1215 // selector, input, output
1216 CASE,
1217 "[aq\\u00DF{Bc}{bC}{Fi}]",
1218 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1219
1220 CASE,
1221 "[\\u01F1]", // 'DZ'
1222 "[\\u01F1\\u01F2\\u01F3]",
1223
1224 CASE,
1225 "[\\u1FB4]",
1226 "[\\u1FB4{\\u03AC\\u03B9}]",
1227
1228 CASE,
1229 "[{F\\uFB01}]",
1230 "[\\uFB03{ffi}]",
1231
1232 CASE, // make sure binary search finds limits
1233 "[a\\uFF3A]",
1234 "[aA\\uFF3A\\uFF5A]",
1235
1236 CASE,
1237 "[a-z]","[A-Za-z\\u017F\\u212A]",
1238 CASE,
1239 "[abc]","[A-Ca-c]",
1240 CASE,
1241 "[ABC]","[A-Ca-c]",
1242
1243 CASE, "[i]", "[iI]",
1244
1245 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1246 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1247
1248 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1249
1250 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1251
1252 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1253
1254 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1255
1256 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1257
1258 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1259
1260 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1261 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1262
1263 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1264
1265 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1266
1267 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1268
1269 #if !UCONFIG_NO_FILE_IO
1270 CASE_MAPPINGS,
1271 "[aq\\u00DF{Bc}{bC}{Fi}]",
1272 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1273 #endif
1274
1275 CASE_MAPPINGS,
1276 "[\\u01F1]", // 'DZ'
1277 "[\\u01F1\\u01F2\\u01F3]",
1278
1279 CASE_MAPPINGS,
1280 "[a-z]",
1281 "[A-Za-z]",
1282
1283 NULL
1284 };
1285
1286 UnicodeSet s;
1287 UnicodeSet t;
1288 UnicodeString buf;
1289 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1290 int32_t selector = DATA[i][0];
1291 UnicodeString pat(DATA[i+1], -1, US_INV);
1292 UnicodeString exp(DATA[i+2], -1, US_INV);
1293 s.applyPattern(pat, ec);
1294 s.closeOver(selector);
1295 t.applyPattern(exp, ec);
1296 if (U_FAILURE(ec)) {
1297 errln("FAIL: applyPattern failed");
1298 continue;
1299 }
1300 if (s == t) {
1301 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1302 } else {
1303 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1304 s.toPattern(buf, TRUE) + ", expected " + exp);
1305 }
1306 }
1307
1308 #if 0
1309 /*
1310 * Unused test code.
1311 * This was used to compare the old implementation (using USET_CASE)
1312 * with the new one (using 0x100 temporarily)
1313 * while transitioning from hardcoded case closure tables in uniset.cpp
1314 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1315 * and using ucase.c functions for closure.
1316 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1317 *
1318 * Note: The old and new implementation never fully matched because
1319 * the old implementation turned out to not map U+0130 and U+0131 correctly
1320 * (dotted I and dotless i) and because the old implementation's data tables
1321 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1322 * new implementation. (So sigmas and some other characters were not handled
1323 * according to the newer Unicode version.)
1324 */
1325 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1326 UnicodeSetIterator si(sens);
1327 UnicodeString str, buf2;
1328 const UnicodeString *pStr;
1329 UChar32 c;
1330 while(si.next()) {
1331 if(!si.isString()) {
1332 c=si.getCodepoint();
1333 s.clear();
1334 s.add(c);
1335
1336 str.setTo(c);
1337 str.foldCase();
1338 sens2.add(str);
1339
1340 t=s;
1341 s.closeOver(USET_CASE);
1342 t.closeOver(0x100);
1343 if(s!=t) {
1344 errln("FAIL: closeOver(U+%04x) differs: ", c);
1345 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1346 }
1347 }
1348 }
1349 // remove all code points
1350 // should contain all full case folding mapping strings
1351 sens2.remove(0, 0x10ffff);
1352 si.reset(sens2);
1353 while(si.next()) {
1354 if(si.isString()) {
1355 pStr=&si.getString();
1356 s.clear();
1357 s.add(*pStr);
1358 t=s2=s;
1359 s.closeOver(USET_CASE);
1360 t.closeOver(0x100);
1361 if(s!=t) {
1362 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1363 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1364 }
1365 }
1366 }
1367 #endif
1368
1369 // Test the pattern API
1370 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1371 if (U_FAILURE(ec)) {
1372 errln("FAIL: applyPattern failed");
1373 } else {
1374 expectContainment(s, "abcABC", "defDEF");
1375 }
1376 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1377 if (U_FAILURE(ec)) {
1378 errln("FAIL: constructor failed");
1379 } else {
1380 expectContainment(v, "defDEF", "abcABC");
1381 }
1382 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1383 if (U_FAILURE(ec)) {
1384 errln("FAIL: construct w/case mappings failed");
1385 } else {
1386 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1387 }
1388 }
1389
TestEscapePattern()1390 void UnicodeSetTest::TestEscapePattern() {
1391 const char pattern[] =
1392 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1393 const char exp[] =
1394 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1395 // We test this with two passes; in the second pass we
1396 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,
1397 // this fails -- which is what we expect.
1398 for (int32_t pass=1; pass<=2; ++pass) {
1399 UErrorCode ec = U_ZERO_ERROR;
1400 UnicodeString pat(pattern, -1, US_INV);
1401 if (pass==2) {
1402 pat = pat.unescape();
1403 }
1404 // Pattern is only good for pass 1
1405 UBool isPatternValid = (pass==1);
1406
1407 UnicodeSet set(pat, ec);
1408 if (U_SUCCESS(ec) != isPatternValid){
1409 errln((UnicodeString)"FAIL: applyPattern(" +
1410 escape(pat) + ") => " +
1411 u_errorName(ec));
1412 continue;
1413 }
1414 if (U_FAILURE(ec)) {
1415 continue;
1416 }
1417 if (set.contains((UChar)0x0644)){
1418 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1419 }
1420
1421 UnicodeString newpat;
1422 set.toPattern(newpat, TRUE);
1423 if (newpat == UnicodeString(exp, -1, US_INV)) {
1424 logln(escape(pat) + " => " + newpat);
1425 } else {
1426 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1427 }
1428
1429 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1430 UnicodeString str("Range ");
1431 str.append((UChar)(0x30 + i))
1432 .append(": ")
1433 .append((UChar32)set.getRangeStart(i))
1434 .append(" - ")
1435 .append((UChar32)set.getRangeEnd(i));
1436 str = str + " (" + set.getRangeStart(i) + " - " +
1437 set.getRangeEnd(i) + ")";
1438 if (set.getRangeStart(i) < 0) {
1439 errln((UnicodeString)"FAIL: " + escape(str));
1440 } else {
1441 logln(escape(str));
1442 }
1443 }
1444 }
1445 }
1446
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1447 void UnicodeSetTest::expectRange(const UnicodeString& label,
1448 const UnicodeSet& set,
1449 UChar32 start, UChar32 end) {
1450 UnicodeSet exp(start, end);
1451 UnicodeString pat;
1452 if (set == exp) {
1453 logln(label + " => " + set.toPattern(pat, TRUE));
1454 } else {
1455 UnicodeString xpat;
1456 errln((UnicodeString)"FAIL: " + label + " => " +
1457 set.toPattern(pat, TRUE) +
1458 ", expected " + exp.toPattern(xpat, TRUE));
1459 }
1460 }
1461
TestInvalidCodePoint()1462 void UnicodeSetTest::TestInvalidCodePoint() {
1463
1464 const UChar32 DATA[] = {
1465 // Test range Expected range
1466 0, 0x10FFFF, 0, 0x10FFFF,
1467 (UChar32)-1, 8, 0, 8,
1468 8, 0x110000, 8, 0x10FFFF
1469 };
1470 const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1471
1472 UnicodeString pat;
1473 int32_t i;
1474
1475 for (i=0; i<DATA_LENGTH; i+=4) {
1476 UChar32 start = DATA[i];
1477 UChar32 end = DATA[i+1];
1478 UChar32 xstart = DATA[i+2];
1479 UChar32 xend = DATA[i+3];
1480
1481 // Try various API using the test code points
1482
1483 UnicodeSet set(start, end);
1484 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1485 set, xstart, xend);
1486
1487 set.clear();
1488 set.set(start, end);
1489 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1490 set, xstart, xend);
1491
1492 UBool b = set.contains(start);
1493 b = set.contains(start, end);
1494 b = set.containsNone(start, end);
1495 b = set.containsSome(start, end);
1496 (void)b; // Suppress set but not used warning.
1497
1498 /*int32_t index = set.indexOf(start);*/
1499
1500 set.clear();
1501 set.add(start);
1502 set.add(start, end);
1503 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1504 set, xstart, xend);
1505
1506 set.set(0, 0x10FFFF);
1507 set.retain(start, end);
1508 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1509 set, xstart, xend);
1510 set.retain(start);
1511
1512 set.set(0, 0x10FFFF);
1513 set.remove(start);
1514 set.remove(start, end);
1515 set.complement();
1516 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1517 set, xstart, xend);
1518
1519 set.set(0, 0x10FFFF);
1520 set.complement(start, end);
1521 set.complement();
1522 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1523 set, xstart, xend);
1524 set.complement(start);
1525 }
1526
1527 const UChar32 DATA2[] = {
1528 0,
1529 0x10FFFF,
1530 (UChar32)-1,
1531 0x110000
1532 };
1533 const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1534
1535 for (i=0; i<DATA2_LENGTH; ++i) {
1536 UChar32 c = DATA2[i], end = 0x10FFFF;
1537 UBool valid = (c >= 0 && c <= 0x10FFFF);
1538
1539 UnicodeSet set(0, 0x10FFFF);
1540
1541 // For single-codepoint contains, invalid codepoints are NOT contained
1542 UBool b = set.contains(c);
1543 if (b == valid) {
1544 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1545 ") = " + b);
1546 } else {
1547 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1548 ") = " + b);
1549 }
1550
1551 // For codepoint range contains, containsNone, and containsSome,
1552 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1553 b = set.contains(c, end);
1554 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1555 "," + end + ") = " + b);
1556
1557 b = set.containsNone(c, end);
1558 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1559 "," + end + ") = " + b);
1560
1561 b = set.containsSome(c, end);
1562 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1563 "," + end + ") = " + b);
1564
1565 int32_t index = set.indexOf(c);
1566 if ((index >= 0) == valid) {
1567 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1568 ") = " + index);
1569 } else {
1570 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1571 ") = " + index);
1572 }
1573 }
1574 }
1575
1576 // Used by TestSymbolTable
1577 class TokenSymbolTable : public SymbolTable {
1578 public:
1579 Hashtable contents;
1580
TokenSymbolTable(UErrorCode & ec)1581 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1582 contents.setValueDeleter(uprv_deleteUObject);
1583 }
1584
~TokenSymbolTable()1585 ~TokenSymbolTable() {}
1586
1587 /**
1588 * (Non-SymbolTable API) Add the given variable and value to
1589 * the table. Variable should NOT contain leading '$'.
1590 */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1591 void add(const UnicodeString& var, const UnicodeString& value,
1592 UErrorCode& ec) {
1593 if (U_SUCCESS(ec)) {
1594 contents.put(var, new UnicodeString(value), ec);
1595 }
1596 }
1597
1598 /**
1599 * SymbolTable API
1600 */
lookup(const UnicodeString & s) const1601 virtual const UnicodeString* lookup(const UnicodeString& s) const {
1602 return (const UnicodeString*) contents.get(s);
1603 }
1604
1605 /**
1606 * SymbolTable API
1607 */
lookupMatcher(UChar32) const1608 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1609 return NULL;
1610 }
1611
1612 /**
1613 * SymbolTable API
1614 */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1615 virtual UnicodeString parseReference(const UnicodeString& text,
1616 ParsePosition& pos, int32_t limit) const {
1617 int32_t start = pos.getIndex();
1618 int32_t i = start;
1619 UnicodeString result;
1620 while (i < limit) {
1621 UChar c = text.charAt(i);
1622 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1623 break;
1624 }
1625 ++i;
1626 }
1627 if (i == start) { // No valid name chars
1628 return result; // Indicate failure with empty string
1629 }
1630 pos.setIndex(i);
1631 text.extractBetween(start, i, result);
1632 return result;
1633 }
1634 };
1635
TestSymbolTable()1636 void UnicodeSetTest::TestSymbolTable() {
1637 // Multiple test cases can be set up here. Each test case
1638 // is terminated by null:
1639 // var, value, var, value,..., input pat., exp. output pat., null
1640 const char* DATA[] = {
1641 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1642 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1643 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1644 NULL
1645 };
1646
1647 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1648 UErrorCode ec = U_ZERO_ERROR;
1649 TokenSymbolTable sym(ec);
1650 if (U_FAILURE(ec)) {
1651 errln("FAIL: couldn't construct TokenSymbolTable");
1652 continue;
1653 }
1654
1655 // Set up variables
1656 while (DATA[i+2] != NULL) {
1657 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1658 if (U_FAILURE(ec)) {
1659 errln("FAIL: couldn't add to TokenSymbolTable");
1660 continue;
1661 }
1662 i += 2;
1663 }
1664
1665 // Input pattern and expected output pattern
1666 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1667 i += 2;
1668
1669 ParsePosition pos(0);
1670 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1671 if (U_FAILURE(ec)) {
1672 errln("FAIL: couldn't construct UnicodeSet");
1673 continue;
1674 }
1675
1676 // results
1677 if (pos.getIndex() != inpat.length()) {
1678 errln((UnicodeString)"Failed to read to end of string \""
1679 + inpat + "\": read to "
1680 + pos.getIndex() + ", length is "
1681 + inpat.length());
1682 }
1683
1684 UnicodeSet us2(exppat, ec);
1685 if (U_FAILURE(ec)) {
1686 errln("FAIL: couldn't construct expected UnicodeSet");
1687 continue;
1688 }
1689
1690 UnicodeString a, b;
1691 if (us != us2) {
1692 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1693 ", expected " + us2.toPattern(b, TRUE));
1694 } else {
1695 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1696 }
1697 }
1698 }
1699
TestSurrogate()1700 void UnicodeSetTest::TestSurrogate() {
1701 const char* DATA[] = {
1702 // These should all behave identically
1703 "[abc\\uD800\\uDC00]",
1704 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1705 "[abc\\U00010000]",
1706 0
1707 };
1708 for (int i=0; DATA[i] != 0; ++i) {
1709 UErrorCode ec = U_ZERO_ERROR;
1710 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1711 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1712 UnicodeSet set(str, ec);
1713 if (U_FAILURE(ec)) {
1714 errln("FAIL: UnicodeSet constructor");
1715 continue;
1716 }
1717 expectContainment(set,
1718 CharsToUnicodeString("abc\\U00010000"),
1719 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1720 if (set.size() != 4) {
1721 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1722 set.size() + ", expected 4");
1723 }
1724
1725 {
1726 UErrorCode subErr = U_ZERO_ERROR;
1727 checkRoundTrip(set);
1728 checkSerializeRoundTrip(set, subErr);
1729 }
1730 }
1731 }
1732
TestExhaustive()1733 void UnicodeSetTest::TestExhaustive() {
1734 // exhaustive tests. Simulate UnicodeSets with integers.
1735 // That gives us very solid tests (except for large memory tests).
1736
1737 int32_t limit = 128;
1738
1739 UnicodeSet x, y, z, aa;
1740
1741 for (int32_t i = 0; i < limit; ++i) {
1742 bitsToSet(i, x);
1743 logln((UnicodeString)"Testing " + i + ", " + x);
1744 _testComplement(i, x, y);
1745
1746 UnicodeSet &toTest = bitsToSet(i, aa);
1747
1748 // AS LONG AS WE ARE HERE, check roundtrip
1749 checkRoundTrip(toTest);
1750 UErrorCode ec = U_ZERO_ERROR;
1751 checkSerializeRoundTrip(toTest, ec);
1752
1753 for (int32_t j = 0; j < limit; ++j) {
1754 _testAdd(i,j, x,y,z);
1755 _testXor(i,j, x,y,z);
1756 _testRetain(i,j, x,y,z);
1757 _testRemove(i,j, x,y,z);
1758 }
1759 }
1760 }
1761
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1762 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1763 bitsToSet(a, x);
1764 z = x;
1765 z.complement();
1766 int32_t c = setToBits(z);
1767 if (c != (~a)) {
1768 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1769 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1770 }
1771 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1772 }
1773
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1774 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1775 bitsToSet(a, x);
1776 bitsToSet(b, y);
1777 z = x;
1778 z.addAll(y);
1779 int32_t c = setToBits(z);
1780 if (c != (a | b)) {
1781 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1782 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1783 }
1784 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1785 }
1786
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1787 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1788 bitsToSet(a, x);
1789 bitsToSet(b, y);
1790 z = x;
1791 z.retainAll(y);
1792 int32_t c = setToBits(z);
1793 if (c != (a & b)) {
1794 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1795 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1796 }
1797 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1798 }
1799
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1800 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1801 bitsToSet(a, x);
1802 bitsToSet(b, y);
1803 z = x;
1804 z.removeAll(y);
1805 int32_t c = setToBits(z);
1806 if (c != (a &~ b)) {
1807 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1808 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1809 }
1810 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1811 }
1812
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1813 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1814 bitsToSet(a, x);
1815 bitsToSet(b, y);
1816 z = x;
1817 z.complementAll(y);
1818 int32_t c = setToBits(z);
1819 if (c != (a ^ b)) {
1820 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1821 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1822 }
1823 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1824 }
1825
1826 /**
1827 * Check that ranges are monotonically increasing and non-
1828 * overlapping.
1829 */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1830 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1831 int32_t n = set.getRangeCount();
1832 if (n < 0) {
1833 errln((UnicodeString)"FAIL result of " + msg +
1834 ": range count should be >= 0 but is " +
1835 n /*+ " for " + set.toPattern())*/);
1836 return;
1837 }
1838 UChar32 last = 0;
1839 for (int32_t i=0; i<n; ++i) {
1840 UChar32 start = set.getRangeStart(i);
1841 UChar32 end = set.getRangeEnd(i);
1842 if (start > end) {
1843 errln((UnicodeString)"FAIL result of " + msg +
1844 ": range " + (i+1) +
1845 " start > end: " + (int)start + ", " + (int)end +
1846 " for " + set);
1847 }
1848 if (i > 0 && start <= last) {
1849 errln((UnicodeString)"FAIL result of " + msg +
1850 ": range " + (i+1) +
1851 " overlaps previous range: " + (int)start + ", " + (int)end +
1852 " for " + set);
1853 }
1854 last = end;
1855 }
1856 }
1857
1858 /**
1859 * Convert a bitmask to a UnicodeSet.
1860 */
bitsToSet(int32_t a,UnicodeSet & result)1861 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1862 result.clear();
1863 for (UChar32 i = 0; i < 32; ++i) {
1864 if ((a & (1<<i)) != 0) {
1865 result.add(i);
1866 }
1867 }
1868 return result;
1869 }
1870
1871 /**
1872 * Convert a UnicodeSet to a bitmask. Only the characters
1873 * U+0000 to U+0020 are represented in the bitmask.
1874 */
setToBits(const UnicodeSet & x)1875 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1876 int32_t result = 0;
1877 for (int32_t i = 0; i < 32; ++i) {
1878 if (x.contains((UChar32)i)) {
1879 result |= (1<<i);
1880 }
1881 }
1882 return result;
1883 }
1884
1885 /**
1886 * Return the representation of an inversion list based UnicodeSet
1887 * as a pairs list. Ranges are listed in ascending Unicode order.
1888 * For example, the set [a-zA-M3] is represented as "33AMaz".
1889 */
getPairs(const UnicodeSet & set)1890 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1891 UnicodeString pairs;
1892 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1893 UChar32 start = set.getRangeStart(i);
1894 UChar32 end = set.getRangeEnd(i);
1895 if (end > 0xFFFF) {
1896 end = 0xFFFF;
1897 i = set.getRangeCount(); // Should be unnecessary
1898 }
1899 pairs.append((UChar)start).append((UChar)end);
1900 }
1901 return pairs;
1902 }
1903
1904 /**
1905 * Basic consistency check for a few items.
1906 * That the iterator works, and that we can create a pattern and
1907 * get the same thing back
1908 */
checkRoundTrip(const UnicodeSet & s)1909 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1910 {
1911 UnicodeSet t(s);
1912 checkEqual(s, t, "copy ct");
1913 }
1914
1915 {
1916 UnicodeSet t(0xabcd, 0xdef0); // dummy contents should be overwritten
1917 t = s;
1918 checkEqual(s, t, "operator=");
1919 }
1920
1921 {
1922 UnicodeSet t;
1923 copyWithIterator(t, s, FALSE);
1924 checkEqual(s, t, "iterator roundtrip");
1925 }
1926
1927 {
1928 UnicodeSet t;
1929 copyWithIterator(t, s, TRUE); // try range
1930 checkEqual(s, t, "iterator roundtrip");
1931 }
1932
1933 {
1934 UnicodeSet t;
1935 UnicodeString pat;
1936 UErrorCode ec = U_ZERO_ERROR;
1937 s.toPattern(pat, FALSE);
1938 t.applyPattern(pat, ec);
1939 if (U_FAILURE(ec)) {
1940 errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1941 return;
1942 } else {
1943 checkEqual(s, t, "toPattern(false)");
1944 }
1945 }
1946
1947 {
1948 UnicodeSet t;
1949 UnicodeString pat;
1950 UErrorCode ec = U_ZERO_ERROR;
1951 s.toPattern(pat, TRUE);
1952 t.applyPattern(pat, ec);
1953 if (U_FAILURE(ec)) {
1954 errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1955 return;
1956 } else {
1957 checkEqual(s, t, "toPattern(true)");
1958 }
1959 }
1960 }
1961
checkSerializeRoundTrip(const UnicodeSet & t,UErrorCode & status)1962 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1963 if(U_FAILURE(status)) return;
1964 int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1965 if(status == U_BUFFER_OVERFLOW_ERROR) {
1966 status = U_ZERO_ERROR;
1967 serializeBuffer.resize(len);
1968 len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1969 // let 2nd error stand
1970 }
1971 if(U_FAILURE(status)) {
1972 errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
1973 return;
1974 }
1975 UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
1976 if(U_FAILURE(status)) {
1977 errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
1978 return;
1979 }
1980
1981 checkEqual(t, deserialized, "Set was unequal when deserialized");
1982 }
1983
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)1984 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1985 t.clear();
1986 UnicodeSetIterator it(s);
1987 if (withRange) {
1988 while (it.nextRange()) {
1989 if (it.isString()) {
1990 t.add(it.getString());
1991 } else {
1992 t.add(it.getCodepoint(), it.getCodepointEnd());
1993 }
1994 }
1995 } else {
1996 while (it.next()) {
1997 if (it.isString()) {
1998 t.add(it.getString());
1999 } else {
2000 t.add(it.getCodepoint());
2001 }
2002 }
2003 }
2004 }
2005
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)2006 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2007 assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2008 assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2009 UnicodeString source; s.toPattern(source, TRUE);
2010 UnicodeString result; t.toPattern(result, TRUE);
2011 if (s != t) {
2012 errln((UnicodeString)"FAIL: " + message
2013 + "; source = " + source
2014 + "; result = " + result
2015 );
2016 return FALSE;
2017 } else {
2018 logln((UnicodeString)"Ok: " + message
2019 + "; source = " + source
2020 + "; result = " + result
2021 );
2022 }
2023 return TRUE;
2024 }
2025
2026 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)2027 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2028 const UnicodeString& charsIn,
2029 const UnicodeString& charsOut) {
2030 UErrorCode ec = U_ZERO_ERROR;
2031 UnicodeSet set(pat, ec);
2032 if (U_FAILURE(ec)) {
2033 dataerrln((UnicodeString)"FAIL: pattern \"" +
2034 pat + "\" => " + u_errorName(ec));
2035 return;
2036 }
2037 expectContainment(set, pat, charsIn, charsOut);
2038 }
2039
2040 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)2041 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2042 const UnicodeString& charsIn,
2043 const UnicodeString& charsOut) {
2044 UnicodeString pat;
2045 set.toPattern(pat);
2046 expectContainment(set, pat, charsIn, charsOut);
2047 }
2048
2049 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)2050 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2051 const UnicodeString& setName,
2052 const UnicodeString& charsIn,
2053 const UnicodeString& charsOut) {
2054 UnicodeString bad;
2055 UChar32 c;
2056 int32_t i;
2057
2058 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2059 c = charsIn.char32At(i);
2060 if (!set.contains(c)) {
2061 bad.append(c);
2062 }
2063 }
2064 if (bad.length() > 0) {
2065 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2066 ", expected containment of " + prettify(charsIn));
2067 } else {
2068 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2069 }
2070
2071 bad.truncate(0);
2072 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2073 c = charsOut.char32At(i);
2074 if (set.contains(c)) {
2075 bad.append(c);
2076 }
2077 }
2078 if (bad.length() > 0) {
2079 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2080 ", expected non-containment of " + prettify(charsOut));
2081 } else {
2082 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2083 }
2084 }
2085
2086 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2087 UnicodeSetTest::expectPattern(UnicodeSet& set,
2088 const UnicodeString& pattern,
2089 const UnicodeString& expectedPairs){
2090 UErrorCode status = U_ZERO_ERROR;
2091 set.applyPattern(pattern, status);
2092 if (U_FAILURE(status)) {
2093 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2094 "\") failed");
2095 return;
2096 } else {
2097 if (getPairs(set) != expectedPairs ) {
2098 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2099 "\") => pairs \"" +
2100 escape(getPairs(set)) + "\", expected \"" +
2101 escape(expectedPairs) + "\"");
2102 } else {
2103 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
2104 "\") => pairs \"" +
2105 escape(getPairs(set)) + "\"");
2106 }
2107 }
2108 // the result of calling set.toPattern(), which is the string representation of
2109 // this set(set), is passed to a UnicodeSet constructor, and tested that it
2110 // will produce another set that is equal to this one.
2111 UnicodeString temppattern;
2112 set.toPattern(temppattern);
2113 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2114 if (U_FAILURE(status)) {
2115 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2116 return;
2117 }
2118 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2119 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2120 escape(getPairs(set)) + "\""));
2121 } else{
2122 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2123 }
2124
2125 delete tempset;
2126
2127 }
2128
2129 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2130 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2131 if (getPairs(set) != expectedPairs) {
2132 errln(UnicodeString("FAIL: Expected pair list \"") +
2133 escape(expectedPairs) + "\", got \"" +
2134 escape(getPairs(set)) + "\"");
2135 }
2136 }
2137
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2138 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2139 const UnicodeString& expPat,
2140 const char** expStrings) {
2141 UnicodeString pat;
2142 set.toPattern(pat, TRUE);
2143 if (pat == expPat) {
2144 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2145 } else {
2146 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2147 return;
2148 }
2149 if (expStrings == NULL) {
2150 return;
2151 }
2152 UBool in = TRUE;
2153 for (int32_t i=0; expStrings[i] != NULL; ++i) {
2154 if (expStrings[i] == NOT) { // sic; pointer comparison
2155 in = FALSE;
2156 continue;
2157 }
2158 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2159 UBool contained = set.contains(s);
2160 if (contained == in) {
2161 logln((UnicodeString)"Ok: " + expPat +
2162 (contained ? " contains {" : " does not contain {") +
2163 escape(expStrings[i]) + "}");
2164 } else {
2165 errln((UnicodeString)"FAIL: " + expPat +
2166 (contained ? " contains {" : " does not contain {") +
2167 escape(expStrings[i]) + "}");
2168 }
2169 }
2170 }
2171
toHexString(int32_t i)2172 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2173
2174 void
doAssert(UBool condition,const char * message)2175 UnicodeSetTest::doAssert(UBool condition, const char *message)
2176 {
2177 if (!condition) {
2178 errln(UnicodeString("ERROR : ") + message);
2179 }
2180 }
2181
2182 UnicodeString
escape(const UnicodeString & s)2183 UnicodeSetTest::escape(const UnicodeString& s) {
2184 UnicodeString buf;
2185 for (int32_t i=0; i<s.length(); )
2186 {
2187 UChar32 c = s.char32At(i);
2188 if (0x0020 <= c && c <= 0x007F) {
2189 buf += c;
2190 } else {
2191 if (c <= 0xFFFF) {
2192 buf += (UChar)0x5c; buf += (UChar)0x75;
2193 } else {
2194 buf += (UChar)0x5c; buf += (UChar)0x55;
2195 buf += toHexString((c & 0xF0000000) >> 28);
2196 buf += toHexString((c & 0x0F000000) >> 24);
2197 buf += toHexString((c & 0x00F00000) >> 20);
2198 buf += toHexString((c & 0x000F0000) >> 16);
2199 }
2200 buf += toHexString((c & 0xF000) >> 12);
2201 buf += toHexString((c & 0x0F00) >> 8);
2202 buf += toHexString((c & 0x00F0) >> 4);
2203 buf += toHexString(c & 0x000F);
2204 }
2205 i += U16_LENGTH(c);
2206 }
2207 return buf;
2208 }
2209
TestFreezable()2210 void UnicodeSetTest::TestFreezable() {
2211 UErrorCode errorCode=U_ZERO_ERROR;
2212 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2213 UnicodeSet idSet(idPattern, errorCode);
2214 if(U_FAILURE(errorCode)) {
2215 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2216 return;
2217 }
2218
2219 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2220 UnicodeSet wsSet(wsPattern, errorCode);
2221 if(U_FAILURE(errorCode)) {
2222 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2223 return;
2224 }
2225
2226 idSet.add(idPattern);
2227 UnicodeSet frozen(idSet);
2228 frozen.freeze();
2229
2230 if(idSet.isFrozen() || !frozen.isFrozen()) {
2231 errln("FAIL: isFrozen() is wrong");
2232 }
2233 if(frozen!=idSet || !(frozen==idSet)) {
2234 errln("FAIL: a copy-constructed frozen set differs from its original");
2235 }
2236
2237 frozen=wsSet;
2238 if(frozen!=idSet || !(frozen==idSet)) {
2239 errln("FAIL: a frozen set was modified by operator=");
2240 }
2241
2242 UnicodeSet frozen2(frozen);
2243 if(frozen2!=frozen || frozen2!=idSet) {
2244 errln("FAIL: a copied frozen set differs from its frozen original");
2245 }
2246 if(!frozen2.isFrozen()) {
2247 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2248 }
2249 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2250 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2251 errln("FAIL: UnicodeSet(5, 55) failed");
2252 }
2253 frozen3=frozen;
2254 if(!frozen3.isFrozen()) {
2255 errln("FAIL: copying a frozen set results in a thawed one");
2256 }
2257
2258 UnicodeSet *cloned=frozen.clone();
2259 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2260 errln("FAIL: clone() failed");
2261 }
2262 cloned->add(0xd802, 0xd805);
2263 if(cloned->containsSome(0xd802, 0xd805)) {
2264 errln("FAIL: unable to modify clone");
2265 }
2266 delete cloned;
2267
2268 UnicodeSet *thawed=frozen.cloneAsThawed();
2269 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2270 errln("FAIL: cloneAsThawed() failed");
2271 }
2272 thawed->add(0xd802, 0xd805);
2273 if(!thawed->contains(0xd802, 0xd805)) {
2274 errln("FAIL: unable to modify thawed clone");
2275 }
2276 delete thawed;
2277
2278 frozen.set(5, 55);
2279 if(frozen!=idSet || !(frozen==idSet)) {
2280 errln("FAIL: UnicodeSet::set() modified a frozen set");
2281 }
2282
2283 frozen.clear();
2284 if(frozen!=idSet || !(frozen==idSet)) {
2285 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2286 }
2287
2288 frozen.closeOver(USET_CASE_INSENSITIVE);
2289 if(frozen!=idSet || !(frozen==idSet)) {
2290 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2291 }
2292
2293 frozen.compact();
2294 if(frozen!=idSet || !(frozen==idSet)) {
2295 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2296 }
2297
2298 ParsePosition pos;
2299 frozen.
2300 applyPattern(wsPattern, errorCode).
2301 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2302 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2303 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2304 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2305 if(frozen!=idSet || !(frozen==idSet)) {
2306 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2307 }
2308
2309 frozen.
2310 add(0xd800).
2311 add(0xd802, 0xd805).
2312 add(wsPattern).
2313 addAll(idPattern).
2314 addAll(wsSet);
2315 if(frozen!=idSet || !(frozen==idSet)) {
2316 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2317 }
2318
2319 frozen.
2320 retain(0x62).
2321 retain(0x64, 0x69).
2322 retainAll(wsPattern).
2323 retainAll(wsSet);
2324 if(frozen!=idSet || !(frozen==idSet)) {
2325 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2326 }
2327
2328 frozen.
2329 remove(0x62).
2330 remove(0x64, 0x69).
2331 remove(idPattern).
2332 removeAll(idPattern).
2333 removeAll(idSet);
2334 if(frozen!=idSet || !(frozen==idSet)) {
2335 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2336 }
2337
2338 frozen.
2339 complement().
2340 complement(0x62).
2341 complement(0x64, 0x69).
2342 complement(idPattern).
2343 complementAll(idPattern).
2344 complementAll(idSet);
2345 if(frozen!=idSet || !(frozen==idSet)) {
2346 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2347 }
2348 }
2349
2350 // Test span() etc. -------------------------------------------------------- ***
2351
2352 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2353 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2354 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2355 UErrorCode errorCode=U_ZERO_ERROR;
2356 int32_t length8=0;
2357 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2358 if(U_SUCCESS(errorCode)) {
2359 return length8;
2360 } else {
2361 // The string contains an unpaired surrogate.
2362 // Ignore this string.
2363 return 0;
2364 }
2365 }
2366
2367 class UnicodeSetWithStringsIterator;
2368
2369 // Make the strings in a UnicodeSet easily accessible.
2370 class UnicodeSetWithStrings {
2371 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2372 UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2373 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2374 int32_t size=set.size();
2375 if(size>0 && set.charAt(size-1)<0) {
2376 // If a set's last element is not a code point, then it must contain strings.
2377 // Iterate over the set, skip all code point ranges, and cache the strings.
2378 // Convert them to UTF-8 for spanUTF8().
2379 UnicodeSetIterator iter(set);
2380 const UnicodeString *s;
2381 char *s8=utf8;
2382 int32_t length8, utf8Count=0;
2383 while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2384 if(iter.isString()) {
2385 // Store the pointer to the set's string element
2386 // which we happen to know is a stable pointer.
2387 strings[stringsLength]=s=&iter.getString();
2388 utf8Count+=
2389 utf8Lengths[stringsLength]=length8=
2390 appendUTF8(s->getBuffer(), s->length(),
2391 s8, (int32_t)(sizeof(utf8)-utf8Count));
2392 if(length8==0) {
2393 hasSurrogates=TRUE; // Contains unpaired surrogates.
2394 }
2395 s8+=length8;
2396 ++stringsLength;
2397 }
2398 }
2399 }
2400 }
2401
getSet() const2402 const UnicodeSet &getSet() const {
2403 return set;
2404 }
2405
hasStrings() const2406 UBool hasStrings() const {
2407 return (UBool)(stringsLength>0);
2408 }
2409
hasStringsWithSurrogates() const2410 UBool hasStringsWithSurrogates() const {
2411 return hasSurrogates;
2412 }
2413
2414 private:
2415 friend class UnicodeSetWithStringsIterator;
2416
2417 const UnicodeSet &set;
2418
2419 const UnicodeString *strings[20];
2420 int32_t stringsLength;
2421 UBool hasSurrogates;
2422
2423 char utf8[1024];
2424 int32_t utf8Lengths[20];
2425 };
2426
2427 class UnicodeSetWithStringsIterator {
2428 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2429 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2430 fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2431 }
2432
reset()2433 void reset() {
2434 nextStringIndex=nextUTF8Start=0;
2435 }
2436
nextString()2437 const UnicodeString *nextString() {
2438 if(nextStringIndex<fSet.stringsLength) {
2439 return fSet.strings[nextStringIndex++];
2440 } else {
2441 return NULL;
2442 }
2443 }
2444
2445 // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2446 const char *nextUTF8(int32_t &length) {
2447 if(nextStringIndex<fSet.stringsLength) {
2448 const char *s8=fSet.utf8+nextUTF8Start;
2449 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2450 return s8;
2451 } else {
2452 length=0;
2453 return NULL;
2454 }
2455 }
2456
2457 private:
2458 const UnicodeSetWithStrings &fSet;
2459 int32_t nextStringIndex;
2460 int32_t nextUTF8Start;
2461 };
2462
2463 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2464 // at code point boundaries.
2465 // That is, each edge of a match must not be in the middle of a surrogate pair.
2466 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2467 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2468 s+=start;
2469 limit-=start;
2470 int32_t length=t.length();
2471 return 0==t.compare(s, length) &&
2472 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2473 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2474 }
2475
2476 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2477 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2478 USetSpanCondition spanCondition) {
2479 const UnicodeSet &realSet(set.getSet());
2480 if(!set.hasStrings()) {
2481 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2482 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2483 }
2484
2485 UChar32 c;
2486 int32_t start=0, prev;
2487 while((prev=start)<length) {
2488 U16_NEXT(s, start, length, c);
2489 if(realSet.contains(c)!=spanCondition) {
2490 break;
2491 }
2492 }
2493 return prev;
2494 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2495 UnicodeSetWithStringsIterator iter(set);
2496 UChar32 c;
2497 int32_t start, next;
2498 for(start=next=0; start<length;) {
2499 U16_NEXT(s, next, length, c);
2500 if(realSet.contains(c)) {
2501 break;
2502 }
2503 const UnicodeString *str;
2504 iter.reset();
2505 while((str=iter.nextString())!=NULL) {
2506 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2507 // spanNeedsStrings=TRUE;
2508 return start;
2509 }
2510 }
2511 start=next;
2512 }
2513 return start;
2514 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2515 UnicodeSetWithStringsIterator iter(set);
2516 UChar32 c;
2517 int32_t start, next, maxSpanLimit=0;
2518 for(start=next=0; start<length;) {
2519 U16_NEXT(s, next, length, c);
2520 if(!realSet.contains(c)) {
2521 next=start; // Do not span this single, not-contained code point.
2522 }
2523 const UnicodeString *str;
2524 iter.reset();
2525 while((str=iter.nextString())!=NULL) {
2526 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2527 // spanNeedsStrings=TRUE;
2528 int32_t matchLimit=start+str->length();
2529 if(matchLimit==length) {
2530 return length;
2531 }
2532 if(spanCondition==USET_SPAN_CONTAINED) {
2533 // Iterate for the shortest match at each position.
2534 // Recurse for each but the shortest match.
2535 if(next==start) {
2536 next=matchLimit; // First match from start.
2537 } else {
2538 if(matchLimit<next) {
2539 // Remember shortest match from start for iteration.
2540 int32_t temp=next;
2541 next=matchLimit;
2542 matchLimit=temp;
2543 }
2544 // Recurse for non-shortest match from start.
2545 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2546 USET_SPAN_CONTAINED);
2547 if((matchLimit+spanLength)>maxSpanLimit) {
2548 maxSpanLimit=matchLimit+spanLength;
2549 if(maxSpanLimit==length) {
2550 return length;
2551 }
2552 }
2553 }
2554 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2555 if(matchLimit>next) {
2556 // Remember longest match from start.
2557 next=matchLimit;
2558 }
2559 }
2560 }
2561 }
2562 if(next==start) {
2563 break; // No match from start.
2564 }
2565 start=next;
2566 }
2567 if(start>maxSpanLimit) {
2568 return start;
2569 } else {
2570 return maxSpanLimit;
2571 }
2572 }
2573 }
2574
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2575 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2576 USetSpanCondition spanCondition) {
2577 if(length==0) {
2578 return 0;
2579 }
2580 const UnicodeSet &realSet(set.getSet());
2581 if(!set.hasStrings()) {
2582 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2583 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2584 }
2585
2586 UChar32 c;
2587 int32_t prev=length;
2588 do {
2589 U16_PREV(s, 0, length, c);
2590 if(realSet.contains(c)!=spanCondition) {
2591 break;
2592 }
2593 } while((prev=length)>0);
2594 return prev;
2595 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2596 UnicodeSetWithStringsIterator iter(set);
2597 UChar32 c;
2598 int32_t prev=length, length0=length;
2599 do {
2600 U16_PREV(s, 0, length, c);
2601 if(realSet.contains(c)) {
2602 break;
2603 }
2604 const UnicodeString *str;
2605 iter.reset();
2606 while((str=iter.nextString())!=NULL) {
2607 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2608 // spanNeedsStrings=TRUE;
2609 return prev;
2610 }
2611 }
2612 } while((prev=length)>0);
2613 return prev;
2614 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2615 UnicodeSetWithStringsIterator iter(set);
2616 UChar32 c;
2617 int32_t prev=length, minSpanStart=length, length0=length;
2618 do {
2619 U16_PREV(s, 0, length, c);
2620 if(!realSet.contains(c)) {
2621 length=prev; // Do not span this single, not-contained code point.
2622 }
2623 const UnicodeString *str;
2624 iter.reset();
2625 while((str=iter.nextString())!=NULL) {
2626 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2627 // spanNeedsStrings=TRUE;
2628 int32_t matchStart=prev-str->length();
2629 if(matchStart==0) {
2630 return 0;
2631 }
2632 if(spanCondition==USET_SPAN_CONTAINED) {
2633 // Iterate for the shortest match at each position.
2634 // Recurse for each but the shortest match.
2635 if(length==prev) {
2636 length=matchStart; // First match from prev.
2637 } else {
2638 if(matchStart>length) {
2639 // Remember shortest match from prev for iteration.
2640 int32_t temp=length;
2641 length=matchStart;
2642 matchStart=temp;
2643 }
2644 // Recurse for non-shortest match from prev.
2645 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2646 USET_SPAN_CONTAINED);
2647 if(spanStart<minSpanStart) {
2648 minSpanStart=spanStart;
2649 if(minSpanStart==0) {
2650 return 0;
2651 }
2652 }
2653 }
2654 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2655 if(matchStart<length) {
2656 // Remember longest match from prev.
2657 length=matchStart;
2658 }
2659 }
2660 }
2661 }
2662 if(length==prev) {
2663 break; // No match from prev.
2664 }
2665 } while((prev=length)>0);
2666 if(prev<minSpanStart) {
2667 return prev;
2668 } else {
2669 return minSpanStart;
2670 }
2671 }
2672 }
2673
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2674 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2675 USetSpanCondition spanCondition) {
2676 const UnicodeSet &realSet(set.getSet());
2677 if(!set.hasStrings()) {
2678 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2679 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2680 }
2681
2682 UChar32 c;
2683 int32_t start=0, prev;
2684 while((prev=start)<length) {
2685 U8_NEXT_OR_FFFD(s, start, length, c);
2686 if(realSet.contains(c)!=spanCondition) {
2687 break;
2688 }
2689 }
2690 return prev;
2691 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2692 UnicodeSetWithStringsIterator iter(set);
2693 UChar32 c;
2694 int32_t start, next;
2695 for(start=next=0; start<length;) {
2696 U8_NEXT_OR_FFFD(s, next, length, c);
2697 if(realSet.contains(c)) {
2698 break;
2699 }
2700 const char *s8;
2701 int32_t length8;
2702 iter.reset();
2703 while((s8=iter.nextUTF8(length8))!=NULL) {
2704 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2705 // spanNeedsStrings=TRUE;
2706 return start;
2707 }
2708 }
2709 start=next;
2710 }
2711 return start;
2712 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2713 UnicodeSetWithStringsIterator iter(set);
2714 UChar32 c;
2715 int32_t start, next, maxSpanLimit=0;
2716 for(start=next=0; start<length;) {
2717 U8_NEXT_OR_FFFD(s, next, length, c);
2718 if(!realSet.contains(c)) {
2719 next=start; // Do not span this single, not-contained code point.
2720 }
2721 const char *s8;
2722 int32_t length8;
2723 iter.reset();
2724 while((s8=iter.nextUTF8(length8))!=NULL) {
2725 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2726 // spanNeedsStrings=TRUE;
2727 int32_t matchLimit=start+length8;
2728 if(matchLimit==length) {
2729 return length;
2730 }
2731 if(spanCondition==USET_SPAN_CONTAINED) {
2732 // Iterate for the shortest match at each position.
2733 // Recurse for each but the shortest match.
2734 if(next==start) {
2735 next=matchLimit; // First match from start.
2736 } else {
2737 if(matchLimit<next) {
2738 // Remember shortest match from start for iteration.
2739 int32_t temp=next;
2740 next=matchLimit;
2741 matchLimit=temp;
2742 }
2743 // Recurse for non-shortest match from start.
2744 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2745 USET_SPAN_CONTAINED);
2746 if((matchLimit+spanLength)>maxSpanLimit) {
2747 maxSpanLimit=matchLimit+spanLength;
2748 if(maxSpanLimit==length) {
2749 return length;
2750 }
2751 }
2752 }
2753 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2754 if(matchLimit>next) {
2755 // Remember longest match from start.
2756 next=matchLimit;
2757 }
2758 }
2759 }
2760 }
2761 if(next==start) {
2762 break; // No match from start.
2763 }
2764 start=next;
2765 }
2766 if(start>maxSpanLimit) {
2767 return start;
2768 } else {
2769 return maxSpanLimit;
2770 }
2771 }
2772 }
2773
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2774 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2775 USetSpanCondition spanCondition) {
2776 if(length==0) {
2777 return 0;
2778 }
2779 const UnicodeSet &realSet(set.getSet());
2780 if(!set.hasStrings()) {
2781 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2782 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2783 }
2784
2785 UChar32 c;
2786 int32_t prev=length;
2787 do {
2788 U8_PREV_OR_FFFD(s, 0, length, c);
2789 if(realSet.contains(c)!=spanCondition) {
2790 break;
2791 }
2792 } while((prev=length)>0);
2793 return prev;
2794 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2795 UnicodeSetWithStringsIterator iter(set);
2796 UChar32 c;
2797 int32_t prev=length;
2798 do {
2799 U8_PREV_OR_FFFD(s, 0, length, c);
2800 if(realSet.contains(c)) {
2801 break;
2802 }
2803 const char *s8;
2804 int32_t length8;
2805 iter.reset();
2806 while((s8=iter.nextUTF8(length8))!=NULL) {
2807 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2808 // spanNeedsStrings=TRUE;
2809 return prev;
2810 }
2811 }
2812 } while((prev=length)>0);
2813 return prev;
2814 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2815 UnicodeSetWithStringsIterator iter(set);
2816 UChar32 c;
2817 int32_t prev=length, minSpanStart=length;
2818 do {
2819 U8_PREV_OR_FFFD(s, 0, length, c);
2820 if(!realSet.contains(c)) {
2821 length=prev; // Do not span this single, not-contained code point.
2822 }
2823 const char *s8;
2824 int32_t length8;
2825 iter.reset();
2826 while((s8=iter.nextUTF8(length8))!=NULL) {
2827 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2828 // spanNeedsStrings=TRUE;
2829 int32_t matchStart=prev-length8;
2830 if(matchStart==0) {
2831 return 0;
2832 }
2833 if(spanCondition==USET_SPAN_CONTAINED) {
2834 // Iterate for the shortest match at each position.
2835 // Recurse for each but the shortest match.
2836 if(length==prev) {
2837 length=matchStart; // First match from prev.
2838 } else {
2839 if(matchStart>length) {
2840 // Remember shortest match from prev for iteration.
2841 int32_t temp=length;
2842 length=matchStart;
2843 matchStart=temp;
2844 }
2845 // Recurse for non-shortest match from prev.
2846 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2847 USET_SPAN_CONTAINED);
2848 if(spanStart<minSpanStart) {
2849 minSpanStart=spanStart;
2850 if(minSpanStart==0) {
2851 return 0;
2852 }
2853 }
2854 }
2855 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2856 if(matchStart<length) {
2857 // Remember longest match from prev.
2858 length=matchStart;
2859 }
2860 }
2861 }
2862 }
2863 if(length==prev) {
2864 break; // No match from prev.
2865 }
2866 } while((prev=length)>0);
2867 if(prev<minSpanStart) {
2868 return prev;
2869 } else {
2870 return minSpanStart;
2871 }
2872 }
2873 }
2874
2875 // spans to be performed and compared
2876 enum {
2877 SPAN_UTF16 =1,
2878 SPAN_UTF8 =2,
2879 SPAN_UTFS =3,
2880
2881 SPAN_SET =4,
2882 SPAN_COMPLEMENT =8,
2883 SPAN_POLARITY =0xc,
2884
2885 SPAN_FWD =0x10,
2886 SPAN_BACK =0x20,
2887 SPAN_DIRS =0x30,
2888
2889 SPAN_CONTAINED =0x100,
2890 SPAN_SIMPLE =0x200,
2891 SPAN_CONDITION =0x300,
2892
2893 SPAN_ALL =0x33f
2894 };
2895
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2896 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2897 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2898 }
2899
slen(const void * s,UBool isUTF16)2900 static inline int32_t slen(const void *s, UBool isUTF16) {
2901 return isUTF16 ? u_strlen((const UChar *)s) : static_cast<int32_t>(strlen((const char *)s));
2902 }
2903
2904 /*
2905 * Count spans on a string with the method according to type and set the span limits.
2906 * The set may be the complement of the original.
2907 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2908 * according to the expected number of spans.
2909 * Sets typeName to an empty string if there is no such type.
2910 * Returns -1 if the span option is filtered out.
2911 */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2912 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2913 const void *s, int32_t length, UBool isUTF16,
2914 uint32_t whichSpans,
2915 int type, const char *&typeName,
2916 int32_t limits[], int32_t limitsCapacity,
2917 int32_t expectCount) {
2918 const UnicodeSet &realSet(set.getSet());
2919 int32_t start, count;
2920 USetSpanCondition spanCondition, firstSpanCondition, contained;
2921 UBool isForward;
2922
2923 if(type<0 || 7<type) {
2924 typeName="";
2925 return 0;
2926 }
2927
2928 static const char *const typeNames16[]={
2929 "contains", "contains(LM)",
2930 "span", "span(LM)",
2931 "containsBack", "containsBack(LM)",
2932 "spanBack", "spanBack(LM)"
2933 };
2934
2935 static const char *const typeNames8[]={
2936 "containsUTF8", "containsUTF8(LM)",
2937 "spanUTF8", "spanUTF8(LM)",
2938 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2939 "spanBackUTF8", "spanBackUTF8(LM)"
2940 };
2941
2942 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2943
2944 // filter span options
2945 if(type<=3) {
2946 // span forward
2947 if((whichSpans&SPAN_FWD)==0) {
2948 return -1;
2949 }
2950 isForward=TRUE;
2951 } else {
2952 // span backward
2953 if((whichSpans&SPAN_BACK)==0) {
2954 return -1;
2955 }
2956 isForward=FALSE;
2957 }
2958 if((type&1)==0) {
2959 // use USET_SPAN_CONTAINED
2960 if((whichSpans&SPAN_CONTAINED)==0) {
2961 return -1;
2962 }
2963 contained=USET_SPAN_CONTAINED;
2964 } else {
2965 // use USET_SPAN_SIMPLE
2966 if((whichSpans&SPAN_SIMPLE)==0) {
2967 return -1;
2968 }
2969 contained=USET_SPAN_SIMPLE;
2970 }
2971
2972 // Default first span condition for going forward with an uncomplemented set.
2973 spanCondition=USET_SPAN_NOT_CONTAINED;
2974 if(isComplement) {
2975 spanCondition=invertSpanCondition(spanCondition, contained);
2976 }
2977
2978 // First span condition for span(), used to terminate the spanBack() iteration.
2979 firstSpanCondition=spanCondition;
2980
2981 // spanBack(): Its initial span condition is span()'s last span condition,
2982 // which is the opposite of span()'s first span condition
2983 // if we expect an even number of spans.
2984 // (The loop inverts spanCondition (expectCount-1) times
2985 // before the expectCount'th span() call.)
2986 // If we do not compare forward and backward directions, then we do not have an
2987 // expectCount and just start with firstSpanCondition.
2988 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2989 spanCondition=invertSpanCondition(spanCondition, contained);
2990 }
2991
2992 count=0;
2993 switch(type) {
2994 case 0:
2995 case 1:
2996 start=0;
2997 if(length<0) {
2998 length=slen(s, isUTF16);
2999 }
3000 for(;;) {
3001 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
3002 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
3003 if(count<limitsCapacity) {
3004 limits[count]=start;
3005 }
3006 ++count;
3007 if(start>=length) {
3008 break;
3009 }
3010 spanCondition=invertSpanCondition(spanCondition, contained);
3011 }
3012 break;
3013 case 2:
3014 case 3:
3015 start=0;
3016 for(;;) {
3017 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3018 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3019 if(count<limitsCapacity) {
3020 limits[count]=start;
3021 }
3022 ++count;
3023 if(length>=0 ? start>=length :
3024 isUTF16 ? ((const UChar *)s)[start]==0 :
3025 ((const char *)s)[start]==0
3026 ) {
3027 break;
3028 }
3029 spanCondition=invertSpanCondition(spanCondition, contained);
3030 }
3031 break;
3032 case 4:
3033 case 5:
3034 if(length<0) {
3035 length=slen(s, isUTF16);
3036 }
3037 for(;;) {
3038 ++count;
3039 if(count<=limitsCapacity) {
3040 limits[limitsCapacity-count]=length;
3041 }
3042 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3043 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3044 if(length==0 && spanCondition==firstSpanCondition) {
3045 break;
3046 }
3047 spanCondition=invertSpanCondition(spanCondition, contained);
3048 }
3049 if(count<limitsCapacity) {
3050 memmove(limits, limits+(limitsCapacity-count), count*4);
3051 }
3052 break;
3053 case 6:
3054 case 7:
3055 for(;;) {
3056 ++count;
3057 if(count<=limitsCapacity) {
3058 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3059 }
3060 // Note: Length<0 is tested only for the first spanBack().
3061 // If we wanted to keep length<0 for all spanBack()s, we would have to
3062 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3063 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3064 realSet.spanBackUTF8((const char *)s, length, spanCondition);
3065 if(length==0 && spanCondition==firstSpanCondition) {
3066 break;
3067 }
3068 spanCondition=invertSpanCondition(spanCondition, contained);
3069 }
3070 if(count<limitsCapacity) {
3071 memmove(limits, limits+(limitsCapacity-count), count*4);
3072 }
3073 break;
3074 default:
3075 typeName="";
3076 return -1;
3077 }
3078
3079 return count;
3080 }
3081
3082 // sets to be tested; odd index=isComplement
3083 enum {
3084 SLOW,
3085 SLOW_NOT,
3086 FAST,
3087 FAST_NOT,
3088 SET_COUNT
3089 };
3090
3091 static const char *const setNames[SET_COUNT]={
3092 "slow",
3093 "slow.not",
3094 "fast",
3095 "fast.not"
3096 };
3097
3098 /*
3099 * Verify that we get the same results whether we look at text with contains(),
3100 * span() or spanBack(), using unfrozen or frozen versions of the set,
3101 * and using the set or its complement (switching the spanConditions accordingly).
3102 * The latter verifies that
3103 * set.span(spanCondition) == set.complement().span(!spanCondition).
3104 *
3105 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3106 * or returned to the caller (with an input expectCount<0).
3107 */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3108 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3109 const void *s, int32_t length, UBool isUTF16,
3110 uint32_t whichSpans,
3111 int32_t expectLimits[], int32_t &expectCount,
3112 const char *testName, int32_t index) {
3113 int32_t limits[500];
3114 int32_t limitsCount;
3115 int i, j;
3116
3117 const char *typeName;
3118 int type;
3119
3120 for(i=0; i<SET_COUNT; ++i) {
3121 if((i&1)==0) {
3122 // Even-numbered sets are original, uncomplemented sets.
3123 if((whichSpans&SPAN_SET)==0) {
3124 continue;
3125 }
3126 } else {
3127 // Odd-numbered sets are complemented.
3128 if((whichSpans&SPAN_COMPLEMENT)==0) {
3129 continue;
3130 }
3131 }
3132 for(type=0;; ++type) {
3133 limitsCount=getSpans(*sets[i], (UBool)(i&1),
3134 s, length, isUTF16,
3135 whichSpans,
3136 type, typeName,
3137 limits, UPRV_LENGTHOF(limits), expectCount);
3138 if(typeName[0]==0) {
3139 break; // All types tried.
3140 }
3141 if(limitsCount<0) {
3142 continue; // Span option filtered out.
3143 }
3144 if(expectCount<0) {
3145 expectCount=limitsCount;
3146 if(limitsCount>UPRV_LENGTHOF(limits)) {
3147 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3148 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3149 return;
3150 }
3151 memcpy(expectLimits, limits, limitsCount*4);
3152 } else if(limitsCount!=expectCount) {
3153 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3154 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3155 } else {
3156 for(j=0; j<limitsCount; ++j) {
3157 if(limits[j]!=expectLimits[j]) {
3158 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3159 testName, (long)index, setNames[i], typeName, (long)limitsCount,
3160 j, (long)limits[j], (long)expectLimits[j]);
3161 break;
3162 }
3163 }
3164 }
3165 }
3166 }
3167
3168 // Compare span() with containsAll()/containsNone(),
3169 // but only if we have expectLimits[] from the uncomplemented set.
3170 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3171 const UChar *s16=(const UChar *)s;
3172 UnicodeString string;
3173 int32_t prev=0, limit, length;
3174 for(i=0; i<expectCount; ++i) {
3175 limit=expectLimits[i];
3176 length=limit-prev;
3177 if(length>0) {
3178 string.setTo(FALSE, s16+prev, length); // read-only alias
3179 if(i&1) {
3180 if(!sets[SLOW]->getSet().containsAll(string)) {
3181 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3182 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3183 return;
3184 }
3185 if(!sets[FAST]->getSet().containsAll(string)) {
3186 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3187 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3188 return;
3189 }
3190 } else {
3191 if(!sets[SLOW]->getSet().containsNone(string)) {
3192 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3193 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3194 return;
3195 }
3196 if(!sets[FAST]->getSet().containsNone(string)) {
3197 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3198 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3199 return;
3200 }
3201 }
3202 }
3203 prev=limit;
3204 }
3205 }
3206 }
3207
3208 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3209 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3210 const void *s, int32_t length, UBool isUTF16,
3211 uint32_t whichSpans,
3212 const char *testName, int32_t index) {
3213 int32_t expectLimits[500];
3214 int32_t expectCount=-1;
3215 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3216 }
3217
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3218 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3219 UChar c, c2;
3220
3221 if(length>=0) {
3222 while(length>0) {
3223 c=*s++;
3224 --length;
3225 if(0xd800<=c && c<0xe000) {
3226 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3227 return TRUE;
3228 }
3229 --length;
3230 }
3231 }
3232 } else {
3233 while((c=*s++)!=0) {
3234 if(0xd800<=c && c<0xe000) {
3235 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3236 return TRUE;
3237 }
3238 }
3239 }
3240 }
3241 return FALSE;
3242 }
3243
3244 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3245 // unless either UTF is turned off in whichSpans.
3246 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3247 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3248 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3249 const UChar *s16, int32_t length16,
3250 uint32_t whichSpans,
3251 const char *testName, int32_t index) {
3252 int32_t expectLimits[500];
3253 int32_t expectCount;
3254
3255 expectCount=-1; // Get expectLimits[] from testSpan().
3256
3257 if((whichSpans&SPAN_UTF16)!=0) {
3258 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3259 }
3260 if((whichSpans&SPAN_UTF8)==0) {
3261 return;
3262 }
3263
3264 // Convert s16[] and expectLimits[] to UTF-8.
3265 uint8_t s8[3000];
3266 int32_t offsets[3000];
3267
3268 const UChar *s16Limit=s16+length16;
3269 char *t=(char *)s8;
3270 char *tLimit=t+sizeof(s8);
3271 int32_t *o=offsets;
3272 UErrorCode errorCode=U_ZERO_ERROR;
3273
3274 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3275 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3276 if(U_FAILURE(errorCode)) {
3277 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3278 testName, (long)index, u_errorName(errorCode));
3279 ucnv_resetFromUnicode(utf8Cnv);
3280 return;
3281 }
3282 int32_t length8=(int32_t)(t-(char *)s8);
3283
3284 // Convert expectLimits[].
3285 int32_t i, j, expect;
3286 for(i=j=0; i<expectCount; ++i) {
3287 expect=expectLimits[i];
3288 if(expect==length16) {
3289 expectLimits[i]=length8;
3290 } else {
3291 while(offsets[j]<expect) {
3292 ++j;
3293 }
3294 expectLimits[i]=j;
3295 }
3296 }
3297
3298 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3299 }
3300
nextCodePoint(UChar32 c)3301 static UChar32 nextCodePoint(UChar32 c) {
3302 // Skip some large and boring ranges.
3303 switch(c) {
3304 case 0x3441:
3305 return 0x4d7f;
3306 case 0x5100:
3307 return 0x9f00;
3308 case 0xb040:
3309 return 0xd780;
3310 case 0xe041:
3311 return 0xf8fe;
3312 case 0x10100:
3313 return 0x20000;
3314 case 0x20041:
3315 return 0xe0000;
3316 case 0xe0101:
3317 return 0x10fffd;
3318 default:
3319 return c+1;
3320 }
3321 }
3322
3323 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3324 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3325 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3326 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3327 // Skip the UTF-8 part of the test - if the string contains surrogates -
3328 // because it is likely to produce a different result.
3329 UBool inconsistentSurrogates=
3330 (!(sets[0]->getSet().contains(0xfffd) ?
3331 sets[0]->getSet().contains(0xd800, 0xdfff) :
3332 sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3333 sets[0]->hasStringsWithSurrogates());
3334
3335 UChar s[1000];
3336 int32_t length=0;
3337 uint32_t localWhichSpans;
3338
3339 UChar32 c, first;
3340 for(first=c=0;; c=nextCodePoint(c)) {
3341 if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3342 localWhichSpans=whichSpans;
3343 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3344 localWhichSpans&=~SPAN_UTF8;
3345 }
3346 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3347 if(c>0x10ffff) {
3348 break;
3349 }
3350 length=0;
3351 first=c;
3352 }
3353 U16_APPEND_UNSAFE(s, length, c);
3354 }
3355 }
3356
3357 // Test with a particular, interesting string.
3358 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3359 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3360 static const UChar s[]={
3361 0x61, 0x62, 0x20, // Latin, space
3362 0x3b1, 0x3b2, 0x3b3, // Greek
3363 0xd900, // lead surrogate
3364 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3365 0xdc05, // trail surrogate
3366 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3367 0xd900, 0xdc05, // unassigned supplementary
3368 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3369 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3370 0 // NUL
3371 };
3372
3373 if((whichSpans&SPAN_UTF16)==0) {
3374 return;
3375 }
3376 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3377 testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3378 }
3379
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3380 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3381 static const char s[]={
3382 "abc" // Latin
3383
3384 /* trail byte in lead position */
3385 "\x80"
3386
3387 " " // space
3388
3389 /* truncated multi-byte sequences */
3390 "\xd0"
3391 "\xe0"
3392 "\xe1"
3393 "\xed"
3394 "\xee"
3395 "\xf0"
3396 "\xf1"
3397 "\xf4"
3398 "\xf8"
3399 "\xfc"
3400
3401 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3402
3403 /* trail byte in lead position */
3404 "\x80"
3405
3406 "\xe0\x80"
3407 "\xe0\xa0"
3408 "\xe1\x80"
3409 "\xed\x80"
3410 "\xed\xa0"
3411 "\xee\x80"
3412 "\xf0\x80"
3413 "\xf0\x90"
3414 "\xf1\x80"
3415 "\xf4\x80"
3416 "\xf4\x90"
3417 "\xf8\x80"
3418 "\xfc\x80"
3419
3420 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3421
3422 /* trail byte in lead position */
3423 "\x80"
3424
3425 "\xf0\x80\x80"
3426 "\xf0\x90\x80"
3427 "\xf1\x80\x80"
3428 "\xf4\x80\x80"
3429 "\xf4\x90\x80"
3430 "\xf8\x80\x80"
3431 "\xfc\x80\x80"
3432
3433 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3434
3435 /* trail byte in lead position */
3436 "\x80"
3437
3438 "\xf8\x80\x80\x80"
3439 "\xfc\x80\x80\x80"
3440
3441 "\xF1\x90\x80\x85" // unassigned supplementary
3442
3443 /* trail byte in lead position */
3444 "\x80"
3445
3446 "\xfc\x80\x80\x80\x80"
3447
3448 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3449
3450 /* trail byte in lead position */
3451 "\x80"
3452
3453 /* complete sequences but non-shortest forms or out of range etc. */
3454 "\xc0\x80"
3455 "\xe0\x80\x80"
3456 "\xed\xa0\x80"
3457 "\xf0\x80\x80\x80"
3458 "\xf4\x90\x80\x80"
3459 "\xf8\x80\x80\x80\x80"
3460 "\xfc\x80\x80\x80\x80\x80"
3461 "\xfe"
3462 "\xff"
3463
3464 /* trail byte in lead position */
3465 "\x80"
3466
3467 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3468 };
3469
3470 if((whichSpans&SPAN_UTF8)==0) {
3471 return;
3472 }
3473 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3474 testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3475 }
3476
3477 // Take a set of span options and multiply them so that
3478 // each portion only has one of the options a, b and c.
3479 // If b==0, then the set of options is just modified with mask and a.
3480 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3481 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3482 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3483 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3484 uint32_t s;
3485 int32_t i;
3486
3487 for(i=0; i<whichSpansCount; ++i) {
3488 s=whichSpans[i]&mask;
3489 whichSpans[i]=s|a;
3490 if(b!=0) {
3491 whichSpans[whichSpansCount+i]=s|b;
3492 if(c!=0) {
3493 whichSpans[2*whichSpansCount+i]=s|c;
3494 }
3495 }
3496 }
3497 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3498 }
3499
3500 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3501 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3502 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3503 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3504
TestSpan()3505 void UnicodeSetTest::TestSpan() {
3506 // "[...]" is a UnicodeSet pattern.
3507 // "*" performs tests on all Unicode code points and on a selection of
3508 // malformed UTF-8/16 strings.
3509 // "-options" limits the scope of testing for the current set.
3510 // By default, the test verifies that equivalent boundaries are found
3511 // for UTF-16 and UTF-8, going forward and backward,
3512 // alternating USET_SPAN_NOT_CONTAINED with
3513 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3514 // Single-character options:
3515 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3516 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3517 // or the set contains strings with unpaired surrogates
3518 // which do not translate to valid UTF-8.
3519 // c -- set.span() and set.complement().span() boundaries may differ.
3520 // Cause: Set strings are not complemented.
3521 // b -- span() and spanBack() boundaries may differ.
3522 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3523 // and spanBack(USET_SPAN_SIMPLE) are defined to
3524 // match with non-overlapping substrings.
3525 // For example, with a set containing "ab" and "ba",
3526 // span() of "aba" yields boundaries { 0, 2, 3 }
3527 // because the initial "ab" matches from 0 to 2,
3528 // while spanBack() yields boundaries { 0, 1, 3 }
3529 // because the final "ba" matches from 1 to 3.
3530 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3531 // Cause: Strings in the set overlap, and a longer match may
3532 // require a sequence including non-longest substrings.
3533 // For example, with a set containing "ab", "abc" and "cd",
3534 // span(contained) of "abcd" spans the entire string
3535 // but span(longest match) only spans the first 3 characters.
3536 // Each "-options" first resets all options and then applies the specified options.
3537 // A "-" without options resets the options.
3538 // The options are also reset for each new set.
3539 // Other strings will be spanned.
3540 static const char *const testdata[]={
3541 "[:ID_Continue:]",
3542 "*",
3543 "[:White_Space:]",
3544 "*",
3545 "[]",
3546 "*",
3547 "[\\u0000-\\U0010FFFF]",
3548 "*",
3549 "[\\u0000\\u0080\\u0800\\U00010000]",
3550 "*",
3551 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3552 "*",
3553 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3554 "-c",
3555 "*",
3556 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3557 "-c",
3558 "*",
3559
3560 // Overlapping strings cause overlapping attempts to match.
3561 "[x{xy}{xya}{axy}{ax}]",
3562 "-cl",
3563
3564 // More repetitions of "xya" would take too long with the recursive
3565 // reference implementation.
3566 // containsAll()=FALSE
3567 // test_string 0x14
3568 "xx"
3569 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3570 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3571 "xyaxyaxyaxya"
3572 "xx"
3573 "xyaxyaxyaxya" // span() ends here.
3574 "aaa",
3575
3576 // containsAll()=TRUE
3577 // test_string 0x15
3578 "xx"
3579 "xyaxyaxyaxya"
3580 "xx"
3581 "xyaxyaxyaxya"
3582 "xx"
3583 "xyaxyaxyaxy",
3584
3585 "-bc",
3586 // test_string 0x17
3587 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3588 "-c",
3589 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3590 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3591 "-",
3592 "byaya", // span() -> { 5 }
3593 "byay", // span() -> { 4 }
3594 "bya", // span() -> { 3 }
3595
3596 // span(longest match) will not span the whole string.
3597 "[a{ab}{bc}]",
3598 "-cl",
3599 // test_string 0x21
3600 "abc",
3601
3602 "[a{ab}{abc}{cd}]",
3603 "-cl",
3604 "acdabcdabccd",
3605
3606 // spanBack(longest match) will not span the whole string.
3607 "[c{ab}{bc}]",
3608 "-cl",
3609 "abc",
3610
3611 "[d{cd}{bcd}{ab}]",
3612 "-cl",
3613 "abbcdabcdabd",
3614
3615 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3616 // and UTF-8 trail bytes.
3617 // Copies of above test sets and strings, but transliterated to have
3618 // different code points with similar trail units.
3619 // Previous: a b c d
3620 // Unicode: 042B 30AB 200AB 204AB
3621 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3622 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3623 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3624 "-cl",
3625 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3626
3627 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3628 "-cl",
3629 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3630
3631 // Stress bookkeeping and recursion.
3632 // The following strings are barely doable with the recursive
3633 // reference implementation.
3634 // The not-contained character at the end prevents an early exit from the span().
3635 "[b{bb}]",
3636 "-c",
3637 // test_string 0x33
3638 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3639 // On complement sets, span() and spanBack() get different results
3640 // because b is not in the complement set and there is an odd number of b's
3641 // in the test string.
3642 "-bc",
3643 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3644
3645 // Test with set strings with an initial or final code point span
3646 // longer than 254.
3647 "[a{" _64_a _64_a _64_a _64_a "b}"
3648 "{a" _64_b _64_b _64_b _64_b "}]",
3649 "-c",
3650 _64_a _64_a _64_a _63_a "b",
3651 _64_a _64_a _64_a _64_a "b",
3652 _64_a _64_a _64_a _64_a "aaaabbbb",
3653 "a" _64_b _64_b _64_b _63_b,
3654 "a" _64_b _64_b _64_b _64_b,
3655 "aaaabbbb" _64_b _64_b _64_b _64_b,
3656
3657 // Test with strings containing unpaired surrogates.
3658 // They are not representable in UTF-8, and a leading trail surrogate
3659 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3660 // U+20001 == \\uD840\\uDC01
3661 // U+20400 == \\uD841\\uDC00
3662 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3663 "-8cl",
3664 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3665 };
3666 uint32_t whichSpans[96]={ SPAN_ALL };
3667 int32_t whichSpansCount=1;
3668
3669 UnicodeSet *sets[SET_COUNT]={ NULL };
3670 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3671
3672 char testName[1024];
3673 char *testNameLimit=testName;
3674
3675 int32_t i, j;
3676 for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3677 const char *s=testdata[i];
3678 if(s[0]=='[') {
3679 // Create new test sets from this pattern.
3680 for(j=0; j<SET_COUNT; ++j) {
3681 delete sets_with_str[j];
3682 delete sets[j];
3683 }
3684 UErrorCode errorCode=U_ZERO_ERROR;
3685 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3686 if(U_FAILURE(errorCode)) {
3687 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3688 break;
3689 }
3690 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3691 sets[SLOW_NOT]->complement();
3692 // Intermediate set: Test cloning of a frozen set.
3693 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3694 fast->freeze();
3695 sets[FAST]=fast->clone();
3696 delete fast;
3697 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3698 fastNot->freeze();
3699 sets[FAST_NOT]=fastNot->clone();
3700 delete fastNot;
3701
3702 for(j=0; j<SET_COUNT; ++j) {
3703 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3704 }
3705
3706 strcpy(testName, s);
3707 testNameLimit=strchr(testName, 0);
3708 *testNameLimit++=':';
3709 *testNameLimit=0;
3710
3711 whichSpans[0]=SPAN_ALL;
3712 whichSpansCount=1;
3713 } else if(s[0]=='-') {
3714 whichSpans[0]=SPAN_ALL;
3715 whichSpansCount=1;
3716
3717 while(*++s!=0) {
3718 switch(*s) {
3719 case 'c':
3720 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3721 ~SPAN_POLARITY,
3722 SPAN_SET,
3723 SPAN_COMPLEMENT,
3724 0);
3725 break;
3726 case 'b':
3727 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3728 ~SPAN_DIRS,
3729 SPAN_FWD,
3730 SPAN_BACK,
3731 0);
3732 break;
3733 case 'l':
3734 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3735 // USET_SPAN_SIMPLE only FWD, and separately
3736 // USET_SPAN_SIMPLE only BACK
3737 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3738 ~(SPAN_DIRS|SPAN_CONDITION),
3739 SPAN_DIRS|SPAN_CONTAINED,
3740 SPAN_FWD|SPAN_SIMPLE,
3741 SPAN_BACK|SPAN_SIMPLE);
3742 break;
3743 case '8':
3744 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3745 ~SPAN_UTFS,
3746 SPAN_UTF16,
3747 SPAN_UTF8,
3748 0);
3749 break;
3750 default:
3751 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3752 break;
3753 }
3754 }
3755 } else if(0==strcmp(s, "*")) {
3756 strcpy(testNameLimit, "bad_string");
3757 for(j=0; j<whichSpansCount; ++j) {
3758 if(whichSpansCount>1) {
3759 sprintf(testNameLimit+10 /* strlen("bad_string") */,
3760 "%%0x%3x",
3761 whichSpans[j]);
3762 }
3763 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3764 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3765 }
3766
3767 strcpy(testNameLimit, "contents");
3768 for(j=0; j<whichSpansCount; ++j) {
3769 if(whichSpansCount>1) {
3770 sprintf(testNameLimit+8 /* strlen("contents") */,
3771 "%%0x%3x",
3772 whichSpans[j]);
3773 }
3774 testSpanContents(sets_with_str, whichSpans[j], testName);
3775 }
3776 } else {
3777 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3778 strcpy(testNameLimit, "test_string");
3779 for(j=0; j<whichSpansCount; ++j) {
3780 if(whichSpansCount>1) {
3781 sprintf(testNameLimit+11 /* strlen("test_string") */,
3782 "%%0x%3x",
3783 whichSpans[j]);
3784 }
3785 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3786 }
3787 }
3788 }
3789 for(j=0; j<SET_COUNT; ++j) {
3790 delete sets_with_str[j];
3791 delete sets[j];
3792 }
3793 }
3794
3795 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3796 void UnicodeSetTest::TestStringSpan() {
3797 static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3798 static const char *const string=
3799 "xx"
3800 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3801 "xx"
3802 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3803 "xx"
3804 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3805 "aaaa";
3806
3807 UErrorCode errorCode=U_ZERO_ERROR;
3808 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3809 UnicodeSet set(pattern16, errorCode);
3810 if(U_FAILURE(errorCode)) {
3811 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3812 return;
3813 }
3814
3815 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3816
3817 if(set.containsAll(string16)) {
3818 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3819 }
3820
3821 // Remove trailing "aaaa".
3822 string16.truncate(string16.length()-4);
3823 if(!set.containsAll(string16)) {
3824 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3825 }
3826
3827 string16=UNICODE_STRING_SIMPLE("byayaxya");
3828 const UChar *s16=string16.getBuffer();
3829 int32_t length16=string16.length();
3830 (void)length16; // Suppress set but not used warning.
3831 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3832 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3833 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3834 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3835 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3836 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3837 ) {
3838 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3839 }
3840
3841 pattern="[a{ab}{abc}{cd}]";
3842 pattern16=UnicodeString(pattern, -1, US_INV);
3843 set.applyPattern(pattern16, errorCode);
3844 if(U_FAILURE(errorCode)) {
3845 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3846 return;
3847 }
3848 string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3849 s16=string16.getBuffer();
3850 length16=string16.length();
3851 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3852 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3853 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3854 ) {
3855 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3856 }
3857
3858 pattern="[d{cd}{bcd}{ab}]";
3859 pattern16=UnicodeString(pattern, -1, US_INV);
3860 set.applyPattern(pattern16, errorCode).freeze();
3861 if(U_FAILURE(errorCode)) {
3862 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3863 return;
3864 }
3865 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3866 s16=string16.getBuffer();
3867 length16=string16.length();
3868 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3869 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3870 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3871 ) {
3872 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3873 }
3874 }
3875
3876 /**
3877 * Including collationroot.h fails here with
3878 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3879 * .. so, we skip this test on Windows.
3880 *
3881 * the cause is that intltest builds with /Za which disables language extensions - which means
3882 * windows header files can't be used.
3883 */
3884 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3885 #include "collationroot.h"
3886 #include "collationtailoring.h"
3887 #endif
3888
TestUCAUnsafeBackwards()3889 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3890 #if U_PLATFORM_HAS_WIN32_API
3891 infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3892 #elif !UCONFIG_NO_COLLATION
3893 UErrorCode errorCode = U_ZERO_ERROR;
3894
3895 // Get the unsafeBackwardsSet
3896 const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
3897 if(U_FAILURE(errorCode)) {
3898 dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
3899 return;
3900 }
3901 //const UVersionInfo &version = rootEntry->tailoring->version;
3902 const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
3903
3904 checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
3905
3906 if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3907 // simple test case
3908 // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3909 // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3910 UnicodeSet surrogates;
3911 surrogates.add(0xd83a); // a lead surrogate
3912 surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
3913 UnicodeString pat;
3914 surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3915 // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3916 // so that at least one type of surrogate code points are escaped,
3917 // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3918 errorCode = U_ZERO_ERROR;
3919 UnicodeSet s2;
3920 s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3921 if(U_FAILURE(errorCode)) {
3922 errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
3923 } else {
3924 checkEqual(surrogates, s2, "surrogates to/from pattern");
3925 }
3926 // This occurs in the UCA unsafe-backwards set.
3927 checkRoundTrip(*unsafeBackwardSet);
3928 }
3929 #endif
3930 }
3931
TestIntOverflow()3932 void UnicodeSetTest::TestIntOverflow() {
3933 // This test triggers undefined double->int conversion behavior
3934 // if the implementation is not careful.
3935 IcuTestErrorCode errorCode(*this, "TestIntOverflow");
3936 UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
3937 assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
3938 assertEquals("[:ccc=int_overflow:] -> illegal argument",
3939 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3940 }
3941
TestUnusedCcc()3942 void UnicodeSetTest::TestUnusedCcc() {
3943 #if !UCONFIG_NO_NORMALIZATION
3944 // All numeric ccc values 0..255 are valid, but many are unused.
3945 IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
3946 UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
3947 assertSuccess("[:ccc=2:]", errorCode);
3948 assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
3949
3950 UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
3951 assertSuccess("[:ccc=255:]", errorCode);
3952 assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
3953
3954 // Non-integer values and values outside 0..255 are invalid.
3955 UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
3956 assertEquals("[:ccc=-1:] -> illegal argument",
3957 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3958 assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
3959
3960 UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
3961 assertEquals("[:ccc=256:] -> illegal argument",
3962 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3963 assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
3964
3965 UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
3966 assertEquals("[:ccc=1.1:] -> illegal argument",
3967 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3968 assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
3969 #endif
3970 }
3971
TestDeepPattern()3972 void UnicodeSetTest::TestDeepPattern() {
3973 IcuTestErrorCode errorCode(*this, "TestDeepPattern");
3974 // Nested ranges are parsed via recursion which can use a lot of stack space.
3975 // After a reasonable limit, we should get an error.
3976 constexpr int32_t DEPTH = 20000;
3977 UnicodeString pattern, suffix;
3978 for (int32_t i = 0; i < DEPTH; ++i) {
3979 pattern.append(u"[a", 2);
3980 suffix.append(']');
3981 }
3982 pattern.append(suffix);
3983 UnicodeSet set(pattern, errorCode);
3984 assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure());
3985 errorCode.reset();
3986 }
3987