1 /*
2 ********************************************************************************
3 * Copyright (C) 1999-2014 International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 * 03/22/2000 Madhu Added additional tests
9 ********************************************************************************
10 */
11
12 #include <stdio.h>
13
14 #include <string.h>
15 #include "unicode/utypes.h"
16 #include "usettest.h"
17 #include "unicode/ucnv.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/usetiter.h"
21 #include "unicode/ustring.h"
22 #include "unicode/parsepos.h"
23 #include "unicode/symtable.h"
24 #include "unicode/uversion.h"
25 #include "hash.h"
26
27 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
28 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
29 u_errorName(status));}}
30
31 #define TEST_ASSERT(expr) {if (!(expr)) { \
32 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
33
operator +(const UnicodeString & left,const UnicodeSet & set)34 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
35 UnicodeString pat;
36 set.toPattern(pat);
37 return left + UnicodeSetTest::escape(pat);
38 }
39
40 #define CASE(id,test) case id: \
41 name = #test; \
42 if (exec) { \
43 logln(#test "---"); \
44 logln(); \
45 test(); \
46 } \
47 break
48
UnicodeSetTest()49 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
50 }
51
openUTF8Converter()52 UConverter *UnicodeSetTest::openUTF8Converter() {
53 if(utf8Cnv==NULL) {
54 UErrorCode errorCode=U_ZERO_ERROR;
55 utf8Cnv=ucnv_open("UTF-8", &errorCode);
56 }
57 return utf8Cnv;
58 }
59
~UnicodeSetTest()60 UnicodeSetTest::~UnicodeSetTest() {
61 ucnv_close(utf8Cnv);
62 }
63
64 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)65 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
66 const char* &name, char* /*par*/) {
67 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
68 switch (index) {
69 CASE(0,TestPatterns);
70 CASE(1,TestAddRemove);
71 CASE(2,TestCategories);
72 CASE(3,TestCloneEqualHash);
73 CASE(4,TestMinimalRep);
74 CASE(5,TestAPI);
75 CASE(6,TestScriptSet);
76 CASE(7,TestPropertySet);
77 CASE(8,TestClone);
78 CASE(9,TestExhaustive);
79 CASE(10,TestToPattern);
80 CASE(11,TestIndexOf);
81 CASE(12,TestStrings);
82 CASE(13,Testj2268);
83 CASE(14,TestCloseOver);
84 CASE(15,TestEscapePattern);
85 CASE(16,TestInvalidCodePoint);
86 CASE(17,TestSymbolTable);
87 CASE(18,TestSurrogate);
88 CASE(19,TestPosixClasses);
89 CASE(20,TestIteration);
90 CASE(21,TestFreezable);
91 CASE(22,TestSpan);
92 CASE(23,TestStringSpan);
93 default: name = ""; break;
94 }
95 }
96
97 static const char NOT[] = "%%%%";
98
99 /**
100 * UVector was improperly copying contents
101 * This code will crash this is still true
102 */
Testj2268()103 void UnicodeSetTest::Testj2268() {
104 UnicodeSet t;
105 t.add(UnicodeString("abc"));
106 UnicodeSet test(t);
107 UnicodeString ustrPat;
108 test.toPattern(ustrPat, TRUE);
109 }
110
111 /**
112 * Test toPattern().
113 */
TestToPattern()114 void UnicodeSetTest::TestToPattern() {
115 UErrorCode ec = U_ZERO_ERROR;
116
117 // Test that toPattern() round trips with syntax characters and
118 // whitespace.
119 {
120 static const char* OTHER_TOPATTERN_TESTS[] = {
121 "[[:latin:]&[:greek:]]",
122 "[[:latin:]-[:greek:]]",
123 "[:nonspacing mark:]",
124 NULL
125 };
126
127 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
128 ec = U_ZERO_ERROR;
129 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
130 if (U_FAILURE(ec)) {
131 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
132 continue;
133 }
134 checkPat(OTHER_TOPATTERN_TESTS[j], s);
135 }
136
137 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
138 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
139
140 // check various combinations to make sure they all work.
141 if (i != 0 && !toPatternAux(i, i)){
142 continue;
143 }
144 if (!toPatternAux(0, i)){
145 continue;
146 }
147 if (!toPatternAux(i, 0xFFFF)){
148 continue;
149 }
150 }
151 }
152 }
153
154 // Test pattern behavior of multicharacter strings.
155 {
156 ec = U_ZERO_ERROR;
157 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
158
159 // This loop isn't a loop. It's here to make the compiler happy.
160 // If you're curious, try removing it and changing the 'break'
161 // statements (except for the last) to goto's.
162 for (;;) {
163 if (U_FAILURE(ec)) break;
164 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
165 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
166
167 s->add("ac");
168 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
169 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
170
171 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
172 if (U_FAILURE(ec)) break;
173 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
174 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
175
176 s->add("[]");
177 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
178 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
179
180 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
181 if (U_FAILURE(ec)) break;
182 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
183 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
184
185 // j2189
186 s->clear();
187 s->add(UnicodeString("abc", ""));
188 s->add(UnicodeString("abc", ""));
189 const char* exp6[] = {"abc", NOT, "ab", NULL};
190 expectToPattern(*s, "[{abc}]", exp6);
191
192 break;
193 }
194
195 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
196 delete s;
197 }
198
199 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
200 UnicodeSet s;
201 s.add((UChar)97, (UChar)98); // 'a', 'b'
202 expectToPattern(s, "[ab]", NULL);
203 }
204
toPatternAux(UChar32 start,UChar32 end)205 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
206
207 // use Integer.toString because Utility.hex doesn't handle ints
208 UnicodeString pat = "";
209 // TODO do these in hex
210 //String source = "0x" + Integer.toString(start,16).toUpperCase();
211 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
212 UnicodeString source;
213 source = source + (uint32_t)start;
214 if (start != end)
215 source = source + ".." + (uint32_t)end;
216 UnicodeSet testSet;
217 testSet.add(start, end);
218 return checkPat(source, testSet);
219 }
220
checkPat(const UnicodeString & source,const UnicodeSet & testSet)221 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
222 const UnicodeSet& testSet) {
223 // What we want to make sure of is that a pattern generated
224 // by toPattern(), with or without escaped unprintables, can
225 // be passed back into the UnicodeSet constructor.
226 UnicodeString pat0;
227
228 testSet.toPattern(pat0, TRUE);
229
230 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
231
232 //String pat1 = unescapeLeniently(pat0);
233 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
234
235 UnicodeString pat2;
236 testSet.toPattern(pat2, FALSE);
237 if (!checkPat(source, testSet, pat2)) return FALSE;
238
239 //String pat3 = unescapeLeniently(pat2);
240 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
241
242 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
243 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
244 return TRUE;
245 }
246
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)247 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
248 const UnicodeSet& testSet,
249 const UnicodeString& pat) {
250 UErrorCode ec = U_ZERO_ERROR;
251 UnicodeSet testSet2(pat, ec);
252 if (testSet2 != testSet) {
253 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
254 return FALSE;
255 }
256 return TRUE;
257 }
258
259 void
TestPatterns(void)260 UnicodeSetTest::TestPatterns(void) {
261 UnicodeSet set;
262 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
263 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
264 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
265 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
266 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
267 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
268
269 // Throw in a test of complement
270 set.complement();
271 UnicodeString exp;
272 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
273 expectPairs(set, exp);
274 }
275
276 void
TestCategories(void)277 UnicodeSetTest::TestCategories(void) {
278 UErrorCode status = U_ZERO_ERROR;
279 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
280 UnicodeSet set(pat, status);
281 if (U_FAILURE(status)) {
282 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
283 return;
284 } else {
285 expectContainment(set, pat, "ABC", "abc");
286 }
287
288 UChar32 i;
289 int32_t failures = 0;
290 // Make sure generation of L doesn't pollute cached Lu set
291 // First generate L, then Lu
292 set.applyPattern("[:L:]", status);
293 if (U_FAILURE(status)) { errln("FAIL"); return; }
294 for (i=0; i<0x200; ++i) {
295 UBool l = u_isalpha((UChar)i);
296 if (l != set.contains(i)) {
297 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
298 set.contains(i));
299 if (++failures == 10) break;
300 }
301 }
302
303 set.applyPattern("[:Lu:]", status);
304 if (U_FAILURE(status)) { errln("FAIL"); return; }
305 for (i=0; i<0x200; ++i) {
306 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
307 if (lu != set.contains(i)) {
308 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
309 set.contains(i));
310 if (++failures == 20) break;
311 }
312 }
313 }
314 void
TestCloneEqualHash(void)315 UnicodeSetTest::TestCloneEqualHash(void) {
316 UErrorCode status = U_ZERO_ERROR;
317 // set1 and set2 used to be built with the obsolete constructor taking
318 // UCharCategory values; replaced with pattern constructors
319 // markus 20030502
320 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase
321 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase
322 if (U_FAILURE(status)){
323 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
324 return;
325 }
326 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit
327 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit
328 if (U_FAILURE(status)){
329 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
330 return;
331 }
332
333 if (*set1 != *set1a) {
334 errln("FAIL: category constructor for Ll broken");
335 }
336 if (*set2 != *set2a) {
337 errln("FAIL: category constructor for Nd broken");
338 }
339 delete set1a;
340 delete set2a;
341
342 logln("Testing copy construction");
343 UnicodeSet *set1copy=new UnicodeSet(*set1);
344 if(*set1 != *set1copy || *set1 == *set2 ||
345 getPairs(*set1) != getPairs(*set1copy) ||
346 set1->hashCode() != set1copy->hashCode()){
347 errln("FAIL : Error in copy construction");
348 return;
349 }
350
351 logln("Testing =operator");
352 UnicodeSet set1equal=*set1;
353 UnicodeSet set2equal=*set2;
354 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
355 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
356 errln("FAIL: Error in =operator");
357 }
358
359 logln("Testing clone()");
360 UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
361 UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
362 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
363 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
364 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
365 errln("FAIL: Error in clone");
366 }
367
368 logln("Testing hashcode");
369 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
370 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
371 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
372 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
373 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
374 errln("FAIL: Error in hashCode()");
375 }
376
377 delete set1;
378 delete set1copy;
379 delete set2;
380 delete set1clone;
381 delete set2clone;
382
383
384 }
385 void
TestAddRemove(void)386 UnicodeSetTest::TestAddRemove(void) {
387 UnicodeSet set; // Construct empty set
388 doAssert(set.isEmpty() == TRUE, "set should be empty");
389 doAssert(set.size() == 0, "size should be 0");
390 set.complement();
391 doAssert(set.size() == 0x110000, "size should be 0x110000");
392 set.clear();
393 set.add(0x0061, 0x007a);
394 expectPairs(set, "az");
395 doAssert(set.isEmpty() == FALSE, "set should not be empty");
396 doAssert(set.size() != 0, "size should not be equal to 0");
397 doAssert(set.size() == 26, "size should be equal to 26");
398 set.remove(0x006d, 0x0070);
399 expectPairs(set, "alqz");
400 doAssert(set.size() == 22, "size should be equal to 22");
401 set.remove(0x0065, 0x0067);
402 expectPairs(set, "adhlqz");
403 doAssert(set.size() == 19, "size should be equal to 19");
404 set.remove(0x0064, 0x0069);
405 expectPairs(set, "acjlqz");
406 doAssert(set.size() == 16, "size should be equal to 16");
407 set.remove(0x0063, 0x0072);
408 expectPairs(set, "absz");
409 doAssert(set.size() == 10, "size should be equal to 10");
410 set.add(0x0066, 0x0071);
411 expectPairs(set, "abfqsz");
412 doAssert(set.size() == 22, "size should be equal to 22");
413 set.remove(0x0061, 0x0067);
414 expectPairs(set, "hqsz");
415 set.remove(0x0061, 0x007a);
416 expectPairs(set, "");
417 doAssert(set.isEmpty() == TRUE, "set should be empty");
418 doAssert(set.size() == 0, "size should be 0");
419 set.add(0x0061);
420 doAssert(set.isEmpty() == FALSE, "set should not be empty");
421 doAssert(set.size() == 1, "size should not be equal to 1");
422 set.add(0x0062);
423 set.add(0x0063);
424 expectPairs(set, "ac");
425 doAssert(set.size() == 3, "size should not be equal to 3");
426 set.add(0x0070);
427 set.add(0x0071);
428 expectPairs(set, "acpq");
429 doAssert(set.size() == 5, "size should not be equal to 5");
430 set.clear();
431 expectPairs(set, "");
432 doAssert(set.isEmpty() == TRUE, "set should be empty");
433 doAssert(set.size() == 0, "size should be 0");
434
435 // Try removing an entire set from another set
436 expectPattern(set, "[c-x]", "cx");
437 UnicodeSet set2;
438 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
439 set.removeAll(set2);
440 expectPairs(set, "deluxx");
441
442 // Try adding an entire set to another set
443 expectPattern(set, "[jackiemclean]", "aacceein");
444 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
445 set.addAll(set2);
446 expectPairs(set, "aacehort");
447 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
448
449 // Try retaining an set of elements contained in another set (intersection)
450 UnicodeSet set3;
451 expectPattern(set3, "[a-c]", "ac");
452 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
453 set3.remove(0x0062);
454 expectPairs(set3, "aacc");
455 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
456 set.retainAll(set3);
457 expectPairs(set, "aacc");
458 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
459 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
460 set.clear();
461 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
462
463 // Test commutativity
464 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
465 expectPattern(set2, "[jackiemclean]", "aacceein");
466 set.addAll(set2);
467 expectPairs(set, "aacehort");
468 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
469
470
471
472
473 }
474
475 /**
476 * Make sure minimal representation is maintained.
477 */
TestMinimalRep()478 void UnicodeSetTest::TestMinimalRep() {
479 UErrorCode status = U_ZERO_ERROR;
480 // This is pretty thoroughly tested by checkCanonicalRep()
481 // run against the exhaustive operation results. Use the code
482 // here for debugging specific spot problems.
483
484 // 1 overlap against 2
485 UnicodeSet set("[h-km-q]", status);
486 if (U_FAILURE(status)) { errln("FAIL"); return; }
487 UnicodeSet set2("[i-o]", status);
488 if (U_FAILURE(status)) { errln("FAIL"); return; }
489 set.addAll(set2);
490 expectPairs(set, "hq");
491 // right
492 set.applyPattern("[a-m]", status);
493 if (U_FAILURE(status)) { errln("FAIL"); return; }
494 set2.applyPattern("[e-o]", status);
495 if (U_FAILURE(status)) { errln("FAIL"); return; }
496 set.addAll(set2);
497 expectPairs(set, "ao");
498 // left
499 set.applyPattern("[e-o]", status);
500 if (U_FAILURE(status)) { errln("FAIL"); return; }
501 set2.applyPattern("[a-m]", status);
502 if (U_FAILURE(status)) { errln("FAIL"); return; }
503 set.addAll(set2);
504 expectPairs(set, "ao");
505 // 1 overlap against 3
506 set.applyPattern("[a-eg-mo-w]", status);
507 if (U_FAILURE(status)) { errln("FAIL"); return; }
508 set2.applyPattern("[d-q]", status);
509 if (U_FAILURE(status)) { errln("FAIL"); return; }
510 set.addAll(set2);
511 expectPairs(set, "aw");
512 }
513
TestAPI()514 void UnicodeSetTest::TestAPI() {
515 UErrorCode status = U_ZERO_ERROR;
516 // default ct
517 UnicodeSet set;
518 if (!set.isEmpty() || set.getRangeCount() != 0) {
519 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
520 set);
521 }
522
523 // clear(), isEmpty()
524 set.add(0x0061);
525 if (set.isEmpty()) {
526 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
527 set);
528 }
529 set.clear();
530 if (!set.isEmpty()) {
531 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
532 set);
533 }
534
535 // size()
536 set.clear();
537 if (set.size() != 0) {
538 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
539 ": " + set);
540 }
541 set.add(0x0061);
542 if (set.size() != 1) {
543 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
544 ": " + set);
545 }
546 set.add(0x0031, 0x0039);
547 if (set.size() != 10) {
548 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
549 ": " + set);
550 }
551
552 // contains(first, last)
553 set.clear();
554 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
555 if (U_FAILURE(status)) { errln("FAIL"); return; }
556 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
557 UChar32 a = set.getRangeStart(i);
558 UChar32 b = set.getRangeEnd(i);
559 if (!set.contains(a, b)) {
560 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
561 " but doesn't: " + set);
562 }
563 if (set.contains((UChar32)(a-1), b)) {
564 errln((UnicodeString)"FAIL, shouldn't contain " +
565 (unsigned short)(a-1) + '-' + (unsigned short)b +
566 " but does: " + set);
567 }
568 if (set.contains(a, (UChar32)(b+1))) {
569 errln((UnicodeString)"FAIL, shouldn't contain " +
570 (unsigned short)a + '-' + (unsigned short)(b+1) +
571 " but does: " + set);
572 }
573 }
574
575 // Ported InversionList test.
576 UnicodeSet a((UChar32)3,(UChar32)10);
577 UnicodeSet b((UChar32)7,(UChar32)15);
578 UnicodeSet c;
579
580 logln((UnicodeString)"a [3-10]: " + a);
581 logln((UnicodeString)"b [7-15]: " + b);
582 c = a;
583 c.addAll(b);
584 UnicodeSet exp((UChar32)3,(UChar32)15);
585 if (c == exp) {
586 logln((UnicodeString)"c.set(a).add(b): " + c);
587 } else {
588 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
589 }
590 c.complement();
591 exp.set((UChar32)0, (UChar32)2);
592 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
593 if (c == exp) {
594 logln((UnicodeString)"c.complement(): " + c);
595 } else {
596 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
597 }
598 c.complement();
599 exp.set((UChar32)3, (UChar32)15);
600 if (c == exp) {
601 logln((UnicodeString)"c.complement(): " + c);
602 } else {
603 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
604 }
605 c = a;
606 c.complementAll(b);
607 exp.set((UChar32)3,(UChar32)6);
608 exp.add((UChar32)11,(UChar32) 15);
609 if (c == exp) {
610 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
611 } else {
612 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
613 }
614
615 exp = c;
616 bitsToSet(setToBits(c), c);
617 if (c == exp) {
618 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
619 } else {
620 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
621 }
622
623 // Additional tests for coverage JB#2118
624 //UnicodeSet::complement(class UnicodeString const &)
625 //UnicodeSet::complementAll(class UnicodeString const &)
626 //UnicodeSet::containsNone(class UnicodeSet const &)
627 //UnicodeSet::containsNone(long,long)
628 //UnicodeSet::containsSome(class UnicodeSet const &)
629 //UnicodeSet::containsSome(long,long)
630 //UnicodeSet::removeAll(class UnicodeString const &)
631 //UnicodeSet::retain(long)
632 //UnicodeSet::retainAll(class UnicodeString const &)
633 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
634 //UnicodeSetIterator::getString(void)
635 set.clear();
636 set.complement("ab");
637 exp.applyPattern("[{ab}]", status);
638 if (U_FAILURE(status)) { errln("FAIL"); return; }
639 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
640
641 UnicodeSetIterator iset(set);
642 if (!iset.next() || !iset.isString()) {
643 errln("FAIL: UnicodeSetIterator::next/isString");
644 } else if (iset.getString() != "ab") {
645 errln("FAIL: UnicodeSetIterator::getString");
646 }
647
648 set.add((UChar32)0x61, (UChar32)0x7A);
649 set.complementAll("alan");
650 exp.applyPattern("[{ab}b-kmo-z]", status);
651 if (U_FAILURE(status)) { errln("FAIL"); return; }
652 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
653
654 exp.applyPattern("[a-z]", status);
655 if (U_FAILURE(status)) { errln("FAIL"); return; }
656 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
657 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
658 exp.applyPattern("[aln]", status);
659 if (U_FAILURE(status)) { errln("FAIL"); return; }
660 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
661 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
662
663 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
664 errln("FAIL: containsNone(UChar32, UChar32)");
665 }
666 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
667 errln("FAIL: containsSome(UChar32, UChar32)");
668 }
669 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
670 errln("FAIL: containsNone(UChar32, UChar32)");
671 }
672 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
673 errln("FAIL: containsSome(UChar32, UChar32)");
674 }
675
676 set.removeAll("liu");
677 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
678 if (U_FAILURE(status)) { errln("FAIL"); return; }
679 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
680
681 set.retainAll("star");
682 exp.applyPattern("[rst]", status);
683 if (U_FAILURE(status)) { errln("FAIL"); return; }
684 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
685
686 set.retain((UChar32)0x73);
687 exp.applyPattern("[s]", status);
688 if (U_FAILURE(status)) { errln("FAIL"); return; }
689 if (set != exp) { errln("FAIL: retain('s')"); return; }
690
691 uint16_t buf[32];
692 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
693 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
694 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
695 errln("FAIL: serialize");
696 return;
697 }
698
699 // Conversions to and from USet
700 UnicodeSet *uniset = &set;
701 USet *uset = uniset->toUSet();
702 TEST_ASSERT((void *)uset == (void *)uniset);
703 UnicodeSet *setx = UnicodeSet::fromUSet(uset);
704 TEST_ASSERT((void *)setx == (void *)uset);
705 const UnicodeSet *constSet = uniset;
706 const USet *constUSet = constSet->toUSet();
707 TEST_ASSERT((void *)constUSet == (void *)constSet);
708 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
709 TEST_ASSERT((void *)constSetx == (void *)constUSet);
710
711 // span(UnicodeString) and spanBack(UnicodeString) convenience methods
712 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
713 UnicodeSet ac(0x61, 0x63);
714 ac.remove(0x62).freeze();
715 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
716 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
717 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
718 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
719 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
720 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
721 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
722 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
723 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
724 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
725 ) {
726 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
727 }
728 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
729 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
730 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
731 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
732 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
733 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
734 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
735 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
736 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
737 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
738 ) {
739 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
740 }
741 }
742
TestIteration()743 void UnicodeSetTest::TestIteration() {
744 UErrorCode ec = U_ZERO_ERROR;
745 int i = 0;
746 int outerLoop;
747
748 // 6 code points, 3 ranges, 2 strings, 8 total elements
749 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
750 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
751 TEST_ASSERT_SUCCESS(ec);
752 UnicodeSetIterator it(set);
753
754 for (outerLoop=0; outerLoop<3; outerLoop++) {
755 // Run the test multiple times, to check that iterator.reset() is working.
756 for (i=0; i<10; i++) {
757 UBool nextv = it.next();
758 UBool isString = it.isString();
759 int32_t codePoint = it.getCodepoint();
760 //int32_t codePointEnd = it.getCodepointEnd();
761 UnicodeString s = it.getString();
762 switch (i) {
763 case 0:
764 TEST_ASSERT(nextv == TRUE);
765 TEST_ASSERT(isString == FALSE);
766 TEST_ASSERT(codePoint==0x61);
767 TEST_ASSERT(s == "a");
768 break;
769 case 1:
770 TEST_ASSERT(nextv == TRUE);
771 TEST_ASSERT(isString == FALSE);
772 TEST_ASSERT(codePoint==0x62);
773 TEST_ASSERT(s == "b");
774 break;
775 case 2:
776 TEST_ASSERT(nextv == TRUE);
777 TEST_ASSERT(isString == FALSE);
778 TEST_ASSERT(codePoint==0x63);
779 TEST_ASSERT(s == "c");
780 break;
781 case 3:
782 TEST_ASSERT(nextv == TRUE);
783 TEST_ASSERT(isString == FALSE);
784 TEST_ASSERT(codePoint==0x79);
785 TEST_ASSERT(s == "y");
786 break;
787 case 4:
788 TEST_ASSERT(nextv == TRUE);
789 TEST_ASSERT(isString == FALSE);
790 TEST_ASSERT(codePoint==0x7a);
791 TEST_ASSERT(s == "z");
792 break;
793 case 5:
794 TEST_ASSERT(nextv == TRUE);
795 TEST_ASSERT(isString == FALSE);
796 TEST_ASSERT(codePoint==0x1abcd);
797 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
798 break;
799 case 6:
800 TEST_ASSERT(nextv == TRUE);
801 TEST_ASSERT(isString == TRUE);
802 TEST_ASSERT(s == "str1");
803 break;
804 case 7:
805 TEST_ASSERT(nextv == TRUE);
806 TEST_ASSERT(isString == TRUE);
807 TEST_ASSERT(s == "str2");
808 break;
809 case 8:
810 TEST_ASSERT(nextv == FALSE);
811 break;
812 case 9:
813 TEST_ASSERT(nextv == FALSE);
814 break;
815 }
816 }
817 it.reset(); // prepare to run the iteration again.
818 }
819 }
820
821
822
823
TestStrings()824 void UnicodeSetTest::TestStrings() {
825 UErrorCode ec = U_ZERO_ERROR;
826
827 UnicodeSet* testList[] = {
828 UnicodeSet::createFromAll("abc"),
829 new UnicodeSet("[a-c]", ec),
830
831 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
832 new UnicodeSet("[{ll}{ch}a-z]", ec),
833
834 UnicodeSet::createFrom("ab}c"),
835 new UnicodeSet("[{ab\\}c}]", ec),
836
837 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
838 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
839
840 NULL
841 };
842
843 if (U_FAILURE(ec)) {
844 errln("FAIL: couldn't construct test sets");
845 }
846
847 for (int32_t i = 0; testList[i] != NULL; i+=2) {
848 if (U_SUCCESS(ec)) {
849 UnicodeString pat0, pat1;
850 testList[i]->toPattern(pat0, TRUE);
851 testList[i+1]->toPattern(pat1, TRUE);
852 if (*testList[i] == *testList[i+1]) {
853 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
854 } else {
855 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
856 }
857 }
858 delete testList[i];
859 delete testList[i+1];
860 }
861 }
862
863 /**
864 * Test the [:Latin:] syntax.
865 */
TestScriptSet()866 void UnicodeSetTest::TestScriptSet() {
867 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
868
869 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
870
871 /* Jitterbug 1423 */
872 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
873
874 }
875
876 /**
877 * Test the [:Latin:] syntax.
878 */
TestPropertySet()879 void UnicodeSetTest::TestPropertySet() {
880 static const char* const DATA[] = {
881 // Pattern, Chars IN, Chars NOT in
882
883 "[:Latin:]",
884 "aA",
885 "\\u0391\\u03B1",
886
887 "[\\p{Greek}]",
888 "\\u0391\\u03B1",
889 "aA",
890
891 "\\P{ GENERAL Category = upper case letter }",
892 "abc",
893 "ABC",
894
895 #if !UCONFIG_NO_NORMALIZATION
896 // Combining class: @since ICU 2.2
897 // Check both symbolic and numeric
898 "\\p{ccc=Nukta}",
899 "\\u0ABC",
900 "abc",
901
902 "\\p{Canonical Combining Class = 11}",
903 "\\u05B1",
904 "\\u05B2",
905
906 "[:c c c = iota subscript :]",
907 "\\u0345",
908 "xyz",
909 #endif
910
911 // Bidi class: @since ICU 2.2
912 "\\p{bidiclass=lefttoright}",
913 "abc",
914 "\\u0671\\u0672",
915
916 // Binary properties: @since ICU 2.2
917 "\\p{ideographic}",
918 "\\u4E0A",
919 "x",
920
921 "[:math=false:]",
922 "q)*(",
923 // weiv: )(and * were removed from math in Unicode 4.0.1
924 //"(*+)",
925 "+<>^",
926
927 // JB#1767 \N{}, \p{ASCII}
928 "[:Ascii:]",
929 "abc\\u0000\\u007F",
930 "\\u0080\\u4E00",
931
932 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
933 "az",
934 "qrs",
935
936 // JB#2015
937 "[:any:]",
938 "a\\U0010FFFF",
939 "",
940
941 "[:nv=0.5:]",
942 "\\u00BD\\u0F2A",
943 "\\u00BC",
944
945 // JB#2653: Age
946 "[:Age=1.1:]",
947 "\\u03D6", // 1.1
948 "\\u03D8\\u03D9", // 3.2
949
950 "[:Age=3.1:]",
951 "\\u1800\\u3400\\U0002f800",
952 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
953
954 // JB#2350: Case_Sensitive
955 "[:Case Sensitive:]",
956 "A\\u1FFC\\U00010410",
957 ";\\u00B4\\U00010500",
958
959 // JB#2832: C99-compatibility props
960 "[:blank:]",
961 " \\u0009",
962 "1-9A-Z",
963
964 "[:graph:]",
965 "19AZ",
966 " \\u0003\\u0007\\u0009\\u000A\\u000D",
967
968 "[:punct:]",
969 "!@#%&*()[]{}-_\\/;:,.?'\"",
970 "09azAZ",
971
972 "[:xdigit:]",
973 "09afAF",
974 "gG!",
975
976 // Regex compatibility test
977 "[-b]", // leading '-' is literal
978 "-b",
979 "ac",
980
981 "[^-b]", // leading '-' is literal
982 "ac",
983 "-b",
984
985 "[b-]", // trailing '-' is literal
986 "-b",
987 "ac",
988
989 "[^b-]", // trailing '-' is literal
990 "ac",
991 "-b",
992
993 "[a-b-]", // trailing '-' is literal
994 "ab-",
995 "c=",
996
997 "[[a-q]&[p-z]-]", // trailing '-' is literal
998 "pq-",
999 "or=",
1000
1001 "[\\s|\\)|:|$|\\>]", // from regex tests
1002 "s|):$>",
1003 "abc",
1004
1005 "[\\uDC00cd]", // JB#2906: isolated trail at start
1006 "cd\\uDC00",
1007 "ab\\uD800\\U00010000",
1008
1009 "[ab\\uD800]", // JB#2906: isolated trail at start
1010 "ab\\uD800",
1011 "cd\\uDC00\\U00010000",
1012
1013 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1014 "abcd\\uD800",
1015 "ef\\uDC00\\U00010000",
1016
1017 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1018 "abcd\\uDC00",
1019 "ef\\uD800\\U00010000",
1020
1021 #if !UCONFIG_NO_NORMALIZATION
1022 "[:^lccc=0:]", // Lead canonical class
1023 "\\u0300\\u0301",
1024 "abcd\\u00c0\\u00c5",
1025
1026 "[:^tccc=0:]", // Trail canonical class
1027 "\\u0300\\u0301\\u00c0\\u00c5",
1028 "abcd",
1029
1030 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1031 "\\u0300\\u0301\\u00c0\\u00c5",
1032 "abcd",
1033
1034 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1035 "",
1036 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1037
1038 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1039 "\\u0F73\\u0F75\\u0F81",
1040 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1041 #endif /* !UCONFIG_NO_NORMALIZATION */
1042
1043 "[:Assigned:]",
1044 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1045 "\\u0888\\uFDD3\\uFFFE\\U00050005",
1046
1047 // Script_Extensions, new in Unicode 6.0
1048 "[:scx=Arab:]",
1049 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1050 "\\u061D\\uFDEF\\uFDFE",
1051
1052 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1053 // so scx-sc is missing U+FDF2.
1054 "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1055 "\\u0640\\u064B\\u0650\\u0655",
1056 "\\uFDF2"
1057 };
1058
1059 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1060
1061 for (int32_t i=0; i<DATA_LEN; i+=3) {
1062 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1063 CharsToUnicodeString(DATA[i+2]));
1064 }
1065 }
1066
1067 /**
1068 * Test that Posix style character classes [:digit:], etc.
1069 * have the Unicode definitions from TR 18.
1070 */
TestPosixClasses()1071 void UnicodeSetTest::TestPosixClasses() {
1072 {
1073 UErrorCode status = U_ZERO_ERROR;
1074 UnicodeSet s1("[:alpha:]", status);
1075 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1076 TEST_ASSERT_SUCCESS(status);
1077 TEST_ASSERT(s1==s2);
1078 }
1079 {
1080 UErrorCode status = U_ZERO_ERROR;
1081 UnicodeSet s1("[:lower:]", status);
1082 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1083 TEST_ASSERT_SUCCESS(status);
1084 TEST_ASSERT(s1==s2);
1085 }
1086 {
1087 UErrorCode status = U_ZERO_ERROR;
1088 UnicodeSet s1("[:upper:]", status);
1089 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1090 TEST_ASSERT_SUCCESS(status);
1091 TEST_ASSERT(s1==s2);
1092 }
1093 {
1094 UErrorCode status = U_ZERO_ERROR;
1095 UnicodeSet s1("[:punct:]", status);
1096 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1097 TEST_ASSERT_SUCCESS(status);
1098 TEST_ASSERT(s1==s2);
1099 }
1100 {
1101 UErrorCode status = U_ZERO_ERROR;
1102 UnicodeSet s1("[:digit:]", status);
1103 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1104 TEST_ASSERT_SUCCESS(status);
1105 TEST_ASSERT(s1==s2);
1106 }
1107 {
1108 UErrorCode status = U_ZERO_ERROR;
1109 UnicodeSet s1("[:xdigit:]", status);
1110 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1111 TEST_ASSERT_SUCCESS(status);
1112 TEST_ASSERT(s1==s2);
1113 }
1114 {
1115 UErrorCode status = U_ZERO_ERROR;
1116 UnicodeSet s1("[:alnum:]", status);
1117 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1118 TEST_ASSERT_SUCCESS(status);
1119 TEST_ASSERT(s1==s2);
1120 }
1121 {
1122 UErrorCode status = U_ZERO_ERROR;
1123 UnicodeSet s1("[:space:]", status);
1124 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1125 TEST_ASSERT_SUCCESS(status);
1126 TEST_ASSERT(s1==s2);
1127 }
1128 {
1129 UErrorCode status = U_ZERO_ERROR;
1130 UnicodeSet s1("[:blank:]", status);
1131 TEST_ASSERT_SUCCESS(status);
1132 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1133 status);
1134 TEST_ASSERT_SUCCESS(status);
1135 TEST_ASSERT(s1==s2);
1136 }
1137 {
1138 UErrorCode status = U_ZERO_ERROR;
1139 UnicodeSet s1("[:cntrl:]", status);
1140 TEST_ASSERT_SUCCESS(status);
1141 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1142 TEST_ASSERT_SUCCESS(status);
1143 TEST_ASSERT(s1==s2);
1144 }
1145 {
1146 UErrorCode status = U_ZERO_ERROR;
1147 UnicodeSet s1("[:graph:]", status);
1148 TEST_ASSERT_SUCCESS(status);
1149 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1150 TEST_ASSERT_SUCCESS(status);
1151 TEST_ASSERT(s1==s2);
1152 }
1153 {
1154 UErrorCode status = U_ZERO_ERROR;
1155 UnicodeSet s1("[:print:]", status);
1156 TEST_ASSERT_SUCCESS(status);
1157 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1158 TEST_ASSERT_SUCCESS(status);
1159 TEST_ASSERT(s1==s2);
1160 }
1161 }
1162 /**
1163 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1164 */
TestClone()1165 void UnicodeSetTest::TestClone() {
1166 UErrorCode ec = U_ZERO_ERROR;
1167 UnicodeSet s("[abcxyz]", ec);
1168 UnicodeSet t(s);
1169 expectContainment(t, "abc", "def");
1170 }
1171
1172 /**
1173 * Test the indexOf() and charAt() methods.
1174 */
TestIndexOf()1175 void UnicodeSetTest::TestIndexOf() {
1176 UErrorCode ec = U_ZERO_ERROR;
1177 UnicodeSet set("[a-cx-y3578]", ec);
1178 if (U_FAILURE(ec)) {
1179 errln("FAIL: UnicodeSet constructor");
1180 return;
1181 }
1182 for (int32_t i=0; i<set.size(); ++i) {
1183 UChar32 c = set.charAt(i);
1184 if (set.indexOf(c) != i) {
1185 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1186 i, c, set.indexOf(c));
1187 }
1188 }
1189 UChar32 c = set.charAt(set.size());
1190 if (c != -1) {
1191 errln("FAIL: charAt(<out of range>) = %X", c);
1192 }
1193 int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1194 if (j != -1) {
1195 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1196 }
1197 }
1198
1199 /**
1200 * Test closure API.
1201 */
TestCloseOver()1202 void UnicodeSetTest::TestCloseOver() {
1203 UErrorCode ec = U_ZERO_ERROR;
1204
1205 char CASE[] = {(char)USET_CASE_INSENSITIVE};
1206 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1207 const char* DATA[] = {
1208 // selector, input, output
1209 CASE,
1210 "[aq\\u00DF{Bc}{bC}{Fi}]",
1211 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1212
1213 CASE,
1214 "[\\u01F1]", // 'DZ'
1215 "[\\u01F1\\u01F2\\u01F3]",
1216
1217 CASE,
1218 "[\\u1FB4]",
1219 "[\\u1FB4{\\u03AC\\u03B9}]",
1220
1221 CASE,
1222 "[{F\\uFB01}]",
1223 "[\\uFB03{ffi}]",
1224
1225 CASE, // make sure binary search finds limits
1226 "[a\\uFF3A]",
1227 "[aA\\uFF3A\\uFF5A]",
1228
1229 CASE,
1230 "[a-z]","[A-Za-z\\u017F\\u212A]",
1231 CASE,
1232 "[abc]","[A-Ca-c]",
1233 CASE,
1234 "[ABC]","[A-Ca-c]",
1235
1236 CASE, "[i]", "[iI]",
1237
1238 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1239 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1240
1241 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1242
1243 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1244
1245 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1246
1247 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1248
1249 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1250
1251 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1252
1253 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1254 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1255
1256 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1257
1258 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1259
1260 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1261
1262 #if !UCONFIG_NO_FILE_IO
1263 CASE_MAPPINGS,
1264 "[aq\\u00DF{Bc}{bC}{Fi}]",
1265 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1266 #endif
1267
1268 CASE_MAPPINGS,
1269 "[\\u01F1]", // 'DZ'
1270 "[\\u01F1\\u01F2\\u01F3]",
1271
1272 CASE_MAPPINGS,
1273 "[a-z]",
1274 "[A-Za-z]",
1275
1276 NULL
1277 };
1278
1279 UnicodeSet s;
1280 UnicodeSet t;
1281 UnicodeString buf;
1282 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1283 int32_t selector = DATA[i][0];
1284 UnicodeString pat(DATA[i+1], -1, US_INV);
1285 UnicodeString exp(DATA[i+2], -1, US_INV);
1286 s.applyPattern(pat, ec);
1287 s.closeOver(selector);
1288 t.applyPattern(exp, ec);
1289 if (U_FAILURE(ec)) {
1290 errln("FAIL: applyPattern failed");
1291 continue;
1292 }
1293 if (s == t) {
1294 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1295 } else {
1296 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1297 s.toPattern(buf, TRUE) + ", expected " + exp);
1298 }
1299 }
1300
1301 #if 0
1302 /*
1303 * Unused test code.
1304 * This was used to compare the old implementation (using USET_CASE)
1305 * with the new one (using 0x100 temporarily)
1306 * while transitioning from hardcoded case closure tables in uniset.cpp
1307 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1308 * and using ucase.c functions for closure.
1309 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1310 *
1311 * Note: The old and new implementation never fully matched because
1312 * the old implementation turned out to not map U+0130 and U+0131 correctly
1313 * (dotted I and dotless i) and because the old implementation's data tables
1314 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1315 * new implementation. (So sigmas and some other characters were not handled
1316 * according to the newer Unicode version.)
1317 */
1318 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1319 UnicodeSetIterator si(sens);
1320 UnicodeString str, buf2;
1321 const UnicodeString *pStr;
1322 UChar32 c;
1323 while(si.next()) {
1324 if(!si.isString()) {
1325 c=si.getCodepoint();
1326 s.clear();
1327 s.add(c);
1328
1329 str.setTo(c);
1330 str.foldCase();
1331 sens2.add(str);
1332
1333 t=s;
1334 s.closeOver(USET_CASE);
1335 t.closeOver(0x100);
1336 if(s!=t) {
1337 errln("FAIL: closeOver(U+%04x) differs: ", c);
1338 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1339 }
1340 }
1341 }
1342 // remove all code points
1343 // should contain all full case folding mapping strings
1344 sens2.remove(0, 0x10ffff);
1345 si.reset(sens2);
1346 while(si.next()) {
1347 if(si.isString()) {
1348 pStr=&si.getString();
1349 s.clear();
1350 s.add(*pStr);
1351 t=s2=s;
1352 s.closeOver(USET_CASE);
1353 t.closeOver(0x100);
1354 if(s!=t) {
1355 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1356 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1357 }
1358 }
1359 }
1360 #endif
1361
1362 // Test the pattern API
1363 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1364 if (U_FAILURE(ec)) {
1365 errln("FAIL: applyPattern failed");
1366 } else {
1367 expectContainment(s, "abcABC", "defDEF");
1368 }
1369 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1370 if (U_FAILURE(ec)) {
1371 errln("FAIL: constructor failed");
1372 } else {
1373 expectContainment(v, "defDEF", "abcABC");
1374 }
1375 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1376 if (U_FAILURE(ec)) {
1377 errln("FAIL: construct w/case mappings failed");
1378 } else {
1379 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1380 }
1381 }
1382
TestEscapePattern()1383 void UnicodeSetTest::TestEscapePattern() {
1384 const char pattern[] =
1385 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1386 const char exp[] =
1387 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1388 // We test this with two passes; in the second pass we
1389 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,
1390 // this fails -- which is what we expect.
1391 for (int32_t pass=1; pass<=2; ++pass) {
1392 UErrorCode ec = U_ZERO_ERROR;
1393 UnicodeString pat(pattern, -1, US_INV);
1394 if (pass==2) {
1395 pat = pat.unescape();
1396 }
1397 // Pattern is only good for pass 1
1398 UBool isPatternValid = (pass==1);
1399
1400 UnicodeSet set(pat, ec);
1401 if (U_SUCCESS(ec) != isPatternValid){
1402 errln((UnicodeString)"FAIL: applyPattern(" +
1403 escape(pat) + ") => " +
1404 u_errorName(ec));
1405 continue;
1406 }
1407 if (U_FAILURE(ec)) {
1408 continue;
1409 }
1410 if (set.contains((UChar)0x0644)){
1411 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1412 }
1413
1414 UnicodeString newpat;
1415 set.toPattern(newpat, TRUE);
1416 if (newpat == UnicodeString(exp, -1, US_INV)) {
1417 logln(escape(pat) + " => " + newpat);
1418 } else {
1419 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1420 }
1421
1422 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1423 UnicodeString str("Range ");
1424 str.append((UChar)(0x30 + i))
1425 .append(": ")
1426 .append((UChar32)set.getRangeStart(i))
1427 .append(" - ")
1428 .append((UChar32)set.getRangeEnd(i));
1429 str = str + " (" + set.getRangeStart(i) + " - " +
1430 set.getRangeEnd(i) + ")";
1431 if (set.getRangeStart(i) < 0) {
1432 errln((UnicodeString)"FAIL: " + escape(str));
1433 } else {
1434 logln(escape(str));
1435 }
1436 }
1437 }
1438 }
1439
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1440 void UnicodeSetTest::expectRange(const UnicodeString& label,
1441 const UnicodeSet& set,
1442 UChar32 start, UChar32 end) {
1443 UnicodeSet exp(start, end);
1444 UnicodeString pat;
1445 if (set == exp) {
1446 logln(label + " => " + set.toPattern(pat, TRUE));
1447 } else {
1448 UnicodeString xpat;
1449 errln((UnicodeString)"FAIL: " + label + " => " +
1450 set.toPattern(pat, TRUE) +
1451 ", expected " + exp.toPattern(xpat, TRUE));
1452 }
1453 }
1454
TestInvalidCodePoint()1455 void UnicodeSetTest::TestInvalidCodePoint() {
1456
1457 const UChar32 DATA[] = {
1458 // Test range Expected range
1459 0, 0x10FFFF, 0, 0x10FFFF,
1460 (UChar32)-1, 8, 0, 8,
1461 8, 0x110000, 8, 0x10FFFF
1462 };
1463 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1464
1465 UnicodeString pat;
1466 int32_t i;
1467
1468 for (i=0; i<DATA_LENGTH; i+=4) {
1469 UChar32 start = DATA[i];
1470 UChar32 end = DATA[i+1];
1471 UChar32 xstart = DATA[i+2];
1472 UChar32 xend = DATA[i+3];
1473
1474 // Try various API using the test code points
1475
1476 UnicodeSet set(start, end);
1477 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1478 set, xstart, xend);
1479
1480 set.clear();
1481 set.set(start, end);
1482 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1483 set, xstart, xend);
1484
1485 UBool b = set.contains(start);
1486 b = set.contains(start, end);
1487 b = set.containsNone(start, end);
1488 b = set.containsSome(start, end);
1489 (void)b; // Suppress set but not used warning.
1490
1491 /*int32_t index = set.indexOf(start);*/
1492
1493 set.clear();
1494 set.add(start);
1495 set.add(start, end);
1496 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1497 set, xstart, xend);
1498
1499 set.set(0, 0x10FFFF);
1500 set.retain(start, end);
1501 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1502 set, xstart, xend);
1503 set.retain(start);
1504
1505 set.set(0, 0x10FFFF);
1506 set.remove(start);
1507 set.remove(start, end);
1508 set.complement();
1509 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1510 set, xstart, xend);
1511
1512 set.set(0, 0x10FFFF);
1513 set.complement(start, end);
1514 set.complement();
1515 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1516 set, xstart, xend);
1517 set.complement(start);
1518 }
1519
1520 const UChar32 DATA2[] = {
1521 0,
1522 0x10FFFF,
1523 (UChar32)-1,
1524 0x110000
1525 };
1526 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1527
1528 for (i=0; i<DATA2_LENGTH; ++i) {
1529 UChar32 c = DATA2[i], end = 0x10FFFF;
1530 UBool valid = (c >= 0 && c <= 0x10FFFF);
1531
1532 UnicodeSet set(0, 0x10FFFF);
1533
1534 // For single-codepoint contains, invalid codepoints are NOT contained
1535 UBool b = set.contains(c);
1536 if (b == valid) {
1537 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1538 ") = " + b);
1539 } else {
1540 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1541 ") = " + b);
1542 }
1543
1544 // For codepoint range contains, containsNone, and containsSome,
1545 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1546 b = set.contains(c, end);
1547 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1548 "," + end + ") = " + b);
1549
1550 b = set.containsNone(c, end);
1551 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1552 "," + end + ") = " + b);
1553
1554 b = set.containsSome(c, end);
1555 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1556 "," + end + ") = " + b);
1557
1558 int32_t index = set.indexOf(c);
1559 if ((index >= 0) == valid) {
1560 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1561 ") = " + index);
1562 } else {
1563 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1564 ") = " + index);
1565 }
1566 }
1567 }
1568
1569 // Used by TestSymbolTable
1570 class TokenSymbolTable : public SymbolTable {
1571 public:
1572 Hashtable contents;
1573
TokenSymbolTable(UErrorCode & ec)1574 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1575 contents.setValueDeleter(uprv_deleteUObject);
1576 }
1577
~TokenSymbolTable()1578 ~TokenSymbolTable() {}
1579
1580 /**
1581 * (Non-SymbolTable API) Add the given variable and value to
1582 * the table. Variable should NOT contain leading '$'.
1583 */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1584 void add(const UnicodeString& var, const UnicodeString& value,
1585 UErrorCode& ec) {
1586 if (U_SUCCESS(ec)) {
1587 contents.put(var, new UnicodeString(value), ec);
1588 }
1589 }
1590
1591 /**
1592 * SymbolTable API
1593 */
lookup(const UnicodeString & s) const1594 virtual const UnicodeString* lookup(const UnicodeString& s) const {
1595 return (const UnicodeString*) contents.get(s);
1596 }
1597
1598 /**
1599 * SymbolTable API
1600 */
lookupMatcher(UChar32) const1601 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1602 return NULL;
1603 }
1604
1605 /**
1606 * SymbolTable API
1607 */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1608 virtual UnicodeString parseReference(const UnicodeString& text,
1609 ParsePosition& pos, int32_t limit) const {
1610 int32_t start = pos.getIndex();
1611 int32_t i = start;
1612 UnicodeString result;
1613 while (i < limit) {
1614 UChar c = text.charAt(i);
1615 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1616 break;
1617 }
1618 ++i;
1619 }
1620 if (i == start) { // No valid name chars
1621 return result; // Indicate failure with empty string
1622 }
1623 pos.setIndex(i);
1624 text.extractBetween(start, i, result);
1625 return result;
1626 }
1627 };
1628
TestSymbolTable()1629 void UnicodeSetTest::TestSymbolTable() {
1630 // Multiple test cases can be set up here. Each test case
1631 // is terminated by null:
1632 // var, value, var, value,..., input pat., exp. output pat., null
1633 const char* DATA[] = {
1634 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1635 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1636 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1637 NULL
1638 };
1639
1640 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1641 UErrorCode ec = U_ZERO_ERROR;
1642 TokenSymbolTable sym(ec);
1643 if (U_FAILURE(ec)) {
1644 errln("FAIL: couldn't construct TokenSymbolTable");
1645 continue;
1646 }
1647
1648 // Set up variables
1649 while (DATA[i+2] != NULL) {
1650 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1651 if (U_FAILURE(ec)) {
1652 errln("FAIL: couldn't add to TokenSymbolTable");
1653 continue;
1654 }
1655 i += 2;
1656 }
1657
1658 // Input pattern and expected output pattern
1659 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1660 i += 2;
1661
1662 ParsePosition pos(0);
1663 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1664 if (U_FAILURE(ec)) {
1665 errln("FAIL: couldn't construct UnicodeSet");
1666 continue;
1667 }
1668
1669 // results
1670 if (pos.getIndex() != inpat.length()) {
1671 errln((UnicodeString)"Failed to read to end of string \""
1672 + inpat + "\": read to "
1673 + pos.getIndex() + ", length is "
1674 + inpat.length());
1675 }
1676
1677 UnicodeSet us2(exppat, ec);
1678 if (U_FAILURE(ec)) {
1679 errln("FAIL: couldn't construct expected UnicodeSet");
1680 continue;
1681 }
1682
1683 UnicodeString a, b;
1684 if (us != us2) {
1685 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1686 ", expected " + us2.toPattern(b, TRUE));
1687 } else {
1688 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1689 }
1690 }
1691 }
1692
TestSurrogate()1693 void UnicodeSetTest::TestSurrogate() {
1694 const char* DATA[] = {
1695 // These should all behave identically
1696 "[abc\\uD800\\uDC00]",
1697 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1698 "[abc\\U00010000]",
1699 0
1700 };
1701 for (int i=0; DATA[i] != 0; ++i) {
1702 UErrorCode ec = U_ZERO_ERROR;
1703 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1704 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1705 UnicodeSet set(str, ec);
1706 if (U_FAILURE(ec)) {
1707 errln("FAIL: UnicodeSet constructor");
1708 continue;
1709 }
1710 expectContainment(set,
1711 CharsToUnicodeString("abc\\U00010000"),
1712 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1713 if (set.size() != 4) {
1714 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1715 set.size() + ", expected 4");
1716 }
1717 }
1718 }
1719
TestExhaustive()1720 void UnicodeSetTest::TestExhaustive() {
1721 // exhaustive tests. Simulate UnicodeSets with integers.
1722 // That gives us very solid tests (except for large memory tests).
1723
1724 int32_t limit = 128;
1725
1726 UnicodeSet x, y, z, aa;
1727
1728 for (int32_t i = 0; i < limit; ++i) {
1729 bitsToSet(i, x);
1730 logln((UnicodeString)"Testing " + i + ", " + x);
1731 _testComplement(i, x, y);
1732
1733 // AS LONG AS WE ARE HERE, check roundtrip
1734 checkRoundTrip(bitsToSet(i, aa));
1735
1736 for (int32_t j = 0; j < limit; ++j) {
1737 _testAdd(i,j, x,y,z);
1738 _testXor(i,j, x,y,z);
1739 _testRetain(i,j, x,y,z);
1740 _testRemove(i,j, x,y,z);
1741 }
1742 }
1743 }
1744
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1745 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1746 bitsToSet(a, x);
1747 z = x;
1748 z.complement();
1749 int32_t c = setToBits(z);
1750 if (c != (~a)) {
1751 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1752 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1753 }
1754 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1755 }
1756
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1757 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1758 bitsToSet(a, x);
1759 bitsToSet(b, y);
1760 z = x;
1761 z.addAll(y);
1762 int32_t c = setToBits(z);
1763 if (c != (a | b)) {
1764 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1765 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1766 }
1767 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1768 }
1769
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1770 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1771 bitsToSet(a, x);
1772 bitsToSet(b, y);
1773 z = x;
1774 z.retainAll(y);
1775 int32_t c = setToBits(z);
1776 if (c != (a & b)) {
1777 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1778 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1779 }
1780 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1781 }
1782
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1783 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1784 bitsToSet(a, x);
1785 bitsToSet(b, y);
1786 z = x;
1787 z.removeAll(y);
1788 int32_t c = setToBits(z);
1789 if (c != (a &~ b)) {
1790 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1791 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1792 }
1793 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1794 }
1795
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1796 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1797 bitsToSet(a, x);
1798 bitsToSet(b, y);
1799 z = x;
1800 z.complementAll(y);
1801 int32_t c = setToBits(z);
1802 if (c != (a ^ b)) {
1803 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1804 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1805 }
1806 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1807 }
1808
1809 /**
1810 * Check that ranges are monotonically increasing and non-
1811 * overlapping.
1812 */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1813 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1814 int32_t n = set.getRangeCount();
1815 if (n < 0) {
1816 errln((UnicodeString)"FAIL result of " + msg +
1817 ": range count should be >= 0 but is " +
1818 n /*+ " for " + set.toPattern())*/);
1819 return;
1820 }
1821 UChar32 last = 0;
1822 for (int32_t i=0; i<n; ++i) {
1823 UChar32 start = set.getRangeStart(i);
1824 UChar32 end = set.getRangeEnd(i);
1825 if (start > end) {
1826 errln((UnicodeString)"FAIL result of " + msg +
1827 ": range " + (i+1) +
1828 " start > end: " + (int)start + ", " + (int)end +
1829 " for " + set);
1830 }
1831 if (i > 0 && start <= last) {
1832 errln((UnicodeString)"FAIL result of " + msg +
1833 ": range " + (i+1) +
1834 " overlaps previous range: " + (int)start + ", " + (int)end +
1835 " for " + set);
1836 }
1837 last = end;
1838 }
1839 }
1840
1841 /**
1842 * Convert a bitmask to a UnicodeSet.
1843 */
bitsToSet(int32_t a,UnicodeSet & result)1844 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1845 result.clear();
1846 for (UChar32 i = 0; i < 32; ++i) {
1847 if ((a & (1<<i)) != 0) {
1848 result.add(i);
1849 }
1850 }
1851 return result;
1852 }
1853
1854 /**
1855 * Convert a UnicodeSet to a bitmask. Only the characters
1856 * U+0000 to U+0020 are represented in the bitmask.
1857 */
setToBits(const UnicodeSet & x)1858 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1859 int32_t result = 0;
1860 for (int32_t i = 0; i < 32; ++i) {
1861 if (x.contains((UChar32)i)) {
1862 result |= (1<<i);
1863 }
1864 }
1865 return result;
1866 }
1867
1868 /**
1869 * Return the representation of an inversion list based UnicodeSet
1870 * as a pairs list. Ranges are listed in ascending Unicode order.
1871 * For example, the set [a-zA-M3] is represented as "33AMaz".
1872 */
getPairs(const UnicodeSet & set)1873 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1874 UnicodeString pairs;
1875 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1876 UChar32 start = set.getRangeStart(i);
1877 UChar32 end = set.getRangeEnd(i);
1878 if (end > 0xFFFF) {
1879 end = 0xFFFF;
1880 i = set.getRangeCount(); // Should be unnecessary
1881 }
1882 pairs.append((UChar)start).append((UChar)end);
1883 }
1884 return pairs;
1885 }
1886
1887 /**
1888 * Basic consistency check for a few items.
1889 * That the iterator works, and that we can create a pattern and
1890 * get the same thing back
1891 */
checkRoundTrip(const UnicodeSet & s)1892 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1893 UErrorCode ec = U_ZERO_ERROR;
1894
1895 UnicodeSet t(s);
1896 checkEqual(s, t, "copy ct");
1897
1898 t = s;
1899 checkEqual(s, t, "operator=");
1900
1901 copyWithIterator(t, s, FALSE);
1902 checkEqual(s, t, "iterator roundtrip");
1903
1904 copyWithIterator(t, s, TRUE); // try range
1905 checkEqual(s, t, "iterator roundtrip");
1906
1907 UnicodeString pat; s.toPattern(pat, FALSE);
1908 t.applyPattern(pat, ec);
1909 if (U_FAILURE(ec)) {
1910 errln("FAIL: applyPattern");
1911 return;
1912 } else {
1913 checkEqual(s, t, "toPattern(false)");
1914 }
1915
1916 s.toPattern(pat, TRUE);
1917 t.applyPattern(pat, ec);
1918 if (U_FAILURE(ec)) {
1919 errln("FAIL: applyPattern");
1920 return;
1921 } else {
1922 checkEqual(s, t, "toPattern(true)");
1923 }
1924 }
1925
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)1926 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1927 t.clear();
1928 UnicodeSetIterator it(s);
1929 if (withRange) {
1930 while (it.nextRange()) {
1931 if (it.isString()) {
1932 t.add(it.getString());
1933 } else {
1934 t.add(it.getCodepoint(), it.getCodepointEnd());
1935 }
1936 }
1937 } else {
1938 while (it.next()) {
1939 if (it.isString()) {
1940 t.add(it.getString());
1941 } else {
1942 t.add(it.getCodepoint());
1943 }
1944 }
1945 }
1946 }
1947
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)1948 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1949 UnicodeString source; s.toPattern(source, TRUE);
1950 UnicodeString result; t.toPattern(result, TRUE);
1951 if (s != t) {
1952 errln((UnicodeString)"FAIL: " + message
1953 + "; source = " + source
1954 + "; result = " + result
1955 );
1956 return FALSE;
1957 } else {
1958 logln((UnicodeString)"Ok: " + message
1959 + "; source = " + source
1960 + "; result = " + result
1961 );
1962 }
1963 return TRUE;
1964 }
1965
1966 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)1967 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1968 const UnicodeString& charsIn,
1969 const UnicodeString& charsOut) {
1970 UErrorCode ec = U_ZERO_ERROR;
1971 UnicodeSet set(pat, ec);
1972 if (U_FAILURE(ec)) {
1973 dataerrln((UnicodeString)"FAIL: pattern \"" +
1974 pat + "\" => " + u_errorName(ec));
1975 return;
1976 }
1977 expectContainment(set, pat, charsIn, charsOut);
1978 }
1979
1980 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)1981 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1982 const UnicodeString& charsIn,
1983 const UnicodeString& charsOut) {
1984 UnicodeString pat;
1985 set.toPattern(pat);
1986 expectContainment(set, pat, charsIn, charsOut);
1987 }
1988
1989 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)1990 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1991 const UnicodeString& setName,
1992 const UnicodeString& charsIn,
1993 const UnicodeString& charsOut) {
1994 UnicodeString bad;
1995 UChar32 c;
1996 int32_t i;
1997
1998 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1999 c = charsIn.char32At(i);
2000 if (!set.contains(c)) {
2001 bad.append(c);
2002 }
2003 }
2004 if (bad.length() > 0) {
2005 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2006 ", expected containment of " + prettify(charsIn));
2007 } else {
2008 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2009 }
2010
2011 bad.truncate(0);
2012 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2013 c = charsOut.char32At(i);
2014 if (set.contains(c)) {
2015 bad.append(c);
2016 }
2017 }
2018 if (bad.length() > 0) {
2019 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2020 ", expected non-containment of " + prettify(charsOut));
2021 } else {
2022 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2023 }
2024 }
2025
2026 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2027 UnicodeSetTest::expectPattern(UnicodeSet& set,
2028 const UnicodeString& pattern,
2029 const UnicodeString& expectedPairs){
2030 UErrorCode status = U_ZERO_ERROR;
2031 set.applyPattern(pattern, status);
2032 if (U_FAILURE(status)) {
2033 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2034 "\") failed");
2035 return;
2036 } else {
2037 if (getPairs(set) != expectedPairs ) {
2038 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2039 "\") => pairs \"" +
2040 escape(getPairs(set)) + "\", expected \"" +
2041 escape(expectedPairs) + "\"");
2042 } else {
2043 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
2044 "\") => pairs \"" +
2045 escape(getPairs(set)) + "\"");
2046 }
2047 }
2048 // the result of calling set.toPattern(), which is the string representation of
2049 // this set(set), is passed to a UnicodeSet constructor, and tested that it
2050 // will produce another set that is equal to this one.
2051 UnicodeString temppattern;
2052 set.toPattern(temppattern);
2053 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2054 if (U_FAILURE(status)) {
2055 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2056 return;
2057 }
2058 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2059 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2060 escape(getPairs(set)) + "\""));
2061 } else{
2062 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2063 }
2064
2065 delete tempset;
2066
2067 }
2068
2069 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2070 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2071 if (getPairs(set) != expectedPairs) {
2072 errln(UnicodeString("FAIL: Expected pair list \"") +
2073 escape(expectedPairs) + "\", got \"" +
2074 escape(getPairs(set)) + "\"");
2075 }
2076 }
2077
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2078 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2079 const UnicodeString& expPat,
2080 const char** expStrings) {
2081 UnicodeString pat;
2082 set.toPattern(pat, TRUE);
2083 if (pat == expPat) {
2084 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2085 } else {
2086 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2087 return;
2088 }
2089 if (expStrings == NULL) {
2090 return;
2091 }
2092 UBool in = TRUE;
2093 for (int32_t i=0; expStrings[i] != NULL; ++i) {
2094 if (expStrings[i] == NOT) { // sic; pointer comparison
2095 in = FALSE;
2096 continue;
2097 }
2098 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2099 UBool contained = set.contains(s);
2100 if (contained == in) {
2101 logln((UnicodeString)"Ok: " + expPat +
2102 (contained ? " contains {" : " does not contain {") +
2103 escape(expStrings[i]) + "}");
2104 } else {
2105 errln((UnicodeString)"FAIL: " + expPat +
2106 (contained ? " contains {" : " does not contain {") +
2107 escape(expStrings[i]) + "}");
2108 }
2109 }
2110 }
2111
toHexString(int32_t i)2112 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2113
2114 void
doAssert(UBool condition,const char * message)2115 UnicodeSetTest::doAssert(UBool condition, const char *message)
2116 {
2117 if (!condition) {
2118 errln(UnicodeString("ERROR : ") + message);
2119 }
2120 }
2121
2122 UnicodeString
escape(const UnicodeString & s)2123 UnicodeSetTest::escape(const UnicodeString& s) {
2124 UnicodeString buf;
2125 for (int32_t i=0; i<s.length(); )
2126 {
2127 UChar32 c = s.char32At(i);
2128 if (0x0020 <= c && c <= 0x007F) {
2129 buf += c;
2130 } else {
2131 if (c <= 0xFFFF) {
2132 buf += (UChar)0x5c; buf += (UChar)0x75;
2133 } else {
2134 buf += (UChar)0x5c; buf += (UChar)0x55;
2135 buf += toHexString((c & 0xF0000000) >> 28);
2136 buf += toHexString((c & 0x0F000000) >> 24);
2137 buf += toHexString((c & 0x00F00000) >> 20);
2138 buf += toHexString((c & 0x000F0000) >> 16);
2139 }
2140 buf += toHexString((c & 0xF000) >> 12);
2141 buf += toHexString((c & 0x0F00) >> 8);
2142 buf += toHexString((c & 0x00F0) >> 4);
2143 buf += toHexString(c & 0x000F);
2144 }
2145 i += U16_LENGTH(c);
2146 }
2147 return buf;
2148 }
2149
TestFreezable()2150 void UnicodeSetTest::TestFreezable() {
2151 UErrorCode errorCode=U_ZERO_ERROR;
2152 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2153 UnicodeSet idSet(idPattern, errorCode);
2154 if(U_FAILURE(errorCode)) {
2155 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2156 return;
2157 }
2158
2159 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2160 UnicodeSet wsSet(wsPattern, errorCode);
2161 if(U_FAILURE(errorCode)) {
2162 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2163 return;
2164 }
2165
2166 idSet.add(idPattern);
2167 UnicodeSet frozen(idSet);
2168 frozen.freeze();
2169
2170 if(idSet.isFrozen() || !frozen.isFrozen()) {
2171 errln("FAIL: isFrozen() is wrong");
2172 }
2173 if(frozen!=idSet || !(frozen==idSet)) {
2174 errln("FAIL: a copy-constructed frozen set differs from its original");
2175 }
2176
2177 frozen=wsSet;
2178 if(frozen!=idSet || !(frozen==idSet)) {
2179 errln("FAIL: a frozen set was modified by operator=");
2180 }
2181
2182 UnicodeSet frozen2(frozen);
2183 if(frozen2!=frozen || frozen2!=idSet) {
2184 errln("FAIL: a copied frozen set differs from its frozen original");
2185 }
2186 if(!frozen2.isFrozen()) {
2187 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2188 }
2189 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2190 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2191 errln("FAIL: UnicodeSet(5, 55) failed");
2192 }
2193 frozen3=frozen;
2194 if(!frozen3.isFrozen()) {
2195 errln("FAIL: copying a frozen set results in a thawed one");
2196 }
2197
2198 UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2199 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2200 errln("FAIL: clone() failed");
2201 }
2202 cloned->add(0xd802, 0xd805);
2203 if(cloned->containsSome(0xd802, 0xd805)) {
2204 errln("FAIL: unable to modify clone");
2205 }
2206 delete cloned;
2207
2208 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2209 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2210 errln("FAIL: cloneAsThawed() failed");
2211 }
2212 thawed->add(0xd802, 0xd805);
2213 if(!thawed->contains(0xd802, 0xd805)) {
2214 errln("FAIL: unable to modify thawed clone");
2215 }
2216 delete thawed;
2217
2218 frozen.set(5, 55);
2219 if(frozen!=idSet || !(frozen==idSet)) {
2220 errln("FAIL: UnicodeSet::set() modified a frozen set");
2221 }
2222
2223 frozen.clear();
2224 if(frozen!=idSet || !(frozen==idSet)) {
2225 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2226 }
2227
2228 frozen.closeOver(USET_CASE_INSENSITIVE);
2229 if(frozen!=idSet || !(frozen==idSet)) {
2230 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2231 }
2232
2233 frozen.compact();
2234 if(frozen!=idSet || !(frozen==idSet)) {
2235 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2236 }
2237
2238 ParsePosition pos;
2239 frozen.
2240 applyPattern(wsPattern, errorCode).
2241 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2242 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2243 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2244 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2245 if(frozen!=idSet || !(frozen==idSet)) {
2246 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2247 }
2248
2249 frozen.
2250 add(0xd800).
2251 add(0xd802, 0xd805).
2252 add(wsPattern).
2253 addAll(idPattern).
2254 addAll(wsSet);
2255 if(frozen!=idSet || !(frozen==idSet)) {
2256 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2257 }
2258
2259 frozen.
2260 retain(0x62).
2261 retain(0x64, 0x69).
2262 retainAll(wsPattern).
2263 retainAll(wsSet);
2264 if(frozen!=idSet || !(frozen==idSet)) {
2265 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2266 }
2267
2268 frozen.
2269 remove(0x62).
2270 remove(0x64, 0x69).
2271 remove(idPattern).
2272 removeAll(idPattern).
2273 removeAll(idSet);
2274 if(frozen!=idSet || !(frozen==idSet)) {
2275 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2276 }
2277
2278 frozen.
2279 complement().
2280 complement(0x62).
2281 complement(0x64, 0x69).
2282 complement(idPattern).
2283 complementAll(idPattern).
2284 complementAll(idSet);
2285 if(frozen!=idSet || !(frozen==idSet)) {
2286 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2287 }
2288 }
2289
2290 // Test span() etc. -------------------------------------------------------- ***
2291
2292 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2293 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2294 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2295 UErrorCode errorCode=U_ZERO_ERROR;
2296 int32_t length8=0;
2297 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2298 if(U_SUCCESS(errorCode)) {
2299 return length8;
2300 } else {
2301 // The string contains an unpaired surrogate.
2302 // Ignore this string.
2303 return 0;
2304 }
2305 }
2306
2307 class UnicodeSetWithStringsIterator;
2308
2309 // Make the strings in a UnicodeSet easily accessible.
2310 class UnicodeSetWithStrings {
2311 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2312 UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2313 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2314 int32_t size=set.size();
2315 if(size>0 && set.charAt(size-1)<0) {
2316 // If a set's last element is not a code point, then it must contain strings.
2317 // Iterate over the set, skip all code point ranges, and cache the strings.
2318 // Convert them to UTF-8 for spanUTF8().
2319 UnicodeSetIterator iter(set);
2320 const UnicodeString *s;
2321 char *s8=utf8;
2322 int32_t length8, utf8Count=0;
2323 while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2324 if(iter.isString()) {
2325 // Store the pointer to the set's string element
2326 // which we happen to know is a stable pointer.
2327 strings[stringsLength]=s=&iter.getString();
2328 utf8Count+=
2329 utf8Lengths[stringsLength]=length8=
2330 appendUTF8(s->getBuffer(), s->length(),
2331 s8, (int32_t)(sizeof(utf8)-utf8Count));
2332 if(length8==0) {
2333 hasSurrogates=TRUE; // Contains unpaired surrogates.
2334 }
2335 s8+=length8;
2336 ++stringsLength;
2337 }
2338 }
2339 }
2340 }
2341
getSet() const2342 const UnicodeSet &getSet() const {
2343 return set;
2344 }
2345
hasStrings() const2346 UBool hasStrings() const {
2347 return (UBool)(stringsLength>0);
2348 }
2349
hasStringsWithSurrogates() const2350 UBool hasStringsWithSurrogates() const {
2351 return hasSurrogates;
2352 }
2353
2354 private:
2355 friend class UnicodeSetWithStringsIterator;
2356
2357 const UnicodeSet &set;
2358
2359 const UnicodeString *strings[20];
2360 int32_t stringsLength;
2361 UBool hasSurrogates;
2362
2363 char utf8[1024];
2364 int32_t utf8Lengths[20];
2365 };
2366
2367 class UnicodeSetWithStringsIterator {
2368 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2369 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2370 fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2371 }
2372
reset()2373 void reset() {
2374 nextStringIndex=nextUTF8Start=0;
2375 }
2376
nextString()2377 const UnicodeString *nextString() {
2378 if(nextStringIndex<fSet.stringsLength) {
2379 return fSet.strings[nextStringIndex++];
2380 } else {
2381 return NULL;
2382 }
2383 }
2384
2385 // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2386 const char *nextUTF8(int32_t &length) {
2387 if(nextStringIndex<fSet.stringsLength) {
2388 const char *s8=fSet.utf8+nextUTF8Start;
2389 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2390 return s8;
2391 } else {
2392 length=0;
2393 return NULL;
2394 }
2395 }
2396
2397 private:
2398 const UnicodeSetWithStrings &fSet;
2399 int32_t nextStringIndex;
2400 int32_t nextUTF8Start;
2401 };
2402
2403 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2404 // at code point boundaries.
2405 // That is, each edge of a match must not be in the middle of a surrogate pair.
2406 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2407 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2408 s+=start;
2409 limit-=start;
2410 int32_t length=t.length();
2411 return 0==t.compare(s, length) &&
2412 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2413 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2414 }
2415
2416 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2417 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2418 USetSpanCondition spanCondition) {
2419 const UnicodeSet &realSet(set.getSet());
2420 if(!set.hasStrings()) {
2421 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2422 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2423 }
2424
2425 UChar32 c;
2426 int32_t start=0, prev;
2427 while((prev=start)<length) {
2428 U16_NEXT(s, start, length, c);
2429 if(realSet.contains(c)!=spanCondition) {
2430 break;
2431 }
2432 }
2433 return prev;
2434 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2435 UnicodeSetWithStringsIterator iter(set);
2436 UChar32 c;
2437 int32_t start, next;
2438 for(start=next=0; start<length;) {
2439 U16_NEXT(s, next, length, c);
2440 if(realSet.contains(c)) {
2441 break;
2442 }
2443 const UnicodeString *str;
2444 iter.reset();
2445 while((str=iter.nextString())!=NULL) {
2446 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2447 // spanNeedsStrings=TRUE;
2448 return start;
2449 }
2450 }
2451 start=next;
2452 }
2453 return start;
2454 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2455 UnicodeSetWithStringsIterator iter(set);
2456 UChar32 c;
2457 int32_t start, next, maxSpanLimit=0;
2458 for(start=next=0; start<length;) {
2459 U16_NEXT(s, next, length, c);
2460 if(!realSet.contains(c)) {
2461 next=start; // Do not span this single, not-contained code point.
2462 }
2463 const UnicodeString *str;
2464 iter.reset();
2465 while((str=iter.nextString())!=NULL) {
2466 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2467 // spanNeedsStrings=TRUE;
2468 int32_t matchLimit=start+str->length();
2469 if(matchLimit==length) {
2470 return length;
2471 }
2472 if(spanCondition==USET_SPAN_CONTAINED) {
2473 // Iterate for the shortest match at each position.
2474 // Recurse for each but the shortest match.
2475 if(next==start) {
2476 next=matchLimit; // First match from start.
2477 } else {
2478 if(matchLimit<next) {
2479 // Remember shortest match from start for iteration.
2480 int32_t temp=next;
2481 next=matchLimit;
2482 matchLimit=temp;
2483 }
2484 // Recurse for non-shortest match from start.
2485 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2486 USET_SPAN_CONTAINED);
2487 if((matchLimit+spanLength)>maxSpanLimit) {
2488 maxSpanLimit=matchLimit+spanLength;
2489 if(maxSpanLimit==length) {
2490 return length;
2491 }
2492 }
2493 }
2494 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2495 if(matchLimit>next) {
2496 // Remember longest match from start.
2497 next=matchLimit;
2498 }
2499 }
2500 }
2501 }
2502 if(next==start) {
2503 break; // No match from start.
2504 }
2505 start=next;
2506 }
2507 if(start>maxSpanLimit) {
2508 return start;
2509 } else {
2510 return maxSpanLimit;
2511 }
2512 }
2513 }
2514
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2515 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2516 USetSpanCondition spanCondition) {
2517 if(length==0) {
2518 return 0;
2519 }
2520 const UnicodeSet &realSet(set.getSet());
2521 if(!set.hasStrings()) {
2522 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2523 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2524 }
2525
2526 UChar32 c;
2527 int32_t prev=length;
2528 do {
2529 U16_PREV(s, 0, length, c);
2530 if(realSet.contains(c)!=spanCondition) {
2531 break;
2532 }
2533 } while((prev=length)>0);
2534 return prev;
2535 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2536 UnicodeSetWithStringsIterator iter(set);
2537 UChar32 c;
2538 int32_t prev=length, length0=length;
2539 do {
2540 U16_PREV(s, 0, length, c);
2541 if(realSet.contains(c)) {
2542 break;
2543 }
2544 const UnicodeString *str;
2545 iter.reset();
2546 while((str=iter.nextString())!=NULL) {
2547 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2548 // spanNeedsStrings=TRUE;
2549 return prev;
2550 }
2551 }
2552 } while((prev=length)>0);
2553 return prev;
2554 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2555 UnicodeSetWithStringsIterator iter(set);
2556 UChar32 c;
2557 int32_t prev=length, minSpanStart=length, length0=length;
2558 do {
2559 U16_PREV(s, 0, length, c);
2560 if(!realSet.contains(c)) {
2561 length=prev; // Do not span this single, not-contained code point.
2562 }
2563 const UnicodeString *str;
2564 iter.reset();
2565 while((str=iter.nextString())!=NULL) {
2566 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2567 // spanNeedsStrings=TRUE;
2568 int32_t matchStart=prev-str->length();
2569 if(matchStart==0) {
2570 return 0;
2571 }
2572 if(spanCondition==USET_SPAN_CONTAINED) {
2573 // Iterate for the shortest match at each position.
2574 // Recurse for each but the shortest match.
2575 if(length==prev) {
2576 length=matchStart; // First match from prev.
2577 } else {
2578 if(matchStart>length) {
2579 // Remember shortest match from prev for iteration.
2580 int32_t temp=length;
2581 length=matchStart;
2582 matchStart=temp;
2583 }
2584 // Recurse for non-shortest match from prev.
2585 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2586 USET_SPAN_CONTAINED);
2587 if(spanStart<minSpanStart) {
2588 minSpanStart=spanStart;
2589 if(minSpanStart==0) {
2590 return 0;
2591 }
2592 }
2593 }
2594 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2595 if(matchStart<length) {
2596 // Remember longest match from prev.
2597 length=matchStart;
2598 }
2599 }
2600 }
2601 }
2602 if(length==prev) {
2603 break; // No match from prev.
2604 }
2605 } while((prev=length)>0);
2606 if(prev<minSpanStart) {
2607 return prev;
2608 } else {
2609 return minSpanStart;
2610 }
2611 }
2612 }
2613
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2614 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2615 USetSpanCondition spanCondition) {
2616 const UnicodeSet &realSet(set.getSet());
2617 if(!set.hasStrings()) {
2618 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2619 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2620 }
2621
2622 UChar32 c;
2623 int32_t start=0, prev;
2624 while((prev=start)<length) {
2625 U8_NEXT_OR_FFFD(s, start, length, c);
2626 if(realSet.contains(c)!=spanCondition) {
2627 break;
2628 }
2629 }
2630 return prev;
2631 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2632 UnicodeSetWithStringsIterator iter(set);
2633 UChar32 c;
2634 int32_t start, next;
2635 for(start=next=0; start<length;) {
2636 U8_NEXT_OR_FFFD(s, next, length, c);
2637 if(realSet.contains(c)) {
2638 break;
2639 }
2640 const char *s8;
2641 int32_t length8;
2642 iter.reset();
2643 while((s8=iter.nextUTF8(length8))!=NULL) {
2644 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2645 // spanNeedsStrings=TRUE;
2646 return start;
2647 }
2648 }
2649 start=next;
2650 }
2651 return start;
2652 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2653 UnicodeSetWithStringsIterator iter(set);
2654 UChar32 c;
2655 int32_t start, next, maxSpanLimit=0;
2656 for(start=next=0; start<length;) {
2657 U8_NEXT_OR_FFFD(s, next, length, c);
2658 if(!realSet.contains(c)) {
2659 next=start; // Do not span this single, not-contained code point.
2660 }
2661 const char *s8;
2662 int32_t length8;
2663 iter.reset();
2664 while((s8=iter.nextUTF8(length8))!=NULL) {
2665 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2666 // spanNeedsStrings=TRUE;
2667 int32_t matchLimit=start+length8;
2668 if(matchLimit==length) {
2669 return length;
2670 }
2671 if(spanCondition==USET_SPAN_CONTAINED) {
2672 // Iterate for the shortest match at each position.
2673 // Recurse for each but the shortest match.
2674 if(next==start) {
2675 next=matchLimit; // First match from start.
2676 } else {
2677 if(matchLimit<next) {
2678 // Remember shortest match from start for iteration.
2679 int32_t temp=next;
2680 next=matchLimit;
2681 matchLimit=temp;
2682 }
2683 // Recurse for non-shortest match from start.
2684 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2685 USET_SPAN_CONTAINED);
2686 if((matchLimit+spanLength)>maxSpanLimit) {
2687 maxSpanLimit=matchLimit+spanLength;
2688 if(maxSpanLimit==length) {
2689 return length;
2690 }
2691 }
2692 }
2693 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2694 if(matchLimit>next) {
2695 // Remember longest match from start.
2696 next=matchLimit;
2697 }
2698 }
2699 }
2700 }
2701 if(next==start) {
2702 break; // No match from start.
2703 }
2704 start=next;
2705 }
2706 if(start>maxSpanLimit) {
2707 return start;
2708 } else {
2709 return maxSpanLimit;
2710 }
2711 }
2712 }
2713
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2714 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2715 USetSpanCondition spanCondition) {
2716 if(length==0) {
2717 return 0;
2718 }
2719 const UnicodeSet &realSet(set.getSet());
2720 if(!set.hasStrings()) {
2721 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2722 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2723 }
2724
2725 UChar32 c;
2726 int32_t prev=length;
2727 do {
2728 U8_PREV_OR_FFFD(s, 0, length, c);
2729 if(realSet.contains(c)!=spanCondition) {
2730 break;
2731 }
2732 } while((prev=length)>0);
2733 return prev;
2734 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2735 UnicodeSetWithStringsIterator iter(set);
2736 UChar32 c;
2737 int32_t prev=length;
2738 do {
2739 U8_PREV_OR_FFFD(s, 0, length, c);
2740 if(realSet.contains(c)) {
2741 break;
2742 }
2743 const char *s8;
2744 int32_t length8;
2745 iter.reset();
2746 while((s8=iter.nextUTF8(length8))!=NULL) {
2747 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2748 // spanNeedsStrings=TRUE;
2749 return prev;
2750 }
2751 }
2752 } while((prev=length)>0);
2753 return prev;
2754 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2755 UnicodeSetWithStringsIterator iter(set);
2756 UChar32 c;
2757 int32_t prev=length, minSpanStart=length;
2758 do {
2759 U8_PREV_OR_FFFD(s, 0, length, c);
2760 if(!realSet.contains(c)) {
2761 length=prev; // Do not span this single, not-contained code point.
2762 }
2763 const char *s8;
2764 int32_t length8;
2765 iter.reset();
2766 while((s8=iter.nextUTF8(length8))!=NULL) {
2767 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2768 // spanNeedsStrings=TRUE;
2769 int32_t matchStart=prev-length8;
2770 if(matchStart==0) {
2771 return 0;
2772 }
2773 if(spanCondition==USET_SPAN_CONTAINED) {
2774 // Iterate for the shortest match at each position.
2775 // Recurse for each but the shortest match.
2776 if(length==prev) {
2777 length=matchStart; // First match from prev.
2778 } else {
2779 if(matchStart>length) {
2780 // Remember shortest match from prev for iteration.
2781 int32_t temp=length;
2782 length=matchStart;
2783 matchStart=temp;
2784 }
2785 // Recurse for non-shortest match from prev.
2786 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2787 USET_SPAN_CONTAINED);
2788 if(spanStart<minSpanStart) {
2789 minSpanStart=spanStart;
2790 if(minSpanStart==0) {
2791 return 0;
2792 }
2793 }
2794 }
2795 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2796 if(matchStart<length) {
2797 // Remember longest match from prev.
2798 length=matchStart;
2799 }
2800 }
2801 }
2802 }
2803 if(length==prev) {
2804 break; // No match from prev.
2805 }
2806 } while((prev=length)>0);
2807 if(prev<minSpanStart) {
2808 return prev;
2809 } else {
2810 return minSpanStart;
2811 }
2812 }
2813 }
2814
2815 // spans to be performed and compared
2816 enum {
2817 SPAN_UTF16 =1,
2818 SPAN_UTF8 =2,
2819 SPAN_UTFS =3,
2820
2821 SPAN_SET =4,
2822 SPAN_COMPLEMENT =8,
2823 SPAN_POLARITY =0xc,
2824
2825 SPAN_FWD =0x10,
2826 SPAN_BACK =0x20,
2827 SPAN_DIRS =0x30,
2828
2829 SPAN_CONTAINED =0x100,
2830 SPAN_SIMPLE =0x200,
2831 SPAN_CONDITION =0x300,
2832
2833 SPAN_ALL =0x33f
2834 };
2835
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2836 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2837 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2838 }
2839
slen(const void * s,UBool isUTF16)2840 static inline int32_t slen(const void *s, UBool isUTF16) {
2841 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2842 }
2843
2844 /*
2845 * Count spans on a string with the method according to type and set the span limits.
2846 * The set may be the complement of the original.
2847 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2848 * according to the expected number of spans.
2849 * Sets typeName to an empty string if there is no such type.
2850 * Returns -1 if the span option is filtered out.
2851 */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2852 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2853 const void *s, int32_t length, UBool isUTF16,
2854 uint32_t whichSpans,
2855 int type, const char *&typeName,
2856 int32_t limits[], int32_t limitsCapacity,
2857 int32_t expectCount) {
2858 const UnicodeSet &realSet(set.getSet());
2859 int32_t start, count;
2860 USetSpanCondition spanCondition, firstSpanCondition, contained;
2861 UBool isForward;
2862
2863 if(type<0 || 7<type) {
2864 typeName="";
2865 return 0;
2866 }
2867
2868 static const char *const typeNames16[]={
2869 "contains", "contains(LM)",
2870 "span", "span(LM)",
2871 "containsBack", "containsBack(LM)",
2872 "spanBack", "spanBack(LM)"
2873 };
2874
2875 static const char *const typeNames8[]={
2876 "containsUTF8", "containsUTF8(LM)",
2877 "spanUTF8", "spanUTF8(LM)",
2878 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2879 "spanBackUTF8", "spanBackUTF8(LM)"
2880 };
2881
2882 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2883
2884 // filter span options
2885 if(type<=3) {
2886 // span forward
2887 if((whichSpans&SPAN_FWD)==0) {
2888 return -1;
2889 }
2890 isForward=TRUE;
2891 } else {
2892 // span backward
2893 if((whichSpans&SPAN_BACK)==0) {
2894 return -1;
2895 }
2896 isForward=FALSE;
2897 }
2898 if((type&1)==0) {
2899 // use USET_SPAN_CONTAINED
2900 if((whichSpans&SPAN_CONTAINED)==0) {
2901 return -1;
2902 }
2903 contained=USET_SPAN_CONTAINED;
2904 } else {
2905 // use USET_SPAN_SIMPLE
2906 if((whichSpans&SPAN_SIMPLE)==0) {
2907 return -1;
2908 }
2909 contained=USET_SPAN_SIMPLE;
2910 }
2911
2912 // Default first span condition for going forward with an uncomplemented set.
2913 spanCondition=USET_SPAN_NOT_CONTAINED;
2914 if(isComplement) {
2915 spanCondition=invertSpanCondition(spanCondition, contained);
2916 }
2917
2918 // First span condition for span(), used to terminate the spanBack() iteration.
2919 firstSpanCondition=spanCondition;
2920
2921 // spanBack(): Its initial span condition is span()'s last span condition,
2922 // which is the opposite of span()'s first span condition
2923 // if we expect an even number of spans.
2924 // (The loop inverts spanCondition (expectCount-1) times
2925 // before the expectCount'th span() call.)
2926 // If we do not compare forward and backward directions, then we do not have an
2927 // expectCount and just start with firstSpanCondition.
2928 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2929 spanCondition=invertSpanCondition(spanCondition, contained);
2930 }
2931
2932 count=0;
2933 switch(type) {
2934 case 0:
2935 case 1:
2936 start=0;
2937 if(length<0) {
2938 length=slen(s, isUTF16);
2939 }
2940 for(;;) {
2941 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2942 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2943 if(count<limitsCapacity) {
2944 limits[count]=start;
2945 }
2946 ++count;
2947 if(start>=length) {
2948 break;
2949 }
2950 spanCondition=invertSpanCondition(spanCondition, contained);
2951 }
2952 break;
2953 case 2:
2954 case 3:
2955 start=0;
2956 for(;;) {
2957 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2958 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2959 if(count<limitsCapacity) {
2960 limits[count]=start;
2961 }
2962 ++count;
2963 if(length>=0 ? start>=length :
2964 isUTF16 ? ((const UChar *)s)[start]==0 :
2965 ((const char *)s)[start]==0
2966 ) {
2967 break;
2968 }
2969 spanCondition=invertSpanCondition(spanCondition, contained);
2970 }
2971 break;
2972 case 4:
2973 case 5:
2974 if(length<0) {
2975 length=slen(s, isUTF16);
2976 }
2977 for(;;) {
2978 ++count;
2979 if(count<=limitsCapacity) {
2980 limits[limitsCapacity-count]=length;
2981 }
2982 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
2983 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
2984 if(length==0 && spanCondition==firstSpanCondition) {
2985 break;
2986 }
2987 spanCondition=invertSpanCondition(spanCondition, contained);
2988 }
2989 if(count<limitsCapacity) {
2990 memmove(limits, limits+(limitsCapacity-count), count*4);
2991 }
2992 break;
2993 case 6:
2994 case 7:
2995 for(;;) {
2996 ++count;
2997 if(count<=limitsCapacity) {
2998 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
2999 }
3000 // Note: Length<0 is tested only for the first spanBack().
3001 // If we wanted to keep length<0 for all spanBack()s, we would have to
3002 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3003 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3004 realSet.spanBackUTF8((const char *)s, length, spanCondition);
3005 if(length==0 && spanCondition==firstSpanCondition) {
3006 break;
3007 }
3008 spanCondition=invertSpanCondition(spanCondition, contained);
3009 }
3010 if(count<limitsCapacity) {
3011 memmove(limits, limits+(limitsCapacity-count), count*4);
3012 }
3013 break;
3014 default:
3015 typeName="";
3016 return -1;
3017 }
3018
3019 return count;
3020 }
3021
3022 // sets to be tested; odd index=isComplement
3023 enum {
3024 SLOW,
3025 SLOW_NOT,
3026 FAST,
3027 FAST_NOT,
3028 SET_COUNT
3029 };
3030
3031 static const char *const setNames[SET_COUNT]={
3032 "slow",
3033 "slow.not",
3034 "fast",
3035 "fast.not"
3036 };
3037
3038 /*
3039 * Verify that we get the same results whether we look at text with contains(),
3040 * span() or spanBack(), using unfrozen or frozen versions of the set,
3041 * and using the set or its complement (switching the spanConditions accordingly).
3042 * The latter verifies that
3043 * set.span(spanCondition) == set.complement().span(!spanCondition).
3044 *
3045 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3046 * or returned to the caller (with an input expectCount<0).
3047 */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3048 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3049 const void *s, int32_t length, UBool isUTF16,
3050 uint32_t whichSpans,
3051 int32_t expectLimits[], int32_t &expectCount,
3052 const char *testName, int32_t index) {
3053 int32_t limits[500];
3054 int32_t limitsCount;
3055 int i, j;
3056
3057 const char *typeName;
3058 int type;
3059
3060 for(i=0; i<SET_COUNT; ++i) {
3061 if((i&1)==0) {
3062 // Even-numbered sets are original, uncomplemented sets.
3063 if((whichSpans&SPAN_SET)==0) {
3064 continue;
3065 }
3066 } else {
3067 // Odd-numbered sets are complemented.
3068 if((whichSpans&SPAN_COMPLEMENT)==0) {
3069 continue;
3070 }
3071 }
3072 for(type=0;; ++type) {
3073 limitsCount=getSpans(*sets[i], (UBool)(i&1),
3074 s, length, isUTF16,
3075 whichSpans,
3076 type, typeName,
3077 limits, UPRV_LENGTHOF(limits), expectCount);
3078 if(typeName[0]==0) {
3079 break; // All types tried.
3080 }
3081 if(limitsCount<0) {
3082 continue; // Span option filtered out.
3083 }
3084 if(expectCount<0) {
3085 expectCount=limitsCount;
3086 if(limitsCount>UPRV_LENGTHOF(limits)) {
3087 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3088 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3089 return;
3090 }
3091 memcpy(expectLimits, limits, limitsCount*4);
3092 } else if(limitsCount!=expectCount) {
3093 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3094 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3095 } else {
3096 for(j=0; j<limitsCount; ++j) {
3097 if(limits[j]!=expectLimits[j]) {
3098 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3099 testName, (long)index, setNames[i], typeName, (long)limitsCount,
3100 j, (long)limits[j], (long)expectLimits[j]);
3101 break;
3102 }
3103 }
3104 }
3105 }
3106 }
3107
3108 // Compare span() with containsAll()/containsNone(),
3109 // but only if we have expectLimits[] from the uncomplemented set.
3110 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3111 const UChar *s16=(const UChar *)s;
3112 UnicodeString string;
3113 int32_t prev=0, limit, length;
3114 for(i=0; i<expectCount; ++i) {
3115 limit=expectLimits[i];
3116 length=limit-prev;
3117 if(length>0) {
3118 string.setTo(FALSE, s16+prev, length); // read-only alias
3119 if(i&1) {
3120 if(!sets[SLOW]->getSet().containsAll(string)) {
3121 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3122 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3123 return;
3124 }
3125 if(!sets[FAST]->getSet().containsAll(string)) {
3126 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3127 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3128 return;
3129 }
3130 } else {
3131 if(!sets[SLOW]->getSet().containsNone(string)) {
3132 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3133 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3134 return;
3135 }
3136 if(!sets[FAST]->getSet().containsNone(string)) {
3137 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3138 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3139 return;
3140 }
3141 }
3142 }
3143 prev=limit;
3144 }
3145 }
3146 }
3147
3148 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3149 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3150 const void *s, int32_t length, UBool isUTF16,
3151 uint32_t whichSpans,
3152 const char *testName, int32_t index) {
3153 int32_t expectLimits[500];
3154 int32_t expectCount=-1;
3155 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3156 }
3157
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3158 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3159 UChar c, c2;
3160
3161 if(length>=0) {
3162 while(length>0) {
3163 c=*s++;
3164 --length;
3165 if(0xd800<=c && c<0xe000) {
3166 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3167 return TRUE;
3168 }
3169 --length;
3170 }
3171 }
3172 } else {
3173 while((c=*s++)!=0) {
3174 if(0xd800<=c && c<0xe000) {
3175 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3176 return TRUE;
3177 }
3178 }
3179 }
3180 }
3181 return FALSE;
3182 }
3183
3184 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3185 // unless either UTF is turned off in whichSpans.
3186 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3187 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3188 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3189 const UChar *s16, int32_t length16,
3190 uint32_t whichSpans,
3191 const char *testName, int32_t index) {
3192 int32_t expectLimits[500];
3193 int32_t expectCount;
3194
3195 expectCount=-1; // Get expectLimits[] from testSpan().
3196
3197 if((whichSpans&SPAN_UTF16)!=0) {
3198 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3199 }
3200 if((whichSpans&SPAN_UTF8)==0) {
3201 return;
3202 }
3203
3204 // Convert s16[] and expectLimits[] to UTF-8.
3205 uint8_t s8[3000];
3206 int32_t offsets[3000];
3207
3208 const UChar *s16Limit=s16+length16;
3209 char *t=(char *)s8;
3210 char *tLimit=t+sizeof(s8);
3211 int32_t *o=offsets;
3212 UErrorCode errorCode=U_ZERO_ERROR;
3213
3214 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3215 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3216 if(U_FAILURE(errorCode)) {
3217 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3218 testName, (long)index, u_errorName(errorCode));
3219 ucnv_resetFromUnicode(utf8Cnv);
3220 return;
3221 }
3222 int32_t length8=(int32_t)(t-(char *)s8);
3223
3224 // Convert expectLimits[].
3225 int32_t i, j, expect;
3226 for(i=j=0; i<expectCount; ++i) {
3227 expect=expectLimits[i];
3228 if(expect==length16) {
3229 expectLimits[i]=length8;
3230 } else {
3231 while(offsets[j]<expect) {
3232 ++j;
3233 }
3234 expectLimits[i]=j;
3235 }
3236 }
3237
3238 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3239 }
3240
nextCodePoint(UChar32 c)3241 static UChar32 nextCodePoint(UChar32 c) {
3242 // Skip some large and boring ranges.
3243 switch(c) {
3244 case 0x3441:
3245 return 0x4d7f;
3246 case 0x5100:
3247 return 0x9f00;
3248 case 0xb040:
3249 return 0xd780;
3250 case 0xe041:
3251 return 0xf8fe;
3252 case 0x10100:
3253 return 0x20000;
3254 case 0x20041:
3255 return 0xe0000;
3256 case 0xe0101:
3257 return 0x10fffd;
3258 default:
3259 return c+1;
3260 }
3261 }
3262
3263 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3264 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3265 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3266 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3267 // Skip the UTF-8 part of the test - if the string contains surrogates -
3268 // because it is likely to produce a different result.
3269 UBool inconsistentSurrogates=
3270 (!(sets[0]->getSet().contains(0xfffd) ?
3271 sets[0]->getSet().contains(0xd800, 0xdfff) :
3272 sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3273 sets[0]->hasStringsWithSurrogates());
3274
3275 UChar s[1000];
3276 int32_t length=0;
3277 uint32_t localWhichSpans;
3278
3279 UChar32 c, first;
3280 for(first=c=0;; c=nextCodePoint(c)) {
3281 if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3282 localWhichSpans=whichSpans;
3283 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3284 localWhichSpans&=~SPAN_UTF8;
3285 }
3286 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3287 if(c>0x10ffff) {
3288 break;
3289 }
3290 length=0;
3291 first=c;
3292 }
3293 U16_APPEND_UNSAFE(s, length, c);
3294 }
3295 }
3296
3297 // Test with a particular, interesting string.
3298 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3299 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3300 static const UChar s[]={
3301 0x61, 0x62, 0x20, // Latin, space
3302 0x3b1, 0x3b2, 0x3b3, // Greek
3303 0xd900, // lead surrogate
3304 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3305 0xdc05, // trail surrogate
3306 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3307 0xd900, 0xdc05, // unassigned supplementary
3308 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3309 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3310 0 // NUL
3311 };
3312
3313 if((whichSpans&SPAN_UTF16)==0) {
3314 return;
3315 }
3316 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3317 testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3318 }
3319
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3320 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3321 static const char s[]={
3322 "abc" // Latin
3323
3324 /* trail byte in lead position */
3325 "\x80"
3326
3327 " " // space
3328
3329 /* truncated multi-byte sequences */
3330 "\xd0"
3331 "\xe0"
3332 "\xe1"
3333 "\xed"
3334 "\xee"
3335 "\xf0"
3336 "\xf1"
3337 "\xf4"
3338 "\xf8"
3339 "\xfc"
3340
3341 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3342
3343 /* trail byte in lead position */
3344 "\x80"
3345
3346 "\xe0\x80"
3347 "\xe0\xa0"
3348 "\xe1\x80"
3349 "\xed\x80"
3350 "\xed\xa0"
3351 "\xee\x80"
3352 "\xf0\x80"
3353 "\xf0\x90"
3354 "\xf1\x80"
3355 "\xf4\x80"
3356 "\xf4\x90"
3357 "\xf8\x80"
3358 "\xfc\x80"
3359
3360 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3361
3362 /* trail byte in lead position */
3363 "\x80"
3364
3365 "\xf0\x80\x80"
3366 "\xf0\x90\x80"
3367 "\xf1\x80\x80"
3368 "\xf4\x80\x80"
3369 "\xf4\x90\x80"
3370 "\xf8\x80\x80"
3371 "\xfc\x80\x80"
3372
3373 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3374
3375 /* trail byte in lead position */
3376 "\x80"
3377
3378 "\xf8\x80\x80\x80"
3379 "\xfc\x80\x80\x80"
3380
3381 "\xF1\x90\x80\x85" // unassigned supplementary
3382
3383 /* trail byte in lead position */
3384 "\x80"
3385
3386 "\xfc\x80\x80\x80\x80"
3387
3388 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3389
3390 /* trail byte in lead position */
3391 "\x80"
3392
3393 /* complete sequences but non-shortest forms or out of range etc. */
3394 "\xc0\x80"
3395 "\xe0\x80\x80"
3396 "\xed\xa0\x80"
3397 "\xf0\x80\x80\x80"
3398 "\xf4\x90\x80\x80"
3399 "\xf8\x80\x80\x80\x80"
3400 "\xfc\x80\x80\x80\x80\x80"
3401 "\xfe"
3402 "\xff"
3403
3404 /* trail byte in lead position */
3405 "\x80"
3406
3407 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3408 };
3409
3410 if((whichSpans&SPAN_UTF8)==0) {
3411 return;
3412 }
3413 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3414 testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3415 }
3416
3417 // Take a set of span options and multiply them so that
3418 // each portion only has one of the options a, b and c.
3419 // If b==0, then the set of options is just modified with mask and a.
3420 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3421 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3422 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3423 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3424 uint32_t s;
3425 int32_t i;
3426
3427 for(i=0; i<whichSpansCount; ++i) {
3428 s=whichSpans[i]&mask;
3429 whichSpans[i]=s|a;
3430 if(b!=0) {
3431 whichSpans[whichSpansCount+i]=s|b;
3432 if(c!=0) {
3433 whichSpans[2*whichSpansCount+i]=s|c;
3434 }
3435 }
3436 }
3437 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3438 }
3439
3440 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3441 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3442 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3443 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3444
TestSpan()3445 void UnicodeSetTest::TestSpan() {
3446 // "[...]" is a UnicodeSet pattern.
3447 // "*" performs tests on all Unicode code points and on a selection of
3448 // malformed UTF-8/16 strings.
3449 // "-options" limits the scope of testing for the current set.
3450 // By default, the test verifies that equivalent boundaries are found
3451 // for UTF-16 and UTF-8, going forward and backward,
3452 // alternating USET_SPAN_NOT_CONTAINED with
3453 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3454 // Single-character options:
3455 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3456 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3457 // or the set contains strings with unpaired surrogates
3458 // which do not translate to valid UTF-8.
3459 // c -- set.span() and set.complement().span() boundaries may differ.
3460 // Cause: Set strings are not complemented.
3461 // b -- span() and spanBack() boundaries may differ.
3462 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3463 // and spanBack(USET_SPAN_SIMPLE) are defined to
3464 // match with non-overlapping substrings.
3465 // For example, with a set containing "ab" and "ba",
3466 // span() of "aba" yields boundaries { 0, 2, 3 }
3467 // because the initial "ab" matches from 0 to 2,
3468 // while spanBack() yields boundaries { 0, 1, 3 }
3469 // because the final "ba" matches from 1 to 3.
3470 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3471 // Cause: Strings in the set overlap, and a longer match may
3472 // require a sequence including non-longest substrings.
3473 // For example, with a set containing "ab", "abc" and "cd",
3474 // span(contained) of "abcd" spans the entire string
3475 // but span(longest match) only spans the first 3 characters.
3476 // Each "-options" first resets all options and then applies the specified options.
3477 // A "-" without options resets the options.
3478 // The options are also reset for each new set.
3479 // Other strings will be spanned.
3480 static const char *const testdata[]={
3481 "[:ID_Continue:]",
3482 "*",
3483 "[:White_Space:]",
3484 "*",
3485 "[]",
3486 "*",
3487 "[\\u0000-\\U0010FFFF]",
3488 "*",
3489 "[\\u0000\\u0080\\u0800\\U00010000]",
3490 "*",
3491 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3492 "*",
3493 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3494 "-c",
3495 "*",
3496 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3497 "-c",
3498 "*",
3499
3500 // Overlapping strings cause overlapping attempts to match.
3501 "[x{xy}{xya}{axy}{ax}]",
3502 "-cl",
3503
3504 // More repetitions of "xya" would take too long with the recursive
3505 // reference implementation.
3506 // containsAll()=FALSE
3507 // test_string 0x14
3508 "xx"
3509 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3510 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3511 "xyaxyaxyaxya"
3512 "xx"
3513 "xyaxyaxyaxya" // span() ends here.
3514 "aaa",
3515
3516 // containsAll()=TRUE
3517 // test_string 0x15
3518 "xx"
3519 "xyaxyaxyaxya"
3520 "xx"
3521 "xyaxyaxyaxya"
3522 "xx"
3523 "xyaxyaxyaxy",
3524
3525 "-bc",
3526 // test_string 0x17
3527 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3528 "-c",
3529 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3530 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3531 "-",
3532 "byaya", // span() -> { 5 }
3533 "byay", // span() -> { 4 }
3534 "bya", // span() -> { 3 }
3535
3536 // span(longest match) will not span the whole string.
3537 "[a{ab}{bc}]",
3538 "-cl",
3539 // test_string 0x21
3540 "abc",
3541
3542 "[a{ab}{abc}{cd}]",
3543 "-cl",
3544 "acdabcdabccd",
3545
3546 // spanBack(longest match) will not span the whole string.
3547 "[c{ab}{bc}]",
3548 "-cl",
3549 "abc",
3550
3551 "[d{cd}{bcd}{ab}]",
3552 "-cl",
3553 "abbcdabcdabd",
3554
3555 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3556 // and UTF-8 trail bytes.
3557 // Copies of above test sets and strings, but transliterated to have
3558 // different code points with similar trail units.
3559 // Previous: a b c d
3560 // Unicode: 042B 30AB 200AB 204AB
3561 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3562 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3563 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3564 "-cl",
3565 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3566
3567 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3568 "-cl",
3569 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3570
3571 // Stress bookkeeping and recursion.
3572 // The following strings are barely doable with the recursive
3573 // reference implementation.
3574 // The not-contained character at the end prevents an early exit from the span().
3575 "[b{bb}]",
3576 "-c",
3577 // test_string 0x33
3578 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3579 // On complement sets, span() and spanBack() get different results
3580 // because b is not in the complement set and there is an odd number of b's
3581 // in the test string.
3582 "-bc",
3583 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3584
3585 // Test with set strings with an initial or final code point span
3586 // longer than 254.
3587 "[a{" _64_a _64_a _64_a _64_a "b}"
3588 "{a" _64_b _64_b _64_b _64_b "}]",
3589 "-c",
3590 _64_a _64_a _64_a _63_a "b",
3591 _64_a _64_a _64_a _64_a "b",
3592 _64_a _64_a _64_a _64_a "aaaabbbb",
3593 "a" _64_b _64_b _64_b _63_b,
3594 "a" _64_b _64_b _64_b _64_b,
3595 "aaaabbbb" _64_b _64_b _64_b _64_b,
3596
3597 // Test with strings containing unpaired surrogates.
3598 // They are not representable in UTF-8, and a leading trail surrogate
3599 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3600 // U+20001 == \\uD840\\uDC01
3601 // U+20400 == \\uD841\\uDC00
3602 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3603 "-8cl",
3604 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3605 };
3606 uint32_t whichSpans[96]={ SPAN_ALL };
3607 int32_t whichSpansCount=1;
3608
3609 UnicodeSet *sets[SET_COUNT]={ NULL };
3610 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3611
3612 char testName[1024];
3613 char *testNameLimit=testName;
3614
3615 int32_t i, j;
3616 for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3617 const char *s=testdata[i];
3618 if(s[0]=='[') {
3619 // Create new test sets from this pattern.
3620 for(j=0; j<SET_COUNT; ++j) {
3621 delete sets_with_str[j];
3622 delete sets[j];
3623 }
3624 UErrorCode errorCode=U_ZERO_ERROR;
3625 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3626 if(U_FAILURE(errorCode)) {
3627 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3628 break;
3629 }
3630 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3631 sets[SLOW_NOT]->complement();
3632 // Intermediate set: Test cloning of a frozen set.
3633 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3634 fast->freeze();
3635 sets[FAST]=(UnicodeSet *)fast->clone();
3636 delete fast;
3637 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3638 fastNot->freeze();
3639 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3640 delete fastNot;
3641
3642 for(j=0; j<SET_COUNT; ++j) {
3643 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3644 }
3645
3646 strcpy(testName, s);
3647 testNameLimit=strchr(testName, 0);
3648 *testNameLimit++=':';
3649 *testNameLimit=0;
3650
3651 whichSpans[0]=SPAN_ALL;
3652 whichSpansCount=1;
3653 } else if(s[0]=='-') {
3654 whichSpans[0]=SPAN_ALL;
3655 whichSpansCount=1;
3656
3657 while(*++s!=0) {
3658 switch(*s) {
3659 case 'c':
3660 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3661 ~SPAN_POLARITY,
3662 SPAN_SET,
3663 SPAN_COMPLEMENT,
3664 0);
3665 break;
3666 case 'b':
3667 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3668 ~SPAN_DIRS,
3669 SPAN_FWD,
3670 SPAN_BACK,
3671 0);
3672 break;
3673 case 'l':
3674 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3675 // USET_SPAN_SIMPLE only FWD, and separately
3676 // USET_SPAN_SIMPLE only BACK
3677 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3678 ~(SPAN_DIRS|SPAN_CONDITION),
3679 SPAN_DIRS|SPAN_CONTAINED,
3680 SPAN_FWD|SPAN_SIMPLE,
3681 SPAN_BACK|SPAN_SIMPLE);
3682 break;
3683 case '8':
3684 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3685 ~SPAN_UTFS,
3686 SPAN_UTF16,
3687 SPAN_UTF8,
3688 0);
3689 break;
3690 default:
3691 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3692 break;
3693 }
3694 }
3695 } else if(0==strcmp(s, "*")) {
3696 strcpy(testNameLimit, "bad_string");
3697 for(j=0; j<whichSpansCount; ++j) {
3698 if(whichSpansCount>1) {
3699 sprintf(testNameLimit+10 /* strlen("bad_string") */,
3700 "%%0x%3x",
3701 whichSpans[j]);
3702 }
3703 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3704 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3705 }
3706
3707 strcpy(testNameLimit, "contents");
3708 for(j=0; j<whichSpansCount; ++j) {
3709 if(whichSpansCount>1) {
3710 sprintf(testNameLimit+8 /* strlen("contents") */,
3711 "%%0x%3x",
3712 whichSpans[j]);
3713 }
3714 testSpanContents(sets_with_str, whichSpans[j], testName);
3715 }
3716 } else {
3717 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3718 strcpy(testNameLimit, "test_string");
3719 for(j=0; j<whichSpansCount; ++j) {
3720 if(whichSpansCount>1) {
3721 sprintf(testNameLimit+11 /* strlen("test_string") */,
3722 "%%0x%3x",
3723 whichSpans[j]);
3724 }
3725 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3726 }
3727 }
3728 }
3729 for(j=0; j<SET_COUNT; ++j) {
3730 delete sets_with_str[j];
3731 delete sets[j];
3732 }
3733 }
3734
3735 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3736 void UnicodeSetTest::TestStringSpan() {
3737 static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3738 static const char *const string=
3739 "xx"
3740 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3741 "xx"
3742 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3743 "xx"
3744 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3745 "aaaa";
3746
3747 UErrorCode errorCode=U_ZERO_ERROR;
3748 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3749 UnicodeSet set(pattern16, errorCode);
3750 if(U_FAILURE(errorCode)) {
3751 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3752 return;
3753 }
3754
3755 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3756
3757 if(set.containsAll(string16)) {
3758 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3759 }
3760
3761 // Remove trailing "aaaa".
3762 string16.truncate(string16.length()-4);
3763 if(!set.containsAll(string16)) {
3764 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3765 }
3766
3767 string16=UNICODE_STRING_SIMPLE("byayaxya");
3768 const UChar *s16=string16.getBuffer();
3769 int32_t length16=string16.length();
3770 (void)length16; // Suppress set but not used warning.
3771 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3772 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3773 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3774 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3775 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3776 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3777 ) {
3778 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3779 }
3780
3781 pattern="[a{ab}{abc}{cd}]";
3782 pattern16=UnicodeString(pattern, -1, US_INV);
3783 set.applyPattern(pattern16, errorCode);
3784 if(U_FAILURE(errorCode)) {
3785 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3786 return;
3787 }
3788 string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3789 s16=string16.getBuffer();
3790 length16=string16.length();
3791 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3792 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3793 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3794 ) {
3795 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3796 }
3797
3798 pattern="[d{cd}{bcd}{ab}]";
3799 pattern16=UnicodeString(pattern, -1, US_INV);
3800 set.applyPattern(pattern16, errorCode).freeze();
3801 if(U_FAILURE(errorCode)) {
3802 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3803 return;
3804 }
3805 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3806 s16=string16.getBuffer();
3807 length16=string16.length();
3808 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3809 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3810 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3811 ) {
3812 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3813 }
3814 }
3815