1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 #include "unicode/uset.h"
10 #include "unicode/ustring.h"
11 #include "cintltst.h"
12 #include "cmemory.h"
13 #include <stdlib.h>
14 #include <string.h>
15 
16 #define TEST(x) addTest(root, &x, "uset/" # x)
17 
18 static void TestAPI(void);
19 static void Testj2269(void);
20 static void TestSerialized(void);
21 static void TestNonInvariantPattern(void);
22 static void TestBadPattern(void);
23 static void TestFreezable(void);
24 static void TestSpan(void);
25 
26 void addUSetTest(TestNode** root);
27 
28 static void expect(const USet* set,
29                    const char* inList,
30                    const char* outList,
31                    UErrorCode* ec);
32 static void expectContainment(const USet* set,
33                               const char* list,
34                               UBool isIn);
35 static char oneUCharToChar(UChar32 c);
36 static void expectItems(const USet* set,
37                         const char* items);
38 
39 void
addUSetTest(TestNode ** root)40 addUSetTest(TestNode** root) {
41     TEST(TestAPI);
42     TEST(Testj2269);
43     TEST(TestSerialized);
44     TEST(TestNonInvariantPattern);
45     TEST(TestBadPattern);
46     TEST(TestFreezable);
47     TEST(TestSpan);
48 }
49 
50 /*------------------------------------------------------------------
51  * Tests
52  *------------------------------------------------------------------*/
53 
Testj2269()54 static void Testj2269() {
55   UErrorCode status = U_ZERO_ERROR;
56   UChar a[4] = { 0x61, 0x62, 0x63, 0 };
57   USet *s = uset_open(1, 0);
58   uset_addString(s, a, 3);
59   a[0] = 0x63; a[1] = 0x63;
60   expect(s, "{abc}", "{ccc}", &status);
61   uset_close(s);
62 }
63 
64 static const UChar PAT[] = {91,97,45,99,123,97,98,125,93,0}; /* "[a-c{ab}]" */
65 static const int32_t PAT_LEN = UPRV_LENGTHOF(PAT) - 1;
66 
67 static const UChar PAT_lb[] = {0x6C, 0x62, 0}; /* "lb" */
68 static const int32_t PAT_lb_LEN = UPRV_LENGTHOF(PAT_lb) - 1;
69 
70 static const UChar VAL_SP[] = {0x53, 0x50, 0}; /* "SP" */
71 static const int32_t VAL_SP_LEN = UPRV_LENGTHOF(VAL_SP) - 1;
72 
73 static const UChar STR_bc[] = {98,99,0}; /* "bc" */
74 static const int32_t STR_bc_LEN = UPRV_LENGTHOF(STR_bc) - 1;
75 
76 static const UChar STR_ab[] = {97,98,0}; /* "ab" */
77 static const int32_t STR_ab_LEN = UPRV_LENGTHOF(STR_ab) - 1;
78 
79 /**
80  * Basic API test for uset.x
81  */
TestAPI()82 static void TestAPI() {
83     USet* set;
84     USet* set2;
85     UErrorCode ec;
86 
87     /* [] */
88     set = uset_openEmpty();
89     expect(set, "", "abc{ab}", NULL);
90     uset_close(set);
91 
92     set = uset_open(1, 0);
93     expect(set, "", "abc{ab}", NULL);
94     uset_close(set);
95 
96     set = uset_open(1, 1);
97     uset_clear(set);
98     expect(set, "", "abc{ab}", NULL);
99     uset_close(set);
100 
101     /* [ABC] */
102     set = uset_open(0x0041, 0x0043);
103     expect(set, "ABC", "DEF{ab}", NULL);
104     uset_close(set);
105 
106     /* [a-c{ab}] */
107     ec = U_ZERO_ERROR;
108     set = uset_openPattern(PAT, PAT_LEN, &ec);
109     if(U_FAILURE(ec)) {
110         log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec));
111         return;
112     }
113     if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) {
114         log_err("uset_resemblesPattern of PAT failed\n");
115     }
116     expect(set, "abc{ab}", "def{bc}", &ec);
117 
118     /* [a-d{ab}] */
119     uset_add(set, 0x64);
120     expect(set, "abcd{ab}", "ef{bc}", NULL);
121 
122     /* [acd{ab}{bc}] */
123     uset_remove(set, 0x62);
124     uset_addString(set, STR_bc, STR_bc_LEN);
125     expect(set, "acd{ab}{bc}", "bef{cd}", NULL);
126 
127     /* [acd{bc}] */
128     uset_removeString(set, STR_ab, STR_ab_LEN);
129     expect(set, "acd{bc}", "bfg{ab}", NULL);
130 
131     /* [^acd{bc}] */
132     uset_complement(set);
133     expect(set, "bef{bc}", "acd{ac}", NULL);
134 
135     /* [a-e{bc}] */
136     uset_complement(set);
137     uset_addRange(set, 0x0062, 0x0065);
138     expect(set, "abcde{bc}", "fg{ab}", NULL);
139 
140     /* [de{bc}] */
141     uset_removeRange(set, 0x0050, 0x0063);
142     expect(set, "de{bc}", "bcfg{ab}", NULL);
143 
144     /* [g-l] */
145     uset_set(set, 0x0067, 0x006C);
146     expect(set, "ghijkl", "de{bc}", NULL);
147 
148     if (uset_indexOf(set, 0x0067) != 0) {
149         log_err("uset_indexOf failed finding correct index of 'g'\n");
150     }
151 
152     if (uset_charAt(set, 0) != 0x0067) {
153         log_err("uset_charAt failed finding correct char 'g' at index 0\n");
154     }
155 
156     /* How to test this one...? */
157     uset_compact(set);
158 
159     /* [g-i] */
160     uset_retain(set, 0x0067, 0x0069);
161     expect(set, "ghi", "dejkl{bc}", NULL);
162 
163     /* UCHAR_ASCII_HEX_DIGIT */
164     uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec);
165     if(U_FAILURE(ec)) {
166         log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec));
167         return;
168     }
169     expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL);
170 
171     /* [ab] */
172     uset_clear(set);
173     uset_addAllCodePoints(set, STR_ab, STR_ab_LEN);
174     expect(set, "ab", "def{ab}", NULL);
175     if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){
176         log_err("set should not conatin all characters of \"bc\" \n");
177     }
178 
179     /* [] */
180     set2 = uset_open(1, 1);
181     uset_clear(set2);
182 
183     /* space */
184     uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec);
185     expect(set2, " ", "abcdefghi{bc}", NULL);
186 
187     /* [a-c] */
188     uset_set(set2, 0x0061, 0x0063);
189     /* [g-i] */
190     uset_set(set, 0x0067, 0x0069);
191 
192     /* [a-c g-i] */
193     if (uset_containsSome(set, set2)) {
194         log_err("set should not contain some of set2 yet\n");
195     }
196     uset_complementAll(set, set2);
197     if (!uset_containsSome(set, set2)) {
198         log_err("set should contain some of set2\n");
199     }
200     expect(set, "abcghi", "def{bc}", NULL);
201 
202     /* [g-i] */
203     uset_removeAll(set, set2);
204     expect(set, "ghi", "abcdef{bc}", NULL);
205 
206     /* [a-c g-i] */
207     uset_addAll(set2, set);
208     expect(set2, "abcghi", "def{bc}", NULL);
209 
210     /* [g-i] */
211     uset_retainAll(set2, set);
212     expect(set2, "ghi", "abcdef{bc}", NULL);
213 
214     uset_close(set);
215     uset_close(set2);
216 }
217 
218 /*------------------------------------------------------------------
219  * Support
220  *------------------------------------------------------------------*/
221 
222 /**
223  * Verifies that the given set contains the characters and strings in
224  * inList, and does not contain those in outList.  Also verifies that
225  * 'set' is not NULL and that 'ec' succeeds.
226  * @param set the set to test, or NULL (on error)
227  * @param inList list of set contents, in iteration order.  Format is
228  * list of individual strings, in iteration order, followed by sorted
229  * list of strings, delimited by {}.  This means we do not test
230  * characters '{' or '}' and we do not test strings containing those
231  * characters either.
232  * @param outList list of things not in the set.  Same format as
233  * inList.
234  * @param ec an error code, checked for success.  May be NULL in which
235  * case it is ignored.
236  */
expect(const USet * set,const char * inList,const char * outList,UErrorCode * ec)237 static void expect(const USet* set,
238                    const char* inList,
239                    const char* outList,
240                    UErrorCode* ec) {
241     if (ec!=NULL && U_FAILURE(*ec)) {
242         log_err("FAIL: %s\n", u_errorName(*ec));
243         return;
244     }
245     if (set == NULL) {
246         log_err("FAIL: USet is NULL\n");
247         return;
248     }
249     expectContainment(set, inList, TRUE);
250     expectContainment(set, outList, FALSE);
251     expectItems(set, inList);
252 }
253 
expectContainment(const USet * set,const char * list,UBool isIn)254 static void expectContainment(const USet* set,
255                               const char* list,
256                               UBool isIn) {
257     const char* p = list;
258     UChar ustr[4096];
259     char *pat;
260     UErrorCode ec;
261     int32_t rangeStart = -1, rangeEnd = -1, length;
262 
263     ec = U_ZERO_ERROR;
264     length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
265     if(U_FAILURE(ec)) {
266         log_err("FAIL: uset_toPattern() fails in expectContainment() - %s\n", u_errorName(ec));
267         return;
268     }
269     pat=aescstrdup(ustr, length);
270 
271     while (*p) {
272         if (*p=='{') {
273             const char* stringStart = ++p;
274             int32_t stringLength = 0;
275             char strCopy[64];
276 
277             while (*p++ != '}') {
278             }
279             stringLength = (int32_t)(p - stringStart - 1);
280             strncpy(strCopy, stringStart, stringLength);
281             strCopy[stringLength] = 0;
282 
283             u_charsToUChars(stringStart, ustr, stringLength);
284 
285             if (uset_containsString(set, ustr, stringLength) == isIn) {
286                 log_verbose("Ok: %s %s \"%s\"\n", pat,
287                             (isIn ? "contains" : "does not contain"),
288                             strCopy);
289             } else {
290                 log_data_err("FAIL: %s %s \"%s\" (Are you missing data?)\n", pat,
291                         (isIn ? "does not contain" : "contains"),
292                         strCopy);
293             }
294         }
295 
296         else {
297             UChar32 c;
298 
299             u_charsToUChars(p, ustr, 1);
300             c = ustr[0];
301 
302             if (uset_contains(set, c) == isIn) {
303                 log_verbose("Ok: %s %s '%c'\n", pat,
304                             (isIn ? "contains" : "does not contain"),
305                             *p);
306             } else {
307                 log_data_err("FAIL: %s %s '%c' (Are you missing data?)\n", pat,
308                         (isIn ? "does not contain" : "contains"),
309                         *p);
310             }
311 
312             /* Test the range API too by looking for ranges */
313             if (c == rangeEnd+1) {
314                 rangeEnd = c;
315             } else {
316                 if (rangeStart >= 0) {
317                     if (uset_containsRange(set, rangeStart, rangeEnd) == isIn) {
318                         log_verbose("Ok: %s %s U+%04X-U+%04X\n", pat,
319                                     (isIn ? "contains" : "does not contain"),
320                                     rangeStart, rangeEnd);
321                     } else {
322                         log_data_err("FAIL: %s %s U+%04X-U+%04X (Are you missing data?)\n", pat,
323                                 (isIn ? "does not contain" : "contains"),
324                                 rangeStart, rangeEnd);
325                     }
326                 }
327                 rangeStart = rangeEnd = c;
328             }
329 
330             ++p;
331         }
332     }
333 
334     if (rangeStart >= 0) {
335         if (uset_containsRange(set, rangeStart, rangeEnd) == isIn) {
336             log_verbose("Ok: %s %s U+%04X-U+%04X\n", pat,
337                         (isIn ? "contains" : "does not contain"),
338                         rangeStart, rangeEnd);
339         } else {
340             log_data_err("FAIL: %s %s U+%04X-U+%04X (Are you missing data?)\n", pat,
341                     (isIn ? "does not contain" : "contains"),
342                     rangeStart, rangeEnd);
343         }
344     }
345 }
346 
347 /* This only works for invariant BMP chars */
oneUCharToChar(UChar32 c)348 static char oneUCharToChar(UChar32 c) {
349     UChar ubuf[1];
350     char buf[1];
351     ubuf[0] = (UChar) c;
352     u_UCharsToChars(ubuf, buf, 1);
353     return buf[0];
354 }
355 
expectItems(const USet * set,const char * items)356 static void expectItems(const USet* set,
357                         const char* items) {
358     const char* p = items;
359     UChar ustr[4096], itemStr[4096];
360     char buf[4096];
361     char *pat;
362     UErrorCode ec;
363     int32_t expectedSize = 0;
364     int32_t itemCount = uset_getItemCount(set);
365     int32_t itemIndex = 0;
366     UChar32 start = 1, end = 0;
367     int32_t itemLen = 0, length;
368 
369     ec = U_ZERO_ERROR;
370     length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
371     if (U_FAILURE(ec)) {
372         log_err("FAIL: uset_toPattern => %s\n", u_errorName(ec));
373         return;
374     }
375     pat=aescstrdup(ustr, length);
376 
377     if (uset_isEmpty(set) != (strlen(items)==0)) {
378         log_data_err("FAIL: %s should return %s from isEmpty (Are you missing data?)\n",
379                 pat,
380                 strlen(items)==0 ? "TRUE" : "FALSE");
381     }
382 
383     /* Don't test patterns starting with "[^" */
384     if (u_strlen(ustr) > 2 && ustr[1] == 0x5e /*'^'*/) {
385         return;
386     }
387 
388     while (*p) {
389 
390         ++expectedSize;
391 
392         if (start > end || start == -1) {
393             /* Fetch our next item */
394             if (itemIndex >= itemCount) {
395                 log_data_err("FAIL: ran out of items iterating %s (Are you missing data?)\n", pat);
396                 return;
397             }
398 
399             itemLen = uset_getItem(set, itemIndex, &start, &end,
400                                    itemStr, sizeof(itemStr), &ec);
401             if (U_FAILURE(ec) || itemLen < 0) {
402                 log_err("FAIL: uset_getItem => %s\n", u_errorName(ec));
403                 return;
404             }
405 
406             if (itemLen == 0) {
407                 log_verbose("Ok: %s item %d is %c-%c\n", pat,
408                             itemIndex, oneUCharToChar(start),
409                             oneUCharToChar(end));
410             } else {
411                 itemStr[itemLen] = 0;
412                 u_UCharsToChars(itemStr, buf, itemLen+1);
413                 log_verbose("Ok: %s item %d is \"%s\"\n", pat, itemIndex, buf);
414             }
415 
416             ++itemIndex;
417         }
418 
419         if (*p=='{') {
420             const char* stringStart = ++p;
421             int32_t stringLength = 0;
422             char strCopy[64];
423 
424             while (*p++ != '}') {
425             }
426             stringLength = (int32_t)(p - stringStart - 1);
427             strncpy(strCopy, stringStart, stringLength);
428             strCopy[stringLength] = 0;
429 
430             u_charsToUChars(stringStart, ustr, stringLength);
431             ustr[stringLength] = 0;
432 
433             if (itemLen == 0) {
434                 log_err("FAIL: for %s expect \"%s\" next, but got a char\n",
435                         pat, strCopy);
436                 return;
437             }
438 
439             if (u_strcmp(ustr, itemStr) != 0) {
440                 log_err("FAIL: for %s expect \"%s\" next\n",
441                         pat, strCopy);
442                 return;
443             }
444         }
445 
446         else {
447             UChar32 c;
448 
449             u_charsToUChars(p, ustr, 1);
450             c = ustr[0];
451 
452             if (itemLen != 0) {
453                 log_err("FAIL: for %s expect '%c' next, but got a string\n",
454                         pat, *p);
455                 return;
456             }
457 
458             if (c != start++) {
459                 log_err("FAIL: for %s expect '%c' next\n",
460                         pat, *p);
461                 return;
462             }
463 
464             ++p;
465         }
466     }
467 
468     if (uset_size(set) == expectedSize) {
469         log_verbose("Ok: %s size is %d\n", pat, expectedSize);
470     } else {
471         log_err("FAIL: %s size is %d, expected %d\n",
472                 pat, uset_size(set), expectedSize);
473     }
474 }
475 
476 static void
TestSerialized()477 TestSerialized() {
478     uint16_t buffer[1000];
479     USerializedSet sset;
480     USet *set;
481     UErrorCode errorCode;
482     UChar32 c;
483     int32_t length;
484 
485     /* use a pattern that generates both BMP and supplementary code points */
486     U_STRING_DECL(pattern, "[:Cf:]", 6);
487     U_STRING_INIT(pattern, "[:Cf:]", 6);
488 
489     errorCode=U_ZERO_ERROR;
490     set=uset_openPattern(pattern, -1, &errorCode);
491     if(U_FAILURE(errorCode)) {
492         log_data_err("uset_openPattern([:Cf:]) failed - %s (Are you missing data?)\n", u_errorName(errorCode));
493         return;
494     }
495 
496     length=uset_serialize(set, buffer, UPRV_LENGTHOF(buffer), &errorCode);
497     if(U_FAILURE(errorCode)) {
498         log_err("unable to uset_serialize([:Cf:]) - %s\n", u_errorName(errorCode));
499         uset_close(set);
500         return;
501     }
502 
503     uset_getSerializedSet(&sset, buffer, length);
504     for(c=0; c<=0x10ffff; ++c) {
505         if(uset_contains(set, c)!=uset_serializedContains(&sset, c)) {
506             log_err("uset_contains(U+%04x)!=uset_serializedContains(U+%04x)\n", c);
507             break;
508         }
509     }
510 
511     uset_close(set);
512 }
513 
514 /**
515  * Make sure that when non-invariant chars are passed to uset_openPattern
516  * they do not cause an ugly failure mode (e.g. assertion failure).
517  * JB#3795.
518  */
519 static void
TestNonInvariantPattern()520 TestNonInvariantPattern() {
521     UErrorCode ec = U_ZERO_ERROR;
522     /* The critical part of this test is that the following pattern
523        must contain a non-invariant character. */
524     static const char *pattern = "[:ccc!=0:]";
525     UChar buf[256];
526     int32_t len = u_unescape(pattern, buf, 256);
527     /* This test 'fails' by having an assertion failure within the
528        following call.  It passes by running to completion with no
529        assertion failure. */
530     USet *set = uset_openPattern(buf, len, &ec);
531     uset_close(set);
532 }
533 
TestBadPattern(void)534 static void TestBadPattern(void) {
535     UErrorCode status = U_ZERO_ERROR;
536     USet *pat;
537     U_STRING_DECL(pattern, "[", 1);
538     U_STRING_INIT(pattern, "[", 1);
539     pat = uset_openPatternOptions(pattern, u_strlen(pattern), 0, &status);
540     if (pat != NULL || U_SUCCESS(status)) {
541         log_err("uset_openPatternOptions did not fail as expected %s\n", u_errorName(status));
542     }
543 }
544 
openIDSet()545 static USet *openIDSet() {
546     UErrorCode errorCode = U_ZERO_ERROR;
547     U_STRING_DECL(pattern, "[:ID_Continue:]", 15);
548     U_STRING_INIT(pattern, "[:ID_Continue:]", 15);
549     return uset_openPattern(pattern, 15, &errorCode);
550 }
551 
TestFreezable()552 static void TestFreezable() {
553     USet *idSet;
554     USet *frozen;
555     USet *thawed;
556 
557     idSet=openIDSet();
558 
559     if (idSet == NULL) {
560         log_data_err("openIDSet() returned NULL. (Are you missing data?)\n");
561         uset_close(idSet);
562         return;
563     }
564 
565     frozen=uset_clone(idSet);
566 
567     if (frozen == NULL) {
568         log_err("uset_Clone() returned NULL\n");
569         return;
570     }
571 
572     if(!uset_equals(frozen, idSet)) {
573         log_err("uset_clone() did not make an equal copy\n");
574     }
575 
576     uset_freeze(frozen);
577     uset_addRange(frozen, 0xd802, 0xd805);
578 
579     if(uset_isFrozen(idSet) || !uset_isFrozen(frozen) || !uset_equals(frozen, idSet)) {
580         log_err("uset_freeze() or uset_isFrozen() does not work\n");
581     }
582 
583     thawed=uset_cloneAsThawed(frozen);
584 
585     if (thawed == NULL) {
586         log_err("uset_cloneAsThawed(frozen) returned NULL");
587         uset_close(frozen);
588         uset_close(idSet);
589         return;
590     }
591 
592     uset_addRange(thawed, 0xd802, 0xd805);
593 
594     if(uset_isFrozen(thawed) || uset_equals(thawed, idSet) || !uset_containsRange(thawed, 0xd802, 0xd805)) {
595         log_err("uset_cloneAsThawed() does not work\n");
596     }
597 
598     uset_close(idSet);
599     uset_close(frozen);
600     uset_close(thawed);
601 }
602 
TestSpan()603 static void TestSpan() {
604     static const UChar s16[2]={ 0xe01, 0x3000 };
605     static const char* s8="\xE0\xB8\x81\xE3\x80\x80";
606 
607     USet *idSet=openIDSet();
608 
609     if (idSet == NULL) {
610         log_data_err("openIDSet() returned NULL (Are you missing data?)\n");
611         return;
612     }
613 
614     if(
615         1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
616         0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
617         2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
618         1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
619     ) {
620         log_err("uset_span() or uset_spanBack() does not work\n");
621     }
622 
623     if(
624         3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
625         0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
626         6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
627         3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
628     ) {
629         log_err("uset_spanUTF8() or uset_spanBackUTF8() does not work\n");
630     }
631 
632     uset_freeze(idSet);
633 
634     if(
635         1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
636         0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
637         2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
638         1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
639     ) {
640         log_err("uset_span(frozen) or uset_spanBack(frozen) does not work\n");
641     }
642 
643     if(
644         3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
645         0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
646         6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
647         3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
648     ) {
649         log_err("uset_spanUTF8(frozen) or uset_spanBackUTF8(frozen) does not work\n");
650     }
651 
652     uset_close(idSet);
653 }
654 
655 /*eof*/
656