1 /*
2 **********************************************************************
3 * Copyright (C) 2011-2015, International Business Machines Corporation
4 * and others.  All Rights Reserved.
5 **********************************************************************
6 */
7 /**
8  * IntlTestSpoof tests for USpoofDetector
9  */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
14 
15 #include "itspoof.h"
16 
17 #include "unicode/normlzr.h"
18 #include "unicode/regex.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uscript.h"
21 #include "unicode/uspoof.h"
22 
23 #include "cstring.h"
24 #include "identifier_info.h"
25 #include "scriptset.h"
26 #include "uhash.h"
27 
28 #include <stdlib.h>
29 #include <stdio.h>
30 
31 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
32     errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
33 
34 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
35     errln("Test Failure at file %s, line %d: \"%s\" is false.", __FILE__, __LINE__, #expr);};}
36 
37 #define TEST_ASSERT_MSG(expr, msg) {if ((expr)==FALSE) { \
38     dataerrln("Test Failure at file %s, line %d, %s: \"%s\" is false.", __FILE__, __LINE__, msg, #expr);};}
39 
40 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
41     errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d)", \
42              __FILE__, __LINE__, #a, (a), #b, (b)); }}
43 
44 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
45     errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d)", \
46              __FILE__, __LINE__, #a, (a), #b, (b)); }}
47 
48 /*
49  *   TEST_SETUP and TEST_TEARDOWN
50  *         macros to handle the boilerplate around setting up test case.
51  *         Put arbitrary test code between SETUP and TEARDOWN.
52  *         "sc" is the ready-to-go  SpoofChecker for use in the tests.
53  */
54 #define TEST_SETUP {  \
55     UErrorCode status = U_ZERO_ERROR; \
56     USpoofChecker *sc;     \
57     sc = uspoof_open(&status);  \
58     TEST_ASSERT_SUCCESS(status);   \
59     if (U_SUCCESS(status)){
60 
61 #define TEST_TEARDOWN  \
62     }  \
63     TEST_ASSERT_SUCCESS(status);  \
64     uspoof_close(sc);  \
65 }
66 
67 
68 
69 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)70 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
71 {
72     if (exec) logln("TestSuite spoof: ");
73     switch (index) {
74         case 0:
75             name = "TestSpoofAPI";
76             if (exec) {
77                 testSpoofAPI();
78             }
79             break;
80         case 1:
81             name = "TestSkeleton";
82             if (exec) {
83                 testSkeleton();
84             }
85             break;
86         case 2:
87             name = "TestAreConfusable";
88             if (exec) {
89                 testAreConfusable();
90             }
91             break;
92         case 3:
93             name = "TestInvisible";
94             if (exec) {
95                 testInvisible();
96             }
97             break;
98         case 4:
99             name = "testConfData";
100             if (exec) {
101                 testConfData();
102             }
103             break;
104         case 5:
105             name = "testBug8654";
106             if (exec) {
107                 testBug8654();
108             }
109             break;
110         case 6:
111             name = "testIdentifierInfo";
112             if (exec) {
113                 testIdentifierInfo();
114             }
115             break;
116         case 7:
117             name = "testScriptSet";
118             if (exec) {
119                 testScriptSet();
120             }
121             break;
122         case 8:
123             name = "testRestrictionLevel";
124             if (exec) {
125                 testRestrictionLevel();
126             }
127             break;
128        case 9:
129             name = "testMixedNumbers";
130             if (exec) {
131                 testMixedNumbers();
132             }
133             break;
134 
135 
136         default: name=""; break;
137     }
138 }
139 
testSpoofAPI()140 void IntlTestSpoof::testSpoofAPI() {
141 
142     TEST_SETUP
143         UnicodeString s("xyz");  // Many latin ranges are whole-script confusable with other scripts.
144                                  // If this test starts failing, consult confusablesWholeScript.txt
145         int32_t position = 666;
146         int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
147         TEST_ASSERT_SUCCESS(status);
148         TEST_ASSERT_EQ(0, checkResults);
149         TEST_ASSERT_EQ(0, position);
150     TEST_TEARDOWN;
151 
152     TEST_SETUP
153         UnicodeString s1("cxs");
154         UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape();  // Cyrillic "cxs"
155         int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
156         TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
157 
158     TEST_TEARDOWN;
159 
160     TEST_SETUP
161         UnicodeString s("I1l0O");
162         UnicodeString dest;
163         UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status);
164         TEST_ASSERT_SUCCESS(status);
165         TEST_ASSERT(UnicodeString("lllOO") == dest);
166         TEST_ASSERT(&dest == &retStr);
167     TEST_TEARDOWN;
168 }
169 
170 
171 #define CHECK_SKELETON(type, input, expected) { \
172     checkSkeleton(sc, type, input, expected, __LINE__); \
173     }
174 
175 
176 // testSkeleton.   Spot check a number of confusable skeleton substitutions from the
177 //                 Unicode data file confusables.txt
178 //                 Test cases chosen for substitutions of various lengths, and
179 //                 membership in different mapping tables.
180 //          Note: for ICU 55, all tables collapsed to the MA table data.
181 //          TODO: for ICU 56 with Unicode 8, revisit this test.
182 //
testSkeleton()183 void IntlTestSpoof::testSkeleton() {
184     const uint32_t ML = 0;
185     const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
186     const uint32_t MA = USPOOF_ANY_CASE;
187     const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
188 
189     TEST_SETUP
190         CHECK_SKELETON(SL, "nochange", "nochange");
191         CHECK_SKELETON(SA, "nochange", "nochange");
192         CHECK_SKELETON(ML, "nochange", "nochange");
193         CHECK_SKELETON(MA, "nochange", "nochange");
194         CHECK_SKELETON(MA, "love", "love");
195         CHECK_SKELETON(MA, "1ove", "love");   // Digit 1 to letter l
196         CHECK_SKELETON(ML, "OOPS", "OOPS");
197         CHECK_SKELETON(ML, "00PS", "OOPS");
198         CHECK_SKELETON(MA, "OOPS", "OOPS");
199         CHECK_SKELETON(MA, "00PS", "OOPS");   // Digit 0 to letter O in any case mode only
200         CHECK_SKELETON(SL, "\\u059c", "\\u0301");
201         CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D");
202         CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029");  // "(ll)"
203         CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u006c\\u0644\\u006f");
204 
205         // This mapping exists in the ML and MA tables, does not exist in SL, SA
206         // 0C83 ;	0983 ;	ML
207         // 0C83 ;	0983 ;	MA
208         //
209 
210         CHECK_SKELETON(SL, "\\u0C83", "\\u0983");
211         CHECK_SKELETON(SA, "\\u0C83", "\\u0983");
212         CHECK_SKELETON(ML, "\\u0C83", "\\u0983");
213         CHECK_SKELETON(MA, "\\u0C83", "\\u0983");
214 
215         // 0391 mappings exist only in MA and SA tables.
216         CHECK_SKELETON(MA, "\\u0391", "A");
217         CHECK_SKELETON(SA, "\\u0391", "A");
218         CHECK_SKELETON(ML, "\\u0391", "A");
219         CHECK_SKELETON(SL, "\\u0391", "A");
220 
221         // 13CF Mappings in all four tables, different in MA.
222         CHECK_SKELETON(ML, "\\u13CF", "b");
223         CHECK_SKELETON(MA, "\\u13CF", "b");
224         CHECK_SKELETON(SL, "\\u13CF", "b");
225         CHECK_SKELETON(SA, "\\u13CF", "b");
226 
227         // 0022 ;  0027 0027 ;
228         // all tables.
229         CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027");
230         CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027");
231         CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027");
232         CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027");
233 
234         // 017F mappings exist only in MA and SA tables.
235         CHECK_SKELETON(MA, "\\u017F", "f");
236         CHECK_SKELETON(SA, "\\u017F", "f");
237         CHECK_SKELETON(ML, "\\u017F", "f");
238         CHECK_SKELETON(SL, "\\u017F", "f");
239 
240     TEST_TEARDOWN;
241 }
242 
243 
244 //
245 //  Run a single confusable skeleton transformation test case.
246 //
checkSkeleton(const USpoofChecker * sc,uint32_t type,const char * input,const char * expected,int32_t lineNum)247 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
248                                   const char *input, const char *expected, int32_t lineNum) {
249     UnicodeString uInput = UnicodeString(input).unescape();
250     UnicodeString uExpected = UnicodeString(expected).unescape();
251 
252     UErrorCode status = U_ZERO_ERROR;
253     UnicodeString actual;
254     uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status);
255     if (U_FAILURE(status)) {
256         errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
257               u_errorName(status));
258         return;
259     }
260     if (uExpected != actual) {
261         errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
262                __FILE__, __LINE__, lineNum);
263         errln(UnicodeString(" Actual   Skeleton: \"") + actual + UnicodeString("\"\n") +
264               UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
265     }
266 }
267 
testAreConfusable()268 void IntlTestSpoof::testAreConfusable() {
269     TEST_SETUP
270         UnicodeString s1("A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
271                          "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ");
272         UnicodeString s2("A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
273                          "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ");
274         TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
275         TEST_ASSERT_SUCCESS(status);
276 
277     TEST_TEARDOWN;
278 }
279 
testInvisible()280 void IntlTestSpoof::testInvisible() {
281     TEST_SETUP
282         UnicodeString  s = UnicodeString("abcd\\u0301ef").unescape();
283         int32_t position = -42;
284         TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
285         TEST_ASSERT_SUCCESS(status);
286         TEST_ASSERT(0 == position);
287 
288         UnicodeString  s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
289         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
290         TEST_ASSERT_SUCCESS(status);
291         TEST_ASSERT_EQ(0, position);
292 
293         // Two acute accents, one from the composed a with acute accent, \u00e1,
294         // and one separate.
295         position = -42;
296         UnicodeString  s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
297         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
298         TEST_ASSERT_SUCCESS(status);
299         TEST_ASSERT_EQ(0, position);
300     TEST_TEARDOWN;
301 }
302 
testBug8654()303 void IntlTestSpoof::testBug8654() {
304     TEST_SETUP
305         UnicodeString s = UnicodeString("B\\u00c1\\u0301").unescape();
306         int32_t position = -42;
307         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s, &position, &status) & USPOOF_INVISIBLE );
308         TEST_ASSERT_SUCCESS(status);
309         TEST_ASSERT_EQ(0, position);
310     TEST_TEARDOWN;
311 }
312 
parseHex(const UnicodeString & in)313 static UnicodeString parseHex(const UnicodeString &in) {
314     // Convert a series of hex numbers in a Unicode String to a string with the
315     // corresponding characters.
316     // The conversion is _really_ annoying.  There must be some function to just do it.
317     UnicodeString result;
318     UChar32 cc = 0;
319     for (int32_t i=0; i<in.length(); i++) {
320         UChar c = in.charAt(i);
321         if (c == 0x20) {   // Space
322             if (cc > 0) {
323                result.append(cc);
324                cc = 0;
325             }
326         } else if (c>=0x30 && c<=0x39) {
327             cc = (cc<<4) + (c - 0x30);
328         } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) {
329             cc = (cc<<4) + (c & 0x0f)+9;
330         }
331         // else do something with bad input.
332     }
333     if (cc > 0) {
334         result.append(cc);
335     }
336     return result;
337 }
338 
339 
340 //
341 // Append the hex form of a UChar32 to a UnicodeString.
342 // Used in formatting error messages.
343 // Match the formatting of numbers in confusables.txt
344 // Minimum of 4 digits, no leading zeroes for positions 5 and up.
345 //
appendHexUChar(UnicodeString & dest,UChar32 c)346 static void appendHexUChar(UnicodeString &dest, UChar32 c) {
347     UBool   doZeroes = FALSE;
348     for (int bitNum=28; bitNum>=0; bitNum-=4) {
349         if (bitNum <= 12) {
350             doZeroes = TRUE;
351         }
352         int hexDigit = (c>>bitNum) & 0x0f;
353         if (hexDigit != 0 || doZeroes) {
354             doZeroes = TRUE;
355             dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41));
356         }
357     }
358     dest.append((UChar)0x20);
359 }
360 
361 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
362 
363 //  testConfData - Check each data item from the Unicode confusables.txt file,
364 //                 verify that it transforms correctly in a skeleton.
365 //
testConfData()366 void IntlTestSpoof::testConfData() {
367     char buffer[2000];
368     if (getUnidataPath(buffer) == NULL) {
369         errln("Skipping test spoof/testConfData. Unable to find path to source/data/unidata/.");
370         return;
371     }
372     uprv_strcat(buffer, "confusables.txt");
373 
374     LocalStdioFilePointer f(fopen(buffer, "rb"));
375     if (f.isNull()) {
376         errln("Skipping test spoof/testConfData.  File confusables.txt not accessible.");
377         return;
378     }
379     fseek(f.getAlias(), 0, SEEK_END);
380     int32_t  fileSize = ftell(f.getAlias());
381     LocalArray<char> fileBuf(new char[fileSize]);
382     fseek(f.getAlias(), 0, SEEK_SET);
383     int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias());
384     TEST_ASSERT_EQ(amt_read, fileSize);
385     TEST_ASSERT(fileSize>0);
386     if (amt_read != fileSize || fileSize <=0) {
387         return;
388     }
389     UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize));
390 
391     UErrorCode status = U_ZERO_ERROR;
392     LocalUSpoofCheckerPointer sc(uspoof_open(&status));
393     TEST_ASSERT_SUCCESS(status);
394 
395     // Parse lines from the confusables.txt file.  Example Line:
396     // FF44 ;	0064 ;	SL	# ( d -> d ) FULLWIDTH ....
397     // Three fields.  The hex fields can contain more than one character,
398     //                and each character may be more than 4 digits (for supplemntals)
399     // This regular expression matches lines and splits the fields into capture groups.
400     RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status);
401     TEST_ASSERT_SUCCESS(status);
402     while (parseLine.find()) {
403         UnicodeString from = parseHex(parseLine.group(1, status));
404         if (!Normalizer::isNormalized(from, UNORM_NFD, status)) {
405             // The source character was not NFD.
406             // Skip this case; the first step in obtaining a skeleton is to NFD the input,
407             //  so the mapping in this line of confusables.txt will never be applied.
408             continue;
409         }
410 
411         UnicodeString rawExpected = parseHex(parseLine.group(2, status));
412         UnicodeString expected;
413         Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status);
414         TEST_ASSERT_SUCCESS(status);
415 
416         int32_t skeletonType = 0;
417         UnicodeString tableType = parseLine.group(3, status);
418         TEST_ASSERT_SUCCESS(status);
419         if (tableType.indexOf("SL") >= 0) {
420             skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
421         } else if (tableType.indexOf("SA") >= 0) {
422             skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
423         } else if (tableType.indexOf("ML") >= 0) {
424             skeletonType = 0;
425         } else if (tableType.indexOf("MA") >= 0) {
426             skeletonType = USPOOF_ANY_CASE;
427         }
428 
429         UnicodeString actual;
430         uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status);
431         TEST_ASSERT_SUCCESS(status);
432         TEST_ASSERT(actual == expected);
433         if (actual != expected) {
434             errln(parseLine.group(0, status));
435             UnicodeString line = "Actual: ";
436             int i = 0;
437             while (i < actual.length()) {
438                 appendHexUChar(line, actual.char32At(i));
439                 i = actual.moveIndex32(i, 1);
440             }
441             errln(line);
442         }
443         if (U_FAILURE(status)) {
444             break;
445         }
446     }
447 }
448 
449 // testIdentifierInfo. Note that IdentifierInfo is not public ICU API at this time
testIdentifierInfo()450 void IntlTestSpoof::testIdentifierInfo() {
451     UErrorCode status = U_ZERO_ERROR;
452     ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status);
453     ScriptSet bitset2;  bitset2.set(USCRIPT_HANGUL, status);
454     TEST_ASSERT(bitset12.contains(bitset2));
455     TEST_ASSERT(bitset12.contains(bitset12));
456     TEST_ASSERT(!bitset2.contains(bitset12));
457 
458     ScriptSet arabSet;  arabSet.set(USCRIPT_ARABIC, status);
459     ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status);
460     UElement arabEl;  arabEl.pointer = &arabSet;
461     UElement latinEl; latinEl.pointer = &latinSet;
462     TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0);
463     TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0);
464 
465     UnicodeString scriptString;
466     bitset12.displayScripts(scriptString);
467     TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString);
468 
469     status = U_ZERO_ERROR;
470     UHashtable *alternates = uhash_open(uhash_hashScriptSet ,uhash_compareScriptSet, NULL, &status);
471     uhash_puti(alternates, &bitset12, 1, &status);
472     uhash_puti(alternates, &bitset2, 1, &status);
473     UnicodeString alternatesString;
474     IdentifierInfo::displayAlternates(alternatesString, alternates, status);
475     TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang; Hang Latn") == alternatesString);
476     TEST_ASSERT_SUCCESS(status);
477 
478     status = U_ZERO_ERROR;
479     ScriptSet tScriptSet;
480     tScriptSet.parseScripts(scriptString, status);
481     TEST_ASSERT_SUCCESS(status);
482     TEST_ASSERT(bitset12 == tScriptSet);
483     UnicodeString ss;
484     ss.remove();
485     uhash_close(alternates);
486 
487     struct Test {
488         const char         *fTestString;
489         URestrictionLevel   fRestrictionLevel;
490         const char         *fNumerics;
491         const char         *fScripts;
492         const char         *fAlternates;
493         const char         *fCommonAlternates;
494     } tests[] = {
495             {"\\u0061\\u2665",                USPOOF_UNRESTRICTIVE,      "[]", "Latn", "", ""},
496             {"\\u0061\\u3006",                USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
497             {"\\u0061\\u30FC\\u3006",         USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hira Kana", "Hira Kana"},
498             {"\\u0061\\u30FC\\u3006\\u30A2",  USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
499             {"\\u30A2\\u0061\\u30FC\\u3006",  USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
500             {"\\u0061\\u0031\\u0661",         USPOOF_UNRESTRICTIVE,      "[\\u0030\\u0660]", "Latn", "Arab Thaa", "Arab Thaa"},
501             {"\\u0061\\u0031\\u0661\\u06F1",  USPOOF_UNRESTRICTIVE,      "[\\u0030\\u0660\\u06F0]", "Latn Arab", "", ""},
502             {"\\u0661\\u30FC\\u3006\\u0061\\u30A2\\u0031\\u0967\\u06F1",  USPOOF_UNRESTRICTIVE,
503                   "[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
504             {"\\u0061\\u30A2\\u30FC\\u3006\\u0031\\u0967\\u0661\\u06F1",  USPOOF_UNRESTRICTIVE,
505                   "[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}
506     };
507 
508     int testNum;
509     for (testNum = 0; testNum < UPRV_LENGTHOF(tests); testNum++) {
510         char testNumStr[40];
511         sprintf(testNumStr, "testNum = %d", testNum);
512         Test &test = tests[testNum];
513         status = U_ZERO_ERROR;
514         UnicodeString testString(test.fTestString);  // Note: may do charset conversion.
515         testString = testString.unescape();
516         IdentifierInfo idInfo(status);
517         TEST_ASSERT_SUCCESS(status);
518         idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status));
519         idInfo.setIdentifier(testString, status);
520         TEST_ASSERT_MSG(*idInfo.getIdentifier() == testString, testNumStr);
521 
522         URestrictionLevel restrictionLevel = test.fRestrictionLevel;
523         TEST_ASSERT_MSG(restrictionLevel == idInfo.getRestrictionLevel(status), testNumStr);
524 
525         status = U_ZERO_ERROR;
526         UnicodeSet numerics(UnicodeString(test.fNumerics).unescape(), status);
527         TEST_ASSERT_SUCCESS(status);
528         TEST_ASSERT_MSG(numerics == *idInfo.getNumerics(), testNumStr);
529 
530         ScriptSet scripts;
531         scripts.parseScripts(UnicodeString(test.fScripts), status);
532         TEST_ASSERT_MSG(scripts == *idInfo.getScripts(), testNumStr);
533 
534         UnicodeString alternatesStr;
535         IdentifierInfo::displayAlternates(alternatesStr, idInfo.getAlternates(), status);
536         TEST_ASSERT_MSG(UnicodeString(test.fAlternates) == alternatesStr, testNumStr);
537 
538         ScriptSet commonAlternates;
539         commonAlternates.parseScripts(UnicodeString(test.fCommonAlternates), status);
540         TEST_ASSERT_MSG(commonAlternates == *idInfo.getCommonAmongAlternates(), testNumStr);
541     }
542 
543     // Test of getScriptCount()
544     //   Script and or Script Extension for chars used in the tests
545     //     \\u3013  ; Bopo Hang Hani Hira Kana # So       GETA MARK
546     //     \\uA838  ; Deva Gujr Guru Kthi Takr # Sc       NORTH INDIC RUPEE MARK
547     //     \\u0951  ; Deva Latn                # Mn       DEVANAGARI STRESS SIGN UDATTA
548     //
549     //     \\u0370  ; Greek                    # L        GREEK CAPITAL LETTER HETA
550     //     \\u0481  ; Cyrillic                 # L&       CYRILLIC SMALL LETTER KOPPA
551     //     \\u0904  ; Devanagari               # Lo       DEVANAGARI LETTER SHORT A
552     //     \\u3041  ; Hiragana                 # Lo       HIRAGANA LETTER SMALL A
553     //     1234     ; Common                   #          ascii digits
554     //     \\u0300  ; Inherited                # Mn       COMBINING GRAVE ACCENT
555 
556     struct ScriptTest {
557         const char *fTestString;
558         int32_t     fScriptCount;
559     } scriptTests[] = {
560         {"Hello", 1},
561         {"Hello\\u0370", 2},
562         {"1234", 0},
563         {"Hello1234\\u0300", 1},   // Common and Inherited are ignored.
564         {"\\u0030", 0},
565         {"abc\\u0951", 1},
566         {"abc\\u3013", 2},
567         {"\\uA838\\u0951", 1},     // Triggers commonAmongAlternates path.
568         {"\\u3013\\uA838", 2}
569     };
570 
571     status = U_ZERO_ERROR;
572     IdentifierInfo identifierInfo(status);
573     for (testNum=0; testNum<UPRV_LENGTHOF(scriptTests); testNum++) {
574         ScriptTest &test = scriptTests[testNum];
575         char msgBuf[100];
576         sprintf(msgBuf, "testNum = %d ", testNum);
577         UnicodeString testString = UnicodeString(test.fTestString).unescape();
578 
579         status = U_ZERO_ERROR;
580         identifierInfo.setIdentifier(testString, status);
581         int32_t scriptCount = identifierInfo.getScriptCount();
582         TEST_ASSERT_MSG(test.fScriptCount == scriptCount, msgBuf);
583     }
584 }
585 
testScriptSet()586 void IntlTestSpoof::testScriptSet() {
587     ScriptSet s1;
588     ScriptSet s2;
589     UErrorCode status = U_ZERO_ERROR;
590 
591     TEST_ASSERT(s1 == s2);
592     s1.set(USCRIPT_ARABIC,status);
593     TEST_ASSERT_SUCCESS(status);
594     TEST_ASSERT(!(s1 == s2));
595     TEST_ASSERT(s1.test(USCRIPT_ARABIC, status));
596     TEST_ASSERT(s1.test(USCRIPT_GREEK, status) == FALSE);
597 
598     status = U_ZERO_ERROR;
599     s1.reset(USCRIPT_ARABIC, status);
600     TEST_ASSERT(s1 == s2);
601 
602     status = U_ZERO_ERROR;
603     s1.setAll();
604     TEST_ASSERT(s1.test(USCRIPT_COMMON, status));
605     TEST_ASSERT(s1.test(USCRIPT_ETHIOPIC, status));
606     TEST_ASSERT(s1.test(USCRIPT_CODE_LIMIT, status));
607     s1.resetAll();
608     TEST_ASSERT(!s1.test(USCRIPT_COMMON, status));
609     TEST_ASSERT(!s1.test(USCRIPT_ETHIOPIC, status));
610     TEST_ASSERT(!s1.test(USCRIPT_CODE_LIMIT, status));
611 
612     status = U_ZERO_ERROR;
613     s1.set(USCRIPT_TAKRI, status);
614     s1.set(USCRIPT_BLISSYMBOLS, status);
615     s2.setAll();
616     TEST_ASSERT(s2.contains(s1));
617     TEST_ASSERT(!s1.contains(s2));
618     TEST_ASSERT(s2.intersects(s1));
619     TEST_ASSERT(s1.intersects(s2));
620     s2.reset(USCRIPT_TAKRI, status);
621     TEST_ASSERT(!s2.contains(s1));
622     TEST_ASSERT(!s1.contains(s2));
623     TEST_ASSERT(s1.intersects(s2));
624     TEST_ASSERT(s2.intersects(s1));
625     TEST_ASSERT_SUCCESS(status);
626 
627     status = U_ZERO_ERROR;
628     s1.resetAll();
629     s1.set(USCRIPT_NKO, status);
630     s1.set(USCRIPT_COMMON, status);
631     s2 = s1;
632     TEST_ASSERT(s2 == s1);
633     TEST_ASSERT_EQ(2, s2.countMembers());
634     s2.intersect(s1);
635     TEST_ASSERT(s2 == s1);
636     s2.setAll();
637     TEST_ASSERT(!(s2 == s1));
638     TEST_ASSERT(s2.countMembers() >= USCRIPT_CODE_LIMIT);
639     s2.intersect(s1);
640     TEST_ASSERT(s2 == s1);
641 
642     s2.setAll();
643     s2.reset(USCRIPT_COMMON, status);
644     s2.intersect(s1);
645     TEST_ASSERT(s2.countMembers() == 1);
646 
647     s1.resetAll();
648     s1.set(USCRIPT_AFAKA, status);
649     s1.set(USCRIPT_VAI, status);
650     s1.set(USCRIPT_INHERITED, status);
651     int32_t n = -1;
652     for (int32_t i=0; i<4; i++) {
653         n = s1.nextSetBit(n+1);
654         switch (i) {
655           case 0: TEST_ASSERT_EQ(USCRIPT_INHERITED, n); break;
656           case 1: TEST_ASSERT_EQ(USCRIPT_VAI, n); break;
657           case 2: TEST_ASSERT_EQ(USCRIPT_AFAKA, n); break;
658           case 3: TEST_ASSERT_EQ(-1, (int32_t)n); break;
659           default: TEST_ASSERT(FALSE);
660         }
661     }
662     TEST_ASSERT_SUCCESS(status);
663 }
664 
665 
testRestrictionLevel()666 void IntlTestSpoof::testRestrictionLevel() {
667     struct Test {
668         const char         *fId;
669         URestrictionLevel   fExpectedRestrictionLevel;
670     } tests[] = {
671         {"\\u0061\\u03B3\\u2665", USPOOF_UNRESTRICTIVE},
672         {"a",                     USPOOF_ASCII},
673         {"\\u03B3",               USPOOF_SINGLE_SCRIPT_RESTRICTIVE},
674         {"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE},
675         {"\\u0061\\u0904",        USPOOF_MODERATELY_RESTRICTIVE},
676         {"\\u0061\\u03B3",        USPOOF_MINIMALLY_RESTRICTIVE}
677     };
678     char msgBuffer[100];
679 
680     URestrictionLevel restrictionLevels[] = { USPOOF_ASCII, USPOOF_SINGLE_SCRIPT_RESTRICTIVE,
681          USPOOF_HIGHLY_RESTRICTIVE, USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE,
682          USPOOF_UNRESTRICTIVE};
683 
684     UErrorCode status = U_ZERO_ERROR;
685     IdentifierInfo idInfo(status);
686     TEST_ASSERT_SUCCESS(status);
687     idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status));
688     TEST_ASSERT_SUCCESS(status);
689     for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) {
690         status = U_ZERO_ERROR;
691         const Test &test = tests[testNum];
692         UnicodeString testString = UnicodeString(test.fId).unescape();
693         URestrictionLevel expectedLevel = test.fExpectedRestrictionLevel;
694         idInfo.setIdentifier(testString, status);
695         sprintf(msgBuffer, "testNum = %d ", testNum);
696         TEST_ASSERT_SUCCESS(status);
697         TEST_ASSERT_MSG(expectedLevel == idInfo.getRestrictionLevel(status), msgBuffer);
698         for (int levelIndex=0; levelIndex<UPRV_LENGTHOF(restrictionLevels); levelIndex++) {
699             status = U_ZERO_ERROR;
700             URestrictionLevel levelSetInSpoofChecker = restrictionLevels[levelIndex];
701             USpoofChecker *sc = uspoof_open(&status);
702             uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status);
703             uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status);
704             uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker);
705             int32_t actualValue = uspoof_checkUnicodeString(sc, testString, NULL, &status);
706 
707             // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
708             int32_t expectedValue = 0;
709             if (expectedLevel > levelSetInSpoofChecker) {
710                 expectedValue |= USPOOF_RESTRICTION_LEVEL;
711             }
712             if (!uspoof_getRecommendedUnicodeSet(&status)->containsAll(testString)) {
713                 expectedValue |= USPOOF_CHAR_LIMIT;
714             }
715             sprintf(msgBuffer, "testNum = %d, levelIndex = %d, expected = %#x, actual = %#x",
716                     testNum, levelIndex, expectedValue, actualValue);
717             TEST_ASSERT_MSG(expectedValue == actualValue, msgBuffer);
718             TEST_ASSERT_SUCCESS(status);
719 
720             // Run the same check again, with the Spoof Checker configured to return
721             // the actual restriction level.
722             uspoof_setChecks(sc, USPOOF_AUX_INFO | USPOOF_RESTRICTION_LEVEL, &status);
723             uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status);
724             uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker);
725             int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status);
726             TEST_ASSERT_SUCCESS(status);
727             if (U_SUCCESS(status)) {
728                 TEST_ASSERT_EQ(expectedLevel, result & USPOOF_RESTRICTION_LEVEL_MASK);
729                 TEST_ASSERT_EQ(expectedValue, result & USPOOF_ALL_CHECKS);
730             }
731             uspoof_close(sc);
732         }
733     }
734 }
735 
736 
testMixedNumbers()737 void IntlTestSpoof::testMixedNumbers() {
738     struct Test {
739         const char *fTestString;
740         const char *fExpectedSet;
741     } tests[] = {
742         {"1",              "[0]"},
743         {"\\u0967",        "[\\u0966]"},
744         {"1\\u0967",       "[0\\u0966]"},
745         {"\\u0661\\u06F1", "[\\u0660\\u06F0]"}
746     };
747     UErrorCode status = U_ZERO_ERROR;
748     IdentifierInfo idInfo(status);
749     for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) {
750         char msgBuf[100];
751         sprintf(msgBuf, "testNum = %d ", testNum);
752         Test &test = tests[testNum];
753 
754         status = U_ZERO_ERROR;
755         UnicodeString testString = UnicodeString(test.fTestString).unescape();
756         UnicodeSet expectedSet(UnicodeString(test.fExpectedSet).unescape(), status);
757         idInfo.setIdentifier(testString, status);
758         TEST_ASSERT_SUCCESS(status);
759         TEST_ASSERT_MSG(expectedSet == *idInfo.getNumerics(), msgBuf);
760 
761         status = U_ZERO_ERROR;
762         USpoofChecker *sc = uspoof_open(&status);
763         uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this
764         int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status);
765         UBool mixedNumberFailure = ((result & USPOOF_MIXED_NUMBERS) != 0);
766         TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf);
767         uspoof_close(sc);
768     }
769 }
770 
771 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO */
772