1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1997-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /*******************************************************************************
9 *
10 * File CUCDTST.C
11 *
12 * Modification History:
13 *        Name                     Description
14 *     Madhu Katragadda            Ported for C API, added tests for string functions
15 ********************************************************************************
16 */
17 
18 #include <string.h>
19 #include <math.h>
20 #include <stdlib.h>
21 
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/putil.h"
25 #include "unicode/ustring.h"
26 #include "unicode/uloc.h"
27 #include "unicode/unorm2.h"
28 #include "unicode/utf16.h"
29 #include "unicode/utf_old.h"
30 #include "cintltst.h"
31 #include "putilimp.h"
32 #include "uparse.h"
33 #include "ucase.h"
34 #include "ubidi_props.h"
35 #include "uprops.h"
36 #include "uset_imp.h"
37 #include "usc_impl.h"
38 #include "udatamem.h"
39 #include "cucdapi.h"
40 #include "cmemory.h"
41 
42 /* prototypes --------------------------------------------------------------- */
43 
44 static void TestUpperLower(void);
45 static void TestLetterNumber(void);
46 static void TestMisc(void);
47 static void TestPOSIX(void);
48 static void TestControlPrint(void);
49 static void TestIdentifier(void);
50 static void TestUnicodeData(void);
51 static void TestCodeUnit(void);
52 static void TestCodePoint(void);
53 static void TestCharLength(void);
54 static void TestCharNames(void);
55 static void TestUCharFromNameUnderflow(void);
56 static void TestMirroring(void);
57 static void TestUScriptRunAPI(void);
58 static void TestAdditionalProperties(void);
59 static void TestNumericProperties(void);
60 static void TestPropertyNames(void);
61 static void TestPropertyValues(void);
62 static void TestConsistency(void);
63 static void TestCaseFolding(void);
64 static void TestBinaryCharacterPropertiesAPI(void);
65 static void TestIntCharacterPropertiesAPI(void);
66 
67 /* internal methods used */
68 static int32_t MakeProp(char* str);
69 static int32_t MakeDir(char* str);
70 
71 /* helpers ------------------------------------------------------------------ */
72 
73 static void
parseUCDFile(const char * filename,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)74 parseUCDFile(const char *filename,
75              char *fields[][2], int32_t fieldCount,
76              UParseLineFn *lineFn, void *context,
77              UErrorCode *pErrorCode) {
78     char path[256];
79     char backupPath[256];
80 
81     if(U_FAILURE(*pErrorCode)) {
82         return;
83     }
84 
85     /* Look inside ICU_DATA first */
86     strcpy(path, u_getDataDirectory());
87     strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
88     strcat(path, filename);
89 
90     /* As a fallback, try to guess where the source data was located
91      *    at the time ICU was built, and look there.
92      */
93     strcpy(backupPath, ctest_dataSrcDir());
94     strcat(backupPath, U_FILE_SEP_STRING);
95     strcat(backupPath, "unidata" U_FILE_SEP_STRING);
96     strcat(backupPath, filename);
97 
98     u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
99     if(*pErrorCode==U_FILE_ACCESS_ERROR) {
100         *pErrorCode=U_ZERO_ERROR;
101         u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
102     }
103     if(U_FAILURE(*pErrorCode)) {
104         log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
105     }
106 }
107 
108 /* test data ---------------------------------------------------------------- */
109 
110 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
111 static const int32_t tagValues[] =
112     {
113     /* Mn */ U_NON_SPACING_MARK,
114     /* Mc */ U_COMBINING_SPACING_MARK,
115     /* Me */ U_ENCLOSING_MARK,
116     /* Nd */ U_DECIMAL_DIGIT_NUMBER,
117     /* Nl */ U_LETTER_NUMBER,
118     /* No */ U_OTHER_NUMBER,
119     /* Zs */ U_SPACE_SEPARATOR,
120     /* Zl */ U_LINE_SEPARATOR,
121     /* Zp */ U_PARAGRAPH_SEPARATOR,
122     /* Cc */ U_CONTROL_CHAR,
123     /* Cf */ U_FORMAT_CHAR,
124     /* Cs */ U_SURROGATE,
125     /* Co */ U_PRIVATE_USE_CHAR,
126     /* Cn */ U_UNASSIGNED,
127     /* Lu */ U_UPPERCASE_LETTER,
128     /* Ll */ U_LOWERCASE_LETTER,
129     /* Lt */ U_TITLECASE_LETTER,
130     /* Lm */ U_MODIFIER_LETTER,
131     /* Lo */ U_OTHER_LETTER,
132     /* Pc */ U_CONNECTOR_PUNCTUATION,
133     /* Pd */ U_DASH_PUNCTUATION,
134     /* Ps */ U_START_PUNCTUATION,
135     /* Pe */ U_END_PUNCTUATION,
136     /* Po */ U_OTHER_PUNCTUATION,
137     /* Sm */ U_MATH_SYMBOL,
138     /* Sc */ U_CURRENCY_SYMBOL,
139     /* Sk */ U_MODIFIER_SYMBOL,
140     /* So */ U_OTHER_SYMBOL,
141     /* Pi */ U_INITIAL_PUNCTUATION,
142     /* Pf */ U_FINAL_PUNCTUATION
143     };
144 
145 static const char dirStrings[][5] = {
146     "L",
147     "R",
148     "EN",
149     "ES",
150     "ET",
151     "AN",
152     "CS",
153     "B",
154     "S",
155     "WS",
156     "ON",
157     "LRE",
158     "LRO",
159     "AL",
160     "RLE",
161     "RLO",
162     "PDF",
163     "NSM",
164     "BN",
165     /* new in Unicode 6.3/ICU 52 */
166     "FSI",
167     "LRI",
168     "RLI",
169     "PDI"
170 };
171 
172 void addUnicodeTest(TestNode** root);
173 
addUnicodeTest(TestNode ** root)174 void addUnicodeTest(TestNode** root)
175 {
176     addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
177     addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
178     addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
179     addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
180     addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
181     addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
182     addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
183     addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
184     addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
185     addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
186     addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
187     addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
188     addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
189     addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
190     addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
191     addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
192     addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
193     addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
194     addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
195     addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
196     addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
197     addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
198     addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
199     addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
200     addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
201     addTest(root, &TestBinaryCharacterPropertiesAPI,
202             "tsutil/cucdtst/TestBinaryCharacterPropertiesAPI");
203     addTest(root, &TestIntCharacterPropertiesAPI,
204             "tsutil/cucdtst/TestIntCharacterPropertiesAPI");
205 }
206 
207 /*==================================================== */
208 /* test u_toupper() and u_tolower()                    */
209 /*==================================================== */
TestUpperLower()210 static void TestUpperLower()
211 {
212     const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
213     const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
214     U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
215     U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
216     int32_t i;
217 
218     U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
219     U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
220 
221 /*
222 Checks LetterLike Symbols which were previously a source of confusion
223 [Bertrand A. D. 02/04/98]
224 */
225     for (i=0x2100;i<0x2138;i++)
226     {
227         /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
228         if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
229         {
230             if (i != (int)u_tolower(i)) /* itself */
231                 log_err("Failed case conversion with itself: U+%04x\n", i);
232             if (i != (int)u_toupper(i))
233                 log_err("Failed case conversion with itself: U+%04x\n", i);
234         }
235     }
236 
237     for(i=0; i < u_strlen(upper); i++){
238         if(u_tolower(upper[i]) != lower[i]){
239             log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
240         }
241     }
242 
243     log_verbose("testing upper lower\n");
244     for (i = 0; i < 21; i++) {
245 
246         if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
247         {
248             log_err("Failed isLowerCase test at  %c\n", upperTest[i]);
249         }
250         else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
251          {
252             log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
253         }
254         else if (upperTest[i] != u_tolower(lowerTest[i]))
255         {
256             log_err("Failed case conversion from %c  To %c :\n", lowerTest[i], upperTest[i]);
257         }
258         else if (lowerTest[i] != u_toupper(upperTest[i]))
259          {
260             log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
261         }
262         else if (upperTest[i] != u_tolower(upperTest[i]))
263         {
264             log_err("Failed case conversion with itself: %c\n", upperTest[i]);
265         }
266         else if (lowerTest[i] != u_toupper(lowerTest[i]))
267         {
268             log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
269         }
270     }
271     log_verbose("done testing upper lower\n");
272 
273     log_verbose("testing u_istitle\n");
274     {
275         static const UChar expected[] = {
276             0x1F88,
277             0x1F89,
278             0x1F8A,
279             0x1F8B,
280             0x1F8C,
281             0x1F8D,
282             0x1F8E,
283             0x1F8F,
284             0x1F88,
285             0x1F89,
286             0x1F8A,
287             0x1F8B,
288             0x1F8C,
289             0x1F8D,
290             0x1F8E,
291             0x1F8F,
292             0x1F98,
293             0x1F99,
294             0x1F9A,
295             0x1F9B,
296             0x1F9C,
297             0x1F9D,
298             0x1F9E,
299             0x1F9F,
300             0x1F98,
301             0x1F99,
302             0x1F9A,
303             0x1F9B,
304             0x1F9C,
305             0x1F9D,
306             0x1F9E,
307             0x1F9F,
308             0x1FA8,
309             0x1FA9,
310             0x1FAA,
311             0x1FAB,
312             0x1FAC,
313             0x1FAD,
314             0x1FAE,
315             0x1FAF,
316             0x1FA8,
317             0x1FA9,
318             0x1FAA,
319             0x1FAB,
320             0x1FAC,
321             0x1FAD,
322             0x1FAE,
323             0x1FAF,
324             0x1FBC,
325             0x1FBC,
326             0x1FCC,
327             0x1FCC,
328             0x1FFC,
329             0x1FFC,
330         };
331         int32_t num = UPRV_LENGTHOF(expected);
332         for(i=0; i<num; i++){
333             if(!u_istitle(expected[i])){
334                 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
335             }
336         }
337 
338     }
339 }
340 
341 /* compare two sets and verify that their difference or intersection is empty */
342 static UBool
showADiffB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool expect,UBool diffIsError)343 showADiffB(const USet *a, const USet *b,
344            const char *a_name, const char *b_name,
345            UBool expect, UBool diffIsError) {
346     USet *aa;
347     int32_t i, start, end, length;
348     UErrorCode errorCode;
349 
350     /*
351      * expect:
352      * TRUE  -> a-b should be empty, that is, b should contain all of a
353      * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
354      */
355     if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
356         return TRUE;
357     }
358 
359     /* clone a to aa because a is const */
360     aa=uset_open(1, 0);
361     if(aa==NULL) {
362         /* unusual problem - out of memory? */
363         return FALSE;
364     }
365     uset_addAll(aa, a);
366 
367     /* compute the set in question */
368     if(expect) {
369         /* a-b */
370         uset_removeAll(aa, b);
371     } else {
372         /* a&b */
373         uset_retainAll(aa, b);
374     }
375 
376     /* aa is not empty because of the initial tests above; show its contents */
377     errorCode=U_ZERO_ERROR;
378     i=0;
379     for(;;) {
380         length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
381         if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
382             break; /* done */
383         }
384         if(U_FAILURE(errorCode)) {
385             log_err("error comparing %s with %s at difference item %d: %s\n",
386                 a_name, b_name, i, u_errorName(errorCode));
387             break;
388         }
389         if(length!=0) {
390             break; /* done with code points, got a string or -1 */
391         }
392 
393         if(diffIsError) {
394             if(expect) {
395                 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
396             } else {
397                 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
398             }
399         } else {
400             if(expect) {
401                 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
402             } else {
403                 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
404             }
405         }
406 
407         ++i;
408     }
409 
410     uset_close(aa);
411     return FALSE;
412 }
413 
414 static UBool
showAMinusB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)415 showAMinusB(const USet *a, const USet *b,
416             const char *a_name, const char *b_name,
417             UBool diffIsError) {
418     return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
419 }
420 
421 static UBool
showAIntersectB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)422 showAIntersectB(const USet *a, const USet *b,
423                 const char *a_name, const char *b_name,
424                 UBool diffIsError) {
425     return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
426 }
427 
428 static UBool
compareUSets(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)429 compareUSets(const USet *a, const USet *b,
430              const char *a_name, const char *b_name,
431              UBool diffIsError) {
432     /*
433      * Use an arithmetic & not a logical && so that both branches
434      * are always taken and all differences are shown.
435      */
436     return
437         showAMinusB(a, b, a_name, b_name, diffIsError) &
438         showAMinusB(b, a, b_name, a_name, diffIsError);
439 }
440 
441 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
TestLetterNumber()442 static void TestLetterNumber()
443 {
444     UChar i = 0x0000;
445 
446     log_verbose("Testing for isalpha\n");
447     for (i = 0x0041; i < 0x005B; i++) {
448         if (!u_isalpha(i))
449         {
450             log_err("Failed isLetter test at  %.4X\n", i);
451         }
452     }
453     for (i = 0x0660; i < 0x066A; i++) {
454         if (u_isalpha(i))
455         {
456             log_err("Failed isLetter test with numbers at %.4X\n", i);
457         }
458     }
459 
460     log_verbose("Testing for isdigit\n");
461     for (i = 0x0660; i < 0x066A; i++) {
462         if (!u_isdigit(i))
463         {
464             log_verbose("Failed isNumber test at %.4X\n", i);
465         }
466     }
467 
468     log_verbose("Testing for isalnum\n");
469     for (i = 0x0041; i < 0x005B; i++) {
470         if (!u_isalnum(i))
471         {
472             log_err("Failed isAlNum test at  %.4X\n", i);
473         }
474     }
475     for (i = 0x0660; i < 0x066A; i++) {
476         if (!u_isalnum(i))
477         {
478             log_err("Failed isAlNum test at  %.4X\n", i);
479         }
480     }
481 
482     {
483         /*
484          * The following checks work only starting from Unicode 4.0.
485          * Check the version number here.
486          */
487         static UVersionInfo u401={ 4, 0, 1, 0 };
488         UVersionInfo version;
489         u_getUnicodeVersion(version);
490         if(version[0]<4 || 0==memcmp(version, u401, 4)) {
491             return;
492         }
493     }
494 
495     {
496         /*
497          * Sanity check:
498          * Verify that exactly the digit characters have decimal digit values.
499          * This assumption is used in the implementation of u_digit()
500          * (which checks nt=de)
501          * compared with the parallel java.lang.Character.digit()
502          * (which checks Nd).
503          *
504          * This was not true in Unicode 3.2 and earlier.
505          * Unicode 4.0 fixed discrepancies.
506          * Unicode 4.0.1 re-introduced problems in this area due to an
507          * unintentionally incomplete last-minute change.
508          */
509         U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
510         U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
511 
512         USet *digits, *decimalValues;
513         UErrorCode errorCode;
514 
515         U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
516         U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
517         errorCode=U_ZERO_ERROR;
518         digits=uset_openPattern(digitsPattern, 6, &errorCode);
519         decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
520 
521         if(U_SUCCESS(errorCode)) {
522             compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
523         }
524 
525         uset_close(digits);
526         uset_close(decimalValues);
527     }
528 }
529 
testSampleCharProps(UBool propFn (UChar32),const char * propName,const UChar32 * sampleChars,int32_t sampleCharsLength,UBool expected)530 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
531                                 const UChar32 *sampleChars, int32_t sampleCharsLength,
532                                 UBool expected) {
533     int32_t i;
534     for (i = 0; i < sampleCharsLength; ++i) {
535         UBool result = propFn(sampleChars[i]);
536         if (result != expected) {
537             log_err("error: character property function %s(U+%04x)=%d is wrong\n",
538                     propName, sampleChars[i], result);
539         }
540     }
541 }
542 
543 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
TestMisc()544 static void TestMisc()
545 {
546     static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
547     static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
548     static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
549     static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
550     static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
551     static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
552 /*    static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
553     static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
554     static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
555     static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
556     static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
557 
558     static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
559 
560     uint32_t mask;
561 
562     int32_t i;
563     char icuVersion[U_MAX_VERSION_STRING_LENGTH];
564     UVersionInfo realVersion;
565 
566     memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
567 
568     testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
569     testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
570 
571     testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
572                         sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
573     testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
574                         sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
575 
576     testSampleCharProps(u_isWhitespace, "u_isWhitespace",
577                         sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
578     testSampleCharProps(u_isWhitespace, "u_isWhitespace",
579                         sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
580 
581     testSampleCharProps(u_isdefined, "u_isdefined",
582                         sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
583     testSampleCharProps(u_isdefined, "u_isdefined",
584                         sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
585 
586     testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
587     testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
588 
589     testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
590     testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
591 
592     for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
593         if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
594             log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
595                     sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
596         }
597     }
598 
599     /* Tests the ICU version #*/
600     u_getVersion(realVersion);
601     u_versionToString(realVersion, icuVersion);
602     if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
603     {
604         log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
605     }
606 #if defined(ICU_VERSION)
607     /* test only happens where we have configure.in with VERSION - sanity check. */
608     if(strcmp(U_ICU_VERSION, ICU_VERSION))
609     {
610         log_err("ICU version mismatch: Header says %s, build environment says %s.\n",  U_ICU_VERSION, ICU_VERSION);
611     }
612 #endif
613 
614     /* test U_GC_... */
615     if(
616         U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
617         U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
618         U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
619         U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
620         U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
621         U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
622     ) {
623         log_err("error: U_GET_GC_MASK does not work properly\n");
624     }
625 
626     mask=0;
627     mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
628 
629     mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
630     mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
631     mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
632     mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
633     mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
634 
635     mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
636     mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
637     mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
638 
639     mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
640     mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
641     mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
642 
643     mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
644     mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
645     mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
646 
647     mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
648     mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
649     mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
650     mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
651 
652     mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
653     mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
654     mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
655     mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
656     mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
657 
658     mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
659     mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
660     mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
661     mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
662 
663     mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
664     mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
665 
666     if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
667         log_err("error: problems with U_GC_XX_MASK constants\n");
668     }
669 
670     mask=0;
671     mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
672     mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
673     mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
674     mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
675     mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
676     mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
677     mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
678 
679     if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
680         log_err("error: problems with U_GC_Y_MASK constants\n");
681     }
682     {
683         static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
684         for(i=0; i<10; i++){
685             if(digit[i]!=u_forDigit(i,10)){
686                 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
687             }
688         }
689     }
690 
691     /* test u_digit() */
692     {
693         static const struct {
694             UChar32 c;
695             int8_t radix, value;
696         } data[]={
697             /* base 16 */
698             { 0x0031, 16, 1 },
699             { 0x0038, 16, 8 },
700             { 0x0043, 16, 12 },
701             { 0x0066, 16, 15 },
702             { 0x00e4, 16, -1 },
703             { 0x0662, 16, 2 },
704             { 0x06f5, 16, 5 },
705             { 0xff13, 16, 3 },
706             { 0xff41, 16, 10 },
707 
708             /* base 8 */
709             { 0x0031, 8, 1 },
710             { 0x0038, 8, -1 },
711             { 0x0043, 8, -1 },
712             { 0x0066, 8, -1 },
713             { 0x00e4, 8, -1 },
714             { 0x0662, 8, 2 },
715             { 0x06f5, 8, 5 },
716             { 0xff13, 8, 3 },
717             { 0xff41, 8, -1 },
718 
719             /* base 36 */
720             { 0x5a, 36, 35 },
721             { 0x7a, 36, 35 },
722             { 0xff3a, 36, 35 },
723             { 0xff5a, 36, 35 },
724 
725             /* wrong radix values */
726             { 0x0031, 1, -1 },
727             { 0xff3a, 37, -1 }
728         };
729 
730         for(i=0; i<UPRV_LENGTHOF(data); ++i) {
731             if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
732                 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
733                         data[i].c,
734                         data[i].radix,
735                         u_digit(data[i].c, data[i].radix),
736                         data[i].value);
737             }
738         }
739     }
740 }
741 
742 /* test C/POSIX-style functions --------------------------------------------- */
743 
744 /* bit flags */
745 #define ISAL     1
746 #define ISLO     2
747 #define ISUP     4
748 
749 #define ISDI     8
750 #define ISXD  0x10
751 
752 #define ISAN  0x20
753 
754 #define ISPU  0x40
755 #define ISGR  0x80
756 #define ISPR 0x100
757 
758 #define ISSP 0x200
759 #define ISBL 0x400
760 #define ISCN 0x800
761 
762 /* C/POSIX-style functions, in the same order as the bit flags */
763 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
764 
765 static const struct {
766     IsPOSIXClass *fn;
767     const char *name;
768 } posixClasses[]={
769     { u_isalpha, "isalpha" },
770     { u_islower, "islower" },
771     { u_isupper, "isupper" },
772     { u_isdigit, "isdigit" },
773     { u_isxdigit, "isxdigit" },
774     { u_isalnum, "isalnum" },
775     { u_ispunct, "ispunct" },
776     { u_isgraph, "isgraph" },
777     { u_isprint, "isprint" },
778     { u_isspace, "isspace" },
779     { u_isblank, "isblank" },
780     { u_iscntrl, "iscntrl" }
781 };
782 
783 static const struct {
784     UChar32 c;
785     uint32_t posixResults;
786 } posixData[]={
787     { 0x0008,                                                        ISCN },    /* backspace */
788     { 0x0009,                                              ISSP|ISBL|ISCN },    /* TAB */
789     { 0x000a,                                              ISSP|     ISCN },    /* LF */
790     { 0x000c,                                              ISSP|     ISCN },    /* FF */
791     { 0x000d,                                              ISSP|     ISCN },    /* CR */
792     { 0x0020,                                         ISPR|ISSP|ISBL      },    /* space */
793     { 0x0021,                               ISPU|ISGR|ISPR                },    /* ! */
794     { 0x0033,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* 3 */
795     { 0x0040,                               ISPU|ISGR|ISPR                },    /* @ */
796     { 0x0041, ISAL|     ISUP|     ISXD|ISAN|     ISGR|ISPR                },    /* A */
797     { 0x007a, ISAL|ISLO|               ISAN|     ISGR|ISPR                },    /* z */
798     { 0x007b,                               ISPU|ISGR|ISPR                },    /* { */
799     { 0x0085,                                              ISSP|     ISCN },    /* NEL */
800     { 0x00a0,                                         ISPR|ISSP|ISBL      },    /* NBSP */
801     { 0x00a4,                                    ISGR|ISPR                },    /* currency sign */
802     { 0x00e4, ISAL|ISLO|               ISAN|     ISGR|ISPR                },    /* a-umlaut */
803     { 0x0300,                                    ISGR|ISPR                },    /* combining grave */
804     { 0x0600,                                                        ISCN },    /* arabic number sign */
805     { 0x0627, ISAL|                    ISAN|     ISGR|ISPR                },    /* alef */
806     { 0x0663,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* arabic 3 */
807     { 0x2002,                                         ISPR|ISSP|ISBL      },    /* en space */
808     { 0x2007,                                         ISPR|ISSP|ISBL      },    /* figure space */
809     { 0x2009,                                         ISPR|ISSP|ISBL      },    /* thin space */
810     { 0x200b,                                                        ISCN },    /* ZWSP */
811   /*{ 0x200b,                                         ISPR|ISSP           },*/    /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
812     { 0x200e,                                                        ISCN },    /* LRM */
813     { 0x2028,                                         ISPR|ISSP|     ISCN },    /* LS */
814     { 0x2029,                                         ISPR|ISSP|     ISCN },    /* PS */
815     { 0x20ac,                                    ISGR|ISPR                },    /* Euro */
816     { 0xff15,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth 5 */
817     { 0xff25, ISAL|     ISUP|     ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth E */
818     { 0xff35, ISAL|     ISUP|          ISAN|     ISGR|ISPR                },    /* fullwidth U */
819     { 0xff45, ISAL|ISLO|          ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth e */
820     { 0xff55, ISAL|ISLO|               ISAN|     ISGR|ISPR                }     /* fullwidth u */
821 };
822 
823 static void
TestPOSIX()824 TestPOSIX() {
825     uint32_t mask;
826     int32_t cl, i;
827     UBool expect;
828 
829     mask=1;
830     for(cl=0; cl<12; ++cl) {
831         for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
832             expect=(UBool)((posixData[i].posixResults&mask)!=0);
833             if(posixClasses[cl].fn(posixData[i].c)!=expect) {
834                 log_err("u_%s(U+%04x)=%s is wrong\n",
835                     posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
836             }
837         }
838         mask<<=1;
839     }
840 }
841 
842 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
TestControlPrint()843 static void TestControlPrint()
844 {
845     const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
846     const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
847     const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
848     const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
849     UChar32 c;
850 
851     testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
852     testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
853 
854     testSampleCharProps(u_isprint, "u_isprint",
855                         samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
856     testSampleCharProps(u_isprint, "u_isprint",
857                         sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
858 
859     /* test all ISO 8 controls */
860     for(c=0; c<=0x9f; ++c) {
861         if(c==0x20) {
862             /* skip ASCII graphic characters and continue with DEL */
863             c=0x7f;
864         }
865         if(!u_iscntrl(c)) {
866             log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
867         }
868         if(!u_isISOControl(c)) {
869             log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
870         }
871         if(u_isprint(c)) {
872             log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
873         }
874     }
875 
876     /* test all Latin-1 graphic characters */
877     for(c=0x20; c<=0xff; ++c) {
878         if(c==0x7f) {
879             c=0xa0;
880         } else if(c==0xad) {
881             /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
882             ++c;
883         }
884         if(!u_isprint(c)) {
885             log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
886         }
887     }
888 }
889 
890 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
TestIdentifier()891 static void TestIdentifier()
892 {
893     const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
894     const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
895     const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
896     const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
897     const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
898     const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
899     const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
900     const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
901     const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
902     const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
903 
904     testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
905                         sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
906     testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
907                         sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
908 
909     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
910                         sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
911     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
912                         sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
913 
914     /* IDPart should imply IDStart */
915     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
916                         sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
917 
918     testSampleCharProps(u_isIDStart, "u_isIDStart",
919                         sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
920     testSampleCharProps(u_isIDStart, "u_isIDStart",
921                         sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
922 
923     testSampleCharProps(u_isIDPart, "u_isIDPart",
924                         sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
925     testSampleCharProps(u_isIDPart, "u_isIDPart",
926                         sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
927 
928     /* IDPart should imply IDStart */
929     testSampleCharProps(u_isIDPart, "u_isIDPart",
930                         sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
931 
932     testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
933                         sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
934     testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
935                         sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
936 }
937 
938 /* for each line of UnicodeData.txt, check some of the properties */
939 typedef struct UnicodeDataContext {
940 #if UCONFIG_NO_NORMALIZATION
941     const void *dummy;
942 #else
943     const UNormalizer2 *nfc;
944     const UNormalizer2 *nfkc;
945 #endif
946 } UnicodeDataContext;
947 
948 /*
949  * ### TODO
950  * This test fails incorrectly if the First or Last code point of a repetitive area
951  * is overridden, which is allowed and is encouraged for the PUAs.
952  * Currently, this means that both area First/Last and override lines are
953  * tested against the properties from the API,
954  * and the area boundary will not match and cause an error.
955  *
956  * This function should detect area boundaries and skip them for the test of individual
957  * code points' properties.
958  * Then it should check that the areas contain all the same properties except where overridden.
959  * For this, it would have had to set a flag for which code points were listed explicitly.
960  */
961 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)962 unicodeDataLineFn(void *context,
963                   char *fields[][2], int32_t fieldCount,
964                   UErrorCode *pErrorCode)
965 {
966     char buffer[100];
967     const char *d;
968     char *end;
969     uint32_t value;
970     UChar32 c;
971     int32_t i;
972     int8_t type;
973     int32_t dt;
974     UChar dm[32], s[32];
975     int32_t dmLength, length;
976 
977 #if !UCONFIG_NO_NORMALIZATION
978     const UNormalizer2 *nfc, *nfkc;
979 #endif
980 
981     /* get the character code, field 0 */
982     c=strtoul(fields[0][0], &end, 16);
983     if(end<=fields[0][0] || end!=fields[0][1]) {
984         log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
985         return;
986     }
987     if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
988         log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
989         return;
990     }
991 
992     /* get general category, field 2 */
993     *fields[2][1]=0;
994     type = (int8_t)tagValues[MakeProp(fields[2][0])];
995     if(u_charType(c)!=type) {
996         log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
997     }
998     if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
999         log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1000     }
1001 
1002     /* get canonical combining class, field 3 */
1003     value=strtoul(fields[3][0], &end, 10);
1004     if(end<=fields[3][0] || end!=fields[3][1]) {
1005         log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1006         return;
1007     }
1008     if(value>255) {
1009         log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1010         return;
1011     }
1012 #if !UCONFIG_NO_NORMALIZATION
1013     if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1014         log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1015     }
1016     nfkc=((UnicodeDataContext *)context)->nfkc;
1017     if(value!=unorm2_getCombiningClass(nfkc, c)) {
1018         log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1019     }
1020 #endif
1021 
1022     /* get BiDi category, field 4 */
1023     *fields[4][1]=0;
1024     i=MakeDir(fields[4][0]);
1025     if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1026         log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1027     }
1028 
1029     /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1030     d=NULL;
1031     if(fields[5][0]==fields[5][1]) {
1032         /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1033         if(c==0xac00 || c==0xd7a3) {
1034             dt=U_DT_CANONICAL;
1035         } else {
1036             dt=U_DT_NONE;
1037         }
1038     } else {
1039         d=fields[5][0];
1040         *fields[5][1]=0;
1041         dt=UCHAR_INVALID_CODE;
1042         if(*d=='<') {
1043             end=strchr(++d, '>');
1044             if(end!=NULL) {
1045                 *end=0;
1046                 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1047                 d=u_skipWhitespace(end+1);
1048             }
1049         } else {
1050             dt=U_DT_CANONICAL;
1051         }
1052     }
1053     if(dt>U_DT_NONE) {
1054         if(c==0xac00) {
1055             dm[0]=0x1100;
1056             dm[1]=0x1161;
1057             dm[2]=0;
1058             dmLength=2;
1059         } else if(c==0xd7a3) {
1060             dm[0]=0xd788;
1061             dm[1]=0x11c2;
1062             dm[2]=0;
1063             dmLength=2;
1064         } else {
1065             dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1066         }
1067     } else {
1068         dmLength=-1;
1069     }
1070     if(dt<0 || U_FAILURE(*pErrorCode)) {
1071         log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1072         return;
1073     }
1074 #if !UCONFIG_NO_NORMALIZATION
1075     i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1076     if(i!=dt) {
1077         log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1078     }
1079     /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1080     length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1081     if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1082         log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1083                 "or the Decomposition_Mapping is different (%s)\n",
1084                 c, length, dmLength, u_errorName(*pErrorCode));
1085         return;
1086     }
1087     /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1088     if(dt!=U_DT_CANONICAL) {
1089         dmLength=-1;
1090     }
1091     nfc=((UnicodeDataContext *)context)->nfc;
1092     length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1093     if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1094         log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1095                 "or the Decomposition_Mapping is different (%s)\n",
1096                 c, length, dmLength, u_errorName(*pErrorCode));
1097         return;
1098     }
1099     /* recompose */
1100     if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1101         UChar32 a, b, composite;
1102         i=0;
1103         U16_NEXT(dm, i, dmLength, a);
1104         U16_NEXT(dm, i, dmLength, b);
1105         /* i==dmLength */
1106         composite=unorm2_composePair(nfc, a, b);
1107         if(composite!=c) {
1108             log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1109                     (long)c, (long)a, (long)b, (long)composite);
1110         }
1111         /*
1112          * Note: NFKC has fewer round-trip mappings than NFC,
1113          * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1114          */
1115     }
1116 #endif
1117 
1118     /* get ISO Comment, field 11 */
1119     *fields[11][1]=0;
1120     i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1121     if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1122         log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1123             c, u_errorName(*pErrorCode),
1124             U_FAILURE(*pErrorCode) ? buffer : "[error]",
1125             fields[11][0]);
1126     }
1127 
1128     /* get uppercase mapping, field 12 */
1129     if(fields[12][0]!=fields[12][1]) {
1130         value=strtoul(fields[12][0], &end, 16);
1131         if(end!=fields[12][1]) {
1132             log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1133             return;
1134         }
1135         if((UChar32)value!=u_toupper(c)) {
1136             log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1137         }
1138     } else {
1139         /* no case mapping: the API must map the code point to itself */
1140         if(c!=u_toupper(c)) {
1141             log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1142         }
1143     }
1144 
1145     /* get lowercase mapping, field 13 */
1146     if(fields[13][0]!=fields[13][1]) {
1147         value=strtoul(fields[13][0], &end, 16);
1148         if(end!=fields[13][1]) {
1149             log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1150             return;
1151         }
1152         if((UChar32)value!=u_tolower(c)) {
1153             log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1154         }
1155     } else {
1156         /* no case mapping: the API must map the code point to itself */
1157         if(c!=u_tolower(c)) {
1158             log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1159         }
1160     }
1161 
1162     /* get titlecase mapping, field 14 */
1163     if(fields[14][0]!=fields[14][1]) {
1164         value=strtoul(fields[14][0], &end, 16);
1165         if(end!=fields[14][1]) {
1166             log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1167             return;
1168         }
1169         if((UChar32)value!=u_totitle(c)) {
1170             log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1171         }
1172     } else {
1173         /* no case mapping: the API must map the code point to itself */
1174         if(c!=u_totitle(c)) {
1175             log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1176         }
1177     }
1178 }
1179 
1180 static UBool U_CALLCONV
enumTypeRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1181 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1182     static const UChar32 test[][2]={
1183         {0x41, U_UPPERCASE_LETTER},
1184         {0x308, U_NON_SPACING_MARK},
1185         {0xfffe, U_GENERAL_OTHER_TYPES},
1186         {0xe0041, U_FORMAT_CHAR},
1187         {0xeffff, U_UNASSIGNED}
1188     };
1189 
1190     int32_t i, count;
1191 
1192     if(0!=strcmp((const char *)context, "a1")) {
1193         log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1194         return FALSE;
1195     }
1196 
1197     count=UPRV_LENGTHOF(test);
1198     for(i=0; i<count; ++i) {
1199         if(start<=test[i][0] && test[i][0]<limit) {
1200             if(type!=(UCharCategory)test[i][1]) {
1201                 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1202                         start, limit, (long)type, test[i][0], test[i][1]);
1203             }
1204             /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1205             return i==(count-1) ? FALSE : TRUE;
1206         }
1207     }
1208 
1209     if(start>test[count-1][0]) {
1210         log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1211                 start, limit, (long)type);
1212         return FALSE;
1213     }
1214 
1215     return TRUE;
1216 }
1217 
1218 static UBool U_CALLCONV
enumDefaultsRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1219 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1220     /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1221     static const int32_t defaultBidi[][2]={ /* { limit, class } */
1222         { 0x0590, U_LEFT_TO_RIGHT },
1223         { 0x0600, U_RIGHT_TO_LEFT },
1224         { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1225         { 0x0860, U_RIGHT_TO_LEFT },
1226         { 0x0870, U_RIGHT_TO_LEFT_ARABIC },  // Unicode 10 changes U+0860..U+086F from R to AL.
1227         { 0x08A0, U_RIGHT_TO_LEFT },
1228         { 0x0900, U_RIGHT_TO_LEFT_ARABIC },  /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1229         { 0x20A0, U_LEFT_TO_RIGHT },
1230         { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR },  /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1231         { 0xFB1D, U_LEFT_TO_RIGHT },
1232         { 0xFB50, U_RIGHT_TO_LEFT },
1233         { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1234         { 0xFE70, U_LEFT_TO_RIGHT },
1235         { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1236 
1237         { 0x10800, U_LEFT_TO_RIGHT },
1238         { 0x10D00, U_RIGHT_TO_LEFT },  // Unicode 11 changes U+10D00..U+10D3F from R to AL.
1239         { 0x10D40, U_RIGHT_TO_LEFT_ARABIC },
1240         { 0x10F30, U_RIGHT_TO_LEFT },  // Unicode 11 changes U+10F30..U+10F6F from R to AL.
1241         { 0x10F70, U_RIGHT_TO_LEFT_ARABIC },
1242         { 0x11000, U_RIGHT_TO_LEFT },
1243 
1244         { 0x1E800, U_LEFT_TO_RIGHT },  /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1245         { 0x1EC70, U_RIGHT_TO_LEFT },  // Unicode 11 changes U+1EC70..U+1ECBF from R to AL.
1246         { 0x1ECC0, U_RIGHT_TO_LEFT_ARABIC },
1247         { 0x1EE00, U_RIGHT_TO_LEFT },
1248         { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC },  /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1249         { 0x1F000, U_RIGHT_TO_LEFT },
1250         { 0x110000, U_LEFT_TO_RIGHT }
1251     };
1252 
1253     UChar32 c;
1254     int32_t i;
1255     UCharDirection shouldBeDir;
1256 
1257     /*
1258      * LineBreak.txt specifies:
1259      *   #  - Assigned characters that are not listed explicitly are given the value
1260      *   #    "AL".
1261      *   #  - Unassigned characters are given the value "XX".
1262      *
1263      * PUA characters are listed explicitly with "XX".
1264      * Verify that no assigned character has "XX".
1265      */
1266     if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1267         c=start;
1268         while(c<limit) {
1269             if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1270                 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1271             }
1272             ++c;
1273         }
1274     }
1275 
1276     /*
1277      * Verify default Bidi classes.
1278      * See DerivedBidiClass.txt, especially for unassigned code points.
1279      */
1280     if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1281         /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1282         c=start;
1283         for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1284             if((int32_t)c<defaultBidi[i][0]) {
1285                 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1286                     if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1287                         shouldBeDir=U_BOUNDARY_NEUTRAL;
1288                     } else {
1289                         shouldBeDir=(UCharDirection)defaultBidi[i][1];
1290                     }
1291 
1292                     if( u_charDirection(c)!=shouldBeDir ||
1293                         u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1294                     ) {
1295                         log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1296                             c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1297                     }
1298                     ++c;
1299                 }
1300             }
1301         }
1302     }
1303 
1304     return TRUE;
1305 }
1306 
1307 /* tests for several properties */
TestUnicodeData()1308 static void TestUnicodeData()
1309 {
1310     UVersionInfo expectVersionArray;
1311     UVersionInfo versionArray;
1312     char *fields[15][2];
1313     UErrorCode errorCode;
1314     UChar32 c;
1315     int8_t type;
1316 
1317     UnicodeDataContext context;
1318 
1319     u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1320     u_getUnicodeVersion(versionArray);
1321     if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1322     {
1323         log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1324         versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1325     }
1326 
1327 #if defined(ICU_UNICODE_VERSION)
1328     /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1329     if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1330     {
1331          log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1332     }
1333 #endif
1334 
1335     if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1336         log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1337     }
1338 
1339     errorCode=U_ZERO_ERROR;
1340 #if !UCONFIG_NO_NORMALIZATION
1341     context.nfc=unorm2_getNFCInstance(&errorCode);
1342     context.nfkc=unorm2_getNFKCInstance(&errorCode);
1343     if(U_FAILURE(errorCode)) {
1344         log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1345         return;
1346     }
1347 #endif
1348     parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1349     if(U_FAILURE(errorCode)) {
1350         return; /* if we couldn't parse UnicodeData.txt, we should return */
1351     }
1352 
1353     /* sanity check on repeated properties */
1354     for(c=0xfffe; c<=0x10ffff;) {
1355         type=u_charType(c);
1356         if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1357             log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1358         }
1359         if(type!=U_UNASSIGNED) {
1360             log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1361         }
1362         if((c&0xffff)==0xfffe) {
1363             ++c;
1364         } else {
1365             c+=0xffff;
1366         }
1367     }
1368 
1369     /* test that PUA is not "unassigned" */
1370     for(c=0xe000; c<=0x10fffd;) {
1371         type=u_charType(c);
1372         if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1373             log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1374         }
1375         if(type==U_UNASSIGNED) {
1376             log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1377         } else if(type!=U_PRIVATE_USE_CHAR) {
1378             log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1379         }
1380         if(c==0xf8ff) {
1381             c=0xf0000;
1382         } else if(c==0xffffd) {
1383             c=0x100000;
1384         } else {
1385             ++c;
1386         }
1387     }
1388 
1389     /* test u_enumCharTypes() */
1390     u_enumCharTypes(enumTypeRange, "a1");
1391 
1392     /* check default properties */
1393     u_enumCharTypes(enumDefaultsRange, NULL);
1394 }
1395 
TestCodeUnit()1396 static void TestCodeUnit(){
1397     const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1398 
1399     int32_t i;
1400 
1401     for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1402         UChar c=codeunit[i];
1403         if(i<4){
1404             if(!(U16_IS_SINGLE(c)) || (U16_IS_LEAD(c)) || (U16_IS_TRAIL(c)) ||
1405                     U16_IS_SURROGATE(c) || U_IS_SURROGATE(c)) {
1406                 log_err("ERROR: U+%04x is a single", c);
1407             }
1408 
1409         }
1410         if(i >= 4 && i< 8){
1411             if(!(U16_IS_LEAD(c)) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c) ||
1412                     !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1413                 log_err("ERROR: U+%04x is a first surrogate", c);
1414             }
1415         }
1416         if(i >= 8 && i< 12){
1417             if(!(U16_IS_TRAIL(c)) || U16_IS_SINGLE(c) || U16_IS_LEAD(c) ||
1418                     !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1419                 log_err("ERROR: U+%04x is a second surrogate", c);
1420             }
1421         }
1422 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1423         if(i<4){
1424             if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1425                 log_err("ERROR: U+%04x is a single", c);
1426             }
1427 
1428         }
1429         if(i >= 4 && i< 8){
1430             if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1431                 log_err("ERROR: U+%04x is a first surrogate", c);
1432             }
1433         }
1434         if(i >= 8 && i< 12){
1435             if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1436                 log_err("ERROR: U+%04x is a second surrogate", c);
1437             }
1438         }
1439 #endif
1440     }
1441 }
1442 
TestCodePoint()1443 static void TestCodePoint(){
1444     const UChar32 codePoint[]={
1445         /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1446         0xd800,
1447         0xdbff,
1448         0xdc00,
1449         0xdfff,
1450         0xdc04,
1451         0xd821,
1452         /*not a surrogate, valid, isUnicodeChar , not Error*/
1453         0x20ac,
1454         0xd7ff,
1455         0xe000,
1456         0xe123,
1457         0x0061,
1458         0xe065,
1459         0x20402,
1460         0x24506,
1461         0x23456,
1462         0x20402,
1463         0x10402,
1464         0x23456,
1465         /*not a surrogate, not valid, isUnicodeChar, isError */
1466         0x0015,
1467         0x009f,
1468         /*not a surrogate, not valid, not isUnicodeChar, isError */
1469         0xffff,
1470         0xfffe,
1471     };
1472     int32_t i;
1473     for(i=0; i<UPRV_LENGTHOF(codePoint); i++) {
1474         UChar32 c=codePoint[i];
1475         if(i<6) {
1476             if(!U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)) {
1477                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1478             }
1479             if(U_IS_UNICODE_CHAR(c)) {
1480                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1481             }
1482         } else if(i >=6 && i<18) {
1483             if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1484                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1485             }
1486             if(!U_IS_UNICODE_CHAR(c)) {
1487                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1488             }
1489         } else if(i >=18 && i<20) {
1490             if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1491                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1492             }
1493             if(!U_IS_UNICODE_CHAR(c)) {
1494                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1495             }
1496         } else if(i >=18 && i<UPRV_LENGTHOF(codePoint)) {
1497             if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1498                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1499             }
1500             if(U_IS_UNICODE_CHAR(c)) {
1501                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1502             }
1503         }
1504 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1505         if(i<6){
1506             if(!UTF_IS_SURROGATE(c)){
1507                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1508             }
1509             if(UTF_IS_VALID(c)){
1510                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1511             }
1512             if(UTF_IS_UNICODE_CHAR(c)){
1513                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1514             }
1515             if(UTF_IS_ERROR(c)){
1516                 log_err("ERROR: isError() failed for U+%04x\n", c);
1517             }
1518         }else if(i >=6 && i<18){
1519             if(UTF_IS_SURROGATE(c)){
1520                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1521             }
1522             if(!UTF_IS_VALID(c)){
1523                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1524             }
1525             if(!UTF_IS_UNICODE_CHAR(c)){
1526                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1527             }
1528             if(UTF_IS_ERROR(c)){
1529                 log_err("ERROR: isError() failed for U+%04x\n", c);
1530             }
1531         }else if(i >=18 && i<20){
1532             if(UTF_IS_SURROGATE(c)){
1533                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1534             }
1535             if(UTF_IS_VALID(c)){
1536                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1537             }
1538             if(!UTF_IS_UNICODE_CHAR(c)){
1539                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1540             }
1541             if(!UTF_IS_ERROR(c)){
1542                 log_err("ERROR: isError() failed for U+%04x\n", c);
1543             }
1544         }
1545         else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1546             if(UTF_IS_SURROGATE(c)){
1547                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1548             }
1549             if(UTF_IS_VALID(c)){
1550                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1551             }
1552             if(UTF_IS_UNICODE_CHAR(c)){
1553                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1554             }
1555             if(!UTF_IS_ERROR(c)){
1556                 log_err("ERROR: isError() failed for U+%04x\n", c);
1557             }
1558         }
1559 #endif
1560     }
1561 
1562     if(
1563         !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1564         !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1565         U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1566         U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1567     ) {
1568         log_err("error with U_IS_BMP()\n");
1569     }
1570 
1571     if(
1572         U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1573         U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1574         U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1575         !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1576     ) {
1577         log_err("error with U_IS_SUPPLEMENTARY()\n");
1578     }
1579 }
1580 
TestCharLength()1581 static void TestCharLength()
1582 {
1583     const int32_t codepoint[]={
1584         1, 0x0061,
1585         1, 0xe065,
1586         1, 0x20ac,
1587         2, 0x20402,
1588         2, 0x23456,
1589         2, 0x24506,
1590         2, 0x20402,
1591         2, 0x10402,
1592         1, 0xd7ff,
1593         1, 0xe000
1594     };
1595 
1596     int32_t i;
1597 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1598     UBool multiple;
1599 #endif
1600     for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1601         UChar32 c=codepoint[i+1];
1602         if(
1603 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1604                 UTF_CHAR_LENGTH(c) != codepoint[i] ||
1605 #endif
1606                 U16_LENGTH(c) != codepoint[i]) {
1607             log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1608         }
1609 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1610         multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1611         if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1612             log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1613         }
1614 #endif
1615     }
1616 }
1617 
1618 /*internal functions ----*/
MakeProp(char * str)1619 static int32_t MakeProp(char* str)
1620 {
1621     int32_t result = 0;
1622     char* matchPosition =0;
1623 
1624     matchPosition = strstr(tagStrings, str);
1625     if (matchPosition == 0)
1626     {
1627         log_err("unrecognized type letter ");
1628         log_err(str);
1629     }
1630     else
1631         result = (int32_t)((matchPosition - tagStrings) / 2);
1632     return result;
1633 }
1634 
MakeDir(char * str)1635 static int32_t MakeDir(char* str)
1636 {
1637     int32_t pos = 0;
1638     for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1639         if (strcmp(str, dirStrings[pos]) == 0) {
1640             return pos;
1641         }
1642     }
1643     return -1;
1644 }
1645 
1646 /* test u_charName() -------------------------------------------------------- */
1647 
1648 static const struct {
1649     uint32_t code;
1650     const char *name, *oldName, *extName, *alias;
1651 } names[]={
1652     {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1653     {0x01a2, "LATIN CAPITAL LETTER OI", "",
1654              "LATIN CAPITAL LETTER OI",
1655              "LATIN CAPITAL LETTER GHA"},
1656     {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1657              "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1658     {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1659              "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1660              "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1661     {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1662     {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1663     {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1664     {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1665     {0xd800, "", "", "<lead surrogate-D800>" },
1666     {0xdc00, "", "", "<trail surrogate-DC00>" },
1667     {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" },
1668     {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1669     {0xffff, "", "", "<noncharacter-FFFF>" },
1670     {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1671               "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1672               "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1673     {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1674 };
1675 
1676 static UBool
enumCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1677 enumCharNamesFn(void *context,
1678                 UChar32 code, UCharNameChoice nameChoice,
1679                 const char *name, int32_t length) {
1680     int32_t *pCount=(int32_t *)context;
1681     const char *expected;
1682     int i;
1683 
1684     if(length<=0 || length!=(int32_t)strlen(name)) {
1685         /* should not be called with an empty string or invalid length */
1686         log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1687         return TRUE;
1688     }
1689 
1690     ++*pCount;
1691     for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1692         if(code==(UChar32)names[i].code) {
1693             switch (nameChoice) {
1694                 case U_EXTENDED_CHAR_NAME:
1695                     if(0!=strcmp(name, names[i].extName)) {
1696                         log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1697                     }
1698                     break;
1699                 case U_UNICODE_CHAR_NAME:
1700                     if(0!=strcmp(name, names[i].name)) {
1701                         log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1702                     }
1703                     break;
1704                 case U_UNICODE_10_CHAR_NAME:
1705                     expected=names[i].oldName;
1706                     if(expected[0]==0 || 0!=strcmp(name, expected)) {
1707                         log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1708                     }
1709                     break;
1710                 case U_CHAR_NAME_ALIAS:
1711                     expected=names[i].alias;
1712                     if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1713                         log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1714                     }
1715                     break;
1716                 case U_CHAR_NAME_CHOICE_COUNT:
1717                     break;
1718             }
1719             break;
1720         }
1721     }
1722     return TRUE;
1723 }
1724 
1725 struct enumExtCharNamesContext {
1726     uint32_t length;
1727     int32_t last;
1728 };
1729 
1730 static UBool
enumExtCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1731 enumExtCharNamesFn(void *context,
1732                 UChar32 code, UCharNameChoice nameChoice,
1733                 const char *name, int32_t length) {
1734     struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1735 
1736     if (ecncp->last != (int32_t) code - 1) {
1737         if (ecncp->last < 0) {
1738             log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1739         } else {
1740             log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1741         }
1742     }
1743     ecncp->last = (int32_t) code;
1744 
1745     if (!*name) {
1746         log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1747     }
1748 
1749     return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1750 }
1751 
1752 /**
1753  * This can be made more efficient by moving it into putil.c and having
1754  * it directly access the ebcdic translation tables.
1755  * TODO: If we get this method in putil.c, then delete it from here.
1756  */
1757 static UChar
u_charToUChar(char c)1758 u_charToUChar(char c) {
1759     UChar uc;
1760     u_charsToUChars(&c, &uc, 1);
1761     return uc;
1762 }
1763 
1764 static void
TestCharNames()1765 TestCharNames() {
1766     static char name[80];
1767     UErrorCode errorCode=U_ZERO_ERROR;
1768     struct enumExtCharNamesContext extContext;
1769     const char *expected;
1770     int32_t length;
1771     UChar32 c;
1772     int32_t i;
1773 
1774     log_verbose("Testing uprv_getMaxCharNameLength()\n");
1775     length=uprv_getMaxCharNameLength();
1776     if(length==0) {
1777         /* no names data available */
1778         return;
1779     }
1780     if(length<83) { /* Unicode 3.2 max char name length */
1781         log_err("uprv_getMaxCharNameLength()=%d is too short");
1782     }
1783     /* ### TODO same tests for max ISO comment length as for max name length */
1784 
1785     log_verbose("Testing u_charName()\n");
1786     for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1787         /* modern Unicode character name */
1788         length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1789         if(U_FAILURE(errorCode)) {
1790             log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1791             return;
1792         }
1793         if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1794             log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1795         }
1796 
1797         /* find the modern name */
1798         if (*names[i].name) {
1799             c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1800             if(U_FAILURE(errorCode)) {
1801                 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1802                 return;
1803             }
1804             if(c!=(UChar32)names[i].code) {
1805                 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1806             }
1807         }
1808 
1809         /* Unicode 1.0 character name */
1810         length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1811         if(U_FAILURE(errorCode)) {
1812             log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1813             return;
1814         }
1815         if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1816             log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1817         }
1818 
1819         /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1820         if(names[i].oldName[0]!=0 /* && length>0 */) {
1821             c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1822             if(U_FAILURE(errorCode)) {
1823                 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1824                 return;
1825             }
1826             if(c!=(UChar32)names[i].code) {
1827                 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1828             }
1829         }
1830 
1831         /* Unicode character name alias */
1832         length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1833         if(U_FAILURE(errorCode)) {
1834             log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1835             return;
1836         }
1837         expected=names[i].alias;
1838         if(expected==NULL) {
1839             expected="";
1840         }
1841         if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1842             log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1843                     names[i].code, name, length, expected);
1844         }
1845 
1846         /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1847         if(expected[0]!=0 /* && length>0 */) {
1848             c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1849             if(U_FAILURE(errorCode)) {
1850                 log_err("u_charFromName(%s - alias) error %s\n",
1851                         expected, u_errorName(errorCode));
1852                 return;
1853             }
1854             if(c!=(UChar32)names[i].code) {
1855                 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1856                         expected, c, names[i].code);
1857             }
1858         }
1859     }
1860 
1861     /* test u_enumCharNames() */
1862     length=0;
1863     errorCode=U_ZERO_ERROR;
1864     u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1865     if(U_FAILURE(errorCode) || length<94140) {
1866         log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1867     }
1868 
1869     extContext.length = 0;
1870     extContext.last = -1;
1871     errorCode=U_ZERO_ERROR;
1872     u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1873     if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1874         log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1875     }
1876 
1877     /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1878     if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1879         log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1880     }
1881 
1882     /* Test getCharNameCharacters */
1883     if(!getTestOption(QUICK_OPTION)) {
1884         enum { BUFSIZE = 256 };
1885         UErrorCode ec = U_ZERO_ERROR;
1886         char buf[BUFSIZE];
1887         int32_t maxLength;
1888         UChar32 cp;
1889         UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1890         int32_t l1, l2;
1891         UBool map[256];
1892         UBool ok;
1893 
1894         USet* set = uset_open(1, 0); /* empty set */
1895         USet* dumb = uset_open(1, 0); /* empty set */
1896 
1897         /*
1898          * uprv_getCharNameCharacters() will likely return more lowercase
1899          * letters than actual character names contain because
1900          * it includes all the characters in lowercased names of
1901          * general categories, for the full possible set of extended names.
1902          */
1903         {
1904             USetAdder sa={
1905                 NULL,
1906                 uset_add,
1907                 uset_addRange,
1908                 uset_addString,
1909                 NULL /* don't need remove() */
1910             };
1911             sa.set=set;
1912             uprv_getCharNameCharacters(&sa);
1913         }
1914 
1915         /* build set the dumb (but sure-fire) way */
1916         for (i=0; i<256; ++i) {
1917             map[i] = FALSE;
1918         }
1919 
1920         maxLength=0;
1921         for (cp=0; cp<0x110000; ++cp) {
1922             int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1923                                      buf, BUFSIZE, &ec);
1924             if (U_FAILURE(ec)) {
1925                 log_err("FAIL: u_charName failed when it shouldn't\n");
1926                 uset_close(set);
1927                 uset_close(dumb);
1928                 return;
1929             }
1930             if(len>maxLength) {
1931                 maxLength=len;
1932             }
1933 
1934             for (i=0; i<len; ++i) {
1935                 if (!map[(uint8_t) buf[i]]) {
1936                     uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1937                     map[(uint8_t) buf[i]] = TRUE;
1938                 }
1939             }
1940 
1941             /* test for leading/trailing whitespace */
1942             if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1943                 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1944             }
1945         }
1946 
1947         if(map[(uint8_t)'\t']) {
1948             log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1949         }
1950 
1951         length=uprv_getMaxCharNameLength();
1952         if(length!=maxLength) {
1953             log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1954                     length, maxLength);
1955         }
1956 
1957         /* compare the sets.  Where is my uset_equals?!! */
1958         ok=TRUE;
1959         for(i=0; i<256; ++i) {
1960             if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1961                 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1962                     /* ignore lowercase a-z that are in set but not in dumb */
1963                     ok=TRUE;
1964                 } else {
1965                     ok=FALSE;
1966                     break;
1967                 }
1968             }
1969         }
1970 
1971         l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1972         l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1973         if (U_FAILURE(ec)) {
1974             log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1975             uset_close(set);
1976             uset_close(dumb);
1977             return;
1978         }
1979 
1980         if (l1 >= BUFSIZE) {
1981             l1 = BUFSIZE-1;
1982             pat[l1] = 0;
1983         }
1984         if (l2 >= BUFSIZE) {
1985             l2 = BUFSIZE-1;
1986             dumbPat[l2] = 0;
1987         }
1988 
1989         if (!ok) {
1990             log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1991                     aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1992         } else if(getTestOption(VERBOSITY_OPTION)) {
1993             log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
1994         }
1995 
1996         uset_close(set);
1997         uset_close(dumb);
1998     }
1999 
2000     /* ### TODO: test error cases and other interesting things */
2001 }
2002 
2003 static void
TestUCharFromNameUnderflow()2004 TestUCharFromNameUnderflow() {
2005     // Ticket #10889: Underflow crash when there is no dash.
2006     UErrorCode errorCode=U_ZERO_ERROR;
2007     UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<NO BREAK SPACE>", &errorCode);
2008     if(U_SUCCESS(errorCode)) {
2009         log_err("u_charFromName(<NO BREAK SPACE>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2010     }
2011 
2012     // Test related edge cases.
2013     errorCode=U_ZERO_ERROR;
2014     c=u_charFromName(U_EXTENDED_CHAR_NAME, "<-00a0>", &errorCode);
2015     if(U_SUCCESS(errorCode)) {
2016         log_err("u_charFromName(<-00a0>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2017     }
2018 
2019     errorCode=U_ZERO_ERROR;
2020     c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control->", &errorCode);
2021     if(U_SUCCESS(errorCode)) {
2022         log_err("u_charFromName(<control->) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2023     }
2024 
2025     errorCode=U_ZERO_ERROR;
2026     c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control-111111>", &errorCode);
2027     if(U_SUCCESS(errorCode)) {
2028         log_err("u_charFromName(<control-111111>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2029     }
2030 }
2031 
2032 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
2033 
2034 static void
TestMirroring()2035 TestMirroring() {
2036     USet *set;
2037     UErrorCode errorCode;
2038 
2039     UChar32 start, end, c2, c3;
2040     int32_t i;
2041 
2042     U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2043 
2044     U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2045 
2046     log_verbose("Testing u_isMirrored()\n");
2047     if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
2048          !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
2049         )
2050     ) {
2051         log_err("u_isMirrored() does not work correctly\n");
2052     }
2053 
2054     log_verbose("Testing u_charMirror()\n");
2055     if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
2056          u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
2057          u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
2058          /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2059          u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
2060          )
2061     ) {
2062         log_err("u_charMirror() does not work correctly\n");
2063     }
2064 
2065     /* verify that Bidi_Mirroring_Glyph roundtrips */
2066     errorCode=U_ZERO_ERROR;
2067     set=uset_openPattern(mirroredPattern, 17, &errorCode);
2068 
2069     if (U_FAILURE(errorCode)) {
2070         log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2071     } else {
2072         for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2073             do {
2074                 c2=u_charMirror(start);
2075                 c3=u_charMirror(c2);
2076                 if(c3!=start) {
2077                     log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2078                 }
2079                 c3=u_getBidiPairedBracket(start);
2080                 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2081                     if(c3!=start) {
2082                         log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2083                                 (long)start);
2084                     }
2085                 } else {
2086                     if(c3!=c2) {
2087                         log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2088                                 (long)start, (long)c2);
2089                     }
2090                 }
2091             } while(++start<=end);
2092         }
2093     }
2094 
2095     uset_close(set);
2096 }
2097 
2098 
2099 struct RunTestData
2100 {
2101     const char *runText;
2102     UScriptCode runCode;
2103 };
2104 
2105 typedef struct RunTestData RunTestData;
2106 
2107 static void
CheckScriptRuns(UScriptRun * scriptRun,int32_t * runStarts,const RunTestData * testData,int32_t nRuns,const char * prefix)2108 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2109                 const char *prefix)
2110 {
2111     int32_t run, runStart, runLimit;
2112     UScriptCode runCode;
2113 
2114     /* iterate over all the runs */
2115     run = 0;
2116     while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2117         if (runStart != runStarts[run]) {
2118             log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2119                 prefix, run, runStarts[run], runStart);
2120         }
2121 
2122         if (runLimit != runStarts[run + 1]) {
2123             log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2124                 prefix, run, runStarts[run + 1], runLimit);
2125         }
2126 
2127         if (runCode != testData[run].runCode) {
2128             log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2129                 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2130         }
2131 
2132         run += 1;
2133 
2134         /* stop when we've seen all the runs we expect to see */
2135         if (run >= nRuns) {
2136             break;
2137         }
2138     }
2139 
2140     /* Complain if we didn't see then number of runs we expected */
2141     if (run != nRuns) {
2142         log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2143     }
2144 }
2145 
2146 static void
TestUScriptRunAPI()2147 TestUScriptRunAPI()
2148 {
2149     static const RunTestData testData1[] = {
2150         {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2151         {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2152         {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2153         {"English (", USCRIPT_LATIN},
2154         {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2155         {") ", USCRIPT_LATIN},
2156         {"\\u6F22\\u5B75", USCRIPT_HAN},
2157         {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2158         {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2159         {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2160     };
2161 
2162     static const RunTestData testData2[] = {
2163        {"((((((((((abc))))))))))", USCRIPT_LATIN}
2164     };
2165 
2166     static const struct {
2167       const RunTestData *testData;
2168       int32_t nRuns;
2169     } testDataEntries[] = {
2170         {testData1, UPRV_LENGTHOF(testData1)},
2171         {testData2, UPRV_LENGTHOF(testData2)}
2172     };
2173 
2174     static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2175     int32_t testEntry;
2176 
2177     for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2178         UChar testString[1024];
2179         int32_t runStarts[256];
2180         int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2181         const RunTestData *testData = testDataEntries[testEntry].testData;
2182 
2183         int32_t run, stringLimit;
2184         UScriptRun *scriptRun = NULL;
2185         UErrorCode err;
2186 
2187         /*
2188          * Fill in the test string and the runStarts array.
2189          */
2190         stringLimit = 0;
2191         for (run = 0; run < nTestRuns; run += 1) {
2192             runStarts[run] = stringLimit;
2193             stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2194             /*stringLimit -= 1;*/
2195         }
2196 
2197         /* The limit of the last run */
2198         runStarts[nTestRuns] = stringLimit;
2199 
2200         /*
2201          * Make sure that calling uscript_OpenRun with a NULL text pointer
2202          * and a non-zero text length returns the correct error.
2203          */
2204         err = U_ZERO_ERROR;
2205         scriptRun = uscript_openRun(NULL, stringLimit, &err);
2206 
2207         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2208             log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2209         }
2210 
2211         if (scriptRun != NULL) {
2212             log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2213             uscript_closeRun(scriptRun);
2214         }
2215 
2216         /*
2217          * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2218          * and a zero text length returns the correct error.
2219          */
2220         err = U_ZERO_ERROR;
2221         scriptRun = uscript_openRun(testString, 0, &err);
2222 
2223         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2224             log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2225         }
2226 
2227         if (scriptRun != NULL) {
2228             log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2229             uscript_closeRun(scriptRun);
2230         }
2231 
2232         /*
2233          * Make sure that calling uscript_openRun with a NULL text pointer
2234          * and a zero text length doesn't return an error.
2235          */
2236         err = U_ZERO_ERROR;
2237         scriptRun = uscript_openRun(NULL, 0, &err);
2238 
2239         if (U_FAILURE(err)) {
2240             log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2241         }
2242 
2243         /* Make sure that the empty iterator doesn't find any runs */
2244         if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2245             log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2246         }
2247 
2248         /*
2249          * Make sure that calling uscript_setRunText with a NULL text pointer
2250          * and a non-zero text length returns the correct error.
2251          */
2252         err = U_ZERO_ERROR;
2253         uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2254 
2255         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2256             log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2257         }
2258 
2259         /*
2260          * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2261          * and a zero text length returns the correct error.
2262          */
2263         err = U_ZERO_ERROR;
2264         uscript_setRunText(scriptRun, testString, 0, &err);
2265 
2266         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2267             log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2268         }
2269 
2270         /*
2271          * Now call uscript_setRunText on the empty iterator
2272          * and make sure that it works.
2273          */
2274         err = U_ZERO_ERROR;
2275         uscript_setRunText(scriptRun, testString, stringLimit, &err);
2276 
2277         if (U_FAILURE(err)) {
2278             log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2279         } else {
2280             CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2281         }
2282 
2283         uscript_closeRun(scriptRun);
2284 
2285         /*
2286          * Now open an interator over the testString
2287          * using uscript_openRun and make sure that it works
2288          */
2289         scriptRun = uscript_openRun(testString, stringLimit, &err);
2290 
2291         if (U_FAILURE(err)) {
2292             log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2293         } else {
2294             CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2295         }
2296 
2297         /* Now reset the iterator, and make sure
2298          * that it still works.
2299          */
2300         uscript_resetRun(scriptRun);
2301 
2302         CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2303 
2304         /* Close the iterator */
2305         uscript_closeRun(scriptRun);
2306     }
2307 }
2308 
2309 /* test additional, non-core properties */
2310 static void
TestAdditionalProperties()2311 TestAdditionalProperties() {
2312     /* test data for u_charAge() */
2313     static const struct {
2314         UChar32 c;
2315         UVersionInfo version;
2316     } charAges[]={
2317         {0x41,    { 1, 1, 0, 0 }},
2318         {0xffff,  { 1, 1, 0, 0 }},
2319         {0x20ab,  { 2, 0, 0, 0 }},
2320         {0x2fffe, { 2, 0, 0, 0 }},
2321         {0x20ac,  { 2, 1, 0, 0 }},
2322         {0xfb1d,  { 3, 0, 0, 0 }},
2323         {0x3f4,   { 3, 1, 0, 0 }},
2324         {0x10300, { 3, 1, 0, 0 }},
2325         {0x220,   { 3, 2, 0, 0 }},
2326         {0xff60,  { 3, 2, 0, 0 }}
2327     };
2328 
2329     /* test data for u_hasBinaryProperty() */
2330     static const int32_t
2331     props[][3]={ /* code point, property, value */
2332         { 0x0627, UCHAR_ALPHABETIC, TRUE },
2333         { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2334         { 0x2028, UCHAR_ALPHABETIC, FALSE },
2335 
2336         { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2337         { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2338 
2339         { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2340         { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2341 
2342         { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2343         { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2344 
2345         /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2346         { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2347         { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2348         { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2349         { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2350 
2351         { 0x058a, UCHAR_DASH, TRUE },
2352         { 0x007e, UCHAR_DASH, FALSE },
2353 
2354         { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2355         { 0x3000, UCHAR_DIACRITIC, FALSE },
2356 
2357         { 0x0e46, UCHAR_EXTENDER, TRUE },
2358         { 0x0020, UCHAR_EXTENDER, FALSE },
2359 
2360 #if !UCONFIG_NO_NORMALIZATION
2361         { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2362         { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2363         { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2364 
2365         { 0x110a, UCHAR_NFD_INERT, TRUE },      /* Jamo L */
2366         { 0x0308, UCHAR_NFD_INERT, FALSE },
2367 
2368         { 0x1164, UCHAR_NFKD_INERT, TRUE },     /* Jamo V */
2369         { 0x1d79d, UCHAR_NFKD_INERT, FALSE },   /* math compat version of xi */
2370 
2371         { 0x0021, UCHAR_NFC_INERT, TRUE },      /* ! */
2372         { 0x0061, UCHAR_NFC_INERT, FALSE },     /* a */
2373         { 0x00e4, UCHAR_NFC_INERT, FALSE },     /* a-umlaut */
2374         { 0x0102, UCHAR_NFC_INERT, FALSE },     /* a-breve */
2375         { 0xac1c, UCHAR_NFC_INERT, FALSE },     /* Hangul LV */
2376         { 0xac1d, UCHAR_NFC_INERT, TRUE },      /* Hangul LVT */
2377 
2378         { 0x1d79d, UCHAR_NFKC_INERT, FALSE },   /* math compat version of xi */
2379         { 0x2a6d6, UCHAR_NFKC_INERT, TRUE },    /* Han, last of CJK ext. B */
2380 
2381         { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2382         { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2383         { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2384         { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2385         { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2386         { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2387 #endif
2388 
2389         { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2390         { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2391         { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2392 
2393         { 0x30fb, UCHAR_HYPHEN, TRUE },
2394         { 0xfe58, UCHAR_HYPHEN, FALSE },
2395 
2396         { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2397         { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2398         { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2399 
2400         { 0x2172, UCHAR_ID_START, TRUE },
2401         { 0x007a, UCHAR_ID_START, TRUE },
2402         { 0x0039, UCHAR_ID_START, FALSE },
2403 
2404         { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2405         { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2406         { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2407 
2408         { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2409         { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2410 
2411         { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2412         { 0x0345, UCHAR_LOWERCASE, TRUE },
2413         { 0x0030, UCHAR_LOWERCASE, FALSE },
2414 
2415         { 0x1d7a9, UCHAR_MATH, TRUE },
2416         { 0x2135, UCHAR_MATH, TRUE },
2417         { 0x0062, UCHAR_MATH, FALSE },
2418 
2419         { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2420         { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2421         { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2422 
2423         { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2424         { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2425         { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2426 
2427         { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2428         { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2429 
2430         { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2431         { 0x2162, UCHAR_UPPERCASE, TRUE },
2432         { 0x0345, UCHAR_UPPERCASE, FALSE },
2433 
2434         { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2435         { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2436         { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2437 
2438         { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2439         { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2440         { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2441 
2442         { 0x16ee, UCHAR_XID_START, TRUE },
2443         { 0x23456, UCHAR_XID_START, TRUE },
2444         { 0x1d1aa, UCHAR_XID_START, FALSE },
2445 
2446         /*
2447          * Version break:
2448          * The following properties are only supported starting with the
2449          * Unicode version indicated in the second field.
2450          */
2451         { -1, 0x320, 0 },
2452 
2453         { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2454         { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2455         { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2456 
2457         { 0x0149, UCHAR_DEPRECATED, TRUE },         /* changed in Unicode 5.2 */
2458         { 0x0341, UCHAR_DEPRECATED, FALSE },        /* changed in Unicode 5.2 */
2459         { 0xe0001, UCHAR_DEPRECATED, TRUE },        /* changed from Unicode 5 to 5.1 */
2460         { 0xe0100, UCHAR_DEPRECATED, FALSE },
2461 
2462         { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2463         { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2464         { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2465         { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE },     /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2466 
2467         { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2468         { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2469         { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE },    /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2470         { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2471 
2472         { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2473         { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2474 
2475         { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2476         { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2477 
2478         { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2479         { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2480 
2481         { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2482         { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2483 
2484         { 0x2e9b, UCHAR_RADICAL, TRUE },
2485         { 0x4e00, UCHAR_RADICAL, FALSE },
2486 
2487         { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2488         { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2489 
2490         { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2491         { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2492 
2493         { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2494 
2495         { 0x002e, UCHAR_S_TERM, TRUE },
2496         { 0x0061, UCHAR_S_TERM, FALSE },
2497 
2498         { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2499         { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2500         { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2501         { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2502 
2503         /* enum/integer type properties */
2504 
2505         /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2506         /* test default Bidi classes for unassigned code points */
2507         { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2508         { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2509         { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2510         { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2511         { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2512         { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2513         { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2514         { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2515         { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2516         { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2517         { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2518 
2519         { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2520         { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2521         { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2522         { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2523         { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2524         { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2525         { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2526 
2527         { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2528         { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2529         { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2530         { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2531         { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2532         { 0x0870, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2533         { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2534         { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2535         { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2536         { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2537         { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2538 
2539         /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2540         { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2541 
2542         { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2543         { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2544         { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2545         { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2546         { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2547         { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2548         { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2549         { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2550         { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2551 
2552         { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2553         { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2554         { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2555         { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2556         { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2557         { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2558         { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2559         { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2560         { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2561         { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2562         { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2563         { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2564         { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2565         { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2566         { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2567         { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2568         { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2569 
2570         /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2571         { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2572         { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER },     /* changed in Unicode 5.2 */
2573 
2574         { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2575         { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2576         { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2577         { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2578         { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2579 
2580         { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2581         { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2582         { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2583         { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2584         { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2585         { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2586         { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2587         { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2588 
2589         /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2590         { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2591         { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2592         { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2593         { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2594         { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2595         { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2596         { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2597         { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2598         { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2599         { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2600         { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2601         { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2602         { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2603         { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2604         { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2605 
2606         /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2607 
2608         /* UCHAR_SCRIPT tested in cucdapi.c TestUScriptCodeAPI() */
2609 
2610         { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2611         { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2612         { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2613         { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2614         { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2615         { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2616         { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2617 
2618         { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2619         { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2620         { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2621         { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2622 
2623         { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2624         { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2625         { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2626         { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2627         { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2628         { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2629 
2630         { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2631         { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2632         { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2633         { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2634 
2635         { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2636         { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2637         { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2638         { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2639         { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2640         { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2641         { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2642 
2643         { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2644         { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2645         { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2646         { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2647 
2648         { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2649         { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2650         { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2651         { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2652 
2653         { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2654         { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2655         { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2656         { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2657         { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2658 
2659         { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2660 
2661         { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2662 
2663         { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2664         { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2665         { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2666 
2667         { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2668         { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2669         { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2670         { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2671         { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2672 
2673         { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2674         { 0x2c8e,  UCHAR_BLOCK, UBLOCK_COPTIC },
2675         { 0xfe17,  UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2676 
2677         { 0x1a00,  UCHAR_SCRIPT, USCRIPT_BUGINESE },
2678         { 0x2cea,  UCHAR_SCRIPT, USCRIPT_COPTIC },
2679         { 0xa82b,  UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2680         { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2681 
2682         { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2683         { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2684         { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2685         { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2686         { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2687         { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2688 
2689         { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2690         { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2691         { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2692         { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2693 
2694         { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2695         { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2696         { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2697         { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2698 
2699         { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2700         { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2701         { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2702         { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2703 
2704         { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2705 
2706         /* unassigned code points in new default Bidi R blocks */
2707         { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2708         { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2709 
2710         /* test some script codes >127 */
2711         { 0xa6e6,  UCHAR_SCRIPT, USCRIPT_BAMUM },
2712         { 0xa4d0,  UCHAR_SCRIPT, USCRIPT_LISU },
2713         { 0x10a7f,  UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2714 
2715         { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2716 
2717         /* value changed in Unicode 6.0 */
2718         { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2719 
2720         { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2721 
2722         /* unassigned code points in new/changed default Bidi AL blocks */
2723         { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2724         { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2725 
2726         { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2727 
2728         /* unassigned code points in the currency symbols block now default to ET */
2729         { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2730         { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2731 
2732         /* new property in Unicode 6.3 */
2733         { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2734         { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2735         { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2736         { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2737         { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2738         { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2739 
2740         { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2741 
2742         /* new character range with Joining_Group values */
2743         { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2744         { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2745         { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2746         { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2747         { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2748 
2749         { -1, 0xa00, 0 },  // version break for Unicode 10
2750 
2751         { 0x1F1E5, UCHAR_REGIONAL_INDICATOR, FALSE },
2752         { 0x1F1E7, UCHAR_REGIONAL_INDICATOR, TRUE },
2753         { 0x1F1FF, UCHAR_REGIONAL_INDICATOR, TRUE },
2754         { 0x1F200, UCHAR_REGIONAL_INDICATOR, FALSE },
2755 
2756         { 0x0600, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2757         { 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, FALSE },
2758         { 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2759 
2760         /* undefined UProperty values */
2761         { 0x61, 0x4a7, 0 },
2762         { 0x234bc, 0x15ed, 0 }
2763     };
2764 
2765     UVersionInfo version;
2766     UChar32 c;
2767     int32_t i, result, uVersion;
2768     UProperty which;
2769 
2770     /* what is our Unicode version? */
2771     u_getUnicodeVersion(version);
2772     uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2773 
2774     u_charAge(0x20, version);
2775     if(version[0]==0) {
2776         /* no additional properties available */
2777         log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2778         return;
2779     }
2780 
2781     /* test u_charAge() */
2782     for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2783         u_charAge(charAges[i].c, version);
2784         if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2785             log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2786                 charAges[i].c,
2787                 version[0], version[1], version[2], version[3],
2788                 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2789         }
2790     }
2791 
2792     if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2793         u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2794         u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 ||   /* j2478 */
2795         u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2796         u_getIntPropertyMinValue(0x2345)!=0
2797     ) {
2798         log_err("error: u_getIntPropertyMinValue() wrong\n");
2799     }
2800     if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2801         log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2802     }
2803     if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2804         log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2805     }
2806     if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2807         log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2808     }
2809     if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2810         log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2811     }
2812     if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2813         log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2814     }
2815     if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2816         log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2817     }
2818     if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2819         log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2820     }
2821     if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2822         log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2823     }
2824     if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2825         log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2826     }
2827     if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2828         log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2829     }
2830     if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2831         log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2832     }
2833     if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2834         log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2835     }
2836     if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2837         log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2838     }
2839     if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2840         log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2841     }
2842     /*JB#2410*/
2843     if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2844         log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2845     }
2846     if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2847         log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2848     }
2849     if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) !=  (int32_t) (U_JG_COUNT -1)) {
2850         log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2851     }
2852     if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2853         log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2854     }
2855     if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2856         log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2857     }
2858 
2859     /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2860     for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2861         const char *whichName;
2862 
2863         if(props[i][0]<0) {
2864             /* Unicode version break */
2865             if(uVersion<props[i][1]) {
2866                 break; /* do not test properties that are not yet supported */
2867             } else {
2868                 continue; /* skip this row */
2869             }
2870         }
2871 
2872         c=(UChar32)props[i][0];
2873         which=(UProperty)props[i][1];
2874         whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2875 
2876         if(which<UCHAR_INT_START) {
2877             result=u_hasBinaryProperty(c, which);
2878             if(result!=props[i][2]) {
2879                 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2880                         c, whichName, result, i);
2881             }
2882         }
2883 
2884         result=u_getIntPropertyValue(c, which);
2885         if(result!=props[i][2]) {
2886             log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2887                     c, whichName, result, props[i][2], i);
2888         }
2889 
2890         /* test separate functions, too */
2891         switch((UProperty)props[i][1]) {
2892         case UCHAR_ALPHABETIC:
2893             if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2894                 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2895                         props[i][0], result, i);
2896             }
2897             break;
2898         case UCHAR_LOWERCASE:
2899             if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2900                 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2901                         props[i][0], result, i);
2902             }
2903             break;
2904         case UCHAR_UPPERCASE:
2905             if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2906                 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2907                         props[i][0], result, i);
2908             }
2909             break;
2910         case UCHAR_WHITE_SPACE:
2911             if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2912                 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2913                         props[i][0], result, i);
2914             }
2915             break;
2916         default:
2917             break;
2918         }
2919     }
2920 }
2921 
2922 static void
TestNumericProperties(void)2923 TestNumericProperties(void) {
2924     /* see UnicodeData.txt, DerivedNumericValues.txt */
2925     static const struct {
2926         UChar32 c;
2927         int32_t type;
2928         double numValue;
2929     } values[]={
2930         { 0x0F33, U_NT_NUMERIC, -1./2. },
2931         { 0x0C66, U_NT_DECIMAL, 0 },
2932         { 0x96f6, U_NT_NUMERIC, 0 },
2933         { 0xa833, U_NT_NUMERIC, 1./16. },
2934         { 0x2152, U_NT_NUMERIC, 1./10. },
2935         { 0x2151, U_NT_NUMERIC, 1./9. },
2936         { 0x1245f, U_NT_NUMERIC, 1./8. },
2937         { 0x2150, U_NT_NUMERIC, 1./7. },
2938         { 0x2159, U_NT_NUMERIC, 1./6. },
2939         { 0x09f6, U_NT_NUMERIC, 3./16. },
2940         { 0x2155, U_NT_NUMERIC, 1./5. },
2941         { 0x00BD, U_NT_NUMERIC, 1./2. },
2942         { 0x0031, U_NT_DECIMAL, 1. },
2943         { 0x4e00, U_NT_NUMERIC, 1. },
2944         { 0x58f1, U_NT_NUMERIC, 1. },
2945         { 0x10320, U_NT_NUMERIC, 1. },
2946         { 0x0F2B, U_NT_NUMERIC, 3./2. },
2947         { 0x00B2, U_NT_DIGIT, 2. },
2948         { 0x5f10, U_NT_NUMERIC, 2. },
2949         { 0x1813, U_NT_DECIMAL, 3. },
2950         { 0x5f0e, U_NT_NUMERIC, 3. },
2951         { 0x2173, U_NT_NUMERIC, 4. },
2952         { 0x8086, U_NT_NUMERIC, 4. },
2953         { 0x278E, U_NT_DIGIT, 5. },
2954         { 0x1D7F2, U_NT_DECIMAL, 6. },
2955         { 0x247A, U_NT_DIGIT, 7. },
2956         { 0x7396, U_NT_NUMERIC, 9. },
2957         { 0x1372, U_NT_NUMERIC, 10. },
2958         { 0x216B, U_NT_NUMERIC, 12. },
2959         { 0x16EE, U_NT_NUMERIC, 17. },
2960         { 0x249A, U_NT_NUMERIC, 19. },
2961         { 0x303A, U_NT_NUMERIC, 30. },
2962         { 0x5345, U_NT_NUMERIC, 30. },
2963         { 0x32B2, U_NT_NUMERIC, 37. },
2964         { 0x1375, U_NT_NUMERIC, 40. },
2965         { 0x10323, U_NT_NUMERIC, 50. },
2966         { 0x0BF1, U_NT_NUMERIC, 100. },
2967         { 0x964c, U_NT_NUMERIC, 100. },
2968         { 0x217E, U_NT_NUMERIC, 500. },
2969         { 0x2180, U_NT_NUMERIC, 1000. },
2970         { 0x4edf, U_NT_NUMERIC, 1000. },
2971         { 0x2181, U_NT_NUMERIC, 5000. },
2972         { 0x137C, U_NT_NUMERIC, 10000. },
2973         { 0x4e07, U_NT_NUMERIC, 10000. },
2974         { 0x12432, U_NT_NUMERIC, 216000. },
2975         { 0x12433, U_NT_NUMERIC, 432000. },
2976         { 0x4ebf, U_NT_NUMERIC, 100000000. },
2977         { 0x5146, U_NT_NUMERIC, 1000000000000. },
2978         { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
2979         { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2980         { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2981         { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2982         { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2983         { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2984         { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
2985         { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
2986     };
2987 
2988     double nv;
2989     UChar32 c;
2990     int32_t i, type;
2991 
2992     for(i=0; i<UPRV_LENGTHOF(values); ++i) {
2993         c=values[i].c;
2994         type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2995         nv=u_getNumericValue(c);
2996 
2997         if(type!=values[i].type) {
2998             log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2999         }
3000         if(0.000001 <= fabs(nv - values[i].numValue)) {
3001             log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
3002         }
3003     }
3004 }
3005 
3006 /**
3007  * Test the property names and property value names API.
3008  */
3009 static void
TestPropertyNames(void)3010 TestPropertyNames(void) {
3011     int32_t p, v, choice=0, rev;
3012     UBool atLeastSomething = FALSE;
3013 
3014     for (p=0; ; ++p) {
3015         UProperty propEnum = (UProperty)p;
3016         UBool sawProp = FALSE;
3017         if(p > 10 && !atLeastSomething) {
3018           log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
3019           return;
3020         }
3021 
3022         for (choice=0; ; ++choice) {
3023             const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
3024             if (name) {
3025                 if (!sawProp)
3026                     log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
3027                 log_verbose("%d=\"%s\"", choice, name);
3028                 sawProp = TRUE;
3029                 atLeastSomething = TRUE;
3030 
3031                 /* test reverse mapping */
3032                 rev = u_getPropertyEnum(name);
3033                 if (rev != p) {
3034                     log_err("Property round-trip failure: %d -> %s -> %d\n",
3035                             p, name, rev);
3036                 }
3037             }
3038             if (!name && choice>0) break;
3039         }
3040         if (sawProp) {
3041             /* looks like a valid property; check the values */
3042             const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3043             int32_t max = 0;
3044             if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
3045                 max = 255;
3046             } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
3047                 /* it's far too slow to iterate all the way up to
3048                    the real max, U_GC_P_MASK */
3049                 max = U_GC_NL_MASK;
3050             } else if (p == UCHAR_BLOCK) {
3051                 /* UBlockCodes, unlike other values, start at 1 */
3052                 max = 1;
3053             }
3054             log_verbose("\n");
3055             for (v=-1; ; ++v) {
3056                 UBool sawValue = FALSE;
3057                 for (choice=0; ; ++choice) {
3058                     const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
3059                     if (vname) {
3060                         if (!sawValue) log_verbose(" %s, value %d:", pname, v);
3061                         log_verbose("%d=\"%s\"", choice, vname);
3062                         sawValue = TRUE;
3063 
3064                         /* test reverse mapping */
3065                         rev = u_getPropertyValueEnum(propEnum, vname);
3066                         if (rev != v) {
3067                             log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
3068                                     pname, v, vname, rev);
3069                         }
3070                     }
3071                     if (!vname && choice>0) break;
3072                 }
3073                 if (sawValue) {
3074                     log_verbose("\n");
3075                 }
3076                 if (!sawValue && v>=max) break;
3077             }
3078         }
3079         if (!sawProp) {
3080             if (p>=UCHAR_STRING_LIMIT) {
3081                 break;
3082             } else if (p>=UCHAR_DOUBLE_LIMIT) {
3083                 p = UCHAR_STRING_START - 1;
3084             } else if (p>=UCHAR_MASK_LIMIT) {
3085                 p = UCHAR_DOUBLE_START - 1;
3086             } else if (p>=UCHAR_INT_LIMIT) {
3087                 p = UCHAR_MASK_START - 1;
3088             } else if (p>=UCHAR_BINARY_LIMIT) {
3089                 p = UCHAR_INT_START - 1;
3090             }
3091         }
3092     }
3093 }
3094 
3095 /**
3096  * Test the property values API.  See JB#2410.
3097  */
3098 static void
TestPropertyValues(void)3099 TestPropertyValues(void) {
3100     int32_t i, p, min, max;
3101     UErrorCode ec;
3102 
3103     /* Min should be 0 for everything. */
3104     /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3105     for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3106         UProperty propEnum = (UProperty)p;
3107         min = u_getIntPropertyMinValue(propEnum);
3108         if (min != 0) {
3109             if (p == UCHAR_BLOCK) {
3110                 /* This is okay...for now.  See JB#2487.
3111                    TODO Update this for JB#2487. */
3112             } else {
3113                 const char* name;
3114                 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3115                 if (name == NULL)
3116                     name = "<ERROR>";
3117                 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3118                         name, min);
3119             }
3120         }
3121     }
3122 
3123     if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3124         u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3125         log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3126     }
3127 
3128     /* Max should be -1 for invalid properties. */
3129     max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3130     if (max != -1) {
3131         log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3132                 max);
3133     }
3134 
3135     /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3136     for (i=0; i<2; ++i) {
3137         int32_t script;
3138         const char* desc;
3139         ec = U_ZERO_ERROR;
3140         switch (i) {
3141         case 0:
3142             script = uscript_getScript(-1, &ec);
3143             desc = "uscript_getScript(-1)";
3144             break;
3145         case 1:
3146             script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3147             desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3148             break;
3149         default:
3150             log_err("Internal test error. Too many scripts\n");
3151             return;
3152         }
3153         /* We don't explicitly test ec.  It should be U_FAILURE but it
3154            isn't documented as such. */
3155         if (script != (int32_t)USCRIPT_INVALID_CODE) {
3156             log_err("FAIL: %s = %d, exp. 0\n",
3157                     desc, script);
3158         }
3159     }
3160 }
3161 
3162 /* various tests for consistency of UCD data and API behavior */
3163 static void
TestConsistency()3164 TestConsistency() {
3165     char buffer[300];
3166     USet *set1, *set2, *set3, *set4;
3167     UErrorCode errorCode;
3168 
3169     UChar32 start, end;
3170     int32_t i, length;
3171 
3172     U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3173     U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3174     U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3175     U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3176     U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3177 
3178     U_STRING_DECL(mathBlocksPattern,
3179         "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3180         214);
3181     U_STRING_DECL(mathPattern, "[:Math:]", 8);
3182     U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3183     U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3184     U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3185 
3186     U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3187     U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3188     U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3189     U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3190     U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3191 
3192     U_STRING_INIT(mathBlocksPattern,
3193         "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3194         214);
3195     U_STRING_INIT(mathPattern, "[:Math:]", 8);
3196     U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3197     U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3198     U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3199 
3200     /*
3201      * It used to be that UCD.html and its precursors said
3202      * "Those dashes used to mark connections between pieces of words,
3203      *  plus the Katakana middle dot."
3204      *
3205      * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3206      * but not from Hyphen.
3207      * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3208      * Therefore, do not show errors when testing the Hyphen property.
3209      */
3210     log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3211                 "known to the UTC and not considered errors.\n");
3212 
3213     errorCode=U_ZERO_ERROR;
3214     set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3215     set2=uset_openPattern(dashPattern, 8, &errorCode);
3216     if(U_SUCCESS(errorCode)) {
3217         /* remove the Katakana middle dot(s) from set1 */
3218         uset_remove(set1, 0x30fb);
3219         uset_remove(set1, 0xff65); /* halfwidth variant */
3220         showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3221     } else {
3222         log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3223     }
3224 
3225     /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3226     set3=uset_openPattern(formatPattern, 6, &errorCode);
3227     set4=uset_openPattern(alphaPattern, 14, &errorCode);
3228     if(U_SUCCESS(errorCode)) {
3229         showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3230         showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3231         showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3232     } else {
3233         log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3234     }
3235 
3236     uset_close(set1);
3237     uset_close(set2);
3238     uset_close(set3);
3239     uset_close(set4);
3240 
3241     /*
3242      * Check that each lowercase character has "small" in its name
3243      * and not "capital".
3244      * There are some such characters, some of which seem odd.
3245      * Use the verbose flag to see these notices.
3246      */
3247     errorCode=U_ZERO_ERROR;
3248     set1=uset_openPattern(lowerPattern, 13, &errorCode);
3249     if(U_SUCCESS(errorCode)) {
3250         for(i=0;; ++i) {
3251             length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3252             if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3253                 break; /* done */
3254             }
3255             if(U_FAILURE(errorCode)) {
3256                 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3257                         i, u_errorName(errorCode));
3258                 break;
3259             }
3260             if(length!=0) {
3261                 break; /* done with code points, got a string or -1 */
3262             }
3263 
3264             while(start<=end) {
3265                 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3266                 if(U_FAILURE(errorCode)) {
3267                     log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3268                     errorCode=U_ZERO_ERROR;
3269                 }
3270                 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3271                     strstr(buffer, "SMALL CAPITAL")==NULL
3272                 ) {
3273                     log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3274                 }
3275                 ++start;
3276             }
3277         }
3278     } else {
3279         log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3280     }
3281     uset_close(set1);
3282 
3283     /* verify that all assigned characters in Math blocks are exactly Math characters */
3284     errorCode=U_ZERO_ERROR;
3285     set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3286     set2=uset_openPattern(mathPattern, 8, &errorCode);
3287     set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3288     if(U_SUCCESS(errorCode)) {
3289         uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3290         uset_complement(set3);      /* assigned characters */
3291         uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3292         compareUSets(set1, set2,
3293                      "[assigned Math block chars]", "[math blocks]&[:Math:]",
3294                      TRUE);
3295     } else {
3296         log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3297     }
3298     uset_close(set1);
3299     uset_close(set2);
3300     uset_close(set3);
3301 
3302     /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3303     errorCode=U_ZERO_ERROR;
3304     set1=uset_openPattern(unknownPattern, 14, &errorCode);
3305     set2=uset_openPattern(reservedPattern, 20, &errorCode);
3306     if(U_SUCCESS(errorCode)) {
3307         compareUSets(set1, set2,
3308                      "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3309                      TRUE);
3310     } else {
3311         log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3312     }
3313     uset_close(set1);
3314     uset_close(set2);
3315 }
3316 
3317 /* test case folding, compare return values with CaseFolding.txt ------------ */
3318 
3319 /* bit set for which case foldings for a character have been tested already */
3320 enum {
3321     CF_SIMPLE=1,
3322     CF_FULL=2,
3323     CF_TURKIC=4,
3324     CF_ALL=7
3325 };
3326 
3327 static void
testFold(UChar32 c,int which,UChar32 simple,UChar32 turkic,const UChar * full,int32_t fullLength,const UChar * turkicFull,int32_t turkicFullLength)3328 testFold(UChar32 c, int which,
3329          UChar32 simple, UChar32 turkic,
3330          const UChar *full, int32_t fullLength,
3331          const UChar *turkicFull, int32_t turkicFullLength) {
3332     UChar s[2], t[32];
3333     UChar32 c2;
3334     int32_t length, length2;
3335 
3336     UErrorCode errorCode=U_ZERO_ERROR;
3337 
3338     length=0;
3339     U16_APPEND_UNSAFE(s, length, c);
3340 
3341     if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3342         log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3343     }
3344     if((which&CF_FULL)!=0) {
3345         length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3346         if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3347             log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3348         }
3349     }
3350     if((which&CF_TURKIC)!=0) {
3351         if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3352             log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3353         }
3354 
3355         length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3356         if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3357             log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3358         }
3359     }
3360 }
3361 
3362 /* test that c case-folds to itself */
3363 static void
testFoldToSelf(UChar32 c,int which)3364 testFoldToSelf(UChar32 c, int which) {
3365     UChar s[2];
3366     int32_t length;
3367 
3368     length=0;
3369     U16_APPEND_UNSAFE(s, length, c);
3370     testFold(c, which, c, c, s, length, s, length);
3371 }
3372 
3373 struct CaseFoldingData {
3374     USet *notSeen;
3375     UChar32 prev, prevSimple;
3376     UChar prevFull[32];
3377     int32_t prevFullLength;
3378     int which;
3379 };
3380 typedef struct CaseFoldingData CaseFoldingData;
3381 
3382 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)3383 caseFoldingLineFn(void *context,
3384                   char *fields[][2], int32_t fieldCount,
3385                   UErrorCode *pErrorCode) {
3386     CaseFoldingData *pData=(CaseFoldingData *)context;
3387     char *end;
3388     UChar full[32];
3389     UChar32 c, prev, simple;
3390     int32_t count;
3391     int which;
3392     char status;
3393 
3394     /* get code point */
3395     const char *s=u_skipWhitespace(fields[0][0]);
3396     if(0==strncmp(s, "0000..10FFFF", 12)) {
3397         /*
3398          * Ignore the line
3399          * # @missing: 0000..10FFFF; C; <code point>
3400          * because maps-to-self is already our default, and this line breaks this parser.
3401          */
3402         return;
3403     }
3404     c=(UChar32)strtoul(s, &end, 16);
3405     end=(char *)u_skipWhitespace(end);
3406     if(end<=fields[0][0] || end!=fields[0][1]) {
3407         log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3408         *pErrorCode=U_PARSE_ERROR;
3409         return;
3410     }
3411 
3412     /* get the status of this mapping */
3413     status=*u_skipWhitespace(fields[1][0]);
3414     if(status!='C' && status!='S' && status!='F' && status!='T') {
3415         log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3416         *pErrorCode=U_PARSE_ERROR;
3417         return;
3418     }
3419 
3420     /* get the mapping */
3421     count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3422     if(U_FAILURE(*pErrorCode)) {
3423         log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3424         return;
3425     }
3426 
3427     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3428     if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3429         simple=c;
3430     }
3431 
3432     if(c!=(prev=pData->prev)) {
3433         /*
3434          * Test remaining mappings for the previous code point.
3435          * If a turkic folding was not mentioned, then it should fold the same
3436          * as the regular simple case folding.
3437          */
3438         UChar prevString[2];
3439         int32_t length;
3440 
3441         length=0;
3442         U16_APPEND_UNSAFE(prevString, length, prev);
3443         testFold(prev, (~pData->which)&CF_ALL,
3444                  prev, pData->prevSimple,
3445                  prevString, length,
3446                  pData->prevFull, pData->prevFullLength);
3447         pData->prev=pData->prevSimple=c;
3448         length=0;
3449         U16_APPEND_UNSAFE(pData->prevFull, length, c);
3450         pData->prevFullLength=length;
3451         pData->which=0;
3452     }
3453 
3454     /*
3455      * Turn the status into a bit set of case foldings to test.
3456      * Remember non-Turkic case foldings as defaults for Turkic mode.
3457      */
3458     switch(status) {
3459     case 'C':
3460         which=CF_SIMPLE|CF_FULL;
3461         pData->prevSimple=simple;
3462         u_memcpy(pData->prevFull, full, count);
3463         pData->prevFullLength=count;
3464         break;
3465     case 'S':
3466         which=CF_SIMPLE;
3467         pData->prevSimple=simple;
3468         break;
3469     case 'F':
3470         which=CF_FULL;
3471         u_memcpy(pData->prevFull, full, count);
3472         pData->prevFullLength=count;
3473         break;
3474     case 'T':
3475         which=CF_TURKIC;
3476         break;
3477     default:
3478         which=0;
3479         break; /* won't happen because of test above */
3480     }
3481 
3482     testFold(c, which, simple, simple, full, count, full, count);
3483 
3484     /* remember which case foldings of c have been tested */
3485     pData->which|=which;
3486 
3487     /* remove c from the set of ones not mentioned in CaseFolding.txt */
3488     uset_remove(pData->notSeen, c);
3489 }
3490 
3491 static void
TestCaseFolding()3492 TestCaseFolding() {
3493     CaseFoldingData data={ NULL };
3494     char *fields[3][2];
3495     UErrorCode errorCode;
3496 
3497     static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3498 
3499     errorCode=U_ZERO_ERROR;
3500     /* test BMP & plane 1 - nothing interesting above */
3501     data.notSeen=uset_open(0, 0x1ffff);
3502     data.prevFullLength=1; /* length of full case folding of U+0000 */
3503 
3504     parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3505     if(U_SUCCESS(errorCode)) {
3506         int32_t i, start, end;
3507 
3508         /* add a pseudo-last line to finish testing of the actual last one */
3509         fields[0][0]=lastLine;
3510         fields[0][1]=lastLine+6;
3511         fields[1][0]=lastLine+7;
3512         fields[1][1]=lastLine+9;
3513         fields[2][0]=lastLine+10;
3514         fields[2][1]=lastLine+17;
3515         caseFoldingLineFn(&data, fields, 3, &errorCode);
3516 
3517         /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3518         for(i=0;
3519             0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3520                 U_SUCCESS(errorCode);
3521             ++i
3522         ) {
3523             do {
3524                 testFoldToSelf(start, CF_ALL);
3525             } while(++start<=end);
3526         }
3527     }
3528 
3529     uset_close(data.notSeen);
3530 }
3531 
TestBinaryCharacterPropertiesAPI()3532 static void TestBinaryCharacterPropertiesAPI() {
3533     // API test only. See intltest/ucdtest.cpp for functional test.
3534     UErrorCode errorCode = U_ZERO_ERROR;
3535     const USet *set = u_getBinaryPropertySet(-1, &errorCode);
3536     if (U_SUCCESS(errorCode)) {
3537         log_err("u_getBinaryPropertySet(-1) did not fail\n");
3538     }
3539     errorCode = U_ZERO_ERROR;
3540     set = u_getBinaryPropertySet(UCHAR_BINARY_LIMIT, &errorCode);
3541     if (U_SUCCESS(errorCode)) {
3542         log_err("u_getBinaryPropertySet(UCHAR_BINARY_LIMIT) did not fail\n");
3543     }
3544     errorCode = U_ZERO_ERROR;
3545     set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &errorCode);
3546     if (!uset_contains(set, 0x20) || uset_contains(set, 0x61)) {
3547         log_err("u_getBinaryPropertySet(UCHAR_WHITE_SPACE) wrong contents\n");
3548     }
3549 }
3550 
TestIntCharacterPropertiesAPI()3551 static void TestIntCharacterPropertiesAPI() {
3552     // API test only. See intltest/ucdtest.cpp for functional test.
3553     UErrorCode errorCode = U_ZERO_ERROR;
3554     const UCPMap *map = u_getIntPropertyMap(UCHAR_INT_START - 1, &errorCode);
3555     if (U_SUCCESS(errorCode)) {
3556         log_err("u_getIntPropertyMap(UCHAR_INT_START - 1) did not fail\n");
3557     }
3558     errorCode = U_ZERO_ERROR;
3559     map = u_getIntPropertyMap(UCHAR_INT_LIMIT, &errorCode);
3560     if (U_SUCCESS(errorCode)) {
3561         log_err("u_getIntPropertyMap(UCHAR_INT_LIMIT) did not fail\n");
3562     }
3563     errorCode = U_ZERO_ERROR;
3564     map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &errorCode);
3565     if (ucpmap_get(map, 0x20) != U_SPACE_SEPARATOR || ucpmap_get(map, 0x23456) != U_OTHER_LETTER) {
3566         log_err("u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY) wrong contents\n");
3567     }
3568 }
3569