1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 1997-2015, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 /*******************************************************************************
7 *
8 * File CUCDTST.C
9 *
10 * Modification History:
11 *        Name                     Description
12 *     Madhu Katragadda            Ported for C API, added tests for string functions
13 ********************************************************************************
14 */
15 
16 #include <string.h>
17 #include <math.h>
18 #include <stdlib.h>
19 
20 #include "unicode/utypes.h"
21 #include "unicode/uchar.h"
22 #include "unicode/putil.h"
23 #include "unicode/ustring.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unorm2.h"
26 
27 #include "cintltst.h"
28 #include "putilimp.h"
29 #include "uparse.h"
30 #include "ucase.h"
31 #include "ubidi_props.h"
32 #include "uprops.h"
33 #include "uset_imp.h"
34 #include "usc_impl.h"
35 #include "udatamem.h" /* for testing ucase_openBinary() */
36 #include "cucdapi.h"
37 #include "cmemory.h"
38 
39 /* prototypes --------------------------------------------------------------- */
40 
41 static void TestUpperLower(void);
42 static void TestLetterNumber(void);
43 static void TestMisc(void);
44 static void TestPOSIX(void);
45 static void TestControlPrint(void);
46 static void TestIdentifier(void);
47 static void TestUnicodeData(void);
48 static void TestCodeUnit(void);
49 static void TestCodePoint(void);
50 static void TestCharLength(void);
51 static void TestCharNames(void);
52 static void TestUCharFromNameUnderflow(void);
53 static void TestMirroring(void);
54 static void TestUScriptRunAPI(void);
55 static void TestAdditionalProperties(void);
56 static void TestNumericProperties(void);
57 static void TestPropertyNames(void);
58 static void TestPropertyValues(void);
59 static void TestConsistency(void);
60 static void TestUCase(void);
61 static void TestUBiDiProps(void);
62 static void TestCaseFolding(void);
63 
64 /* internal methods used */
65 static int32_t MakeProp(char* str);
66 static int32_t MakeDir(char* str);
67 
68 /* helpers ------------------------------------------------------------------ */
69 
70 static void
parseUCDFile(const char * filename,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)71 parseUCDFile(const char *filename,
72              char *fields[][2], int32_t fieldCount,
73              UParseLineFn *lineFn, void *context,
74              UErrorCode *pErrorCode) {
75     char path[256];
76     char backupPath[256];
77 
78     if(U_FAILURE(*pErrorCode)) {
79         return;
80     }
81 
82     /* Look inside ICU_DATA first */
83     strcpy(path, u_getDataDirectory());
84     strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
85     strcat(path, filename);
86 
87     /* As a fallback, try to guess where the source data was located
88      *    at the time ICU was built, and look there.
89      */
90     strcpy(backupPath, ctest_dataSrcDir());
91     strcat(backupPath, U_FILE_SEP_STRING);
92     strcat(backupPath, "unidata" U_FILE_SEP_STRING);
93     strcat(backupPath, filename);
94 
95     u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
96     if(*pErrorCode==U_FILE_ACCESS_ERROR) {
97         *pErrorCode=U_ZERO_ERROR;
98         u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
99     }
100     if(U_FAILURE(*pErrorCode)) {
101         log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
102     }
103 }
104 
105 /* test data ---------------------------------------------------------------- */
106 
107 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
108 static const int32_t tagValues[] =
109     {
110     /* Mn */ U_NON_SPACING_MARK,
111     /* Mc */ U_COMBINING_SPACING_MARK,
112     /* Me */ U_ENCLOSING_MARK,
113     /* Nd */ U_DECIMAL_DIGIT_NUMBER,
114     /* Nl */ U_LETTER_NUMBER,
115     /* No */ U_OTHER_NUMBER,
116     /* Zs */ U_SPACE_SEPARATOR,
117     /* Zl */ U_LINE_SEPARATOR,
118     /* Zp */ U_PARAGRAPH_SEPARATOR,
119     /* Cc */ U_CONTROL_CHAR,
120     /* Cf */ U_FORMAT_CHAR,
121     /* Cs */ U_SURROGATE,
122     /* Co */ U_PRIVATE_USE_CHAR,
123     /* Cn */ U_UNASSIGNED,
124     /* Lu */ U_UPPERCASE_LETTER,
125     /* Ll */ U_LOWERCASE_LETTER,
126     /* Lt */ U_TITLECASE_LETTER,
127     /* Lm */ U_MODIFIER_LETTER,
128     /* Lo */ U_OTHER_LETTER,
129     /* Pc */ U_CONNECTOR_PUNCTUATION,
130     /* Pd */ U_DASH_PUNCTUATION,
131     /* Ps */ U_START_PUNCTUATION,
132     /* Pe */ U_END_PUNCTUATION,
133     /* Po */ U_OTHER_PUNCTUATION,
134     /* Sm */ U_MATH_SYMBOL,
135     /* Sc */ U_CURRENCY_SYMBOL,
136     /* Sk */ U_MODIFIER_SYMBOL,
137     /* So */ U_OTHER_SYMBOL,
138     /* Pi */ U_INITIAL_PUNCTUATION,
139     /* Pf */ U_FINAL_PUNCTUATION
140     };
141 
142 static const char dirStrings[][5] = {
143     "L",
144     "R",
145     "EN",
146     "ES",
147     "ET",
148     "AN",
149     "CS",
150     "B",
151     "S",
152     "WS",
153     "ON",
154     "LRE",
155     "LRO",
156     "AL",
157     "RLE",
158     "RLO",
159     "PDF",
160     "NSM",
161     "BN",
162     /* new in Unicode 6.3/ICU 52 */
163     "FSI",
164     "LRI",
165     "RLI",
166     "PDI"
167 };
168 
169 void addUnicodeTest(TestNode** root);
170 
addUnicodeTest(TestNode ** root)171 void addUnicodeTest(TestNode** root)
172 {
173     addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
174     addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
175     addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
176     addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
177     addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
178     addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
179     addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
180     addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
181     addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
182     addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
183     addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
184     addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
185     addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
186     addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
187     addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
188     addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
189     addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
190     addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
191     addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
192     addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
193     addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
194     addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
195     addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
196     addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
197     addTest(root, &TestUCase, "tsutil/cucdtst/TestUCase");
198     addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps");
199     addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
200 }
201 
202 /*==================================================== */
203 /* test u_toupper() and u_tolower()                    */
204 /*==================================================== */
TestUpperLower()205 static void TestUpperLower()
206 {
207     const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
208     const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
209     U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
210     U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
211     int32_t i;
212 
213     U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
214     U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
215 
216 /*
217 Checks LetterLike Symbols which were previously a source of confusion
218 [Bertrand A. D. 02/04/98]
219 */
220     for (i=0x2100;i<0x2138;i++)
221     {
222         /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
223         if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
224         {
225             if (i != (int)u_tolower(i)) /* itself */
226                 log_err("Failed case conversion with itself: U+%04x\n", i);
227             if (i != (int)u_toupper(i))
228                 log_err("Failed case conversion with itself: U+%04x\n", i);
229         }
230     }
231 
232     for(i=0; i < u_strlen(upper); i++){
233         if(u_tolower(upper[i]) != lower[i]){
234             log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
235         }
236     }
237 
238     log_verbose("testing upper lower\n");
239     for (i = 0; i < 21; i++) {
240 
241         if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
242         {
243             log_err("Failed isLowerCase test at  %c\n", upperTest[i]);
244         }
245         else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
246          {
247             log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
248         }
249         else if (upperTest[i] != u_tolower(lowerTest[i]))
250         {
251             log_err("Failed case conversion from %c  To %c :\n", lowerTest[i], upperTest[i]);
252         }
253         else if (lowerTest[i] != u_toupper(upperTest[i]))
254          {
255             log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
256         }
257         else if (upperTest[i] != u_tolower(upperTest[i]))
258         {
259             log_err("Failed case conversion with itself: %c\n", upperTest[i]);
260         }
261         else if (lowerTest[i] != u_toupper(lowerTest[i]))
262         {
263             log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
264         }
265     }
266     log_verbose("done testing upper lower\n");
267 
268     log_verbose("testing u_istitle\n");
269     {
270         static const UChar expected[] = {
271             0x1F88,
272             0x1F89,
273             0x1F8A,
274             0x1F8B,
275             0x1F8C,
276             0x1F8D,
277             0x1F8E,
278             0x1F8F,
279             0x1F88,
280             0x1F89,
281             0x1F8A,
282             0x1F8B,
283             0x1F8C,
284             0x1F8D,
285             0x1F8E,
286             0x1F8F,
287             0x1F98,
288             0x1F99,
289             0x1F9A,
290             0x1F9B,
291             0x1F9C,
292             0x1F9D,
293             0x1F9E,
294             0x1F9F,
295             0x1F98,
296             0x1F99,
297             0x1F9A,
298             0x1F9B,
299             0x1F9C,
300             0x1F9D,
301             0x1F9E,
302             0x1F9F,
303             0x1FA8,
304             0x1FA9,
305             0x1FAA,
306             0x1FAB,
307             0x1FAC,
308             0x1FAD,
309             0x1FAE,
310             0x1FAF,
311             0x1FA8,
312             0x1FA9,
313             0x1FAA,
314             0x1FAB,
315             0x1FAC,
316             0x1FAD,
317             0x1FAE,
318             0x1FAF,
319             0x1FBC,
320             0x1FBC,
321             0x1FCC,
322             0x1FCC,
323             0x1FFC,
324             0x1FFC,
325         };
326         int32_t num = sizeof(expected)/sizeof(expected[0]);
327         for(i=0; i<num; i++){
328             if(!u_istitle(expected[i])){
329                 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
330             }
331         }
332 
333     }
334 }
335 
336 /* compare two sets and verify that their difference or intersection is empty */
337 static UBool
showADiffB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool expect,UBool diffIsError)338 showADiffB(const USet *a, const USet *b,
339            const char *a_name, const char *b_name,
340            UBool expect, UBool diffIsError) {
341     USet *aa;
342     int32_t i, start, end, length;
343     UErrorCode errorCode;
344 
345     /*
346      * expect:
347      * TRUE  -> a-b should be empty, that is, b should contain all of a
348      * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
349      */
350     if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
351         return TRUE;
352     }
353 
354     /* clone a to aa because a is const */
355     aa=uset_open(1, 0);
356     if(aa==NULL) {
357         /* unusual problem - out of memory? */
358         return FALSE;
359     }
360     uset_addAll(aa, a);
361 
362     /* compute the set in question */
363     if(expect) {
364         /* a-b */
365         uset_removeAll(aa, b);
366     } else {
367         /* a&b */
368         uset_retainAll(aa, b);
369     }
370 
371     /* aa is not empty because of the initial tests above; show its contents */
372     errorCode=U_ZERO_ERROR;
373     i=0;
374     for(;;) {
375         length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
376         if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
377             break; /* done */
378         }
379         if(U_FAILURE(errorCode)) {
380             log_err("error comparing %s with %s at difference item %d: %s\n",
381                 a_name, b_name, i, u_errorName(errorCode));
382             break;
383         }
384         if(length!=0) {
385             break; /* done with code points, got a string or -1 */
386         }
387 
388         if(diffIsError) {
389             if(expect) {
390                 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
391             } else {
392                 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
393             }
394         } else {
395             if(expect) {
396                 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
397             } else {
398                 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
399             }
400         }
401 
402         ++i;
403     }
404 
405     uset_close(aa);
406     return FALSE;
407 }
408 
409 static UBool
showAMinusB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)410 showAMinusB(const USet *a, const USet *b,
411             const char *a_name, const char *b_name,
412             UBool diffIsError) {
413     return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
414 }
415 
416 static UBool
showAIntersectB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)417 showAIntersectB(const USet *a, const USet *b,
418                 const char *a_name, const char *b_name,
419                 UBool diffIsError) {
420     return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
421 }
422 
423 static UBool
compareUSets(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)424 compareUSets(const USet *a, const USet *b,
425              const char *a_name, const char *b_name,
426              UBool diffIsError) {
427     /*
428      * Use an arithmetic & not a logical && so that both branches
429      * are always taken and all differences are shown.
430      */
431     return
432         showAMinusB(a, b, a_name, b_name, diffIsError) &
433         showAMinusB(b, a, b_name, a_name, diffIsError);
434 }
435 
436 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
TestLetterNumber()437 static void TestLetterNumber()
438 {
439     UChar i = 0x0000;
440 
441     log_verbose("Testing for isalpha\n");
442     for (i = 0x0041; i < 0x005B; i++) {
443         if (!u_isalpha(i))
444         {
445             log_err("Failed isLetter test at  %.4X\n", i);
446         }
447     }
448     for (i = 0x0660; i < 0x066A; i++) {
449         if (u_isalpha(i))
450         {
451             log_err("Failed isLetter test with numbers at %.4X\n", i);
452         }
453     }
454 
455     log_verbose("Testing for isdigit\n");
456     for (i = 0x0660; i < 0x066A; i++) {
457         if (!u_isdigit(i))
458         {
459             log_verbose("Failed isNumber test at %.4X\n", i);
460         }
461     }
462 
463     log_verbose("Testing for isalnum\n");
464     for (i = 0x0041; i < 0x005B; i++) {
465         if (!u_isalnum(i))
466         {
467             log_err("Failed isAlNum test at  %.4X\n", i);
468         }
469     }
470     for (i = 0x0660; i < 0x066A; i++) {
471         if (!u_isalnum(i))
472         {
473             log_err("Failed isAlNum test at  %.4X\n", i);
474         }
475     }
476 
477     {
478         /*
479          * The following checks work only starting from Unicode 4.0.
480          * Check the version number here.
481          */
482         static UVersionInfo u401={ 4, 0, 1, 0 };
483         UVersionInfo version;
484         u_getUnicodeVersion(version);
485         if(version[0]<4 || 0==memcmp(version, u401, 4)) {
486             return;
487         }
488     }
489 
490     {
491         /*
492          * Sanity check:
493          * Verify that exactly the digit characters have decimal digit values.
494          * This assumption is used in the implementation of u_digit()
495          * (which checks nt=de)
496          * compared with the parallel java.lang.Character.digit()
497          * (which checks Nd).
498          *
499          * This was not true in Unicode 3.2 and earlier.
500          * Unicode 4.0 fixed discrepancies.
501          * Unicode 4.0.1 re-introduced problems in this area due to an
502          * unintentionally incomplete last-minute change.
503          */
504         U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
505         U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
506 
507         USet *digits, *decimalValues;
508         UErrorCode errorCode;
509 
510         U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
511         U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
512         errorCode=U_ZERO_ERROR;
513         digits=uset_openPattern(digitsPattern, 6, &errorCode);
514         decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
515 
516         if(U_SUCCESS(errorCode)) {
517             compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
518         }
519 
520         uset_close(digits);
521         uset_close(decimalValues);
522     }
523 }
524 
testSampleCharProps(UBool propFn (UChar32),const char * propName,const UChar32 * sampleChars,int32_t sampleCharsLength,UBool expected)525 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
526                                 const UChar32 *sampleChars, int32_t sampleCharsLength,
527                                 UBool expected) {
528     int32_t i;
529     for (i = 0; i < sampleCharsLength; ++i) {
530         UBool result = propFn(sampleChars[i]);
531         if (result != expected) {
532             log_err("error: character property function %s(U+%04x)=%d is wrong\n",
533                     propName, sampleChars[i], result);
534         }
535     }
536 }
537 
538 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
TestMisc()539 static void TestMisc()
540 {
541     static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
542     static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
543     static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
544     static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
545     static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
546     static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
547 /*    static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
548     static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
549     static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
550     static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
551     static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
552 
553     static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
554 
555     uint32_t mask;
556 
557     int32_t i;
558     char icuVersion[U_MAX_VERSION_STRING_LENGTH];
559     UVersionInfo realVersion;
560 
561     memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
562 
563     testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
564     testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
565 
566     testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
567                         sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
568     testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
569                         sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
570 
571     testSampleCharProps(u_isWhitespace, "u_isWhitespace",
572                         sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
573     testSampleCharProps(u_isWhitespace, "u_isWhitespace",
574                         sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
575 
576     testSampleCharProps(u_isdefined, "u_isdefined",
577                         sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
578     testSampleCharProps(u_isdefined, "u_isdefined",
579                         sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
580 
581     testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
582     testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
583 
584     testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
585     testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
586 
587     for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
588         if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
589             log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
590                     sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
591         }
592     }
593 
594     /* Tests the ICU version #*/
595     u_getVersion(realVersion);
596     u_versionToString(realVersion, icuVersion);
597     if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
598     {
599         log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
600     }
601 #if defined(ICU_VERSION)
602     /* test only happens where we have configure.in with VERSION - sanity check. */
603     if(strcmp(U_ICU_VERSION, ICU_VERSION))
604     {
605         log_err("ICU version mismatch: Header says %s, build environment says %s.\n",  U_ICU_VERSION, ICU_VERSION);
606     }
607 #endif
608 
609     /* test U_GC_... */
610     if(
611         U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
612         U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
613         U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
614         U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
615         U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
616         U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
617     ) {
618         log_err("error: U_GET_GC_MASK does not work properly\n");
619     }
620 
621     mask=0;
622     mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
623 
624     mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
625     mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
626     mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
627     mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
628     mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
629 
630     mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
631     mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
632     mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
633 
634     mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
635     mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
636     mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
637 
638     mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
639     mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
640     mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
641 
642     mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
643     mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
644     mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
645     mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
646 
647     mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
648     mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
649     mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
650     mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
651     mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
652 
653     mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
654     mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
655     mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
656     mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
657 
658     mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
659     mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
660 
661     if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
662         log_err("error: problems with U_GC_XX_MASK constants\n");
663     }
664 
665     mask=0;
666     mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
667     mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
668     mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
669     mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
670     mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
671     mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
672     mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
673 
674     if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
675         log_err("error: problems with U_GC_Y_MASK constants\n");
676     }
677     {
678         static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
679         for(i=0; i<10; i++){
680             if(digit[i]!=u_forDigit(i,10)){
681                 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
682             }
683         }
684     }
685 
686     /* test u_digit() */
687     {
688         static const struct {
689             UChar32 c;
690             int8_t radix, value;
691         } data[]={
692             /* base 16 */
693             { 0x0031, 16, 1 },
694             { 0x0038, 16, 8 },
695             { 0x0043, 16, 12 },
696             { 0x0066, 16, 15 },
697             { 0x00e4, 16, -1 },
698             { 0x0662, 16, 2 },
699             { 0x06f5, 16, 5 },
700             { 0xff13, 16, 3 },
701             { 0xff41, 16, 10 },
702 
703             /* base 8 */
704             { 0x0031, 8, 1 },
705             { 0x0038, 8, -1 },
706             { 0x0043, 8, -1 },
707             { 0x0066, 8, -1 },
708             { 0x00e4, 8, -1 },
709             { 0x0662, 8, 2 },
710             { 0x06f5, 8, 5 },
711             { 0xff13, 8, 3 },
712             { 0xff41, 8, -1 },
713 
714             /* base 36 */
715             { 0x5a, 36, 35 },
716             { 0x7a, 36, 35 },
717             { 0xff3a, 36, 35 },
718             { 0xff5a, 36, 35 },
719 
720             /* wrong radix values */
721             { 0x0031, 1, -1 },
722             { 0xff3a, 37, -1 }
723         };
724 
725         for(i=0; i<UPRV_LENGTHOF(data); ++i) {
726             if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
727                 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
728                         data[i].c,
729                         data[i].radix,
730                         u_digit(data[i].c, data[i].radix),
731                         data[i].value);
732             }
733         }
734     }
735 }
736 
737 /* test C/POSIX-style functions --------------------------------------------- */
738 
739 /* bit flags */
740 #define ISAL     1
741 #define ISLO     2
742 #define ISUP     4
743 
744 #define ISDI     8
745 #define ISXD  0x10
746 
747 #define ISAN  0x20
748 
749 #define ISPU  0x40
750 #define ISGR  0x80
751 #define ISPR 0x100
752 
753 #define ISSP 0x200
754 #define ISBL 0x400
755 #define ISCN 0x800
756 
757 /* C/POSIX-style functions, in the same order as the bit flags */
758 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
759 
760 static const struct {
761     IsPOSIXClass *fn;
762     const char *name;
763 } posixClasses[]={
764     { u_isalpha, "isalpha" },
765     { u_islower, "islower" },
766     { u_isupper, "isupper" },
767     { u_isdigit, "isdigit" },
768     { u_isxdigit, "isxdigit" },
769     { u_isalnum, "isalnum" },
770     { u_ispunct, "ispunct" },
771     { u_isgraph, "isgraph" },
772     { u_isprint, "isprint" },
773     { u_isspace, "isspace" },
774     { u_isblank, "isblank" },
775     { u_iscntrl, "iscntrl" }
776 };
777 
778 static const struct {
779     UChar32 c;
780     uint32_t posixResults;
781 } posixData[]={
782     { 0x0008,                                                        ISCN },    /* backspace */
783     { 0x0009,                                              ISSP|ISBL|ISCN },    /* TAB */
784     { 0x000a,                                              ISSP|     ISCN },    /* LF */
785     { 0x000c,                                              ISSP|     ISCN },    /* FF */
786     { 0x000d,                                              ISSP|     ISCN },    /* CR */
787     { 0x0020,                                         ISPR|ISSP|ISBL      },    /* space */
788     { 0x0021,                               ISPU|ISGR|ISPR                },    /* ! */
789     { 0x0033,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* 3 */
790     { 0x0040,                               ISPU|ISGR|ISPR                },    /* @ */
791     { 0x0041, ISAL|     ISUP|     ISXD|ISAN|     ISGR|ISPR                },    /* A */
792     { 0x007a, ISAL|ISLO|               ISAN|     ISGR|ISPR                },    /* z */
793     { 0x007b,                               ISPU|ISGR|ISPR                },    /* { */
794     { 0x0085,                                              ISSP|     ISCN },    /* NEL */
795     { 0x00a0,                                         ISPR|ISSP|ISBL      },    /* NBSP */
796     { 0x00a4,                                    ISGR|ISPR                },    /* currency sign */
797     { 0x00e4, ISAL|ISLO|               ISAN|     ISGR|ISPR                },    /* a-umlaut */
798     { 0x0300,                                    ISGR|ISPR                },    /* combining grave */
799     { 0x0600,                                                        ISCN },    /* arabic number sign */
800     { 0x0627, ISAL|                    ISAN|     ISGR|ISPR                },    /* alef */
801     { 0x0663,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* arabic 3 */
802     { 0x2002,                                         ISPR|ISSP|ISBL      },    /* en space */
803     { 0x2007,                                         ISPR|ISSP|ISBL      },    /* figure space */
804     { 0x2009,                                         ISPR|ISSP|ISBL      },    /* thin space */
805     { 0x200b,                                                        ISCN },    /* ZWSP */
806   /*{ 0x200b,                                         ISPR|ISSP           },*/    /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
807     { 0x200e,                                                        ISCN },    /* LRM */
808     { 0x2028,                                         ISPR|ISSP|     ISCN },    /* LS */
809     { 0x2029,                                         ISPR|ISSP|     ISCN },    /* PS */
810     { 0x20ac,                                    ISGR|ISPR                },    /* Euro */
811     { 0xff15,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth 5 */
812     { 0xff25, ISAL|     ISUP|     ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth E */
813     { 0xff35, ISAL|     ISUP|          ISAN|     ISGR|ISPR                },    /* fullwidth U */
814     { 0xff45, ISAL|ISLO|          ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth e */
815     { 0xff55, ISAL|ISLO|               ISAN|     ISGR|ISPR                }     /* fullwidth u */
816 };
817 
818 static void
TestPOSIX()819 TestPOSIX() {
820     uint32_t mask;
821     int32_t cl, i;
822     UBool expect;
823 
824     mask=1;
825     for(cl=0; cl<12; ++cl) {
826         for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
827             expect=(UBool)((posixData[i].posixResults&mask)!=0);
828             if(posixClasses[cl].fn(posixData[i].c)!=expect) {
829                 log_err("u_%s(U+%04x)=%s is wrong\n",
830                     posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
831             }
832         }
833         mask<<=1;
834     }
835 }
836 
837 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
TestControlPrint()838 static void TestControlPrint()
839 {
840     const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
841     const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
842     const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
843     const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
844     UChar32 c;
845 
846     testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
847     testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
848 
849     testSampleCharProps(u_isprint, "u_isprint",
850                         samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
851     testSampleCharProps(u_isprint, "u_isprint",
852                         sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
853 
854     /* test all ISO 8 controls */
855     for(c=0; c<=0x9f; ++c) {
856         if(c==0x20) {
857             /* skip ASCII graphic characters and continue with DEL */
858             c=0x7f;
859         }
860         if(!u_iscntrl(c)) {
861             log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
862         }
863         if(!u_isISOControl(c)) {
864             log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
865         }
866         if(u_isprint(c)) {
867             log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
868         }
869     }
870 
871     /* test all Latin-1 graphic characters */
872     for(c=0x20; c<=0xff; ++c) {
873         if(c==0x7f) {
874             c=0xa0;
875         } else if(c==0xad) {
876             /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
877             ++c;
878         }
879         if(!u_isprint(c)) {
880             log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
881         }
882     }
883 }
884 
885 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
TestIdentifier()886 static void TestIdentifier()
887 {
888     const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
889     const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
890     const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
891     const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
892     const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
893     const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
894     const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
895     const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
896     const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
897     const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
898 
899     testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
900                         sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
901     testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
902                         sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
903 
904     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
905                         sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
906     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
907                         sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
908 
909     /* IDPart should imply IDStart */
910     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
911                         sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
912 
913     testSampleCharProps(u_isIDStart, "u_isIDStart",
914                         sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
915     testSampleCharProps(u_isIDStart, "u_isIDStart",
916                         sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
917 
918     testSampleCharProps(u_isIDPart, "u_isIDPart",
919                         sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
920     testSampleCharProps(u_isIDPart, "u_isIDPart",
921                         sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
922 
923     /* IDPart should imply IDStart */
924     testSampleCharProps(u_isIDPart, "u_isIDPart",
925                         sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
926 
927     testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
928                         sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
929     testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
930                         sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
931 }
932 
933 /* for each line of UnicodeData.txt, check some of the properties */
934 typedef struct UnicodeDataContext {
935 #if UCONFIG_NO_NORMALIZATION
936     const void *dummy;
937 #else
938     const UNormalizer2 *nfc;
939     const UNormalizer2 *nfkc;
940 #endif
941 } UnicodeDataContext;
942 
943 /*
944  * ### TODO
945  * This test fails incorrectly if the First or Last code point of a repetitive area
946  * is overridden, which is allowed and is encouraged for the PUAs.
947  * Currently, this means that both area First/Last and override lines are
948  * tested against the properties from the API,
949  * and the area boundary will not match and cause an error.
950  *
951  * This function should detect area boundaries and skip them for the test of individual
952  * code points' properties.
953  * Then it should check that the areas contain all the same properties except where overridden.
954  * For this, it would have had to set a flag for which code points were listed explicitly.
955  */
956 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)957 unicodeDataLineFn(void *context,
958                   char *fields[][2], int32_t fieldCount,
959                   UErrorCode *pErrorCode)
960 {
961     char buffer[100];
962     const char *d;
963     char *end;
964     uint32_t value;
965     UChar32 c;
966     int32_t i;
967     int8_t type;
968     int32_t dt;
969     UChar dm[32], s[32];
970     int32_t dmLength, length;
971 
972 #if !UCONFIG_NO_NORMALIZATION
973     const UNormalizer2 *nfc, *nfkc;
974 #endif
975 
976     /* get the character code, field 0 */
977     c=strtoul(fields[0][0], &end, 16);
978     if(end<=fields[0][0] || end!=fields[0][1]) {
979         log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
980         return;
981     }
982     if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
983         log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
984         return;
985     }
986 
987     /* get general category, field 2 */
988     *fields[2][1]=0;
989     type = (int8_t)tagValues[MakeProp(fields[2][0])];
990     if(u_charType(c)!=type) {
991         log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
992     }
993     if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
994         log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
995     }
996 
997     /* get canonical combining class, field 3 */
998     value=strtoul(fields[3][0], &end, 10);
999     if(end<=fields[3][0] || end!=fields[3][1]) {
1000         log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1001         return;
1002     }
1003     if(value>255) {
1004         log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1005         return;
1006     }
1007 #if !UCONFIG_NO_NORMALIZATION
1008     if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1009         log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1010     }
1011     nfkc=((UnicodeDataContext *)context)->nfkc;
1012     if(value!=unorm2_getCombiningClass(nfkc, c)) {
1013         log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1014     }
1015 #endif
1016 
1017     /* get BiDi category, field 4 */
1018     *fields[4][1]=0;
1019     i=MakeDir(fields[4][0]);
1020     if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1021         log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1022     }
1023 
1024     /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1025     d=NULL;
1026     if(fields[5][0]==fields[5][1]) {
1027         /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1028         if(c==0xac00 || c==0xd7a3) {
1029             dt=U_DT_CANONICAL;
1030         } else {
1031             dt=U_DT_NONE;
1032         }
1033     } else {
1034         d=fields[5][0];
1035         *fields[5][1]=0;
1036         dt=UCHAR_INVALID_CODE;
1037         if(*d=='<') {
1038             end=strchr(++d, '>');
1039             if(end!=NULL) {
1040                 *end=0;
1041                 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1042                 d=u_skipWhitespace(end+1);
1043             }
1044         } else {
1045             dt=U_DT_CANONICAL;
1046         }
1047     }
1048     if(dt>U_DT_NONE) {
1049         if(c==0xac00) {
1050             dm[0]=0x1100;
1051             dm[1]=0x1161;
1052             dm[2]=0;
1053             dmLength=2;
1054         } else if(c==0xd7a3) {
1055             dm[0]=0xd788;
1056             dm[1]=0x11c2;
1057             dm[2]=0;
1058             dmLength=2;
1059         } else {
1060             dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1061         }
1062     } else {
1063         dmLength=-1;
1064     }
1065     if(dt<0 || U_FAILURE(*pErrorCode)) {
1066         log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1067         return;
1068     }
1069 #if !UCONFIG_NO_NORMALIZATION
1070     i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1071     if(i!=dt) {
1072         log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1073     }
1074     /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1075     length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1076     if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1077         log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1078                 "or the Decomposition_Mapping is different (%s)\n",
1079                 c, length, dmLength, u_errorName(*pErrorCode));
1080         return;
1081     }
1082     /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1083     if(dt!=U_DT_CANONICAL) {
1084         dmLength=-1;
1085     }
1086     nfc=((UnicodeDataContext *)context)->nfc;
1087     length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1088     if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1089         log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1090                 "or the Decomposition_Mapping is different (%s)\n",
1091                 c, length, dmLength, u_errorName(*pErrorCode));
1092         return;
1093     }
1094     /* recompose */
1095     if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1096         UChar32 a, b, composite;
1097         i=0;
1098         U16_NEXT(dm, i, dmLength, a);
1099         U16_NEXT(dm, i, dmLength, b);
1100         /* i==dmLength */
1101         composite=unorm2_composePair(nfc, a, b);
1102         if(composite!=c) {
1103             log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1104                     (long)c, (long)a, (long)b, (long)composite);
1105         }
1106         /*
1107          * Note: NFKC has fewer round-trip mappings than NFC,
1108          * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1109          */
1110     }
1111 #endif
1112 
1113     /* get ISO Comment, field 11 */
1114     *fields[11][1]=0;
1115     i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1116     if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1117         log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1118             c, u_errorName(*pErrorCode),
1119             U_FAILURE(*pErrorCode) ? buffer : "[error]",
1120             fields[11][0]);
1121     }
1122 
1123     /* get uppercase mapping, field 12 */
1124     if(fields[12][0]!=fields[12][1]) {
1125         value=strtoul(fields[12][0], &end, 16);
1126         if(end!=fields[12][1]) {
1127             log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1128             return;
1129         }
1130         if((UChar32)value!=u_toupper(c)) {
1131             log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1132         }
1133     } else {
1134         /* no case mapping: the API must map the code point to itself */
1135         if(c!=u_toupper(c)) {
1136             log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1137         }
1138     }
1139 
1140     /* get lowercase mapping, field 13 */
1141     if(fields[13][0]!=fields[13][1]) {
1142         value=strtoul(fields[13][0], &end, 16);
1143         if(end!=fields[13][1]) {
1144             log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1145             return;
1146         }
1147         if((UChar32)value!=u_tolower(c)) {
1148             log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1149         }
1150     } else {
1151         /* no case mapping: the API must map the code point to itself */
1152         if(c!=u_tolower(c)) {
1153             log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1154         }
1155     }
1156 
1157     /* get titlecase mapping, field 14 */
1158     if(fields[14][0]!=fields[14][1]) {
1159         value=strtoul(fields[14][0], &end, 16);
1160         if(end!=fields[14][1]) {
1161             log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1162             return;
1163         }
1164         if((UChar32)value!=u_totitle(c)) {
1165             log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1166         }
1167     } else {
1168         /* no case mapping: the API must map the code point to itself */
1169         if(c!=u_totitle(c)) {
1170             log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1171         }
1172     }
1173 }
1174 
1175 static UBool U_CALLCONV
enumTypeRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1176 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1177     static const UChar32 test[][2]={
1178         {0x41, U_UPPERCASE_LETTER},
1179         {0x308, U_NON_SPACING_MARK},
1180         {0xfffe, U_GENERAL_OTHER_TYPES},
1181         {0xe0041, U_FORMAT_CHAR},
1182         {0xeffff, U_UNASSIGNED}
1183     };
1184 
1185     int32_t i, count;
1186 
1187     if(0!=strcmp((const char *)context, "a1")) {
1188         log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1189         return FALSE;
1190     }
1191 
1192     count=UPRV_LENGTHOF(test);
1193     for(i=0; i<count; ++i) {
1194         if(start<=test[i][0] && test[i][0]<limit) {
1195             if(type!=(UCharCategory)test[i][1]) {
1196                 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1197                         start, limit, (long)type, test[i][0], test[i][1]);
1198             }
1199             /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1200             return i==(count-1) ? FALSE : TRUE;
1201         }
1202     }
1203 
1204     if(start>test[count-1][0]) {
1205         log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1206                 start, limit, (long)type);
1207         return FALSE;
1208     }
1209 
1210     return TRUE;
1211 }
1212 
1213 static UBool U_CALLCONV
enumDefaultsRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1214 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1215     /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1216     static const int32_t defaultBidi[][2]={ /* { limit, class } */
1217         { 0x0590, U_LEFT_TO_RIGHT },
1218         { 0x0600, U_RIGHT_TO_LEFT },
1219         { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1220         { 0x08A0, U_RIGHT_TO_LEFT },
1221         { 0x0900, U_RIGHT_TO_LEFT_ARABIC },  /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1222         { 0x20A0, U_LEFT_TO_RIGHT },
1223         { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR },  /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1224         { 0xFB1D, U_LEFT_TO_RIGHT },
1225         { 0xFB50, U_RIGHT_TO_LEFT },
1226         { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1227         { 0xFE70, U_LEFT_TO_RIGHT },
1228         { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1229         { 0x10800, U_LEFT_TO_RIGHT },
1230         { 0x11000, U_RIGHT_TO_LEFT },
1231         { 0x1E800, U_LEFT_TO_RIGHT },  /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1232         { 0x1EE00, U_RIGHT_TO_LEFT },
1233         { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC },  /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1234         { 0x1F000, U_RIGHT_TO_LEFT },
1235         { 0x110000, U_LEFT_TO_RIGHT }
1236     };
1237 
1238     UChar32 c;
1239     int32_t i;
1240     UCharDirection shouldBeDir;
1241 
1242     /*
1243      * LineBreak.txt specifies:
1244      *   #  - Assigned characters that are not listed explicitly are given the value
1245      *   #    "AL".
1246      *   #  - Unassigned characters are given the value "XX".
1247      *
1248      * PUA characters are listed explicitly with "XX".
1249      * Verify that no assigned character has "XX".
1250      */
1251     if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1252         c=start;
1253         while(c<limit) {
1254             if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1255                 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1256             }
1257             ++c;
1258         }
1259     }
1260 
1261     /*
1262      * Verify default Bidi classes.
1263      * For recent Unicode versions, see UCD.html.
1264      *
1265      * For older Unicode versions:
1266      * See table 3-7 "Bidirectional Character Types" in UAX #9.
1267      * http://www.unicode.org/reports/tr9/
1268      *
1269      * See also DerivedBidiClass.txt for Cn code points!
1270      *
1271      * Unicode 4.0.1/Public Review Issue #28 (http://www.unicode.org/review/resolved-pri.html)
1272      * changed some default values.
1273      * In particular, non-characters and unassigned Default Ignorable Code Points
1274      * change from L to BN.
1275      *
1276      * UCD.html version 4.0.1 does not yet reflect these changes.
1277      */
1278     if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1279         /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1280         c=start;
1281         for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1282             if((int32_t)c<defaultBidi[i][0]) {
1283                 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1284                     if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1285                         shouldBeDir=U_BOUNDARY_NEUTRAL;
1286                     } else {
1287                         shouldBeDir=(UCharDirection)defaultBidi[i][1];
1288                     }
1289 
1290                     if( u_charDirection(c)!=shouldBeDir ||
1291                         u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1292                     ) {
1293                         log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1294                             c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1295                     }
1296                     ++c;
1297                 }
1298             }
1299         }
1300     }
1301 
1302     return TRUE;
1303 }
1304 
1305 /* tests for several properties */
TestUnicodeData()1306 static void TestUnicodeData()
1307 {
1308     UVersionInfo expectVersionArray;
1309     UVersionInfo versionArray;
1310     char *fields[15][2];
1311     UErrorCode errorCode;
1312     UChar32 c;
1313     int8_t type;
1314 
1315     UnicodeDataContext context;
1316 
1317     u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1318     u_getUnicodeVersion(versionArray);
1319     if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1320     {
1321         log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1322         versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1323     }
1324 
1325 #if defined(ICU_UNICODE_VERSION)
1326     /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1327     if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1328     {
1329          log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1330     }
1331 #endif
1332 
1333     if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1334         log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1335     }
1336 
1337     errorCode=U_ZERO_ERROR;
1338 #if !UCONFIG_NO_NORMALIZATION
1339     context.nfc=unorm2_getNFCInstance(&errorCode);
1340     context.nfkc=unorm2_getNFKCInstance(&errorCode);
1341     if(U_FAILURE(errorCode)) {
1342         log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1343         return;
1344     }
1345 #endif
1346     parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1347     if(U_FAILURE(errorCode)) {
1348         return; /* if we couldn't parse UnicodeData.txt, we should return */
1349     }
1350 
1351     /* sanity check on repeated properties */
1352     for(c=0xfffe; c<=0x10ffff;) {
1353         type=u_charType(c);
1354         if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1355             log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1356         }
1357         if(type!=U_UNASSIGNED) {
1358             log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1359         }
1360         if((c&0xffff)==0xfffe) {
1361             ++c;
1362         } else {
1363             c+=0xffff;
1364         }
1365     }
1366 
1367     /* test that PUA is not "unassigned" */
1368     for(c=0xe000; c<=0x10fffd;) {
1369         type=u_charType(c);
1370         if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1371             log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1372         }
1373         if(type==U_UNASSIGNED) {
1374             log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1375         } else if(type!=U_PRIVATE_USE_CHAR) {
1376             log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1377         }
1378         if(c==0xf8ff) {
1379             c=0xf0000;
1380         } else if(c==0xffffd) {
1381             c=0x100000;
1382         } else {
1383             ++c;
1384         }
1385     }
1386 
1387     /* test u_enumCharTypes() */
1388     u_enumCharTypes(enumTypeRange, "a1");
1389 
1390     /* check default properties */
1391     u_enumCharTypes(enumDefaultsRange, NULL);
1392 }
1393 
TestCodeUnit()1394 static void TestCodeUnit(){
1395     const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1396 
1397     int32_t i;
1398 
1399     for(i=0; i<(int32_t)(sizeof(codeunit)/sizeof(codeunit[0])); i++){
1400         UChar c=codeunit[i];
1401         if(i<4){
1402             if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1403                 log_err("ERROR: U+%04x is a single", c);
1404             }
1405 
1406         }
1407         if(i >= 4 && i< 8){
1408             if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1409                 log_err("ERROR: U+%04x is a first surrogate", c);
1410             }
1411         }
1412         if(i >= 8 && i< 12){
1413             if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1414                 log_err("ERROR: U+%04x is a second surrogate", c);
1415             }
1416         }
1417     }
1418 
1419 }
1420 
TestCodePoint()1421 static void TestCodePoint(){
1422     const UChar32 codePoint[]={
1423         /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1424         0xd800,
1425         0xdbff,
1426         0xdc00,
1427         0xdfff,
1428         0xdc04,
1429         0xd821,
1430         /*not a surrogate, valid, isUnicodeChar , not Error*/
1431         0x20ac,
1432         0xd7ff,
1433         0xe000,
1434         0xe123,
1435         0x0061,
1436         0xe065,
1437         0x20402,
1438         0x24506,
1439         0x23456,
1440         0x20402,
1441         0x10402,
1442         0x23456,
1443         /*not a surrogate, not valid, isUnicodeChar, isError */
1444         0x0015,
1445         0x009f,
1446         /*not a surrogate, not valid, not isUnicodeChar, isError */
1447         0xffff,
1448         0xfffe,
1449     };
1450     int32_t i;
1451     for(i=0; i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0])); i++){
1452         UChar32 c=codePoint[i];
1453         if(i<6){
1454             if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
1455                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1456             }
1457             if(UTF_IS_VALID(c)){
1458                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1459             }
1460             if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1461                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1462             }
1463             if(UTF_IS_ERROR(c)){
1464                 log_err("ERROR: isError() failed for U+%04x\n", c);
1465             }
1466         }else if(i >=6 && i<18){
1467             if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1468                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1469             }
1470             if(!UTF_IS_VALID(c)){
1471                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1472             }
1473             if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1474                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1475             }
1476             if(UTF_IS_ERROR(c)){
1477                 log_err("ERROR: isError() failed for U+%04x\n", c);
1478             }
1479         }else if(i >=18 && i<20){
1480             if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1481                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1482             }
1483             if(UTF_IS_VALID(c)){
1484                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1485             }
1486             if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1487                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1488             }
1489             if(!UTF_IS_ERROR(c)){
1490                 log_err("ERROR: isError() failed for U+%04x\n", c);
1491             }
1492         }
1493         else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){
1494             if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1495                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1496             }
1497             if(UTF_IS_VALID(c)){
1498                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1499             }
1500             if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1501                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1502             }
1503             if(!UTF_IS_ERROR(c)){
1504                 log_err("ERROR: isError() failed for U+%04x\n", c);
1505             }
1506         }
1507     }
1508 
1509     if(
1510         !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1511         !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1512         U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1513         U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1514     ) {
1515         log_err("error with U_IS_BMP()\n");
1516     }
1517 
1518     if(
1519         U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1520         U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1521         U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1522         !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1523     ) {
1524         log_err("error with U_IS_SUPPLEMENTARY()\n");
1525     }
1526 }
1527 
TestCharLength()1528 static void TestCharLength()
1529 {
1530     const int32_t codepoint[]={
1531         1, 0x0061,
1532         1, 0xe065,
1533         1, 0x20ac,
1534         2, 0x20402,
1535         2, 0x23456,
1536         2, 0x24506,
1537         2, 0x20402,
1538         2, 0x10402,
1539         1, 0xd7ff,
1540         1, 0xe000
1541     };
1542 
1543     int32_t i;
1544     UBool multiple;
1545     for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){
1546         UChar32 c=codepoint[i+1];
1547         if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
1548             log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1549         }
1550         multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1551         if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1552             log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1553         }
1554     }
1555 }
1556 
1557 /*internal functions ----*/
MakeProp(char * str)1558 static int32_t MakeProp(char* str)
1559 {
1560     int32_t result = 0;
1561     char* matchPosition =0;
1562 
1563     matchPosition = strstr(tagStrings, str);
1564     if (matchPosition == 0)
1565     {
1566         log_err("unrecognized type letter ");
1567         log_err(str);
1568     }
1569     else
1570         result = (int32_t)((matchPosition - tagStrings) / 2);
1571     return result;
1572 }
1573 
MakeDir(char * str)1574 static int32_t MakeDir(char* str)
1575 {
1576     int32_t pos = 0;
1577     for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1578         if (strcmp(str, dirStrings[pos]) == 0) {
1579             return pos;
1580         }
1581     }
1582     return -1;
1583 }
1584 
1585 /* test u_charName() -------------------------------------------------------- */
1586 
1587 static const struct {
1588     uint32_t code;
1589     const char *name, *oldName, *extName, *alias;
1590 } names[]={
1591     {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1592     {0x01a2, "LATIN CAPITAL LETTER OI", "",
1593              "LATIN CAPITAL LETTER OI",
1594              "LATIN CAPITAL LETTER GHA"},
1595     {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1596              "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1597     {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1598              "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1599              "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1600     {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1601     {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1602     {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1603     {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1604     {0xd800, "", "", "<lead surrogate-D800>" },
1605     {0xdc00, "", "", "<trail surrogate-DC00>" },
1606     {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" },
1607     {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1608     {0xffff, "", "", "<noncharacter-FFFF>" },
1609     {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1610               "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1611               "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1612     {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1613 };
1614 
1615 static UBool
enumCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1616 enumCharNamesFn(void *context,
1617                 UChar32 code, UCharNameChoice nameChoice,
1618                 const char *name, int32_t length) {
1619     int32_t *pCount=(int32_t *)context;
1620     const char *expected;
1621     int i;
1622 
1623     if(length<=0 || length!=(int32_t)strlen(name)) {
1624         /* should not be called with an empty string or invalid length */
1625         log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1626         return TRUE;
1627     }
1628 
1629     ++*pCount;
1630     for(i=0; i<sizeof(names)/sizeof(names[0]); ++i) {
1631         if(code==(UChar32)names[i].code) {
1632             switch (nameChoice) {
1633                 case U_EXTENDED_CHAR_NAME:
1634                     if(0!=strcmp(name, names[i].extName)) {
1635                         log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1636                     }
1637                     break;
1638                 case U_UNICODE_CHAR_NAME:
1639                     if(0!=strcmp(name, names[i].name)) {
1640                         log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1641                     }
1642                     break;
1643                 case U_UNICODE_10_CHAR_NAME:
1644                     expected=names[i].oldName;
1645                     if(expected[0]==0 || 0!=strcmp(name, expected)) {
1646                         log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1647                     }
1648                     break;
1649                 case U_CHAR_NAME_ALIAS:
1650                     expected=names[i].alias;
1651                     if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1652                         log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1653                     }
1654                     break;
1655                 case U_CHAR_NAME_CHOICE_COUNT:
1656                     break;
1657             }
1658             break;
1659         }
1660     }
1661     return TRUE;
1662 }
1663 
1664 struct enumExtCharNamesContext {
1665     uint32_t length;
1666     int32_t last;
1667 };
1668 
1669 static UBool
enumExtCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1670 enumExtCharNamesFn(void *context,
1671                 UChar32 code, UCharNameChoice nameChoice,
1672                 const char *name, int32_t length) {
1673     struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1674 
1675     if (ecncp->last != (int32_t) code - 1) {
1676         if (ecncp->last < 0) {
1677             log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1678         } else {
1679             log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1680         }
1681     }
1682     ecncp->last = (int32_t) code;
1683 
1684     if (!*name) {
1685         log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1686     }
1687 
1688     return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1689 }
1690 
1691 /**
1692  * This can be made more efficient by moving it into putil.c and having
1693  * it directly access the ebcdic translation tables.
1694  * TODO: If we get this method in putil.c, then delete it from here.
1695  */
1696 static UChar
u_charToUChar(char c)1697 u_charToUChar(char c) {
1698     UChar uc;
1699     u_charsToUChars(&c, &uc, 1);
1700     return uc;
1701 }
1702 
1703 static void
TestCharNames()1704 TestCharNames() {
1705     static char name[80];
1706     UErrorCode errorCode=U_ZERO_ERROR;
1707     struct enumExtCharNamesContext extContext;
1708     const char *expected;
1709     int32_t length;
1710     UChar32 c;
1711     int32_t i;
1712 
1713     log_verbose("Testing uprv_getMaxCharNameLength()\n");
1714     length=uprv_getMaxCharNameLength();
1715     if(length==0) {
1716         /* no names data available */
1717         return;
1718     }
1719     if(length<83) { /* Unicode 3.2 max char name length */
1720         log_err("uprv_getMaxCharNameLength()=%d is too short");
1721     }
1722     /* ### TODO same tests for max ISO comment length as for max name length */
1723 
1724     log_verbose("Testing u_charName()\n");
1725     for(i=0; i<(int32_t)(sizeof(names)/sizeof(names[0])); ++i) {
1726         /* modern Unicode character name */
1727         length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1728         if(U_FAILURE(errorCode)) {
1729             log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1730             return;
1731         }
1732         if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1733             log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1734         }
1735 
1736         /* find the modern name */
1737         if (*names[i].name) {
1738             c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1739             if(U_FAILURE(errorCode)) {
1740                 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1741                 return;
1742             }
1743             if(c!=(UChar32)names[i].code) {
1744                 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1745             }
1746         }
1747 
1748         /* Unicode 1.0 character name */
1749         length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1750         if(U_FAILURE(errorCode)) {
1751             log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1752             return;
1753         }
1754         if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1755             log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1756         }
1757 
1758         /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1759         if(names[i].oldName[0]!=0 /* && length>0 */) {
1760             c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1761             if(U_FAILURE(errorCode)) {
1762                 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1763                 return;
1764             }
1765             if(c!=(UChar32)names[i].code) {
1766                 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1767             }
1768         }
1769 
1770         /* Unicode character name alias */
1771         length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1772         if(U_FAILURE(errorCode)) {
1773             log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1774             return;
1775         }
1776         expected=names[i].alias;
1777         if(expected==NULL) {
1778             expected="";
1779         }
1780         if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1781             log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1782                     names[i].code, name, length, expected);
1783         }
1784 
1785         /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1786         if(expected[0]!=0 /* && length>0 */) {
1787             c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1788             if(U_FAILURE(errorCode)) {
1789                 log_err("u_charFromName(%s - alias) error %s\n",
1790                         expected, u_errorName(errorCode));
1791                 return;
1792             }
1793             if(c!=(UChar32)names[i].code) {
1794                 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1795                         expected, c, names[i].code);
1796             }
1797         }
1798     }
1799 
1800     /* test u_enumCharNames() */
1801     length=0;
1802     errorCode=U_ZERO_ERROR;
1803     u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1804     if(U_FAILURE(errorCode) || length<94140) {
1805         log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1806     }
1807 
1808     extContext.length = 0;
1809     extContext.last = -1;
1810     errorCode=U_ZERO_ERROR;
1811     u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1812     if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1813         log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1814     }
1815 
1816     /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1817     if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1818         log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1819     }
1820 
1821     /* Test getCharNameCharacters */
1822     if(!getTestOption(QUICK_OPTION)) {
1823         enum { BUFSIZE = 256 };
1824         UErrorCode ec = U_ZERO_ERROR;
1825         char buf[BUFSIZE];
1826         int32_t maxLength;
1827         UChar32 cp;
1828         UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1829         int32_t l1, l2;
1830         UBool map[256];
1831         UBool ok;
1832 
1833         USet* set = uset_open(1, 0); /* empty set */
1834         USet* dumb = uset_open(1, 0); /* empty set */
1835 
1836         /*
1837          * uprv_getCharNameCharacters() will likely return more lowercase
1838          * letters than actual character names contain because
1839          * it includes all the characters in lowercased names of
1840          * general categories, for the full possible set of extended names.
1841          */
1842         {
1843             USetAdder sa={
1844                 NULL,
1845                 uset_add,
1846                 uset_addRange,
1847                 uset_addString,
1848                 NULL /* don't need remove() */
1849             };
1850             sa.set=set;
1851             uprv_getCharNameCharacters(&sa);
1852         }
1853 
1854         /* build set the dumb (but sure-fire) way */
1855         for (i=0; i<256; ++i) {
1856             map[i] = FALSE;
1857         }
1858 
1859         maxLength=0;
1860         for (cp=0; cp<0x110000; ++cp) {
1861             int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1862                                      buf, BUFSIZE, &ec);
1863             if (U_FAILURE(ec)) {
1864                 log_err("FAIL: u_charName failed when it shouldn't\n");
1865                 uset_close(set);
1866                 uset_close(dumb);
1867                 return;
1868             }
1869             if(len>maxLength) {
1870                 maxLength=len;
1871             }
1872 
1873             for (i=0; i<len; ++i) {
1874                 if (!map[(uint8_t) buf[i]]) {
1875                     uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1876                     map[(uint8_t) buf[i]] = TRUE;
1877                 }
1878             }
1879 
1880             /* test for leading/trailing whitespace */
1881             if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1882                 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1883             }
1884         }
1885 
1886         if(map[(uint8_t)'\t']) {
1887             log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1888         }
1889 
1890         length=uprv_getMaxCharNameLength();
1891         if(length!=maxLength) {
1892             log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1893                     length, maxLength);
1894         }
1895 
1896         /* compare the sets.  Where is my uset_equals?!! */
1897         ok=TRUE;
1898         for(i=0; i<256; ++i) {
1899             if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1900                 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1901                     /* ignore lowercase a-z that are in set but not in dumb */
1902                     ok=TRUE;
1903                 } else {
1904                     ok=FALSE;
1905                     break;
1906                 }
1907             }
1908         }
1909 
1910         l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1911         l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1912         if (U_FAILURE(ec)) {
1913             log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1914             uset_close(set);
1915             uset_close(dumb);
1916             return;
1917         }
1918 
1919         if (l1 >= BUFSIZE) {
1920             l1 = BUFSIZE-1;
1921             pat[l1] = 0;
1922         }
1923         if (l2 >= BUFSIZE) {
1924             l2 = BUFSIZE-1;
1925             dumbPat[l2] = 0;
1926         }
1927 
1928         if (!ok) {
1929             log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1930                     aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1931         } else if(getTestOption(VERBOSITY_OPTION)) {
1932             log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
1933         }
1934 
1935         uset_close(set);
1936         uset_close(dumb);
1937     }
1938 
1939     /* ### TODO: test error cases and other interesting things */
1940 }
1941 
1942 static void
TestUCharFromNameUnderflow()1943 TestUCharFromNameUnderflow() {
1944     // Ticket #10889: Underflow crash when there is no dash.
1945     UErrorCode errorCode=U_ZERO_ERROR;
1946     UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<NO BREAK SPACE>", &errorCode);
1947     if(U_SUCCESS(errorCode)) {
1948         log_err("u_charFromName(<NO BREAK SPACE>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1949     }
1950 
1951     // Test related edge cases.
1952     errorCode=U_ZERO_ERROR;
1953     c=u_charFromName(U_EXTENDED_CHAR_NAME, "<-00a0>", &errorCode);
1954     if(U_SUCCESS(errorCode)) {
1955         log_err("u_charFromName(<-00a0>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1956     }
1957 
1958     errorCode=U_ZERO_ERROR;
1959     c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control->", &errorCode);
1960     if(U_SUCCESS(errorCode)) {
1961         log_err("u_charFromName(<control->) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1962     }
1963 
1964     errorCode=U_ZERO_ERROR;
1965     c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control-111111>", &errorCode);
1966     if(U_SUCCESS(errorCode)) {
1967         log_err("u_charFromName(<control-111111>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1968     }
1969 }
1970 
1971 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
1972 
1973 static void
TestMirroring()1974 TestMirroring() {
1975     USet *set;
1976     UErrorCode errorCode;
1977 
1978     UChar32 start, end, c2, c3;
1979     int32_t i;
1980 
1981     U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1982 
1983     U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1984 
1985     log_verbose("Testing u_isMirrored()\n");
1986     if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
1987          !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
1988         )
1989     ) {
1990         log_err("u_isMirrored() does not work correctly\n");
1991     }
1992 
1993     log_verbose("Testing u_charMirror()\n");
1994     if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
1995          u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
1996          u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
1997          /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
1998          u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
1999          )
2000     ) {
2001         log_err("u_charMirror() does not work correctly\n");
2002     }
2003 
2004     /* verify that Bidi_Mirroring_Glyph roundtrips */
2005     errorCode=U_ZERO_ERROR;
2006     set=uset_openPattern(mirroredPattern, 17, &errorCode);
2007 
2008     if (U_FAILURE(errorCode)) {
2009         log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2010     } else {
2011         for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2012             do {
2013                 c2=u_charMirror(start);
2014                 c3=u_charMirror(c2);
2015                 if(c3!=start) {
2016                     log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2017                 }
2018                 c3=u_getBidiPairedBracket(start);
2019                 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2020                     if(c3!=start) {
2021                         log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2022                                 (long)start);
2023                     }
2024                 } else {
2025                     if(c3!=c2) {
2026                         log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2027                                 (long)start, (long)c2);
2028                     }
2029                 }
2030             } while(++start<=end);
2031         }
2032     }
2033 
2034     uset_close(set);
2035 }
2036 
2037 
2038 struct RunTestData
2039 {
2040     const char *runText;
2041     UScriptCode runCode;
2042 };
2043 
2044 typedef struct RunTestData RunTestData;
2045 
2046 static void
CheckScriptRuns(UScriptRun * scriptRun,int32_t * runStarts,const RunTestData * testData,int32_t nRuns,const char * prefix)2047 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2048                 const char *prefix)
2049 {
2050     int32_t run, runStart, runLimit;
2051     UScriptCode runCode;
2052 
2053     /* iterate over all the runs */
2054     run = 0;
2055     while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2056         if (runStart != runStarts[run]) {
2057             log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2058                 prefix, run, runStarts[run], runStart);
2059         }
2060 
2061         if (runLimit != runStarts[run + 1]) {
2062             log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2063                 prefix, run, runStarts[run + 1], runLimit);
2064         }
2065 
2066         if (runCode != testData[run].runCode) {
2067             log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2068                 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2069         }
2070 
2071         run += 1;
2072 
2073         /* stop when we've seen all the runs we expect to see */
2074         if (run >= nRuns) {
2075             break;
2076         }
2077     }
2078 
2079     /* Complain if we didn't see then number of runs we expected */
2080     if (run != nRuns) {
2081         log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2082     }
2083 }
2084 
2085 static void
TestUScriptRunAPI()2086 TestUScriptRunAPI()
2087 {
2088     static const RunTestData testData1[] = {
2089         {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2090         {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2091         {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2092         {"English (", USCRIPT_LATIN},
2093         {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2094         {") ", USCRIPT_LATIN},
2095         {"\\u6F22\\u5B75", USCRIPT_HAN},
2096         {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2097         {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2098         {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2099     };
2100 
2101     static const RunTestData testData2[] = {
2102        {"((((((((((abc))))))))))", USCRIPT_LATIN}
2103     };
2104 
2105     static const struct {
2106       const RunTestData *testData;
2107       int32_t nRuns;
2108     } testDataEntries[] = {
2109         {testData1, UPRV_LENGTHOF(testData1)},
2110         {testData2, UPRV_LENGTHOF(testData2)}
2111     };
2112 
2113     static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2114     int32_t testEntry;
2115 
2116     for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2117         UChar testString[1024];
2118         int32_t runStarts[256];
2119         int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2120         const RunTestData *testData = testDataEntries[testEntry].testData;
2121 
2122         int32_t run, stringLimit;
2123         UScriptRun *scriptRun = NULL;
2124         UErrorCode err;
2125 
2126         /*
2127          * Fill in the test string and the runStarts array.
2128          */
2129         stringLimit = 0;
2130         for (run = 0; run < nTestRuns; run += 1) {
2131             runStarts[run] = stringLimit;
2132             stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2133             /*stringLimit -= 1;*/
2134         }
2135 
2136         /* The limit of the last run */
2137         runStarts[nTestRuns] = stringLimit;
2138 
2139         /*
2140          * Make sure that calling uscript_OpenRun with a NULL text pointer
2141          * and a non-zero text length returns the correct error.
2142          */
2143         err = U_ZERO_ERROR;
2144         scriptRun = uscript_openRun(NULL, stringLimit, &err);
2145 
2146         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2147             log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2148         }
2149 
2150         if (scriptRun != NULL) {
2151             log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2152             uscript_closeRun(scriptRun);
2153         }
2154 
2155         /*
2156          * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2157          * and a zero text length returns the correct error.
2158          */
2159         err = U_ZERO_ERROR;
2160         scriptRun = uscript_openRun(testString, 0, &err);
2161 
2162         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2163             log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2164         }
2165 
2166         if (scriptRun != NULL) {
2167             log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2168             uscript_closeRun(scriptRun);
2169         }
2170 
2171         /*
2172          * Make sure that calling uscript_openRun with a NULL text pointer
2173          * and a zero text length doesn't return an error.
2174          */
2175         err = U_ZERO_ERROR;
2176         scriptRun = uscript_openRun(NULL, 0, &err);
2177 
2178         if (U_FAILURE(err)) {
2179             log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2180         }
2181 
2182         /* Make sure that the empty iterator doesn't find any runs */
2183         if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2184             log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2185         }
2186 
2187         /*
2188          * Make sure that calling uscript_setRunText with a NULL text pointer
2189          * and a non-zero text length returns the correct error.
2190          */
2191         err = U_ZERO_ERROR;
2192         uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2193 
2194         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2195             log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2196         }
2197 
2198         /*
2199          * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2200          * and a zero text length returns the correct error.
2201          */
2202         err = U_ZERO_ERROR;
2203         uscript_setRunText(scriptRun, testString, 0, &err);
2204 
2205         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2206             log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2207         }
2208 
2209         /*
2210          * Now call uscript_setRunText on the empty iterator
2211          * and make sure that it works.
2212          */
2213         err = U_ZERO_ERROR;
2214         uscript_setRunText(scriptRun, testString, stringLimit, &err);
2215 
2216         if (U_FAILURE(err)) {
2217             log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2218         } else {
2219             CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2220         }
2221 
2222         uscript_closeRun(scriptRun);
2223 
2224         /*
2225          * Now open an interator over the testString
2226          * using uscript_openRun and make sure that it works
2227          */
2228         scriptRun = uscript_openRun(testString, stringLimit, &err);
2229 
2230         if (U_FAILURE(err)) {
2231             log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2232         } else {
2233             CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2234         }
2235 
2236         /* Now reset the iterator, and make sure
2237          * that it still works.
2238          */
2239         uscript_resetRun(scriptRun);
2240 
2241         CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2242 
2243         /* Close the iterator */
2244         uscript_closeRun(scriptRun);
2245     }
2246 }
2247 
2248 /* test additional, non-core properties */
2249 static void
TestAdditionalProperties()2250 TestAdditionalProperties() {
2251     /* test data for u_charAge() */
2252     static const struct {
2253         UChar32 c;
2254         UVersionInfo version;
2255     } charAges[]={
2256         {0x41,    { 1, 1, 0, 0 }},
2257         {0xffff,  { 1, 1, 0, 0 }},
2258         {0x20ab,  { 2, 0, 0, 0 }},
2259         {0x2fffe, { 2, 0, 0, 0 }},
2260         {0x20ac,  { 2, 1, 0, 0 }},
2261         {0xfb1d,  { 3, 0, 0, 0 }},
2262         {0x3f4,   { 3, 1, 0, 0 }},
2263         {0x10300, { 3, 1, 0, 0 }},
2264         {0x220,   { 3, 2, 0, 0 }},
2265         {0xff60,  { 3, 2, 0, 0 }}
2266     };
2267 
2268     /* test data for u_hasBinaryProperty() */
2269     static const int32_t
2270     props[][3]={ /* code point, property, value */
2271         { 0x0627, UCHAR_ALPHABETIC, TRUE },
2272         { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2273         { 0x2028, UCHAR_ALPHABETIC, FALSE },
2274 
2275         { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2276         { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2277 
2278         { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2279         { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2280 
2281         { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2282         { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2283 
2284         /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2285         { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2286         { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2287         { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2288         { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2289 
2290         { 0x058a, UCHAR_DASH, TRUE },
2291         { 0x007e, UCHAR_DASH, FALSE },
2292 
2293         { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2294         { 0x3000, UCHAR_DIACRITIC, FALSE },
2295 
2296         { 0x0e46, UCHAR_EXTENDER, TRUE },
2297         { 0x0020, UCHAR_EXTENDER, FALSE },
2298 
2299 #if !UCONFIG_NO_NORMALIZATION
2300         { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2301         { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2302         { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2303 
2304         { 0x110a, UCHAR_NFD_INERT, TRUE },      /* Jamo L */
2305         { 0x0308, UCHAR_NFD_INERT, FALSE },
2306 
2307         { 0x1164, UCHAR_NFKD_INERT, TRUE },     /* Jamo V */
2308         { 0x1d79d, UCHAR_NFKD_INERT, FALSE },   /* math compat version of xi */
2309 
2310         { 0x0021, UCHAR_NFC_INERT, TRUE },      /* ! */
2311         { 0x0061, UCHAR_NFC_INERT, FALSE },     /* a */
2312         { 0x00e4, UCHAR_NFC_INERT, FALSE },     /* a-umlaut */
2313         { 0x0102, UCHAR_NFC_INERT, FALSE },     /* a-breve */
2314         { 0xac1c, UCHAR_NFC_INERT, FALSE },     /* Hangul LV */
2315         { 0xac1d, UCHAR_NFC_INERT, TRUE },      /* Hangul LVT */
2316 
2317         { 0x1d79d, UCHAR_NFKC_INERT, FALSE },   /* math compat version of xi */
2318         { 0x2a6d6, UCHAR_NFKC_INERT, TRUE },    /* Han, last of CJK ext. B */
2319 
2320         { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2321         { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2322         { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2323         { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2324         { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2325         { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2326 #endif
2327 
2328         { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2329         { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2330         { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2331 
2332         { 0x30fb, UCHAR_HYPHEN, TRUE },
2333         { 0xfe58, UCHAR_HYPHEN, FALSE },
2334 
2335         { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2336         { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2337         { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2338 
2339         { 0x2172, UCHAR_ID_START, TRUE },
2340         { 0x007a, UCHAR_ID_START, TRUE },
2341         { 0x0039, UCHAR_ID_START, FALSE },
2342 
2343         { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2344         { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2345         { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2346 
2347         { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2348         { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2349 
2350         { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2351         { 0x0345, UCHAR_LOWERCASE, TRUE },
2352         { 0x0030, UCHAR_LOWERCASE, FALSE },
2353 
2354         { 0x1d7a9, UCHAR_MATH, TRUE },
2355         { 0x2135, UCHAR_MATH, TRUE },
2356         { 0x0062, UCHAR_MATH, FALSE },
2357 
2358         { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2359         { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2360         { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2361 
2362         { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2363         { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2364         { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2365 
2366         { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2367         { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2368 
2369         { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2370         { 0x2162, UCHAR_UPPERCASE, TRUE },
2371         { 0x0345, UCHAR_UPPERCASE, FALSE },
2372 
2373         { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2374         { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2375         { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2376 
2377         { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2378         { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2379         { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2380 
2381         { 0x16ee, UCHAR_XID_START, TRUE },
2382         { 0x23456, UCHAR_XID_START, TRUE },
2383         { 0x1d1aa, UCHAR_XID_START, FALSE },
2384 
2385         /*
2386          * Version break:
2387          * The following properties are only supported starting with the
2388          * Unicode version indicated in the second field.
2389          */
2390         { -1, 0x320, 0 },
2391 
2392         { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2393         { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2394         { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2395 
2396         { 0x0149, UCHAR_DEPRECATED, TRUE },         /* changed in Unicode 5.2 */
2397         { 0x0341, UCHAR_DEPRECATED, FALSE },        /* changed in Unicode 5.2 */
2398         { 0xe0001, UCHAR_DEPRECATED, TRUE },        /* changed from Unicode 5 to 5.1 */
2399         { 0xe0100, UCHAR_DEPRECATED, FALSE },
2400 
2401         { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2402         { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2403         { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2404         { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE },     /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2405 
2406         { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2407         { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2408         { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE },    /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2409         { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2410 
2411         { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2412         { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2413 
2414         { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2415         { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2416 
2417         { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2418         { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2419 
2420         { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2421         { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2422 
2423         { 0x2e9b, UCHAR_RADICAL, TRUE },
2424         { 0x4e00, UCHAR_RADICAL, FALSE },
2425 
2426         { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2427         { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2428 
2429         { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2430         { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2431 
2432         { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2433 
2434         { 0x002e, UCHAR_S_TERM, TRUE },
2435         { 0x0061, UCHAR_S_TERM, FALSE },
2436 
2437         { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2438         { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2439         { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2440         { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2441 
2442         /* enum/integer type properties */
2443 
2444         /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2445         /* test default Bidi classes for unassigned code points */
2446         { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2447         { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2448         { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2449         { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2450         { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2451         { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2452         { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2453         { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2454         { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2455         { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2456         { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2457 
2458         { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2459         { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2460         { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2461         { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2462         { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2463         { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2464         { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2465 
2466         { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2467         { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2468         { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2469         { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2470         { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2471         { 0x1CBF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2472         { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2473         { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2474         { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2475         { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2476         { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2477 
2478         /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2479         { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2480 
2481         { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2482         { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2483         { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2484         { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2485         { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2486         { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2487         { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2488         { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2489         { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2490 
2491         { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2492         { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2493         { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2494         { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2495         { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2496         { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2497         { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2498         { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2499         { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2500         { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2501         { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2502         { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2503         { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2504         { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2505         { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2506         { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2507         { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2508 
2509         /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2510         { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2511         { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER },     /* changed in Unicode 5.2 */
2512 
2513         { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2514         { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2515         { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2516         { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2517         { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2518 
2519         { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2520         { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2521         { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2522         { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2523         { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2524         { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2525         { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2526         { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2527 
2528         /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2529         { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2530         { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2531         { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2532         { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2533         { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2534         { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2535         { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2536         { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2537         { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2538         { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2539         { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2540         { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2541         { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2542         { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2543         { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2544 
2545         /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2546 
2547         /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */
2548 
2549         { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2550         { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2551         { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2552         { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2553         { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2554         { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2555         { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2556 
2557         { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2558         { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2559         { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2560         { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2561 
2562         { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2563         { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2564         { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2565         { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2566         { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2567         { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2568 
2569         { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2570         { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2571         { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2572         { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2573 
2574         { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2575         { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2576         { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2577         { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2578         { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2579         { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2580         { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2581 
2582         { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2583         { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2584         { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2585         { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2586 
2587         { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2588         { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2589         { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2590         { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2591 
2592         { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2593         { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2594         { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2595         { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2596         { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2597 
2598         { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2599 
2600         { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2601 
2602         { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2603         { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2604         { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2605 
2606         { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2607         { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2608         { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2609         { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2610         { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2611 
2612         { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2613         { 0x2c8e,  UCHAR_BLOCK, UBLOCK_COPTIC },
2614         { 0xfe17,  UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2615 
2616         { 0x1a00,  UCHAR_SCRIPT, USCRIPT_BUGINESE },
2617         { 0x2cea,  UCHAR_SCRIPT, USCRIPT_COPTIC },
2618         { 0xa82b,  UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2619         { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2620 
2621         { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2622         { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2623         { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2624         { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2625         { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2626         { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2627 
2628         { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2629         { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2630         { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2631         { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2632 
2633         { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2634         { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2635         { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2636         { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2637 
2638         { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2639         { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2640         { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2641         { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2642 
2643         { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2644 
2645         /* unassigned code points in new default Bidi R blocks */
2646         { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2647         { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2648 
2649         /* test some script codes >127 */
2650         { 0xa6e6,  UCHAR_SCRIPT, USCRIPT_BAMUM },
2651         { 0xa4d0,  UCHAR_SCRIPT, USCRIPT_LISU },
2652         { 0x10a7f,  UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2653 
2654         { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2655 
2656         /* value changed in Unicode 6.0 */
2657         { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2658 
2659         { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2660 
2661         /* unassigned code points in new/changed default Bidi AL blocks */
2662         { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2663         { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2664 
2665         { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2666 
2667         /* unassigned code points in the currency symbols block now default to ET */
2668         { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2669         { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2670 
2671         /* new property in Unicode 6.3 */
2672         { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2673         { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2674         { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2675         { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2676         { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2677         { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2678 
2679         { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2680 
2681         /* new character range with Joining_Group values */
2682         { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2683         { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2684         { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2685         { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2686         { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2687 
2688         /* undefined UProperty values */
2689         { 0x61, 0x4a7, 0 },
2690         { 0x234bc, 0x15ed, 0 }
2691     };
2692 
2693     UVersionInfo version;
2694     UChar32 c;
2695     int32_t i, result, uVersion;
2696     UProperty which;
2697 
2698     /* what is our Unicode version? */
2699     u_getUnicodeVersion(version);
2700     uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2701 
2702     u_charAge(0x20, version);
2703     if(version[0]==0) {
2704         /* no additional properties available */
2705         log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2706         return;
2707     }
2708 
2709     /* test u_charAge() */
2710     for(i=0; i<sizeof(charAges)/sizeof(charAges[0]); ++i) {
2711         u_charAge(charAges[i].c, version);
2712         if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2713             log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2714                 charAges[i].c,
2715                 version[0], version[1], version[2], version[3],
2716                 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2717         }
2718     }
2719 
2720     if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2721         u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2722         u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 ||   /* j2478 */
2723         u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2724         u_getIntPropertyMinValue(0x2345)!=0
2725     ) {
2726         log_err("error: u_getIntPropertyMinValue() wrong\n");
2727     }
2728     if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2729         log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2730     }
2731     if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2732         log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2733     }
2734     if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2735         log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2736     }
2737     if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2738         log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2739     }
2740     if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2741         log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2742     }
2743     if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2744         log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2745     }
2746     if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2747         log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2748     }
2749     if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2750         log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2751     }
2752     if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2753         log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2754     }
2755     if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2756         log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2757     }
2758     if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2759         log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2760     }
2761     if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2762         log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2763     }
2764     if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2765         log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2766     }
2767     if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2768         log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2769     }
2770     /*JB#2410*/
2771     if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2772         log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2773     }
2774     if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2775         log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2776     }
2777     if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) !=  (int32_t) (U_JG_COUNT -1)) {
2778         log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2779     }
2780     if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2781         log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2782     }
2783     if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2784         log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2785     }
2786 
2787     /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2788     for(i=0; i<sizeof(props)/sizeof(props[0]); ++i) {
2789         const char *whichName;
2790 
2791         if(props[i][0]<0) {
2792             /* Unicode version break */
2793             if(uVersion<props[i][1]) {
2794                 break; /* do not test properties that are not yet supported */
2795             } else {
2796                 continue; /* skip this row */
2797             }
2798         }
2799 
2800         c=(UChar32)props[i][0];
2801         which=(UProperty)props[i][1];
2802         whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2803 
2804         if(which<UCHAR_INT_START) {
2805             result=u_hasBinaryProperty(c, which);
2806             if(result!=props[i][2]) {
2807                 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2808                         c, whichName, result, i);
2809             }
2810         }
2811 
2812         result=u_getIntPropertyValue(c, which);
2813         if(result!=props[i][2]) {
2814             log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2815                     c, whichName, result, props[i][2], i);
2816         }
2817 
2818         /* test separate functions, too */
2819         switch((UProperty)props[i][1]) {
2820         case UCHAR_ALPHABETIC:
2821             if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2822                 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2823                         props[i][0], result, i);
2824             }
2825             break;
2826         case UCHAR_LOWERCASE:
2827             if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2828                 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2829                         props[i][0], result, i);
2830             }
2831             break;
2832         case UCHAR_UPPERCASE:
2833             if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2834                 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2835                         props[i][0], result, i);
2836             }
2837             break;
2838         case UCHAR_WHITE_SPACE:
2839             if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2840                 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2841                         props[i][0], result, i);
2842             }
2843             break;
2844         default:
2845             break;
2846         }
2847     }
2848 }
2849 
2850 static void
TestNumericProperties(void)2851 TestNumericProperties(void) {
2852     /* see UnicodeData.txt, DerivedNumericValues.txt */
2853     static const struct {
2854         UChar32 c;
2855         int32_t type;
2856         double numValue;
2857     } values[]={
2858         { 0x0F33, U_NT_NUMERIC, -1./2. },
2859         { 0x0C66, U_NT_DECIMAL, 0 },
2860         { 0x96f6, U_NT_NUMERIC, 0 },
2861         { 0xa833, U_NT_NUMERIC, 1./16. },
2862         { 0x2152, U_NT_NUMERIC, 1./10. },
2863         { 0x2151, U_NT_NUMERIC, 1./9. },
2864         { 0x1245f, U_NT_NUMERIC, 1./8. },
2865         { 0x2150, U_NT_NUMERIC, 1./7. },
2866         { 0x2159, U_NT_NUMERIC, 1./6. },
2867         { 0x09f6, U_NT_NUMERIC, 3./16. },
2868         { 0x2155, U_NT_NUMERIC, 1./5. },
2869         { 0x00BD, U_NT_NUMERIC, 1./2. },
2870         { 0x0031, U_NT_DECIMAL, 1. },
2871         { 0x4e00, U_NT_NUMERIC, 1. },
2872         { 0x58f1, U_NT_NUMERIC, 1. },
2873         { 0x10320, U_NT_NUMERIC, 1. },
2874         { 0x0F2B, U_NT_NUMERIC, 3./2. },
2875         { 0x00B2, U_NT_DIGIT, 2. },
2876         { 0x5f10, U_NT_NUMERIC, 2. },
2877         { 0x1813, U_NT_DECIMAL, 3. },
2878         { 0x5f0e, U_NT_NUMERIC, 3. },
2879         { 0x2173, U_NT_NUMERIC, 4. },
2880         { 0x8086, U_NT_NUMERIC, 4. },
2881         { 0x278E, U_NT_DIGIT, 5. },
2882         { 0x1D7F2, U_NT_DECIMAL, 6. },
2883         { 0x247A, U_NT_DIGIT, 7. },
2884         { 0x7396, U_NT_NUMERIC, 9. },
2885         { 0x1372, U_NT_NUMERIC, 10. },
2886         { 0x216B, U_NT_NUMERIC, 12. },
2887         { 0x16EE, U_NT_NUMERIC, 17. },
2888         { 0x249A, U_NT_NUMERIC, 19. },
2889         { 0x303A, U_NT_NUMERIC, 30. },
2890         { 0x5345, U_NT_NUMERIC, 30. },
2891         { 0x32B2, U_NT_NUMERIC, 37. },
2892         { 0x1375, U_NT_NUMERIC, 40. },
2893         { 0x10323, U_NT_NUMERIC, 50. },
2894         { 0x0BF1, U_NT_NUMERIC, 100. },
2895         { 0x964c, U_NT_NUMERIC, 100. },
2896         { 0x217E, U_NT_NUMERIC, 500. },
2897         { 0x2180, U_NT_NUMERIC, 1000. },
2898         { 0x4edf, U_NT_NUMERIC, 1000. },
2899         { 0x2181, U_NT_NUMERIC, 5000. },
2900         { 0x137C, U_NT_NUMERIC, 10000. },
2901         { 0x4e07, U_NT_NUMERIC, 10000. },
2902         { 0x12432, U_NT_NUMERIC, 216000. },
2903         { 0x12433, U_NT_NUMERIC, 432000. },
2904         { 0x4ebf, U_NT_NUMERIC, 100000000. },
2905         { 0x5146, U_NT_NUMERIC, 1000000000000. },
2906         { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
2907         { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2908         { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2909         { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2910         { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2911         { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2912         { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
2913         { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
2914     };
2915 
2916     double nv;
2917     UChar32 c;
2918     int32_t i, type;
2919 
2920     for(i=0; i<UPRV_LENGTHOF(values); ++i) {
2921         c=values[i].c;
2922         type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2923         nv=u_getNumericValue(c);
2924 
2925         if(type!=values[i].type) {
2926             log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2927         }
2928         if(0.000001 <= fabs(nv - values[i].numValue)) {
2929             log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
2930         }
2931     }
2932 }
2933 
2934 /**
2935  * Test the property names and property value names API.
2936  */
2937 static void
TestPropertyNames(void)2938 TestPropertyNames(void) {
2939     int32_t p, v, choice=0, rev;
2940     UBool atLeastSomething = FALSE;
2941 
2942     for (p=0; ; ++p) {
2943         UProperty propEnum = (UProperty)p;
2944         UBool sawProp = FALSE;
2945         if(p > 10 && !atLeastSomething) {
2946           log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
2947           return;
2948         }
2949 
2950         for (choice=0; ; ++choice) {
2951             const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
2952             if (name) {
2953                 if (!sawProp)
2954                     log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
2955                 log_verbose("%d=\"%s\"", choice, name);
2956                 sawProp = TRUE;
2957                 atLeastSomething = TRUE;
2958 
2959                 /* test reverse mapping */
2960                 rev = u_getPropertyEnum(name);
2961                 if (rev != p) {
2962                     log_err("Property round-trip failure: %d -> %s -> %d\n",
2963                             p, name, rev);
2964                 }
2965             }
2966             if (!name && choice>0) break;
2967         }
2968         if (sawProp) {
2969             /* looks like a valid property; check the values */
2970             const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
2971             int32_t max = 0;
2972             if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
2973                 max = 255;
2974             } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
2975                 /* it's far too slow to iterate all the way up to
2976                    the real max, U_GC_P_MASK */
2977                 max = U_GC_NL_MASK;
2978             } else if (p == UCHAR_BLOCK) {
2979                 /* UBlockCodes, unlike other values, start at 1 */
2980                 max = 1;
2981             }
2982             log_verbose("\n");
2983             for (v=-1; ; ++v) {
2984                 UBool sawValue = FALSE;
2985                 for (choice=0; ; ++choice) {
2986                     const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
2987                     if (vname) {
2988                         if (!sawValue) log_verbose(" %s, value %d:", pname, v);
2989                         log_verbose("%d=\"%s\"", choice, vname);
2990                         sawValue = TRUE;
2991 
2992                         /* test reverse mapping */
2993                         rev = u_getPropertyValueEnum(propEnum, vname);
2994                         if (rev != v) {
2995                             log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
2996                                     pname, v, vname, rev);
2997                         }
2998                     }
2999                     if (!vname && choice>0) break;
3000                 }
3001                 if (sawValue) {
3002                     log_verbose("\n");
3003                 }
3004                 if (!sawValue && v>=max) break;
3005             }
3006         }
3007         if (!sawProp) {
3008             if (p>=UCHAR_STRING_LIMIT) {
3009                 break;
3010             } else if (p>=UCHAR_DOUBLE_LIMIT) {
3011                 p = UCHAR_STRING_START - 1;
3012             } else if (p>=UCHAR_MASK_LIMIT) {
3013                 p = UCHAR_DOUBLE_START - 1;
3014             } else if (p>=UCHAR_INT_LIMIT) {
3015                 p = UCHAR_MASK_START - 1;
3016             } else if (p>=UCHAR_BINARY_LIMIT) {
3017                 p = UCHAR_INT_START - 1;
3018             }
3019         }
3020     }
3021 }
3022 
3023 /**
3024  * Test the property values API.  See JB#2410.
3025  */
3026 static void
TestPropertyValues(void)3027 TestPropertyValues(void) {
3028     int32_t i, p, min, max;
3029     UErrorCode ec;
3030 
3031     /* Min should be 0 for everything. */
3032     /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3033     for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3034         UProperty propEnum = (UProperty)p;
3035         min = u_getIntPropertyMinValue(propEnum);
3036         if (min != 0) {
3037             if (p == UCHAR_BLOCK) {
3038                 /* This is okay...for now.  See JB#2487.
3039                    TODO Update this for JB#2487. */
3040             } else {
3041                 const char* name;
3042                 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3043                 if (name == NULL)
3044                     name = "<ERROR>";
3045                 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3046                         name, min);
3047             }
3048         }
3049     }
3050 
3051     if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3052         u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3053         log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3054     }
3055 
3056     /* Max should be -1 for invalid properties. */
3057     max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3058     if (max != -1) {
3059         log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3060                 max);
3061     }
3062 
3063     /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3064     for (i=0; i<2; ++i) {
3065         int32_t script;
3066         const char* desc;
3067         ec = U_ZERO_ERROR;
3068         switch (i) {
3069         case 0:
3070             script = uscript_getScript(-1, &ec);
3071             desc = "uscript_getScript(-1)";
3072             break;
3073         case 1:
3074             script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3075             desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3076             break;
3077         default:
3078             log_err("Internal test error. Too many scripts\n");
3079             return;
3080         }
3081         /* We don't explicitly test ec.  It should be U_FAILURE but it
3082            isn't documented as such. */
3083         if (script != (int32_t)USCRIPT_INVALID_CODE) {
3084             log_err("FAIL: %s = %d, exp. 0\n",
3085                     desc, script);
3086         }
3087     }
3088 }
3089 
3090 /* various tests for consistency of UCD data and API behavior */
3091 static void
TestConsistency()3092 TestConsistency() {
3093     char buffer[300];
3094     USet *set1, *set2, *set3, *set4;
3095     UErrorCode errorCode;
3096 
3097     UChar32 start, end;
3098     int32_t i, length;
3099 
3100     U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3101     U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3102     U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3103     U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3104     U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3105 
3106     U_STRING_DECL(mathBlocksPattern,
3107         "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3108         214);
3109     U_STRING_DECL(mathPattern, "[:Math:]", 8);
3110     U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3111     U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3112     U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3113 
3114     U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3115     U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3116     U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3117     U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3118     U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3119 
3120     U_STRING_INIT(mathBlocksPattern,
3121         "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3122         214);
3123     U_STRING_INIT(mathPattern, "[:Math:]", 8);
3124     U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3125     U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3126     U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3127 
3128     /*
3129      * It used to be that UCD.html and its precursors said
3130      * "Those dashes used to mark connections between pieces of words,
3131      *  plus the Katakana middle dot."
3132      *
3133      * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3134      * but not from Hyphen.
3135      * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3136      * Therefore, do not show errors when testing the Hyphen property.
3137      */
3138     log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3139                 "known to the UTC and not considered errors.\n");
3140 
3141     errorCode=U_ZERO_ERROR;
3142     set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3143     set2=uset_openPattern(dashPattern, 8, &errorCode);
3144     if(U_SUCCESS(errorCode)) {
3145         /* remove the Katakana middle dot(s) from set1 */
3146         uset_remove(set1, 0x30fb);
3147         uset_remove(set1, 0xff65); /* halfwidth variant */
3148         showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3149     } else {
3150         log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3151     }
3152 
3153     /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3154     set3=uset_openPattern(formatPattern, 6, &errorCode);
3155     set4=uset_openPattern(alphaPattern, 14, &errorCode);
3156     if(U_SUCCESS(errorCode)) {
3157         showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3158         showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3159         showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3160     } else {
3161         log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3162     }
3163 
3164     uset_close(set1);
3165     uset_close(set2);
3166     uset_close(set3);
3167     uset_close(set4);
3168 
3169     /*
3170      * Check that each lowercase character has "small" in its name
3171      * and not "capital".
3172      * There are some such characters, some of which seem odd.
3173      * Use the verbose flag to see these notices.
3174      */
3175     errorCode=U_ZERO_ERROR;
3176     set1=uset_openPattern(lowerPattern, 13, &errorCode);
3177     if(U_SUCCESS(errorCode)) {
3178         for(i=0;; ++i) {
3179             length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3180             if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3181                 break; /* done */
3182             }
3183             if(U_FAILURE(errorCode)) {
3184                 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3185                         i, u_errorName(errorCode));
3186                 break;
3187             }
3188             if(length!=0) {
3189                 break; /* done with code points, got a string or -1 */
3190             }
3191 
3192             while(start<=end) {
3193                 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3194                 if(U_FAILURE(errorCode)) {
3195                     log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3196                     errorCode=U_ZERO_ERROR;
3197                 }
3198                 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3199                     strstr(buffer, "SMALL CAPITAL")==NULL
3200                 ) {
3201                     log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3202                 }
3203                 ++start;
3204             }
3205         }
3206     } else {
3207         log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3208     }
3209     uset_close(set1);
3210 
3211     /* verify that all assigned characters in Math blocks are exactly Math characters */
3212     errorCode=U_ZERO_ERROR;
3213     set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3214     set2=uset_openPattern(mathPattern, 8, &errorCode);
3215     set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3216     if(U_SUCCESS(errorCode)) {
3217         uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3218         uset_complement(set3);      /* assigned characters */
3219         uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3220         compareUSets(set1, set2,
3221                      "[assigned Math block chars]", "[math blocks]&[:Math:]",
3222                      TRUE);
3223     } else {
3224         log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3225     }
3226     uset_close(set1);
3227     uset_close(set2);
3228     uset_close(set3);
3229 
3230     /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3231     errorCode=U_ZERO_ERROR;
3232     set1=uset_openPattern(unknownPattern, 14, &errorCode);
3233     set2=uset_openPattern(reservedPattern, 20, &errorCode);
3234     if(U_SUCCESS(errorCode)) {
3235         compareUSets(set1, set2,
3236                      "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3237                      TRUE);
3238     } else {
3239         log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3240     }
3241     uset_close(set1);
3242     uset_close(set2);
3243 }
3244 
3245 /*
3246  * Starting with ICU4C 3.4, the core Unicode properties files
3247  * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu)
3248  * are hardcoded in the common DLL and therefore not included
3249  * in the data package any more.
3250  * Test requiring these files are disabled so that
3251  * we need not jump through hoops (like adding snapshots of these files
3252  * to testdata).
3253  * See Jitterbug 4497.
3254  */
3255 #define HARDCODED_DATA_4497 1
3256 
3257 /* API coverage for ucase.c */
TestUCase()3258 static void TestUCase() {
3259 #if !HARDCODED_DATA_4497
3260     UDataMemory *pData;
3261     UCaseProps *csp;
3262     const UCaseProps *ccsp;
3263     UErrorCode errorCode;
3264 
3265     /* coverage for ucase_openBinary() */
3266     errorCode=U_ZERO_ERROR;
3267     pData=udata_open(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, &errorCode);
3268     if(U_FAILURE(errorCode)) {
3269         log_data_err("unable to open " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3270                     u_errorName(errorCode));
3271         return;
3272     }
3273 
3274     csp=ucase_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3275     if(U_FAILURE(errorCode)) {
3276         log_err("ucase_openBinary() fails for the contents of " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3277                 u_errorName(errorCode));
3278         udata_close(pData);
3279         return;
3280     }
3281 
3282     if(UCASE_LOWER!=ucase_getType(csp, 0xdf)) { /* verify islower(sharp s) */
3283         log_err("ucase_openBinary() does not seem to return working UCaseProps\n");
3284     }
3285 
3286     ucase_close(csp);
3287     udata_close(pData);
3288 
3289     /* coverage for ucase_getDummy() */
3290     errorCode=U_ZERO_ERROR;
3291     ccsp=ucase_getDummy(&errorCode);
3292     if(ucase_tolower(ccsp, 0x41)!=0x41) {
3293         log_err("ucase_tolower(dummy, A)!=A\n");
3294     }
3295 #endif
3296 }
3297 
3298 /* API coverage for ubidi_props.c */
TestUBiDiProps()3299 static void TestUBiDiProps() {
3300 #if !HARDCODED_DATA_4497
3301     UDataMemory *pData;
3302     UBiDiProps *bdp;
3303     const UBiDiProps *cbdp;
3304     UErrorCode errorCode;
3305 
3306     /* coverage for ubidi_openBinary() */
3307     errorCode=U_ZERO_ERROR;
3308     pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode);
3309     if(U_FAILURE(errorCode)) {
3310         log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3311                     u_errorName(errorCode));
3312         return;
3313     }
3314 
3315     bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3316     if(U_FAILURE(errorCode)) {
3317         log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3318                 u_errorName(errorCode));
3319         udata_close(pData);
3320         return;
3321     }
3322 
3323     if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */
3324         log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n");
3325     }
3326 
3327     ubidi_closeProps(bdp);
3328     udata_close(pData);
3329 
3330     /* coverage for ubidi_getDummy() */
3331     errorCode=U_ZERO_ERROR;
3332     cbdp=ubidi_getDummy(&errorCode);
3333     if(ubidi_getClass(cbdp, 0x20)!=0) {
3334         log_err("ubidi_getClass(dummy, space)!=0\n");
3335     }
3336 #endif
3337 }
3338 
3339 /* test case folding, compare return values with CaseFolding.txt ------------ */
3340 
3341 /* bit set for which case foldings for a character have been tested already */
3342 enum {
3343     CF_SIMPLE=1,
3344     CF_FULL=2,
3345     CF_TURKIC=4,
3346     CF_ALL=7
3347 };
3348 
3349 static void
testFold(UChar32 c,int which,UChar32 simple,UChar32 turkic,const UChar * full,int32_t fullLength,const UChar * turkicFull,int32_t turkicFullLength)3350 testFold(UChar32 c, int which,
3351          UChar32 simple, UChar32 turkic,
3352          const UChar *full, int32_t fullLength,
3353          const UChar *turkicFull, int32_t turkicFullLength) {
3354     UChar s[2], t[32];
3355     UChar32 c2;
3356     int32_t length, length2;
3357 
3358     UErrorCode errorCode=U_ZERO_ERROR;
3359 
3360     length=0;
3361     U16_APPEND_UNSAFE(s, length, c);
3362 
3363     if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3364         log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3365     }
3366     if((which&CF_FULL)!=0) {
3367         length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3368         if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3369             log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3370         }
3371     }
3372     if((which&CF_TURKIC)!=0) {
3373         if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3374             log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3375         }
3376 
3377         length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3378         if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3379             log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3380         }
3381     }
3382 }
3383 
3384 /* test that c case-folds to itself */
3385 static void
testFoldToSelf(UChar32 c,int which)3386 testFoldToSelf(UChar32 c, int which) {
3387     UChar s[2];
3388     int32_t length;
3389 
3390     length=0;
3391     U16_APPEND_UNSAFE(s, length, c);
3392     testFold(c, which, c, c, s, length, s, length);
3393 }
3394 
3395 struct CaseFoldingData {
3396     USet *notSeen;
3397     UChar32 prev, prevSimple;
3398     UChar prevFull[32];
3399     int32_t prevFullLength;
3400     int which;
3401 };
3402 typedef struct CaseFoldingData CaseFoldingData;
3403 
3404 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)3405 caseFoldingLineFn(void *context,
3406                   char *fields[][2], int32_t fieldCount,
3407                   UErrorCode *pErrorCode) {
3408     CaseFoldingData *pData=(CaseFoldingData *)context;
3409     char *end;
3410     UChar full[32];
3411     UChar32 c, prev, simple;
3412     int32_t count;
3413     int which;
3414     char status;
3415 
3416     /* get code point */
3417     const char *s=u_skipWhitespace(fields[0][0]);
3418     if(0==strncmp(s, "0000..10FFFF", 12)) {
3419         /*
3420          * Ignore the line
3421          * # @missing: 0000..10FFFF; C; <code point>
3422          * because maps-to-self is already our default, and this line breaks this parser.
3423          */
3424         return;
3425     }
3426     c=(UChar32)strtoul(s, &end, 16);
3427     end=(char *)u_skipWhitespace(end);
3428     if(end<=fields[0][0] || end!=fields[0][1]) {
3429         log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3430         *pErrorCode=U_PARSE_ERROR;
3431         return;
3432     }
3433 
3434     /* get the status of this mapping */
3435     status=*u_skipWhitespace(fields[1][0]);
3436     if(status!='C' && status!='S' && status!='F' && status!='T') {
3437         log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3438         *pErrorCode=U_PARSE_ERROR;
3439         return;
3440     }
3441 
3442     /* get the mapping */
3443     count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3444     if(U_FAILURE(*pErrorCode)) {
3445         log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3446         return;
3447     }
3448 
3449     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3450     if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3451         simple=c;
3452     }
3453 
3454     if(c!=(prev=pData->prev)) {
3455         /*
3456          * Test remaining mappings for the previous code point.
3457          * If a turkic folding was not mentioned, then it should fold the same
3458          * as the regular simple case folding.
3459          */
3460         UChar prevString[2];
3461         int32_t length;
3462 
3463         length=0;
3464         U16_APPEND_UNSAFE(prevString, length, prev);
3465         testFold(prev, (~pData->which)&CF_ALL,
3466                  prev, pData->prevSimple,
3467                  prevString, length,
3468                  pData->prevFull, pData->prevFullLength);
3469         pData->prev=pData->prevSimple=c;
3470         length=0;
3471         U16_APPEND_UNSAFE(pData->prevFull, length, c);
3472         pData->prevFullLength=length;
3473         pData->which=0;
3474     }
3475 
3476     /*
3477      * Turn the status into a bit set of case foldings to test.
3478      * Remember non-Turkic case foldings as defaults for Turkic mode.
3479      */
3480     switch(status) {
3481     case 'C':
3482         which=CF_SIMPLE|CF_FULL;
3483         pData->prevSimple=simple;
3484         u_memcpy(pData->prevFull, full, count);
3485         pData->prevFullLength=count;
3486         break;
3487     case 'S':
3488         which=CF_SIMPLE;
3489         pData->prevSimple=simple;
3490         break;
3491     case 'F':
3492         which=CF_FULL;
3493         u_memcpy(pData->prevFull, full, count);
3494         pData->prevFullLength=count;
3495         break;
3496     case 'T':
3497         which=CF_TURKIC;
3498         break;
3499     default:
3500         which=0;
3501         break; /* won't happen because of test above */
3502     }
3503 
3504     testFold(c, which, simple, simple, full, count, full, count);
3505 
3506     /* remember which case foldings of c have been tested */
3507     pData->which|=which;
3508 
3509     /* remove c from the set of ones not mentioned in CaseFolding.txt */
3510     uset_remove(pData->notSeen, c);
3511 }
3512 
3513 static void
TestCaseFolding()3514 TestCaseFolding() {
3515     CaseFoldingData data={ NULL };
3516     char *fields[3][2];
3517     UErrorCode errorCode;
3518 
3519     static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3520 
3521     errorCode=U_ZERO_ERROR;
3522     /* test BMP & plane 1 - nothing interesting above */
3523     data.notSeen=uset_open(0, 0x1ffff);
3524     data.prevFullLength=1; /* length of full case folding of U+0000 */
3525 
3526     parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3527     if(U_SUCCESS(errorCode)) {
3528         int32_t i, start, end;
3529 
3530         /* add a pseudo-last line to finish testing of the actual last one */
3531         fields[0][0]=lastLine;
3532         fields[0][1]=lastLine+6;
3533         fields[1][0]=lastLine+7;
3534         fields[1][1]=lastLine+9;
3535         fields[2][0]=lastLine+10;
3536         fields[2][1]=lastLine+17;
3537         caseFoldingLineFn(&data, fields, 3, &errorCode);
3538 
3539         /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3540         for(i=0;
3541             0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3542                 U_SUCCESS(errorCode);
3543             ++i
3544         ) {
3545             do {
3546                 testFoldToSelf(start, CF_ALL);
3547             } while(++start<=end);
3548         }
3549     }
3550 
3551     uset_close(data.notSeen);
3552 }
3553