1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/tblcoll.h"
26 #include "unicode/uiter.h"
27 #include "unicode/uniset.h"
28 #include "unicode/unistr.h"
29 #include "unicode/usetiter.h"
30 #include "unicode/ustring.h"
31 #include "charstr.h"
32 #include "cmemory.h"
33 #include "collation.h"
34 #include "collationdata.h"
35 #include "collationfcd.h"
36 #include "collationiterator.h"
37 #include "collationroot.h"
38 #include "collationrootelements.h"
39 #include "collationruleparser.h"
40 #include "collationweights.h"
41 #include "cstring.h"
42 #include "intltest.h"
43 #include "normalizer2impl.h"
44 #include "ucbuf.h"
45 #include "uhash.h"
46 #include "uitercollationiterator.h"
47 #include "utf16collationiterator.h"
48 #include "utf8collationiterator.h"
49 #include "uvectr32.h"
50 #include "uvectr64.h"
51 #include "writesrc.h"
52 
53 class CodePointIterator;
54 
55 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
56 
57 class CollationTest : public IntlTest {
58 public:
CollationTest()59     CollationTest()
60             : fcd(NULL), nfd(NULL),
61               fileLineNumber(0),
62               coll(NULL) {}
63 
~CollationTest()64     ~CollationTest() {
65         delete coll;
66     }
67 
68     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
69 
70     void TestMinMax();
71     void TestImplicits();
72     void TestNulTerminated();
73     void TestIllegalUTF8();
74     void TestShortFCDData();
75     void TestFCD();
76     void TestCollationWeights();
77     void TestRootElements();
78     void TestTailoredElements();
79     void TestDataDriven();
80 
81 private:
82     void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
83     void checkAllocWeights(CollationWeights &cw,
84                            uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
85                            int32_t someLength, int32_t minCount);
86 
87     static UnicodeString printSortKey(const uint8_t *p, int32_t length);
88     static UnicodeString printCollationKey(const CollationKey &key);
89 
90     // Helpers & fields for data-driven test.
isCROrLF(UChar c)91     static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)92     static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)93     static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
skipSpaces(int32_t i)94     int32_t skipSpaces(int32_t i) {
95         while(isSpace(fileLine[i])) { ++i; }
96         return i;
97     }
98 
99     UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
100     void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
101     Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
102     void parseAndSetAttribute(IcuTestErrorCode &errorCode);
103     void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
104     void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
105     void setRootCollator(IcuTestErrorCode &errorCode);
106     void setLocaleCollator(IcuTestErrorCode &errorCode);
107 
108     UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
109 
110     UBool getSortKeyParts(const UChar *s, int32_t length,
111                           CharString &dest, int32_t partSize,
112                           IcuTestErrorCode &errorCode);
113     UBool getCollationKey(const char *norm, const UnicodeString &line,
114                           const UChar *s, int32_t length,
115                           CollationKey &key, IcuTestErrorCode &errorCode);
116     UBool getMergedCollationKey(const UChar *s, int32_t length,
117                                 CollationKey &key, IcuTestErrorCode &errorCode);
118     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
119                           const UnicodeString &prevString, const UnicodeString &s,
120                           UCollationResult expectedOrder, Collation::Level expectedLevel,
121                           IcuTestErrorCode &errorCode);
122     void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
123 
124     const Normalizer2 *fcd, *nfd;
125     UnicodeString fileLine;
126     int32_t fileLineNumber;
127     UnicodeString fileTestName;
128     Collator *coll;
129 };
130 
createCollationTest()131 extern IntlTest *createCollationTest() {
132     return new CollationTest();
133 }
134 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)135 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
136     if(exec) {
137         logln("TestSuite CollationTest: ");
138     }
139     TESTCASE_AUTO_BEGIN;
140     TESTCASE_AUTO(TestMinMax);
141     TESTCASE_AUTO(TestImplicits);
142     TESTCASE_AUTO(TestNulTerminated);
143     TESTCASE_AUTO(TestIllegalUTF8);
144     TESTCASE_AUTO(TestShortFCDData);
145     TESTCASE_AUTO(TestFCD);
146     TESTCASE_AUTO(TestCollationWeights);
147     TESTCASE_AUTO(TestRootElements);
148     TESTCASE_AUTO(TestTailoredElements);
149     TESTCASE_AUTO(TestDataDriven);
150     TESTCASE_AUTO_END;
151 }
152 
TestMinMax()153 void CollationTest::TestMinMax() {
154     IcuTestErrorCode errorCode(*this, "TestMinMax");
155 
156     setRootCollator(errorCode);
157     if(errorCode.isFailure()) {
158         errorCode.reset();
159         return;
160     }
161     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
162     if(rbc == NULL) {
163         errln("the root collator is not a RuleBasedCollator");
164         return;
165     }
166 
167     static const UChar s[2] = { 0xfffe, 0xffff };
168     UVector64 ces(errorCode);
169     rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
170     errorCode.assertSuccess();
171     if(ces.size() != 2) {
172         errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
173         return;
174     }
175     int64_t ce = ces.elementAti(0);
176     int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
177     if(ce != expected) {
178         errln("CE(U+fffe)=%04lx != 02..", (long)ce);
179     }
180 
181     ce = ces.elementAti(1);
182     expected = Collation::makeCE(Collation::MAX_PRIMARY);
183     if(ce != expected) {
184         errln("CE(U+ffff)=%04lx != max..", (long)ce);
185     }
186 }
187 
TestImplicits()188 void CollationTest::TestImplicits() {
189     IcuTestErrorCode errorCode(*this, "TestImplicits");
190 
191     const CollationData *cd = CollationRoot::getData(errorCode);
192     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
193         return;
194     }
195 
196     // Implicit primary weights should be assigned for the following sets,
197     // and sort in ascending order by set and then code point.
198     // See http://www.unicode.org/reports/tr10/#Implicit_Weights
199 
200     // core Han Unified Ideographs
201     UnicodeSet coreHan("[\\p{unified_ideograph}&"
202                             "[\\p{Block=CJK_Unified_Ideographs}"
203                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
204                        errorCode);
205     // all other Unified Han ideographs
206     UnicodeSet otherHan("[\\p{unified ideograph}-"
207                             "[\\p{Block=CJK_Unified_Ideographs}"
208                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
209                         errorCode);
210     UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
211     unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
212 
213     // Starting with CLDR 26/ICU 54, the root Han order may instead be
214     // the Unihan radical-stroke order.
215     // The tests should pass either way, so we only test the order of a small set of Han characters
216     // whose radical-stroke order is the same as their code point order.
217     UnicodeSet someHanInCPOrder(
218             "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
219             "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
220             errorCode);
221     UnicodeSet inOrder(someHanInCPOrder);
222     inOrder.addAll(unassigned).freeze();
223     if(errorCode.errIfFailureAndReset("UnicodeSet")) {
224         return;
225     }
226     const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
227     UChar32 prev = 0;
228     uint32_t prevPrimary = 0;
229     UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
230     for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
231         LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
232         while(iter->next()) {
233             UChar32 c = iter->getCodepoint();
234             UnicodeString s(c);
235             ci.setText(s.getBuffer(), s.getBuffer() + s.length());
236             int64_t ce = ci.nextCE(errorCode);
237             int64_t ce2 = ci.nextCE(errorCode);
238             if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
239                 return;
240             }
241             if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
242                 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
243                 continue;
244             }
245             if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
246                 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
247                       (long)c, (long)(ce & 0xffffffff));
248                 continue;
249             }
250             uint32_t primary = (uint32_t)(ce >> 32);
251             if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
252                 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
253                       (long)c, (long)primary, (long)prev, (long)prevPrimary);
254             }
255             prev = c;
256             prevPrimary = primary;
257         }
258     }
259 }
260 
TestNulTerminated()261 void CollationTest::TestNulTerminated() {
262     IcuTestErrorCode errorCode(*this, "TestNulTerminated");
263     const CollationData *data = CollationRoot::getData(errorCode);
264     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
265         return;
266     }
267 
268     static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
269 
270     UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
271     UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
272     for(int32_t i = 0;; ++i) {
273         int64_t ce1 = ci1.nextCE(errorCode);
274         int64_t ce2 = ci2.nextCE(errorCode);
275         if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
276             return;
277         }
278         if(ce1 != ce2) {
279             errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
280             break;
281         }
282         if(ce1 == Collation::NO_CE) { break; }
283     }
284 }
285 
TestIllegalUTF8()286 void CollationTest::TestIllegalUTF8() {
287     IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
288 
289     setRootCollator(errorCode);
290     if(errorCode.isFailure()) {
291         errorCode.reset();
292         return;
293     }
294     coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
295 
296     static const char *strings[] = {
297         // string with U+FFFD == illegal byte sequence
298         u8"a\uFFFDz", "a\x80z",  // trail byte
299         u8"a\uFFFD\uFFFDz", "a\xc1\x81z",  // non-shortest form
300         u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z",  // non-shortest form
301         u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
302         u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
303         u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz",  // non-shortest form
304         u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
305     };
306 
307     for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
308         StringPiece fffd(strings[i]);
309         StringPiece illegal(strings[i + 1]);
310         UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
311         if(order != UCOL_EQUAL) {
312             errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
313                   (int)i, order);
314         }
315     }
316 }
317 
318 namespace {
319 
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)320 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
321     for(UChar32 c = 0x10000; c < 0x110000;) {
322         UChar32 next = c + 0x400;
323         if(src.containsSome(c, next - 1)) {
324             dest.add(U16_LEAD(c));
325         }
326         c = next;
327     }
328 }
329 
330 }  // namespace
331 
TestShortFCDData()332 void CollationTest::TestShortFCDData() {
333     // See CollationFCD class comments.
334     IcuTestErrorCode errorCode(*this, "TestShortFCDData");
335     UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
336     errorCode.assertSuccess();
337     expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
338     addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
339     UnicodeSet lccc;  // actual
340     for(UChar32 c = 0; c <= 0xffff; ++c) {
341         if(CollationFCD::hasLccc(c)) { lccc.add(c); }
342     }
343     UnicodeSet diff(expectedLccc);
344     diff.removeAll(lccc);
345     diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
346     UnicodeString empty("[]");
347     UnicodeString diffString;
348     diff.toPattern(diffString, TRUE);
349     assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
350     diff = lccc;
351     diff.removeAll(expectedLccc);
352     diff.toPattern(diffString, TRUE);
353     assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
354 
355     UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
356     if (errorCode.isSuccess()) {
357         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
358         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
359         UnicodeSet tccc;  // actual
360         for(UChar32 c = 0; c <= 0xffff; ++c) {
361             if(CollationFCD::hasTccc(c)) { tccc.add(c); }
362         }
363         diff = expectedTccc;
364         diff.removeAll(tccc);
365         diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
366         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
367         diff = tccc;
368         diff.removeAll(expectedTccc);
369         diff.toPattern(diffString, TRUE);
370         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
371     }
372 }
373 
374 class CodePointIterator {
375 public:
CodePointIterator(const UChar32 * cp,int32_t length)376     CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()377     void resetToStart() { pos = 0; }
next()378     UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()379     UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const380     int32_t getLength() const { return length; }
getIndex() const381     int getIndex() const { return (int)pos; }
382 private:
383     const UChar32 *cp;
384     int32_t length;
385     int32_t pos;
386 };
387 
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)388 void CollationTest::checkFCD(const char *name,
389                              CollationIterator &ci, CodePointIterator &cpi) {
390     IcuTestErrorCode errorCode(*this, "checkFCD");
391 
392     // Iterate forward to the limit.
393     for(;;) {
394         UChar32 c1 = ci.nextCodePoint(errorCode);
395         UChar32 c2 = cpi.next();
396         if(c1 != c2) {
397             errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
398                   name, (long)c1, (long)c2, cpi.getIndex());
399             return;
400         }
401         if(c1 < 0) { break; }
402     }
403 
404     // Iterate backward most of the way.
405     for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
406         UChar32 c1 = ci.previousCodePoint(errorCode);
407         UChar32 c2 = cpi.previous();
408         if(c1 != c2) {
409             errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
410                   name, (long)c1, (long)c2, cpi.getIndex());
411             return;
412         }
413     }
414 
415     // Forward again.
416     for(;;) {
417         UChar32 c1 = ci.nextCodePoint(errorCode);
418         UChar32 c2 = cpi.next();
419         if(c1 != c2) {
420             errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
421                   name, (long)c1, (long)c2, cpi.getIndex());
422             return;
423         }
424         if(c1 < 0) { break; }
425     }
426 
427     // Iterate backward to the start.
428     for(;;) {
429         UChar32 c1 = ci.previousCodePoint(errorCode);
430         UChar32 c2 = cpi.previous();
431         if(c1 != c2) {
432             errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
433                   name, (long)c1, (long)c2, cpi.getIndex());
434             return;
435         }
436         if(c1 < 0) { break; }
437     }
438 }
439 
TestFCD()440 void CollationTest::TestFCD() {
441     IcuTestErrorCode errorCode(*this, "TestFCD");
442     const CollationData *data = CollationRoot::getData(errorCode);
443     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
444         return;
445     }
446 
447     // Input string, not FCD, NUL-terminated.
448     static const UChar s[] = {
449         0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
450         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
451         0x327, 0x308,  // ccc=202, 230
452         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
453         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
454         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
455         0xac01,
456         0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
457         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
458         0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
459         0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
460         0x4e00, 0xf81,
461         0
462     };
463     // Expected code points.
464     static const UChar32 cp[] = {
465         0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
466         0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
467         0x1D15F, 0x1D16D,
468         0xac01,
469         0x63, 0x327, 0x1D165, 0x1D16D,
470         0x61,
471         0xf71, 0xf71, 0xf72, 0xf74, 0x301,
472         0x4e00, 0xf71, 0xf80
473     };
474 
475     FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
476     if(errorCode.errIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
477         return;
478     }
479     CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
480     checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
481 
482     cpi.resetToStart();
483     std::string utf8;
484     UnicodeString(s).toUTF8String(utf8);
485     FCDUTF8CollationIterator u8ci(data, FALSE,
486                                   reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
487     if(errorCode.errIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
488         return;
489     }
490     checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
491 
492     cpi.resetToStart();
493     UCharIterator iter;
494     uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL
495     FCDUIterCollationIterator uici(data, FALSE, iter, 0);
496     if(errorCode.errIfFailureAndReset("FCDUIterCollationIterator constructor")) {
497         return;
498     }
499     checkFCD("FCDUIterCollationIterator", uici, cpi);
500 }
501 
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)502 void CollationTest::checkAllocWeights(CollationWeights &cw,
503                                       uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
504                                       int32_t someLength, int32_t minCount) {
505     if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
506         errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
507               (long)lowerLimit, (long)upperLimit, (long)n);
508         return;
509     }
510     uint32_t previous = lowerLimit;
511     int32_t count = 0;  // number of weights that have someLength
512     for(int32_t i = 0; i < n; ++i) {
513         uint32_t w = cw.nextWeight();
514         if(w == 0xffffffff) {
515             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
516                   "returns only %ld weights",
517                   (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
518             return;
519         }
520         if(!(previous < w && w < upperLimit)) {
521             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
522                   "number %ld -> %lx not between %lx and %lx",
523                   (long)lowerLimit, (long)upperLimit, (long)n,
524                   (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
525             return;
526         }
527         if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
528     }
529     if(count < minCount) {
530         errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
531               "returns only %ld < %ld weights of length %d",
532               (long)lowerLimit, (long)upperLimit, (long)n,
533               (long)count, (long)minCount, (int)someLength);
534     }
535 }
536 
TestCollationWeights()537 void CollationTest::TestCollationWeights() {
538     CollationWeights cw;
539 
540     // Non-compressible primaries use 254 second bytes 02..FF.
541     logln("CollationWeights.initForPrimary(non-compressible)");
542     cw.initForPrimary(FALSE);
543     // Expect 1 weight 11 and 254 weights 12xx.
544     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
545     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
546     // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
547     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
548     // Expect 254 two-byte weights from the ranges 10ff and 11xx.
549     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
550     // Expect 254^2=64516 three-byte weights.
551     // During computation, there should be 3 three-byte ranges
552     // 10ffff, 11xxxx, 120202.
553     // The middle one should be split 64515:1,
554     // and the newly-split-off range and the last ranged lengthened.
555     checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
556     // Expect weights 1102 & 1103.
557     checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
558     // Expect weights 102102 & 102103.
559     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
560 
561     // Compressible primaries use 251 second bytes 04..FE.
562     logln("CollationWeights.initForPrimary(compressible)");
563     cw.initForPrimary(TRUE);
564     // Expect 1 weight 11 and 251 weights 12xx.
565     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
566     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
567     // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
568     checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
569     // Expect weights 1104 & 1105.
570     checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
571     // Expect weights 102102 & 102103.
572     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
573 
574     // Secondary and tertiary weights use only bytes 3 & 4.
575     logln("CollationWeights.initForSecondary()");
576     cw.initForSecondary();
577     // Expect weights fbxx and all four fc..ff.
578     checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
579 
580     logln("CollationWeights.initForTertiary()");
581     cw.initForTertiary();
582     // Expect weights 3dxx and both 3e & 3f.
583     checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
584 }
585 
586 namespace {
587 
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)588 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
589                 uint32_t p, uint32_t s, uint32_t ctq) {
590     uint32_t p1 = p >> 24;
591     uint32_t p2 = (p >> 16) & 0xff;
592     uint32_t p3 = (p >> 8) & 0xff;
593     uint32_t p4 = p & 0xff;
594     uint32_t s1 = s >> 8;
595     uint32_t s2 = s & 0xff;
596     // ctq = Case, Tertiary, Quaternary
597     uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
598     uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
599     uint32_t t1 = t >> 8;
600     uint32_t t2 = t & 0xff;
601     uint32_t q = ctq & Collation::QUATERNARY_MASK;
602     // No leading zero bytes.
603     if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
604         return FALSE;
605     }
606     // No intermediate zero bytes.
607     if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
608         return FALSE;
609     }
610     if(p2 != 0 && p3 == 0 && p4 != 0) {
611         return FALSE;
612     }
613     // Minimum & maximum lead bytes.
614     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
615             s1 == Collation::LEVEL_SEPARATOR_BYTE ||
616             t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
617         return FALSE;
618     }
619     if(c > 2) {
620         return FALSE;
621     }
622     // The valid byte range for the second primary byte depends on compressibility.
623     if(p2 != 0) {
624         if(data.isCompressibleLeadByte(p1)) {
625             if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
626                     Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
627                 return FALSE;
628             }
629         } else {
630             if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
631                 return FALSE;
632             }
633         }
634     }
635     // Other bytes just need to avoid the level separator.
636     // Trailing zeros are ok.
637     U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
638     if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
639             s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
640         return FALSE;
641     }
642     // Well-formed CEs.
643     if(p == 0) {
644         if(s == 0) {
645             if(t == 0) {
646                 // Completely ignorable CE.
647                 // Quaternary CEs are not supported.
648                 if(c != 0 || q != 0) {
649                     return FALSE;
650                 }
651             } else {
652                 // Tertiary CE.
653                 if(t < re.getTertiaryBoundary() || c != 2) {
654                     return FALSE;
655                 }
656             }
657         } else {
658             // Secondary CE.
659             if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
660                 return FALSE;
661             }
662         }
663     } else {
664         // Primary CE.
665         if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
666                 s >= re.getSecondaryBoundary()) {
667             return FALSE;
668         }
669         if(t == 0 || t >= re.getTertiaryBoundary()) {
670             return FALSE;
671         }
672     }
673     return TRUE;
674 }
675 
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)676 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
677     uint32_t p = (uint32_t)(ce >> 32);
678     uint32_t secTer = (uint32_t)ce;
679     return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
680 }
681 
682 class RootElementsIterator {
683 public:
RootElementsIterator(const CollationData & root)684     RootElementsIterator(const CollationData &root)
685             : data(root),
686               elements(root.rootElements), length(root.rootElementsLength),
687               pri(0), secTer(0),
688               index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
689 
next()690     UBool next() {
691         if(index >= length) { return FALSE; }
692         uint32_t p = elements[index];
693         if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
694         if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
695             ++index;
696             secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
697             return TRUE;
698         }
699         if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
700             // End of a range, enumerate the primaries in the range.
701             int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
702             p &= 0xffffff00;
703             if(pri == p) {
704                 // Finished the range, return the next CE after it.
705                 ++index;
706                 return next();
707             }
708             U_ASSERT(pri < p);
709             // Return the next primary in this range.
710             UBool isCompressible = data.isCompressiblePrimary(pri);
711             if((pri & 0xffff) == 0) {
712                 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
713             } else {
714                 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
715             }
716             return TRUE;
717         }
718         // Simple primary CE.
719         ++index;
720         pri = p;
721         // Does this have an explicit below-common sec/ter unit,
722         // or does it imply a common one?
723         if(index == length) {
724             secTer = Collation::COMMON_SEC_AND_TER_CE;
725         } else {
726             secTer = elements[index];
727             if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
728                 // No sec/ter delta.
729                 secTer = Collation::COMMON_SEC_AND_TER_CE;
730             } else {
731                 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
732                 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
733                     // Implied sec/ter.
734                     secTer = Collation::COMMON_SEC_AND_TER_CE;
735                 } else {
736                     // Explicit sec/ter below common/common.
737                     ++index;
738                 }
739             }
740         }
741         return TRUE;
742     }
743 
getPrimary() const744     uint32_t getPrimary() const { return pri; }
getSecTer() const745     uint32_t getSecTer() const { return secTer; }
746 
747 private:
748     const CollationData &data;
749     const uint32_t *elements;
750     int32_t length;
751 
752     uint32_t pri;
753     uint32_t secTer;
754     int32_t index;
755 };
756 
757 }  // namespace
758 
TestRootElements()759 void CollationTest::TestRootElements() {
760     IcuTestErrorCode errorCode(*this, "TestRootElements");
761     const CollationData *root = CollationRoot::getData(errorCode);
762     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
763         return;
764     }
765     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
766     RootElementsIterator iter(*root);
767 
768     // We check each root CE for validity,
769     // and we also verify that there is a tailoring gap between each two CEs.
770     CollationWeights cw1c;  // compressible primary weights
771     CollationWeights cw1u;  // uncompressible primary weights
772     CollationWeights cw2;
773     CollationWeights cw3;
774 
775     cw1c.initForPrimary(TRUE);
776     cw1u.initForPrimary(FALSE);
777     cw2.initForSecondary();
778     cw3.initForTertiary();
779 
780     // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
781     // nor the special merge-separator CE for U+FFFE.
782     uint32_t prevPri = 0;
783     uint32_t prevSec = 0;
784     uint32_t prevTer = 0;
785     while(iter.next()) {
786         uint32_t pri = iter.getPrimary();
787         uint32_t secTer = iter.getSecTer();
788         // CollationRootElements CEs must have 0 case and quaternary bits.
789         if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
790             errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
791                   (long)pri, (long)secTer);
792         }
793         uint32_t sec = secTer >> 16;
794         uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
795         uint32_t ctq = ter;
796         if(pri == 0 && sec == 0 && ter != 0) {
797             // Tertiary CEs must have uppercase bits,
798             // but they are not stored in the CollationRootElements.
799             ctq |= 0x8000;
800         }
801         if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
802             errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
803         } else {
804             if(pri != prevPri) {
805                 uint32_t newWeight = 0;
806                 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
807                     // There is currently no tailoring gap after primary ignorables,
808                     // and we forbid tailoring after U+FFFD and U+FFFF.
809                 } else if(root->isCompressiblePrimary(prevPri)) {
810                     if(!cw1c.allocWeights(prevPri, pri, 1)) {
811                         errln("no primary/compressible tailoring gap between %08lx and %08lx",
812                               (long)prevPri, (long)pri);
813                     } else {
814                         newWeight = cw1c.nextWeight();
815                     }
816                 } else {
817                     if(!cw1u.allocWeights(prevPri, pri, 1)) {
818                         errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
819                               (long)prevPri, (long)pri);
820                     } else {
821                         newWeight = cw1u.nextWeight();
822                     }
823                 }
824                 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
825                     errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
826                           (long)prevPri, (long)newWeight, (long)pri);
827                 }
828             } else if(sec != prevSec) {
829                 uint32_t lowerLimit =
830                     prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
831                 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
832                     errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
833                 } else {
834                     uint32_t newWeight = cw2.nextWeight();
835                     if(!(prevSec < newWeight && newWeight < sec)) {
836                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
837                               (long)lowerLimit, (long)newWeight, (long)sec);
838                     }
839                 }
840             } else if(ter != prevTer) {
841                 uint32_t lowerLimit =
842                     prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
843                 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
844                     errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
845                 } else {
846                     uint32_t newWeight = cw3.nextWeight();
847                     if(!(prevTer < newWeight && newWeight < ter)) {
848                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
849                               (long)lowerLimit, (long)newWeight, (long)ter);
850                     }
851                 }
852             } else {
853                 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
854             }
855         }
856         prevPri = pri;
857         prevSec = sec;
858         prevTer = ter;
859     }
860 }
861 
TestTailoredElements()862 void CollationTest::TestTailoredElements() {
863     IcuTestErrorCode errorCode(*this, "TestTailoredElements");
864     const CollationData *root = CollationRoot::getData(errorCode);
865     if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
866         return;
867     }
868     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
869 
870     UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
871     if(errorCode.errIfFailureAndReset("failed to create a hash table")) {
872         return;
873     }
874     uhash_setKeyDeleter(prevLocales, uprv_free);
875     // TestRootElements() tests the root collator which does not have tailorings.
876     uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
877     uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
878     uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
879 
880     UVector64 ces(errorCode);
881     LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
882     U_ASSERT(locales.isValid());
883     const char *localeID = "root";
884     do {
885         Locale locale(localeID);
886         LocalPointer<StringEnumeration> types(
887                 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
888         errorCode.assertSuccess();
889         const char *type;  // first: default type
890         while((type = types->next(NULL, errorCode)) != NULL) {
891             if(strncmp(type, "private-", 8) == 0) {
892                 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
893                         localeID, type);
894             }
895             Locale localeWithType(locale);
896             localeWithType.setKeywordValue("collation", type, errorCode);
897             errorCode.assertSuccess();
898             LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
899             if(errorCode.errIfFailureAndReset("Collator::createInstance(%s)",
900                                               localeWithType.getName())) {
901                 continue;
902             }
903             Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
904             if(uhash_geti(prevLocales, actual.getName()) != 0) {
905                 continue;
906             }
907             uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
908             errorCode.assertSuccess();
909             logln("TestTailoredElements(): requested %s -> actual %s",
910                   localeWithType.getName(), actual.getName());
911             RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
912             if(rbc == NULL) {
913                 continue;
914             }
915             // Note: It would be better to get tailored strings such that we can
916             // identify the prefix, and only get the CEs for the prefix+string,
917             // not also for the prefix.
918             // There is currently no API for that.
919             // It would help in an unusual case where a contraction starting in the prefix
920             // extends past its end, and we do not see the intended mapping.
921             // For example, for a mapping p|st, if there is also a contraction ps,
922             // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
923             LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
924             errorCode.assertSuccess();
925             UnicodeSetIterator iter(*tailored);
926             while(iter.next()) {
927                 const UnicodeString &s = iter.getString();
928                 ces.removeAllElements();
929                 rbc->internalGetCEs(s, ces, errorCode);
930                 errorCode.assertSuccess();
931                 for(int32_t i = 0; i < ces.size(); ++i) {
932                     int64_t ce = ces.elementAti(i);
933                     if(!isValidCE(rootElements, *root, ce)) {
934                         errln("invalid tailored CE %016llx at CE index %d from string:",
935                               (long long)ce, (int)i);
936                         infoln(prettify(s));
937                     }
938                 }
939             }
940         }
941     } while((localeID = locales->next(NULL, errorCode)) != NULL);
942     uhash_close(prevLocales);
943 }
944 
printSortKey(const uint8_t * p,int32_t length)945 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
946     UnicodeString s;
947     for(int32_t i = 0; i < length; ++i) {
948         if(i > 0) { s.append((UChar)0x20); }
949         uint8_t b = p[i];
950         if(b == 0) {
951             s.append((UChar)0x2e);  // period
952         } else if(b == 1) {
953             s.append((UChar)0x7c);  // vertical bar
954         } else {
955             appendHex(b, 2, s);
956         }
957     }
958     return s;
959 }
960 
printCollationKey(const CollationKey & key)961 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
962     int32_t length;
963     const uint8_t *p = key.getByteArray(length);
964     return printSortKey(p, length);
965 }
966 
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)967 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
968     for(;;) {
969         int32_t lineLength;
970         const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
971         if(line == NULL || errorCode.isFailure()) {
972             fileLine.remove();
973             return FALSE;
974         }
975         ++fileLineNumber;
976         // Strip trailing CR/LF, comments, and spaces.
977         const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
978         if(comment != NULL) {
979             lineLength = (int32_t)(comment - line);
980         } else {
981             while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
982         }
983         while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
984         if(lineLength != 0) {
985             fileLine.setTo(FALSE, line, lineLength);
986             return TRUE;
987         }
988         // Empty line, continue.
989     }
990 }
991 
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)992 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
993                                 UErrorCode &errorCode) {
994     int32_t length = fileLine.length();
995     int32_t i;
996     for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
997     int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
998     if(pipeIndex >= 0) {
999         prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1000         if(prefix.isEmpty()) {
1001             errln("empty prefix on line %d", (int)fileLineNumber);
1002             infoln(fileLine);
1003             errorCode = U_PARSE_ERROR;
1004             return;
1005         }
1006         start = pipeIndex + 1;
1007     } else {
1008         prefix.remove();
1009     }
1010     s = fileLine.tempSubStringBetween(start, i).unescape();
1011     if(s.isEmpty()) {
1012         errln("empty string on line %d", (int)fileLineNumber);
1013         infoln(fileLine);
1014         errorCode = U_PARSE_ERROR;
1015         return;
1016     }
1017     start = i;
1018 }
1019 
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1020 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1021     Collation::Level relation;
1022     int32_t start;
1023     if(fileLine[0] == 0x3c) {  // <
1024         UChar second = fileLine[1];
1025         start = 2;
1026         switch(second) {
1027         case 0x31:  // <1
1028             relation = Collation::PRIMARY_LEVEL;
1029             break;
1030         case 0x32:  // <2
1031             relation = Collation::SECONDARY_LEVEL;
1032             break;
1033         case 0x33:  // <3
1034             relation = Collation::TERTIARY_LEVEL;
1035             break;
1036         case 0x34:  // <4
1037             relation = Collation::QUATERNARY_LEVEL;
1038             break;
1039         case 0x63:  // <c
1040             relation = Collation::CASE_LEVEL;
1041             break;
1042         case 0x69:  // <i
1043             relation = Collation::IDENTICAL_LEVEL;
1044             break;
1045         default:  // just <
1046             relation = Collation::NO_LEVEL;
1047             start = 1;
1048             break;
1049         }
1050     } else if(fileLine[0] == 0x3d) {  // =
1051         relation = Collation::ZERO_LEVEL;
1052         start = 1;
1053     } else {
1054         start = 0;
1055     }
1056     if(start == 0 || !isSpace(fileLine[start])) {
1057         errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1058         infoln(fileLine);
1059         errorCode.set(U_PARSE_ERROR);
1060         return Collation::NO_LEVEL;
1061     }
1062     start = skipSpaces(start);
1063     UnicodeString prefix;
1064     parseString(start, prefix, s, errorCode);
1065     if(errorCode.isSuccess() && !prefix.isEmpty()) {
1066         errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1067         infoln(fileLine);
1068         errorCode.set(U_PARSE_ERROR);
1069         return Collation::NO_LEVEL;
1070     }
1071     if(start < fileLine.length()) {
1072         errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1073         infoln(fileLine);
1074         errorCode.set(U_PARSE_ERROR);
1075         return Collation::NO_LEVEL;
1076     }
1077     return relation;
1078 }
1079 
1080 static const struct {
1081     const char *name;
1082     UColAttribute attr;
1083 } attributes[] = {
1084     { "backwards", UCOL_FRENCH_COLLATION },
1085     { "alternate", UCOL_ALTERNATE_HANDLING },
1086     { "caseFirst", UCOL_CASE_FIRST },
1087     { "caseLevel", UCOL_CASE_LEVEL },
1088     // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1089     { "strength", UCOL_STRENGTH },
1090     // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1091     { "numeric", UCOL_NUMERIC_COLLATION }
1092 };
1093 
1094 static const struct {
1095     const char *name;
1096     UColAttributeValue value;
1097 } attributeValues[] = {
1098     { "default", UCOL_DEFAULT },
1099     { "primary", UCOL_PRIMARY },
1100     { "secondary", UCOL_SECONDARY },
1101     { "tertiary", UCOL_TERTIARY },
1102     { "quaternary", UCOL_QUATERNARY },
1103     { "identical", UCOL_IDENTICAL },
1104     { "off", UCOL_OFF },
1105     { "on", UCOL_ON },
1106     { "shifted", UCOL_SHIFTED },
1107     { "non-ignorable", UCOL_NON_IGNORABLE },
1108     { "lower", UCOL_LOWER_FIRST },
1109     { "upper", UCOL_UPPER_FIRST }
1110 };
1111 
parseAndSetAttribute(IcuTestErrorCode & errorCode)1112 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1113     // Parse attributes even if the Collator could not be created,
1114     // in order to report syntax errors.
1115     int32_t start = skipSpaces(1);
1116     int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1117     if(equalPos < 0) {
1118         if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1119             parseAndSetReorderCodes(start + 7, errorCode);
1120             return;
1121         }
1122         errln("missing '=' on line %d", (int)fileLineNumber);
1123         infoln(fileLine);
1124         errorCode.set(U_PARSE_ERROR);
1125         return;
1126     }
1127 
1128     UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1129     UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1130     if(attrString == UNICODE_STRING("maxVariable", 11)) {
1131         UColReorderCode max;
1132         if(valueString == UNICODE_STRING("space", 5)) {
1133             max = UCOL_REORDER_CODE_SPACE;
1134         } else if(valueString == UNICODE_STRING("punct", 5)) {
1135             max = UCOL_REORDER_CODE_PUNCTUATION;
1136         } else if(valueString == UNICODE_STRING("symbol", 6)) {
1137             max = UCOL_REORDER_CODE_SYMBOL;
1138         } else if(valueString == UNICODE_STRING("currency", 8)) {
1139             max = UCOL_REORDER_CODE_CURRENCY;
1140         } else {
1141             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1142             infoln(fileLine);
1143             errorCode.set(U_PARSE_ERROR);
1144             return;
1145         }
1146         if(coll != NULL) {
1147             coll->setMaxVariable(max, errorCode);
1148             if(errorCode.isFailure()) {
1149                 errln("setMaxVariable() failed on line %d: %s",
1150                       (int)fileLineNumber, errorCode.errorName());
1151                 infoln(fileLine);
1152                 return;
1153             }
1154         }
1155         fileLine.remove();
1156         return;
1157     }
1158 
1159     UColAttribute attr;
1160     for(int32_t i = 0;; ++i) {
1161         if(i == UPRV_LENGTHOF(attributes)) {
1162             errln("invalid attribute name on line %d", (int)fileLineNumber);
1163             infoln(fileLine);
1164             errorCode.set(U_PARSE_ERROR);
1165             return;
1166         }
1167         if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1168             attr = attributes[i].attr;
1169             break;
1170         }
1171     }
1172 
1173     UColAttributeValue value;
1174     for(int32_t i = 0;; ++i) {
1175         if(i == UPRV_LENGTHOF(attributeValues)) {
1176             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1177             infoln(fileLine);
1178             errorCode.set(U_PARSE_ERROR);
1179             return;
1180         }
1181         if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1182             value = attributeValues[i].value;
1183             break;
1184         }
1185     }
1186 
1187     if(coll != NULL) {
1188         coll->setAttribute(attr, value, errorCode);
1189         if(errorCode.isFailure()) {
1190             errln("illegal attribute=value combination on line %d: %s",
1191                   (int)fileLineNumber, errorCode.errorName());
1192             infoln(fileLine);
1193             return;
1194         }
1195     }
1196     fileLine.remove();
1197 }
1198 
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1199 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1200     UVector32 reorderCodes(errorCode);
1201     while(start < fileLine.length()) {
1202         start = skipSpaces(start);
1203         int32_t limit = start;
1204         while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1205         CharString name;
1206         name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1207         int32_t code = CollationRuleParser::getReorderCode(name.data());
1208         if(code < 0) {
1209             if(uprv_stricmp(name.data(), "default") == 0) {
1210                 code = UCOL_REORDER_CODE_DEFAULT;  // -1
1211             } else {
1212                 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1213                 infoln(fileLine);
1214                 errorCode.set(U_PARSE_ERROR);
1215                 return;
1216             }
1217         }
1218         reorderCodes.addElement(code, errorCode);
1219         start = limit;
1220     }
1221     if(coll != NULL) {
1222         coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1223         if(errorCode.isFailure()) {
1224             errln("setReorderCodes() failed on line %d: %s",
1225                   (int)fileLineNumber, errorCode.errorName());
1226             infoln(fileLine);
1227             return;
1228         }
1229     }
1230     fileLine.remove();
1231 }
1232 
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1233 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1234     UnicodeString rules;
1235     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1236         rules.append(fileLine.unescape());
1237     }
1238     if(errorCode.isFailure()) { return; }
1239     logln(rules);
1240 
1241     UParseError parseError;
1242     UnicodeString reason;
1243     delete coll;
1244     coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1245     if(coll == NULL) {
1246         errln("unable to allocate a new collator");
1247         errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1248         return;
1249     }
1250     if(errorCode.isFailure()) {
1251         dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1252         infoln(UnicodeString("  reason: ") + reason);
1253         if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1254         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1255             infoln(UnicodeString("  snippet: ...") +
1256                 parseError.preContext + "(!)" + parseError.postContext + "...");
1257         }
1258         delete coll;
1259         coll = NULL;
1260         errorCode.reset();
1261     } else {
1262         assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1263                      UnicodeString(), reason);
1264     }
1265 }
1266 
setRootCollator(IcuTestErrorCode & errorCode)1267 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1268     if(errorCode.isFailure()) { return; }
1269     delete coll;
1270     coll = Collator::createInstance(Locale::getRoot(), errorCode);
1271     if(errorCode.isFailure()) {
1272         dataerrln("unable to create a root collator");
1273         return;
1274     }
1275 }
1276 
setLocaleCollator(IcuTestErrorCode & errorCode)1277 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1278     if(errorCode.isFailure()) { return; }
1279     delete coll;
1280     coll = NULL;
1281     int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1282     if(at >= 0) {
1283         fileLine.setCharAt(at, (UChar)0x2a);  // *
1284     }
1285     CharString localeID;
1286     localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1287     if(at >= 0) {
1288         localeID.data()[at - 9] = '@';
1289     }
1290     Locale locale(localeID.data());
1291     if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1292         errln("invalid language tag on line %d", (int)fileLineNumber);
1293         infoln(fileLine);
1294         if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1295         return;
1296     }
1297 
1298     logln("creating a collator for locale ID %s", locale.getName());
1299     coll = Collator::createInstance(locale, errorCode);
1300     if(errorCode.isFailure()) {
1301         dataerrln("unable to create a collator for locale %s on line %d",
1302                   locale.getName(), (int)fileLineNumber);
1303         infoln(fileLine);
1304         delete coll;
1305         coll = NULL;
1306         errorCode.reset();
1307     }
1308 }
1309 
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1310 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1311     if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1312     // In some sequences with Tibetan composite vowel signs,
1313     // even if the string passes the FCD check,
1314     // those composites must be decomposed.
1315     // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1316     int32_t index = 0;
1317     while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1318         if(++index < s.length()) {
1319             UChar c = s[index];
1320             if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1321         }
1322     }
1323     return FALSE;
1324 }
1325 
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1326 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1327                                      CharString &dest, int32_t partSize,
1328                                      IcuTestErrorCode &errorCode) {
1329     if(errorCode.isFailure()) { return FALSE; }
1330     uint8_t part[32];
1331     U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1332     UCharIterator iter;
1333     uiter_setString(&iter, s, length);
1334     uint32_t state[2] = { 0, 0 };
1335     for(;;) {
1336         int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1337         UBool done = partLength < partSize;
1338         if(done) {
1339             // At the end, append the next byte as well which should be 00.
1340             ++partLength;
1341         }
1342         dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1343         if(done) {
1344             return errorCode.isSuccess();
1345         }
1346     }
1347 }
1348 
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1349 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1350                                      const UChar *s, int32_t length,
1351                                      CollationKey &key, IcuTestErrorCode &errorCode) {
1352     if(errorCode.isFailure()) { return FALSE; }
1353     coll->getCollationKey(s, length, key, errorCode);
1354     if(errorCode.isFailure()) {
1355         infoln(fileTestName);
1356         errln("Collator(%s).getCollationKey() failed: %s",
1357               norm, errorCode.errorName());
1358         infoln(line);
1359         return FALSE;
1360     }
1361     int32_t keyLength;
1362     const uint8_t *keyBytes = key.getByteArray(keyLength);
1363     if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1364         infoln(fileTestName);
1365         errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1366               norm);
1367         infoln(line);
1368         infoln(printCollationKey(key));
1369         return FALSE;
1370     }
1371 
1372     int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1373     if(numLevels < UCOL_IDENTICAL) {
1374         ++numLevels;
1375     } else {
1376         numLevels = 5;
1377     }
1378     if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1379         ++numLevels;
1380     }
1381     errorCode.assertSuccess();
1382     int32_t numLevelSeparators = 0;
1383     for(int32_t i = 0; i < (keyLength - 1); ++i) {
1384         uint8_t b = keyBytes[i];
1385         if(b == 0) {
1386             infoln(fileTestName);
1387             errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1388             infoln(line);
1389             infoln(printCollationKey(key));
1390             return FALSE;
1391         }
1392         if(b == 1) { ++numLevelSeparators; }
1393     }
1394     if(numLevelSeparators != (numLevels - 1)) {
1395         infoln(fileTestName);
1396         errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1397               norm, (int)numLevelSeparators, (int)numLevels);
1398         infoln(line);
1399         infoln(printCollationKey(key));
1400         return FALSE;
1401     }
1402 
1403     // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1404     static const int32_t partSizes[] = { 32, 3, 1 };
1405     for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1406         int32_t partSize = partSizes[psi];
1407         CharString parts;
1408         if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1409             infoln(fileTestName);
1410             errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1411                   norm, (int)partSize, errorCode.errorName());
1412             infoln(line);
1413             return FALSE;
1414         }
1415         if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1416             infoln(fileTestName);
1417             errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1418                   norm, (int)partSize);
1419             infoln(line);
1420             infoln(printCollationKey(key));
1421             infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1422             return FALSE;
1423         }
1424     }
1425     return TRUE;
1426 }
1427 
1428 /**
1429  * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1430  * Leaves key unchanged if s does not contain U+FFFE.
1431  * @return TRUE if the key was successfully changed
1432  */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1433 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1434                                            CollationKey &key, IcuTestErrorCode &errorCode) {
1435     if(errorCode.isFailure()) { return FALSE; }
1436     LocalMemory<uint8_t> mergedKey;
1437     int32_t mergedKeyLength = 0;
1438     int32_t mergedKeyCapacity = 0;
1439     int32_t sLength = (length >= 0) ? length : u_strlen(s);
1440     int32_t segmentStart = 0;
1441     for(int32_t i = 0;;) {
1442         if(i == sLength) {
1443             if(segmentStart == 0) {
1444                 // s does not contain any U+FFFE.
1445                 return FALSE;
1446             }
1447         } else if(s[i] != 0xfffe) {
1448             ++i;
1449             continue;
1450         }
1451         // Get the sort key for another segment and merge it into mergedKey.
1452         CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1453         CollationKey key2;
1454         coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1455         int32_t key1Length, key2Length;
1456         const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1457         const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1458         uint8_t *dest;
1459         int32_t minCapacity = key1Length + key2Length;
1460         if(key1Length > 0) { --minCapacity; }
1461         if(minCapacity <= mergedKeyCapacity) {
1462             dest = mergedKey.getAlias();
1463         } else {
1464             if(minCapacity <= 200) {
1465                 mergedKeyCapacity = 200;
1466             } else if(minCapacity <= 2 * mergedKeyCapacity) {
1467                 mergedKeyCapacity *= 2;
1468             } else {
1469                 mergedKeyCapacity = minCapacity;
1470             }
1471             dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1472         }
1473         U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1474         if(key1Length == 0) {
1475             // key2 is the sort key for the first segment.
1476             uprv_memcpy(dest, key2Bytes, key2Length);
1477             mergedKeyLength = key2Length;
1478         } else {
1479             mergedKeyLength =
1480                 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1481                                    dest, mergedKeyCapacity);
1482         }
1483         if(i == sLength) { break; }
1484         segmentStart = ++i;
1485     }
1486     key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1487     return TRUE;
1488 }
1489 
1490 namespace {
1491 
1492 /**
1493  * Replaces unpaired surrogates with U+FFFD.
1494  * Returns s if no replacement was made, otherwise buffer.
1495  */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1496 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1497     int32_t i = 0;
1498     while(i < s.length()) {
1499         UChar32 c = s.char32At(i);
1500         if(U_IS_SURROGATE(c)) {
1501             if(buffer.length() < i) {
1502                 buffer.append(s, buffer.length(), i - buffer.length());
1503             }
1504             buffer.append((UChar)0xfffd);
1505         }
1506         i += U16_LENGTH(c);
1507     }
1508     if(buffer.isEmpty()) {
1509         return s;
1510     }
1511     if(buffer.length() < i) {
1512         buffer.append(s, buffer.length(), i - buffer.length());
1513     }
1514     return buffer;
1515 }
1516 
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1517 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1518                            UCollationResult order, UBool collHasCaseLevel) {
1519     if(order == UCOL_EQUAL) {
1520         return Collation::NO_LEVEL;
1521     }
1522     int32_t prevKeyLength;
1523     const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1524     int32_t keyLength;
1525     const uint8_t *bytes = key.getByteArray(keyLength);
1526     int32_t level = Collation::PRIMARY_LEVEL;
1527     for(int32_t i = 0;; ++i) {
1528         uint8_t b = prevBytes[i];
1529         if(b != bytes[i]) { break; }
1530         if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1531             ++level;
1532             if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1533                 ++level;
1534             }
1535         }
1536     }
1537     return level;
1538 }
1539 
1540 }
1541 
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1542 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1543                                      const UnicodeString &prevString, const UnicodeString &s,
1544                                      UCollationResult expectedOrder, Collation::Level expectedLevel,
1545                                      IcuTestErrorCode &errorCode) {
1546     if(errorCode.isFailure()) { return FALSE; }
1547 
1548     // Get the sort keys first, for error debug output.
1549     CollationKey prevKey;
1550     if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1551                         prevKey, errorCode)) {
1552         return FALSE;
1553     }
1554     CollationKey key;
1555     if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1556 
1557     UCollationResult order = coll->compare(prevString, s, errorCode);
1558     if(order != expectedOrder || errorCode.isFailure()) {
1559         infoln(fileTestName);
1560         errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1561               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1562         infoln(prevFileLine);
1563         infoln(fileLine);
1564         infoln(printCollationKey(prevKey));
1565         infoln(printCollationKey(key));
1566         return FALSE;
1567     }
1568     order = coll->compare(s, prevString, errorCode);
1569     if(order != -expectedOrder || errorCode.isFailure()) {
1570         infoln(fileTestName);
1571         errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1572               (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1573         infoln(prevFileLine);
1574         infoln(fileLine);
1575         infoln(printCollationKey(prevKey));
1576         infoln(printCollationKey(key));
1577         return FALSE;
1578     }
1579     // Test NUL-termination if the strings do not contain NUL characters.
1580     UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1581     if(!containNUL) {
1582         order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1583         if(order != expectedOrder || errorCode.isFailure()) {
1584             infoln(fileTestName);
1585             errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1586                   (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1587             infoln(prevFileLine);
1588             infoln(fileLine);
1589             infoln(printCollationKey(prevKey));
1590             infoln(printCollationKey(key));
1591             return FALSE;
1592         }
1593         order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1594         if(order != -expectedOrder || errorCode.isFailure()) {
1595             infoln(fileTestName);
1596             errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1597                   (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1598             infoln(prevFileLine);
1599             infoln(fileLine);
1600             infoln(printCollationKey(prevKey));
1601             infoln(printCollationKey(key));
1602             return FALSE;
1603         }
1604     }
1605 
1606     // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1607     // Unpaired surrogates cannot be converted to UTF-8.
1608     // Create valid UTF-16 strings if necessary, and use those for
1609     // both the expected compare() result and for the input to compare(UTF-8).
1610     UnicodeString prevBuffer, sBuffer;
1611     const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1612     const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1613     std::string prevUTF8, sUTF8;
1614     UnicodeString(prevValid).toUTF8String(prevUTF8);
1615     UnicodeString(sValid).toUTF8String(sUTF8);
1616     UCollationResult expectedUTF8Order;
1617     if(&prevValid == &prevString && &sValid == &s) {
1618         expectedUTF8Order = expectedOrder;
1619     } else {
1620         expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1621     }
1622 
1623     order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1624     if(order != expectedUTF8Order || errorCode.isFailure()) {
1625         infoln(fileTestName);
1626         errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1627               (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1628         infoln(prevFileLine);
1629         infoln(fileLine);
1630         infoln(printCollationKey(prevKey));
1631         infoln(printCollationKey(key));
1632         return FALSE;
1633     }
1634     order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1635     if(order != -expectedUTF8Order || errorCode.isFailure()) {
1636         infoln(fileTestName);
1637         errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1638               (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1639         infoln(prevFileLine);
1640         infoln(fileLine);
1641         infoln(printCollationKey(prevKey));
1642         infoln(printCollationKey(key));
1643         return FALSE;
1644     }
1645     // Test NUL-termination if the strings do not contain NUL characters.
1646     if(!containNUL) {
1647         order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1648         if(order != expectedUTF8Order || errorCode.isFailure()) {
1649             infoln(fileTestName);
1650             errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1651                   (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1652             infoln(prevFileLine);
1653             infoln(fileLine);
1654             infoln(printCollationKey(prevKey));
1655             infoln(printCollationKey(key));
1656             return FALSE;
1657         }
1658         order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1659         if(order != -expectedUTF8Order || errorCode.isFailure()) {
1660             infoln(fileTestName);
1661             errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1662                   (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1663             infoln(prevFileLine);
1664             infoln(fileLine);
1665             infoln(printCollationKey(prevKey));
1666             infoln(printCollationKey(key));
1667             return FALSE;
1668         }
1669     }
1670 
1671     UCharIterator leftIter;
1672     UCharIterator rightIter;
1673     uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1674     uiter_setString(&rightIter, s.getBuffer(), s.length());
1675     order = coll->compare(leftIter, rightIter, errorCode);
1676     if(order != expectedOrder || errorCode.isFailure()) {
1677         infoln(fileTestName);
1678         errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1679               "wrong order: %d != %d (%s)",
1680               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1681         infoln(prevFileLine);
1682         infoln(fileLine);
1683         infoln(printCollationKey(prevKey));
1684         infoln(printCollationKey(key));
1685         return FALSE;
1686     }
1687 
1688     order = prevKey.compareTo(key, errorCode);
1689     if(order != expectedOrder || errorCode.isFailure()) {
1690         infoln(fileTestName);
1691         errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1692               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1693         infoln(prevFileLine);
1694         infoln(fileLine);
1695         infoln(printCollationKey(prevKey));
1696         infoln(printCollationKey(key));
1697         return FALSE;
1698     }
1699     UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1700     int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1701     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1702         if(level != expectedLevel) {
1703             infoln(fileTestName);
1704             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1705                   (int)fileLineNumber, norm, order, level, expectedLevel);
1706             infoln(prevFileLine);
1707             infoln(fileLine);
1708             infoln(printCollationKey(prevKey));
1709             infoln(printCollationKey(key));
1710             return FALSE;
1711         }
1712     }
1713 
1714     // If either string contains U+FFFE, then their sort keys must compare the same as
1715     // the merged sort keys of each string's between-FFFE segments.
1716     //
1717     // It is not required that
1718     //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1719     // only that those two methods yield the same order.
1720     //
1721     // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1722     if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1723                 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1724             errorCode.isFailure()) {
1725         order = prevKey.compareTo(key, errorCode);
1726         if(order != expectedOrder || errorCode.isFailure()) {
1727             infoln(fileTestName);
1728             errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1729                 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1730                 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1731             infoln(prevFileLine);
1732             infoln(fileLine);
1733             infoln(printCollationKey(prevKey));
1734             infoln(printCollationKey(key));
1735             return FALSE;
1736         }
1737         int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1738         if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1739             if(mergedLevel != level) {
1740                 infoln(fileTestName);
1741                 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1742                     "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1743                     (int)fileLineNumber, norm, order, mergedLevel, level);
1744                 infoln(prevFileLine);
1745                 infoln(fileLine);
1746                 infoln(printCollationKey(prevKey));
1747                 infoln(printCollationKey(key));
1748                 return FALSE;
1749             }
1750         }
1751     }
1752     return TRUE;
1753 }
1754 
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1755 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1756     if(errorCode.isFailure()) { return; }
1757     UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1758     UnicodeString prevString, s;
1759     prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1760     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1761         // Parse the line even if it will be ignored (when we do not have a Collator)
1762         // in order to report syntax issues.
1763         Collation::Level relation = parseRelationAndString(s, errorCode);
1764         if(errorCode.isFailure()) {
1765             errorCode.reset();
1766             break;
1767         }
1768         if(coll == NULL) {
1769             // We were unable to create the Collator but continue with tests.
1770             // Ignore test data for this Collator.
1771             // The next Collator creation might work.
1772             continue;
1773         }
1774         UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1775         Collation::Level expectedLevel = relation;
1776         s.getTerminatedBuffer();  // Ensure NUL-termination.
1777         UBool isOk = TRUE;
1778         if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1779             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1780             isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1781                                    expectedOrder, expectedLevel, errorCode);
1782         }
1783         if(isOk) {
1784             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1785             isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1786                                    expectedOrder, expectedLevel, errorCode);
1787         }
1788         if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1789             UnicodeString pn = nfd->normalize(prevString, errorCode);
1790             UnicodeString n = nfd->normalize(s, errorCode);
1791             pn.getTerminatedBuffer();
1792             n.getTerminatedBuffer();
1793             errorCode.assertSuccess();
1794             isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1795                                    expectedOrder, expectedLevel, errorCode);
1796         }
1797         if(!isOk) {
1798             errorCode.reset();  // already reported
1799         }
1800         prevFileLine = fileLine;
1801         prevString = s;
1802         prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1803     }
1804 }
1805 
TestDataDriven()1806 void CollationTest::TestDataDriven() {
1807     IcuTestErrorCode errorCode(*this, "TestDataDriven");
1808 
1809     fcd = Normalizer2Factory::getFCDInstance(errorCode);
1810     nfd = Normalizer2::getNFDInstance(errorCode);
1811     if(errorCode.errDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1812         return;
1813     }
1814 
1815     CharString path(getSourceTestData(errorCode), errorCode);
1816     path.appendPathPart("collationtest.txt", errorCode);
1817     const char *codePage = "UTF-8";
1818     LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1819     if(errorCode.errIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1820         return;
1821     }
1822     // Read a new line if necessary.
1823     // Sub-parsers leave the first line set that they do not handle.
1824     while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1825         if(!isSectionStarter(fileLine[0])) {
1826             errln("syntax error on line %d", (int)fileLineNumber);
1827             infoln(fileLine);
1828             return;
1829         }
1830         if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1831             fileTestName = fileLine;
1832             logln(fileLine);
1833             fileLine.remove();
1834         } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1835             setRootCollator(errorCode);
1836             fileLine.remove();
1837         } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1838             setLocaleCollator(errorCode);
1839             fileLine.remove();
1840         } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1841             buildTailoring(f.getAlias(), errorCode);
1842         } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1843             parseAndSetAttribute(errorCode);
1844         } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1845             checkCompareStrings(f.getAlias(), errorCode);
1846         } else {
1847             errln("syntax error on line %d", (int)fileLineNumber);
1848             infoln(fileLine);
1849             return;
1850         }
1851     }
1852 }
1853 
1854 #endif  // !UCONFIG_NO_COLLATION
1855