1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2015, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationtest.cpp
7 *
8 * created on: 2012apr27
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/coll.h"
17 #include "unicode/errorcode.h"
18 #include "unicode/localpointer.h"
19 #include "unicode/normalizer2.h"
20 #include "unicode/sortkey.h"
21 #include "unicode/std_string.h"
22 #include "unicode/strenum.h"
23 #include "unicode/tblcoll.h"
24 #include "unicode/uiter.h"
25 #include "unicode/uniset.h"
26 #include "unicode/unistr.h"
27 #include "unicode/usetiter.h"
28 #include "unicode/ustring.h"
29 #include "charstr.h"
30 #include "cmemory.h"
31 #include "collation.h"
32 #include "collationdata.h"
33 #include "collationfcd.h"
34 #include "collationiterator.h"
35 #include "collationroot.h"
36 #include "collationrootelements.h"
37 #include "collationruleparser.h"
38 #include "collationweights.h"
39 #include "cstring.h"
40 #include "intltest.h"
41 #include "normalizer2impl.h"
42 #include "ucbuf.h"
43 #include "uhash.h"
44 #include "uitercollationiterator.h"
45 #include "utf16collationiterator.h"
46 #include "utf8collationiterator.h"
47 #include "uvectr32.h"
48 #include "uvectr64.h"
49 #include "writesrc.h"
50 
51 class CodePointIterator;
52 
53 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
54 
55 class CollationTest : public IntlTest {
56 public:
CollationTest()57     CollationTest()
58             : fcd(NULL), nfd(NULL),
59               fileLineNumber(0),
60               coll(NULL) {}
61 
~CollationTest()62     ~CollationTest() {
63         delete coll;
64     }
65 
66     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
67 
68     void TestMinMax();
69     void TestImplicits();
70     void TestNulTerminated();
71     void TestIllegalUTF8();
72     void TestShortFCDData();
73     void TestFCD();
74     void TestCollationWeights();
75     void TestRootElements();
76     void TestTailoredElements();
77     void TestDataDriven();
78 
79 private:
80     void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
81     void checkAllocWeights(CollationWeights &cw,
82                            uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
83                            int32_t someLength, int32_t minCount);
84 
85     static UnicodeString printSortKey(const uint8_t *p, int32_t length);
86     static UnicodeString printCollationKey(const CollationKey &key);
87 
88     // Helpers & fields for data-driven test.
isCROrLF(UChar c)89     static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)90     static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)91     static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
skipSpaces(int32_t i)92     int32_t skipSpaces(int32_t i) {
93         while(isSpace(fileLine[i])) { ++i; }
94         return i;
95     }
96 
97     UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
98     void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
99     Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
100     void parseAndSetAttribute(IcuTestErrorCode &errorCode);
101     void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
102     void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
103     void setRootCollator(IcuTestErrorCode &errorCode);
104     void setLocaleCollator(IcuTestErrorCode &errorCode);
105 
106     UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
107 
108     UBool getSortKeyParts(const UChar *s, int32_t length,
109                           CharString &dest, int32_t partSize,
110                           IcuTestErrorCode &errorCode);
111     UBool getCollationKey(const char *norm, const UnicodeString &line,
112                           const UChar *s, int32_t length,
113                           CollationKey &key, IcuTestErrorCode &errorCode);
114     UBool getMergedCollationKey(const UChar *s, int32_t length,
115                                 CollationKey &key, IcuTestErrorCode &errorCode);
116     UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
117                           const UnicodeString &prevString, const UnicodeString &s,
118                           UCollationResult expectedOrder, Collation::Level expectedLevel,
119                           IcuTestErrorCode &errorCode);
120     void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
121 
122     const Normalizer2 *fcd, *nfd;
123     UnicodeString fileLine;
124     int32_t fileLineNumber;
125     UnicodeString fileTestName;
126     Collator *coll;
127 };
128 
createCollationTest()129 extern IntlTest *createCollationTest() {
130     return new CollationTest();
131 }
132 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)133 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
134     if(exec) {
135         logln("TestSuite CollationTest: ");
136     }
137     TESTCASE_AUTO_BEGIN;
138     TESTCASE_AUTO(TestMinMax);
139     TESTCASE_AUTO(TestImplicits);
140     TESTCASE_AUTO(TestNulTerminated);
141     TESTCASE_AUTO(TestIllegalUTF8);
142     TESTCASE_AUTO(TestShortFCDData);
143     TESTCASE_AUTO(TestFCD);
144     TESTCASE_AUTO(TestCollationWeights);
145     TESTCASE_AUTO(TestRootElements);
146     TESTCASE_AUTO(TestTailoredElements);
147     TESTCASE_AUTO(TestDataDriven);
148     TESTCASE_AUTO_END;
149 }
150 
TestMinMax()151 void CollationTest::TestMinMax() {
152     IcuTestErrorCode errorCode(*this, "TestMinMax");
153 
154     setRootCollator(errorCode);
155     if(errorCode.isFailure()) {
156         errorCode.reset();
157         return;
158     }
159     RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
160     if(rbc == NULL) {
161         errln("the root collator is not a RuleBasedCollator");
162         return;
163     }
164 
165     static const UChar s[2] = { 0xfffe, 0xffff };
166     UVector64 ces(errorCode);
167     rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
168     errorCode.assertSuccess();
169     if(ces.size() != 2) {
170         errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
171         return;
172     }
173     int64_t ce = ces.elementAti(0);
174     int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
175     if(ce != expected) {
176         errln("CE(U+fffe)=%04lx != 02..", (long)ce);
177     }
178 
179     ce = ces.elementAti(1);
180     expected = Collation::makeCE(Collation::MAX_PRIMARY);
181     if(ce != expected) {
182         errln("CE(U+ffff)=%04lx != max..", (long)ce);
183     }
184 }
185 
TestImplicits()186 void CollationTest::TestImplicits() {
187     IcuTestErrorCode errorCode(*this, "TestImplicits");
188 
189     const CollationData *cd = CollationRoot::getData(errorCode);
190     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
191         return;
192     }
193 
194     // Implicit primary weights should be assigned for the following sets,
195     // and sort in ascending order by set and then code point.
196     // See http://www.unicode.org/reports/tr10/#Implicit_Weights
197 
198     // core Han Unified Ideographs
199     UnicodeSet coreHan("[\\p{unified_ideograph}&"
200                             "[\\p{Block=CJK_Unified_Ideographs}"
201                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
202                        errorCode);
203     // all other Unified Han ideographs
204     UnicodeSet otherHan("[\\p{unified ideograph}-"
205                             "[\\p{Block=CJK_Unified_Ideographs}"
206                             "\\p{Block=CJK_Compatibility_Ideographs}]]",
207                         errorCode);
208     UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
209     unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
210 
211     // Starting with CLDR 26/ICU 54, the root Han order may instead be
212     // the Unihan radical-stroke order.
213     // The tests should pass either way, so we only test the order of a small set of Han characters
214     // whose radical-stroke order is the same as their code point order.
215     UnicodeSet someHanInCPOrder(
216             "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
217             "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
218             errorCode);
219     UnicodeSet inOrder(someHanInCPOrder);
220     inOrder.addAll(unassigned).freeze();
221     if(errorCode.logIfFailureAndReset("UnicodeSet")) {
222         return;
223     }
224     const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
225     UChar32 prev = 0;
226     uint32_t prevPrimary = 0;
227     UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
228     for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
229         LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
230         while(iter->next()) {
231             UChar32 c = iter->getCodepoint();
232             UnicodeString s(c);
233             ci.setText(s.getBuffer(), s.getBuffer() + s.length());
234             int64_t ce = ci.nextCE(errorCode);
235             int64_t ce2 = ci.nextCE(errorCode);
236             if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
237                 return;
238             }
239             if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
240                 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
241                 continue;
242             }
243             if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
244                 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
245                       (long)c, (long)(ce & 0xffffffff));
246                 continue;
247             }
248             uint32_t primary = (uint32_t)(ce >> 32);
249             if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
250                 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
251                       (long)c, (long)primary, (long)prev, (long)prevPrimary);
252             }
253             prev = c;
254             prevPrimary = primary;
255         }
256     }
257 }
258 
TestNulTerminated()259 void CollationTest::TestNulTerminated() {
260     IcuTestErrorCode errorCode(*this, "TestNulTerminated");
261     const CollationData *data = CollationRoot::getData(errorCode);
262     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
263         return;
264     }
265 
266     static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
267 
268     UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
269     UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
270     for(int32_t i = 0;; ++i) {
271         int64_t ce1 = ci1.nextCE(errorCode);
272         int64_t ce2 = ci2.nextCE(errorCode);
273         if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
274             return;
275         }
276         if(ce1 != ce2) {
277             errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
278             break;
279         }
280         if(ce1 == Collation::NO_CE) { break; }
281     }
282 }
283 
TestIllegalUTF8()284 void CollationTest::TestIllegalUTF8() {
285     IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
286 
287     setRootCollator(errorCode);
288     if(errorCode.isFailure()) {
289         errorCode.reset();
290         return;
291     }
292     coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
293 
294     static const char *strings[] = {
295         // U+FFFD
296         "a\xef\xbf\xbdz",
297         // illegal byte sequences
298         "a\x80z",  // trail byte
299         "a\xc1\x81z",  // non-shortest form
300         "a\xe0\x82\x83z",  // non-shortest form
301         "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
302         "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
303         "a\xf0\x8f\xbf\xbfz",  // non-shortest form
304         "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
305     };
306 
307     StringPiece fffd(strings[0]);
308     for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
309         StringPiece illegal(strings[i]);
310         UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
311         if(order != UCOL_EQUAL) {
312             errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
313                   (int)i, order);
314         }
315     }
316 }
317 
318 namespace {
319 
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)320 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
321     for(UChar32 c = 0x10000; c < 0x110000;) {
322         UChar32 next = c + 0x400;
323         if(src.containsSome(c, next - 1)) {
324             dest.add(U16_LEAD(c));
325         }
326         c = next;
327     }
328 }
329 
330 }  // namespace
331 
TestShortFCDData()332 void CollationTest::TestShortFCDData() {
333     // See CollationFCD class comments.
334     IcuTestErrorCode errorCode(*this, "TestShortFCDData");
335     UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
336     errorCode.assertSuccess();
337     expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
338     addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
339     UnicodeSet lccc;  // actual
340     for(UChar32 c = 0; c <= 0xffff; ++c) {
341         if(CollationFCD::hasLccc(c)) { lccc.add(c); }
342     }
343     UnicodeSet diff(expectedLccc);
344     diff.removeAll(lccc);
345     diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
346     UnicodeString empty("[]");
347     UnicodeString diffString;
348     diff.toPattern(diffString, TRUE);
349     assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
350     diff = lccc;
351     diff.removeAll(expectedLccc);
352     diff.toPattern(diffString, TRUE);
353     assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
354 
355     UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
356     if (errorCode.isSuccess()) {
357         addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
358         addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
359         UnicodeSet tccc;  // actual
360         for(UChar32 c = 0; c <= 0xffff; ++c) {
361             if(CollationFCD::hasTccc(c)) { tccc.add(c); }
362         }
363         diff = expectedTccc;
364         diff.removeAll(tccc);
365         diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
366         assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
367         diff = tccc;
368         diff.removeAll(expectedTccc);
369         diff.toPattern(diffString, TRUE);
370         assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
371     }
372 }
373 
374 class CodePointIterator {
375 public:
CodePointIterator(const UChar32 * cp,int32_t length)376     CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()377     void resetToStart() { pos = 0; }
next()378     UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()379     UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const380     int32_t getLength() const { return length; }
getIndex() const381     int getIndex() const { return (int)pos; }
382 private:
383     const UChar32 *cp;
384     int32_t length;
385     int32_t pos;
386 };
387 
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)388 void CollationTest::checkFCD(const char *name,
389                              CollationIterator &ci, CodePointIterator &cpi) {
390     IcuTestErrorCode errorCode(*this, "checkFCD");
391 
392     // Iterate forward to the limit.
393     for(;;) {
394         UChar32 c1 = ci.nextCodePoint(errorCode);
395         UChar32 c2 = cpi.next();
396         if(c1 != c2) {
397             errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
398                   name, (long)c1, (long)c2, cpi.getIndex());
399             return;
400         }
401         if(c1 < 0) { break; }
402     }
403 
404     // Iterate backward most of the way.
405     for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
406         UChar32 c1 = ci.previousCodePoint(errorCode);
407         UChar32 c2 = cpi.previous();
408         if(c1 != c2) {
409             errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
410                   name, (long)c1, (long)c2, cpi.getIndex());
411             return;
412         }
413     }
414 
415     // Forward again.
416     for(;;) {
417         UChar32 c1 = ci.nextCodePoint(errorCode);
418         UChar32 c2 = cpi.next();
419         if(c1 != c2) {
420             errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
421                   name, (long)c1, (long)c2, cpi.getIndex());
422             return;
423         }
424         if(c1 < 0) { break; }
425     }
426 
427     // Iterate backward to the start.
428     for(;;) {
429         UChar32 c1 = ci.previousCodePoint(errorCode);
430         UChar32 c2 = cpi.previous();
431         if(c1 != c2) {
432             errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
433                   name, (long)c1, (long)c2, cpi.getIndex());
434             return;
435         }
436         if(c1 < 0) { break; }
437     }
438 }
439 
TestFCD()440 void CollationTest::TestFCD() {
441     IcuTestErrorCode errorCode(*this, "TestFCD");
442     const CollationData *data = CollationRoot::getData(errorCode);
443     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
444         return;
445     }
446 
447     // Input string, not FCD, NUL-terminated.
448     static const UChar s[] = {
449         0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
450         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
451         0x327, 0x308,  // ccc=202, 230
452         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
453         U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
454         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
455         0xac01,
456         0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
457         U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
458         0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
459         0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
460         0x4e00, 0xf81,
461         0
462     };
463     // Expected code points.
464     static const UChar32 cp[] = {
465         0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
466         0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
467         0x1D15F, 0x1D16D,
468         0xac01,
469         0x63, 0x327, 0x1D165, 0x1D16D,
470         0x61,
471         0xf71, 0xf71, 0xf72, 0xf74, 0x301,
472         0x4e00, 0xf71, 0xf80
473     };
474 
475     FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
476     if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
477         return;
478     }
479     CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
480     checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
481 
482 #if U_HAVE_STD_STRING
483     cpi.resetToStart();
484     std::string utf8;
485     UnicodeString(s).toUTF8String(utf8);
486     FCDUTF8CollationIterator u8ci(data, FALSE,
487                                   reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
488     if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
489         return;
490     }
491     checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
492 #endif
493 
494     cpi.resetToStart();
495     UCharIterator iter;
496     uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL
497     FCDUIterCollationIterator uici(data, FALSE, iter, 0);
498     if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
499         return;
500     }
501     checkFCD("FCDUIterCollationIterator", uici, cpi);
502 }
503 
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)504 void CollationTest::checkAllocWeights(CollationWeights &cw,
505                                       uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
506                                       int32_t someLength, int32_t minCount) {
507     if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
508         errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
509               (long)lowerLimit, (long)upperLimit, (long)n);
510         return;
511     }
512     uint32_t previous = lowerLimit;
513     int32_t count = 0;  // number of weights that have someLength
514     for(int32_t i = 0; i < n; ++i) {
515         uint32_t w = cw.nextWeight();
516         if(w == 0xffffffff) {
517             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
518                   "returns only %ld weights",
519                   (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
520             return;
521         }
522         if(!(previous < w && w < upperLimit)) {
523             errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
524                   "number %ld -> %lx not between %lx and %lx",
525                   (long)lowerLimit, (long)upperLimit, (long)n,
526                   (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
527             return;
528         }
529         if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
530     }
531     if(count < minCount) {
532         errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
533               "returns only %ld < %ld weights of length %d",
534               (long)lowerLimit, (long)upperLimit, (long)n,
535               (long)count, (long)minCount, (int)someLength);
536     }
537 }
538 
TestCollationWeights()539 void CollationTest::TestCollationWeights() {
540     CollationWeights cw;
541 
542     // Non-compressible primaries use 254 second bytes 02..FF.
543     logln("CollationWeights.initForPrimary(non-compressible)");
544     cw.initForPrimary(FALSE);
545     // Expect 1 weight 11 and 254 weights 12xx.
546     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
547     checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
548     // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
549     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
550     // Expect 254 two-byte weights from the ranges 10ff and 11xx.
551     checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
552     // Expect 254^2=64516 three-byte weights.
553     // During computation, there should be 3 three-byte ranges
554     // 10ffff, 11xxxx, 120202.
555     // The middle one should be split 64515:1,
556     // and the newly-split-off range and the last ranged lengthened.
557     checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
558     // Expect weights 1102 & 1103.
559     checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
560     // Expect weights 102102 & 102103.
561     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
562 
563     // Compressible primaries use 251 second bytes 04..FE.
564     logln("CollationWeights.initForPrimary(compressible)");
565     cw.initForPrimary(TRUE);
566     // Expect 1 weight 11 and 251 weights 12xx.
567     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
568     checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
569     // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
570     checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
571     // Expect weights 1104 & 1105.
572     checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
573     // Expect weights 102102 & 102103.
574     checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
575 
576     // Secondary and tertiary weights use only bytes 3 & 4.
577     logln("CollationWeights.initForSecondary()");
578     cw.initForSecondary();
579     // Expect weights fbxx and all four fc..ff.
580     checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
581 
582     logln("CollationWeights.initForTertiary()");
583     cw.initForTertiary();
584     // Expect weights 3dxx and both 3e & 3f.
585     checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
586 }
587 
588 namespace {
589 
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)590 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
591                 uint32_t p, uint32_t s, uint32_t ctq) {
592     uint32_t p1 = p >> 24;
593     uint32_t p2 = (p >> 16) & 0xff;
594     uint32_t p3 = (p >> 8) & 0xff;
595     uint32_t p4 = p & 0xff;
596     uint32_t s1 = s >> 8;
597     uint32_t s2 = s & 0xff;
598     // ctq = Case, Tertiary, Quaternary
599     uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
600     uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
601     uint32_t t1 = t >> 8;
602     uint32_t t2 = t & 0xff;
603     uint32_t q = ctq & Collation::QUATERNARY_MASK;
604     // No leading zero bytes.
605     if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
606         return FALSE;
607     }
608     // No intermediate zero bytes.
609     if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
610         return FALSE;
611     }
612     if(p2 != 0 && p3 == 0 && p4 != 0) {
613         return FALSE;
614     }
615     // Minimum & maximum lead bytes.
616     if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
617             s1 == Collation::LEVEL_SEPARATOR_BYTE ||
618             t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
619         return FALSE;
620     }
621     if(c > 2) {
622         return FALSE;
623     }
624     // The valid byte range for the second primary byte depends on compressibility.
625     if(p2 != 0) {
626         if(data.isCompressibleLeadByte(p1)) {
627             if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
628                     Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
629                 return FALSE;
630             }
631         } else {
632             if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
633                 return FALSE;
634             }
635         }
636     }
637     // Other bytes just need to avoid the level separator.
638     // Trailing zeros are ok.
639     U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
640     if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
641             s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
642         return FALSE;
643     }
644     // Well-formed CEs.
645     if(p == 0) {
646         if(s == 0) {
647             if(t == 0) {
648                 // Completely ignorable CE.
649                 // Quaternary CEs are not supported.
650                 if(c != 0 || q != 0) {
651                     return FALSE;
652                 }
653             } else {
654                 // Tertiary CE.
655                 if(t < re.getTertiaryBoundary() || c != 2) {
656                     return FALSE;
657                 }
658             }
659         } else {
660             // Secondary CE.
661             if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
662                 return FALSE;
663             }
664         }
665     } else {
666         // Primary CE.
667         if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
668                 s >= re.getSecondaryBoundary()) {
669             return FALSE;
670         }
671         if(t == 0 || t >= re.getTertiaryBoundary()) {
672             return FALSE;
673         }
674     }
675     return TRUE;
676 }
677 
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)678 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
679     uint32_t p = (uint32_t)(ce >> 32);
680     uint32_t secTer = (uint32_t)ce;
681     return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
682 }
683 
684 class RootElementsIterator {
685 public:
RootElementsIterator(const CollationData & root)686     RootElementsIterator(const CollationData &root)
687             : data(root),
688               elements(root.rootElements), length(root.rootElementsLength),
689               pri(0), secTer(0),
690               index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
691 
next()692     UBool next() {
693         if(index >= length) { return FALSE; }
694         uint32_t p = elements[index];
695         if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
696         if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
697             ++index;
698             secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
699             return TRUE;
700         }
701         if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
702             // End of a range, enumerate the primaries in the range.
703             int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
704             p &= 0xffffff00;
705             if(pri == p) {
706                 // Finished the range, return the next CE after it.
707                 ++index;
708                 return next();
709             }
710             U_ASSERT(pri < p);
711             // Return the next primary in this range.
712             UBool isCompressible = data.isCompressiblePrimary(pri);
713             if((pri & 0xffff) == 0) {
714                 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
715             } else {
716                 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
717             }
718             return TRUE;
719         }
720         // Simple primary CE.
721         ++index;
722         pri = p;
723         // Does this have an explicit below-common sec/ter unit,
724         // or does it imply a common one?
725         if(index == length) {
726             secTer = Collation::COMMON_SEC_AND_TER_CE;
727         } else {
728             secTer = elements[index];
729             if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
730                 // No sec/ter delta.
731                 secTer = Collation::COMMON_SEC_AND_TER_CE;
732             } else {
733                 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
734                 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
735                     // Implied sec/ter.
736                     secTer = Collation::COMMON_SEC_AND_TER_CE;
737                 } else {
738                     // Explicit sec/ter below common/common.
739                     ++index;
740                 }
741             }
742         }
743         return TRUE;
744     }
745 
getPrimary() const746     uint32_t getPrimary() const { return pri; }
getSecTer() const747     uint32_t getSecTer() const { return secTer; }
748 
749 private:
750     const CollationData &data;
751     const uint32_t *elements;
752     int32_t length;
753 
754     uint32_t pri;
755     uint32_t secTer;
756     int32_t index;
757 };
758 
759 }  // namespace
760 
TestRootElements()761 void CollationTest::TestRootElements() {
762     IcuTestErrorCode errorCode(*this, "TestRootElements");
763     const CollationData *root = CollationRoot::getData(errorCode);
764     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
765         return;
766     }
767     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
768     RootElementsIterator iter(*root);
769 
770     // We check each root CE for validity,
771     // and we also verify that there is a tailoring gap between each two CEs.
772     CollationWeights cw1c;  // compressible primary weights
773     CollationWeights cw1u;  // uncompressible primary weights
774     CollationWeights cw2;
775     CollationWeights cw3;
776 
777     cw1c.initForPrimary(TRUE);
778     cw1u.initForPrimary(FALSE);
779     cw2.initForSecondary();
780     cw3.initForTertiary();
781 
782     // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
783     // nor the special merge-separator CE for U+FFFE.
784     uint32_t prevPri = 0;
785     uint32_t prevSec = 0;
786     uint32_t prevTer = 0;
787     while(iter.next()) {
788         uint32_t pri = iter.getPrimary();
789         uint32_t secTer = iter.getSecTer();
790         // CollationRootElements CEs must have 0 case and quaternary bits.
791         if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
792             errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
793                   (long)pri, (long)secTer);
794         }
795         uint32_t sec = secTer >> 16;
796         uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
797         uint32_t ctq = ter;
798         if(pri == 0 && sec == 0 && ter != 0) {
799             // Tertiary CEs must have uppercase bits,
800             // but they are not stored in the CollationRootElements.
801             ctq |= 0x8000;
802         }
803         if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
804             errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
805         } else {
806             if(pri != prevPri) {
807                 uint32_t newWeight = 0;
808                 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
809                     // There is currently no tailoring gap after primary ignorables,
810                     // and we forbid tailoring after U+FFFD and U+FFFF.
811                 } else if(root->isCompressiblePrimary(prevPri)) {
812                     if(!cw1c.allocWeights(prevPri, pri, 1)) {
813                         errln("no primary/compressible tailoring gap between %08lx and %08lx",
814                               (long)prevPri, (long)pri);
815                     } else {
816                         newWeight = cw1c.nextWeight();
817                     }
818                 } else {
819                     if(!cw1u.allocWeights(prevPri, pri, 1)) {
820                         errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
821                               (long)prevPri, (long)pri);
822                     } else {
823                         newWeight = cw1u.nextWeight();
824                     }
825                 }
826                 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
827                     errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
828                           (long)prevPri, (long)newWeight, (long)pri);
829                 }
830             } else if(sec != prevSec) {
831                 uint32_t lowerLimit =
832                     prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
833                 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
834                     errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
835                 } else {
836                     uint32_t newWeight = cw2.nextWeight();
837                     if(!(prevSec < newWeight && newWeight < sec)) {
838                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
839                               (long)lowerLimit, (long)newWeight, (long)sec);
840                     }
841                 }
842             } else if(ter != prevTer) {
843                 uint32_t lowerLimit =
844                     prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
845                 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
846                     errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
847                 } else {
848                     uint32_t newWeight = cw3.nextWeight();
849                     if(!(prevTer < newWeight && newWeight < ter)) {
850                         errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
851                               (long)lowerLimit, (long)newWeight, (long)ter);
852                     }
853                 }
854             } else {
855                 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
856             }
857         }
858         prevPri = pri;
859         prevSec = sec;
860         prevTer = ter;
861     }
862 }
863 
TestTailoredElements()864 void CollationTest::TestTailoredElements() {
865     IcuTestErrorCode errorCode(*this, "TestTailoredElements");
866     const CollationData *root = CollationRoot::getData(errorCode);
867     if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
868         return;
869     }
870     CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
871 
872     UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
873     if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
874         return;
875     }
876     uhash_setKeyDeleter(prevLocales, uprv_free);
877     // TestRootElements() tests the root collator which does not have tailorings.
878     uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
879     uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
880     uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
881 
882     UVector64 ces(errorCode);
883     LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
884     U_ASSERT(locales.isValid());
885     const char *localeID = "root";
886     do {
887         Locale locale(localeID);
888         LocalPointer<StringEnumeration> types(
889                 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
890         errorCode.assertSuccess();
891         const char *type;  // first: default type
892         while((type = types->next(NULL, errorCode)) != NULL) {
893             if(strncmp(type, "private-", 8) == 0) {
894                 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
895                         localeID, type);
896             }
897             Locale localeWithType(locale);
898             localeWithType.setKeywordValue("collation", type, errorCode);
899             errorCode.assertSuccess();
900             LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
901             if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
902                                               localeWithType.getName())) {
903                 continue;
904             }
905             Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
906             if(uhash_geti(prevLocales, actual.getName()) != 0) {
907                 continue;
908             }
909             uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
910             errorCode.assertSuccess();
911             logln("TestTailoredElements(): requested %s -> actual %s",
912                   localeWithType.getName(), actual.getName());
913             RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
914             if(rbc == NULL) {
915                 continue;
916             }
917             // Note: It would be better to get tailored strings such that we can
918             // identify the prefix, and only get the CEs for the prefix+string,
919             // not also for the prefix.
920             // There is currently no API for that.
921             // It would help in an unusual case where a contraction starting in the prefix
922             // extends past its end, and we do not see the intended mapping.
923             // For example, for a mapping p|st, if there is also a contraction ps,
924             // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
925             LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
926             errorCode.assertSuccess();
927             UnicodeSetIterator iter(*tailored);
928             while(iter.next()) {
929                 const UnicodeString &s = iter.getString();
930                 ces.removeAllElements();
931                 rbc->internalGetCEs(s, ces, errorCode);
932                 errorCode.assertSuccess();
933                 for(int32_t i = 0; i < ces.size(); ++i) {
934                     int64_t ce = ces.elementAti(i);
935                     if(!isValidCE(rootElements, *root, ce)) {
936                         errln("invalid tailored CE %016llx at CE index %d from string:",
937                               (long long)ce, (int)i);
938                         infoln(prettify(s));
939                     }
940                 }
941             }
942         }
943     } while((localeID = locales->next(NULL, errorCode)) != NULL);
944     uhash_close(prevLocales);
945 }
946 
printSortKey(const uint8_t * p,int32_t length)947 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
948     UnicodeString s;
949     for(int32_t i = 0; i < length; ++i) {
950         if(i > 0) { s.append((UChar)0x20); }
951         uint8_t b = p[i];
952         if(b == 0) {
953             s.append((UChar)0x2e);  // period
954         } else if(b == 1) {
955             s.append((UChar)0x7c);  // vertical bar
956         } else {
957             appendHex(b, 2, s);
958         }
959     }
960     return s;
961 }
962 
printCollationKey(const CollationKey & key)963 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
964     int32_t length;
965     const uint8_t *p = key.getByteArray(length);
966     return printSortKey(p, length);
967 }
968 
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)969 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
970     for(;;) {
971         int32_t lineLength;
972         const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
973         if(line == NULL || errorCode.isFailure()) {
974             fileLine.remove();
975             return FALSE;
976         }
977         ++fileLineNumber;
978         // Strip trailing CR/LF, comments, and spaces.
979         const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
980         if(comment != NULL) {
981             lineLength = (int32_t)(comment - line);
982         } else {
983             while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
984         }
985         while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
986         if(lineLength != 0) {
987             fileLine.setTo(FALSE, line, lineLength);
988             return TRUE;
989         }
990         // Empty line, continue.
991     }
992 }
993 
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)994 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
995                                 UErrorCode &errorCode) {
996     int32_t length = fileLine.length();
997     int32_t i;
998     for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
999     int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
1000     if(pipeIndex >= 0) {
1001         prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1002         if(prefix.isEmpty()) {
1003             errln("empty prefix on line %d", (int)fileLineNumber);
1004             infoln(fileLine);
1005             errorCode = U_PARSE_ERROR;
1006             return;
1007         }
1008         start = pipeIndex + 1;
1009     } else {
1010         prefix.remove();
1011     }
1012     s = fileLine.tempSubStringBetween(start, i).unescape();
1013     if(s.isEmpty()) {
1014         errln("empty string on line %d", (int)fileLineNumber);
1015         infoln(fileLine);
1016         errorCode = U_PARSE_ERROR;
1017         return;
1018     }
1019     start = i;
1020 }
1021 
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1022 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1023     Collation::Level relation;
1024     int32_t start;
1025     if(fileLine[0] == 0x3c) {  // <
1026         UChar second = fileLine[1];
1027         start = 2;
1028         switch(second) {
1029         case 0x31:  // <1
1030             relation = Collation::PRIMARY_LEVEL;
1031             break;
1032         case 0x32:  // <2
1033             relation = Collation::SECONDARY_LEVEL;
1034             break;
1035         case 0x33:  // <3
1036             relation = Collation::TERTIARY_LEVEL;
1037             break;
1038         case 0x34:  // <4
1039             relation = Collation::QUATERNARY_LEVEL;
1040             break;
1041         case 0x63:  // <c
1042             relation = Collation::CASE_LEVEL;
1043             break;
1044         case 0x69:  // <i
1045             relation = Collation::IDENTICAL_LEVEL;
1046             break;
1047         default:  // just <
1048             relation = Collation::NO_LEVEL;
1049             start = 1;
1050             break;
1051         }
1052     } else if(fileLine[0] == 0x3d) {  // =
1053         relation = Collation::ZERO_LEVEL;
1054         start = 1;
1055     } else {
1056         start = 0;
1057     }
1058     if(start == 0 || !isSpace(fileLine[start])) {
1059         errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1060         infoln(fileLine);
1061         errorCode.set(U_PARSE_ERROR);
1062         return Collation::NO_LEVEL;
1063     }
1064     start = skipSpaces(start);
1065     UnicodeString prefix;
1066     parseString(start, prefix, s, errorCode);
1067     if(errorCode.isSuccess() && !prefix.isEmpty()) {
1068         errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1069         infoln(fileLine);
1070         errorCode.set(U_PARSE_ERROR);
1071         return Collation::NO_LEVEL;
1072     }
1073     if(start < fileLine.length()) {
1074         errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1075         infoln(fileLine);
1076         errorCode.set(U_PARSE_ERROR);
1077         return Collation::NO_LEVEL;
1078     }
1079     return relation;
1080 }
1081 
1082 static const struct {
1083     const char *name;
1084     UColAttribute attr;
1085 } attributes[] = {
1086     { "backwards", UCOL_FRENCH_COLLATION },
1087     { "alternate", UCOL_ALTERNATE_HANDLING },
1088     { "caseFirst", UCOL_CASE_FIRST },
1089     { "caseLevel", UCOL_CASE_LEVEL },
1090     // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1091     { "strength", UCOL_STRENGTH },
1092     // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1093     { "numeric", UCOL_NUMERIC_COLLATION }
1094 };
1095 
1096 static const struct {
1097     const char *name;
1098     UColAttributeValue value;
1099 } attributeValues[] = {
1100     { "default", UCOL_DEFAULT },
1101     { "primary", UCOL_PRIMARY },
1102     { "secondary", UCOL_SECONDARY },
1103     { "tertiary", UCOL_TERTIARY },
1104     { "quaternary", UCOL_QUATERNARY },
1105     { "identical", UCOL_IDENTICAL },
1106     { "off", UCOL_OFF },
1107     { "on", UCOL_ON },
1108     { "shifted", UCOL_SHIFTED },
1109     { "non-ignorable", UCOL_NON_IGNORABLE },
1110     { "lower", UCOL_LOWER_FIRST },
1111     { "upper", UCOL_UPPER_FIRST }
1112 };
1113 
parseAndSetAttribute(IcuTestErrorCode & errorCode)1114 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1115     // Parse attributes even if the Collator could not be created,
1116     // in order to report syntax errors.
1117     int32_t start = skipSpaces(1);
1118     int32_t equalPos = fileLine.indexOf(0x3d);
1119     if(equalPos < 0) {
1120         if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1121             parseAndSetReorderCodes(start + 7, errorCode);
1122             return;
1123         }
1124         errln("missing '=' on line %d", (int)fileLineNumber);
1125         infoln(fileLine);
1126         errorCode.set(U_PARSE_ERROR);
1127         return;
1128     }
1129 
1130     UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1131     UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1132     if(attrString == UNICODE_STRING("maxVariable", 11)) {
1133         UColReorderCode max;
1134         if(valueString == UNICODE_STRING("space", 5)) {
1135             max = UCOL_REORDER_CODE_SPACE;
1136         } else if(valueString == UNICODE_STRING("punct", 5)) {
1137             max = UCOL_REORDER_CODE_PUNCTUATION;
1138         } else if(valueString == UNICODE_STRING("symbol", 6)) {
1139             max = UCOL_REORDER_CODE_SYMBOL;
1140         } else if(valueString == UNICODE_STRING("currency", 8)) {
1141             max = UCOL_REORDER_CODE_CURRENCY;
1142         } else {
1143             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1144             infoln(fileLine);
1145             errorCode.set(U_PARSE_ERROR);
1146             return;
1147         }
1148         if(coll != NULL) {
1149             coll->setMaxVariable(max, errorCode);
1150             if(errorCode.isFailure()) {
1151                 errln("setMaxVariable() failed on line %d: %s",
1152                       (int)fileLineNumber, errorCode.errorName());
1153                 infoln(fileLine);
1154                 return;
1155             }
1156         }
1157         fileLine.remove();
1158         return;
1159     }
1160 
1161     UColAttribute attr;
1162     for(int32_t i = 0;; ++i) {
1163         if(i == UPRV_LENGTHOF(attributes)) {
1164             errln("invalid attribute name on line %d", (int)fileLineNumber);
1165             infoln(fileLine);
1166             errorCode.set(U_PARSE_ERROR);
1167             return;
1168         }
1169         if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1170             attr = attributes[i].attr;
1171             break;
1172         }
1173     }
1174 
1175     UColAttributeValue value;
1176     for(int32_t i = 0;; ++i) {
1177         if(i == UPRV_LENGTHOF(attributeValues)) {
1178             errln("invalid attribute value name on line %d", (int)fileLineNumber);
1179             infoln(fileLine);
1180             errorCode.set(U_PARSE_ERROR);
1181             return;
1182         }
1183         if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1184             value = attributeValues[i].value;
1185             break;
1186         }
1187     }
1188 
1189     if(coll != NULL) {
1190         coll->setAttribute(attr, value, errorCode);
1191         if(errorCode.isFailure()) {
1192             errln("illegal attribute=value combination on line %d: %s",
1193                   (int)fileLineNumber, errorCode.errorName());
1194             infoln(fileLine);
1195             return;
1196         }
1197     }
1198     fileLine.remove();
1199 }
1200 
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1201 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1202     UVector32 reorderCodes(errorCode);
1203     while(start < fileLine.length()) {
1204         start = skipSpaces(start);
1205         int32_t limit = start;
1206         while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1207         CharString name;
1208         name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1209         int32_t code = CollationRuleParser::getReorderCode(name.data());
1210         if(code < 0) {
1211             if(uprv_stricmp(name.data(), "default") == 0) {
1212                 code = UCOL_REORDER_CODE_DEFAULT;  // -1
1213             } else {
1214                 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1215                 infoln(fileLine);
1216                 errorCode.set(U_PARSE_ERROR);
1217                 return;
1218             }
1219         }
1220         reorderCodes.addElement(code, errorCode);
1221         start = limit;
1222     }
1223     if(coll != NULL) {
1224         coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1225         if(errorCode.isFailure()) {
1226             errln("setReorderCodes() failed on line %d: %s",
1227                   (int)fileLineNumber, errorCode.errorName());
1228             infoln(fileLine);
1229             return;
1230         }
1231     }
1232     fileLine.remove();
1233 }
1234 
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1235 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1236     UnicodeString rules;
1237     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1238         rules.append(fileLine.unescape());
1239     }
1240     if(errorCode.isFailure()) { return; }
1241     logln(rules);
1242 
1243     UParseError parseError;
1244     UnicodeString reason;
1245     delete coll;
1246     coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1247     if(coll == NULL) {
1248         errln("unable to allocate a new collator");
1249         errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1250         return;
1251     }
1252     if(errorCode.isFailure()) {
1253         dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1254         infoln(UnicodeString("  reason: ") + reason);
1255         if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1256         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1257             infoln(UnicodeString("  snippet: ...") +
1258                 parseError.preContext + "(!)" + parseError.postContext + "...");
1259         }
1260         delete coll;
1261         coll = NULL;
1262         errorCode.reset();
1263     } else {
1264         assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1265                      UnicodeString(), reason);
1266     }
1267 }
1268 
setRootCollator(IcuTestErrorCode & errorCode)1269 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1270     if(errorCode.isFailure()) { return; }
1271     delete coll;
1272     coll = Collator::createInstance(Locale::getRoot(), errorCode);
1273     if(errorCode.isFailure()) {
1274         dataerrln("unable to create a root collator");
1275         return;
1276     }
1277 }
1278 
setLocaleCollator(IcuTestErrorCode & errorCode)1279 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1280     if(errorCode.isFailure()) { return; }
1281     delete coll;
1282     coll = NULL;
1283     int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1284     if(at >= 0) {
1285         fileLine.setCharAt(at, (UChar)0x2a);  // *
1286     }
1287     CharString localeID;
1288     localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1289     if(at >= 0) {
1290         localeID.data()[at - 9] = '@';
1291     }
1292     Locale locale(localeID.data());
1293     if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1294         errln("invalid language tag on line %d", (int)fileLineNumber);
1295         infoln(fileLine);
1296         if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1297         return;
1298     }
1299 
1300     logln("creating a collator for locale ID %s", locale.getName());
1301     coll = Collator::createInstance(locale, errorCode);
1302     if(errorCode.isFailure()) {
1303         dataerrln("unable to create a collator for locale %s on line %d",
1304                   locale.getName(), (int)fileLineNumber);
1305         infoln(fileLine);
1306         delete coll;
1307         coll = NULL;
1308         errorCode.reset();
1309     }
1310 }
1311 
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1312 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1313     if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1314     // In some sequences with Tibetan composite vowel signs,
1315     // even if the string passes the FCD check,
1316     // those composites must be decomposed.
1317     // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1318     int32_t index = 0;
1319     while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1320         if(++index < s.length()) {
1321             UChar c = s[index];
1322             if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1323         }
1324     }
1325     return FALSE;
1326 }
1327 
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1328 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1329                                      CharString &dest, int32_t partSize,
1330                                      IcuTestErrorCode &errorCode) {
1331     if(errorCode.isFailure()) { return FALSE; }
1332     uint8_t part[32];
1333     U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1334     UCharIterator iter;
1335     uiter_setString(&iter, s, length);
1336     uint32_t state[2] = { 0, 0 };
1337     for(;;) {
1338         int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1339         UBool done = partLength < partSize;
1340         if(done) {
1341             // At the end, append the next byte as well which should be 00.
1342             ++partLength;
1343         }
1344         dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1345         if(done) {
1346             return errorCode.isSuccess();
1347         }
1348     }
1349 }
1350 
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1351 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1352                                      const UChar *s, int32_t length,
1353                                      CollationKey &key, IcuTestErrorCode &errorCode) {
1354     if(errorCode.isFailure()) { return FALSE; }
1355     coll->getCollationKey(s, length, key, errorCode);
1356     if(errorCode.isFailure()) {
1357         infoln(fileTestName);
1358         errln("Collator(%s).getCollationKey() failed: %s",
1359               norm, errorCode.errorName());
1360         infoln(line);
1361         return FALSE;
1362     }
1363     int32_t keyLength;
1364     const uint8_t *keyBytes = key.getByteArray(keyLength);
1365     if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1366         infoln(fileTestName);
1367         errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1368               norm);
1369         infoln(line);
1370         infoln(printCollationKey(key));
1371         return FALSE;
1372     }
1373 
1374     int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1375     if(numLevels < UCOL_IDENTICAL) {
1376         ++numLevels;
1377     } else {
1378         numLevels = 5;
1379     }
1380     if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1381         ++numLevels;
1382     }
1383     errorCode.assertSuccess();
1384     int32_t numLevelSeparators = 0;
1385     for(int32_t i = 0; i < (keyLength - 1); ++i) {
1386         uint8_t b = keyBytes[i];
1387         if(b == 0) {
1388             infoln(fileTestName);
1389             errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1390             infoln(line);
1391             infoln(printCollationKey(key));
1392             return FALSE;
1393         }
1394         if(b == 1) { ++numLevelSeparators; }
1395     }
1396     if(numLevelSeparators != (numLevels - 1)) {
1397         infoln(fileTestName);
1398         errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1399               norm, (int)numLevelSeparators, (int)numLevels);
1400         infoln(line);
1401         infoln(printCollationKey(key));
1402         return FALSE;
1403     }
1404 
1405     // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1406     static const int32_t partSizes[] = { 32, 3, 1 };
1407     for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1408         int32_t partSize = partSizes[psi];
1409         CharString parts;
1410         if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1411             infoln(fileTestName);
1412             errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1413                   norm, (int)partSize, errorCode.errorName());
1414             infoln(line);
1415             return FALSE;
1416         }
1417         if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1418             infoln(fileTestName);
1419             errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1420                   norm, (int)partSize);
1421             infoln(line);
1422             infoln(printCollationKey(key));
1423             infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1424             return FALSE;
1425         }
1426     }
1427     return TRUE;
1428 }
1429 
1430 /**
1431  * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1432  * Leaves key unchanged if s does not contain U+FFFE.
1433  * @return TRUE if the key was successfully changed
1434  */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1435 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1436                                            CollationKey &key, IcuTestErrorCode &errorCode) {
1437     if(errorCode.isFailure()) { return FALSE; }
1438     LocalMemory<uint8_t> mergedKey;
1439     int32_t mergedKeyLength = 0;
1440     int32_t mergedKeyCapacity = 0;
1441     int32_t sLength = (length >= 0) ? length : u_strlen(s);
1442     int32_t segmentStart = 0;
1443     for(int32_t i = 0;;) {
1444         if(i == sLength) {
1445             if(segmentStart == 0) {
1446                 // s does not contain any U+FFFE.
1447                 return FALSE;
1448             }
1449         } else if(s[i] != 0xfffe) {
1450             ++i;
1451             continue;
1452         }
1453         // Get the sort key for another segment and merge it into mergedKey.
1454         CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1455         CollationKey key2;
1456         coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1457         int32_t key1Length, key2Length;
1458         const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1459         const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1460         uint8_t *dest;
1461         int32_t minCapacity = key1Length + key2Length;
1462         if(key1Length > 0) { --minCapacity; }
1463         if(minCapacity <= mergedKeyCapacity) {
1464             dest = mergedKey.getAlias();
1465         } else {
1466             if(minCapacity <= 200) {
1467                 mergedKeyCapacity = 200;
1468             } else if(minCapacity <= 2 * mergedKeyCapacity) {
1469                 mergedKeyCapacity *= 2;
1470             } else {
1471                 mergedKeyCapacity = minCapacity;
1472             }
1473             dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1474         }
1475         U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1476         if(key1Length == 0) {
1477             // key2 is the sort key for the first segment.
1478             uprv_memcpy(dest, key2Bytes, key2Length);
1479             mergedKeyLength = key2Length;
1480         } else {
1481             mergedKeyLength =
1482                 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1483                                    dest, mergedKeyCapacity);
1484         }
1485         if(i == sLength) { break; }
1486         segmentStart = ++i;
1487     }
1488     key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1489     return TRUE;
1490 }
1491 
1492 namespace {
1493 
1494 /**
1495  * Replaces unpaired surrogates with U+FFFD.
1496  * Returns s if no replacement was made, otherwise buffer.
1497  */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1498 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1499     int32_t i = 0;
1500     while(i < s.length()) {
1501         UChar32 c = s.char32At(i);
1502         if(U_IS_SURROGATE(c)) {
1503             if(buffer.length() < i) {
1504                 buffer.append(s, buffer.length(), i - buffer.length());
1505             }
1506             buffer.append((UChar)0xfffd);
1507         }
1508         i += U16_LENGTH(c);
1509     }
1510     if(buffer.isEmpty()) {
1511         return s;
1512     }
1513     if(buffer.length() < i) {
1514         buffer.append(s, buffer.length(), i - buffer.length());
1515     }
1516     return buffer;
1517 }
1518 
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1519 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1520                            UCollationResult order, UBool collHasCaseLevel) {
1521     if(order == UCOL_EQUAL) {
1522         return Collation::NO_LEVEL;
1523     }
1524     int32_t prevKeyLength;
1525     const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1526     int32_t keyLength;
1527     const uint8_t *bytes = key.getByteArray(keyLength);
1528     int32_t level = Collation::PRIMARY_LEVEL;
1529     for(int32_t i = 0;; ++i) {
1530         uint8_t b = prevBytes[i];
1531         if(b != bytes[i]) { break; }
1532         if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1533             ++level;
1534             if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1535                 ++level;
1536             }
1537         }
1538     }
1539     return level;
1540 }
1541 
1542 }
1543 
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1544 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1545                                      const UnicodeString &prevString, const UnicodeString &s,
1546                                      UCollationResult expectedOrder, Collation::Level expectedLevel,
1547                                      IcuTestErrorCode &errorCode) {
1548     if(errorCode.isFailure()) { return FALSE; }
1549 
1550     // Get the sort keys first, for error debug output.
1551     CollationKey prevKey;
1552     if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1553                         prevKey, errorCode)) {
1554         return FALSE;
1555     }
1556     CollationKey key;
1557     if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1558 
1559     UCollationResult order = coll->compare(prevString, s, errorCode);
1560     if(order != expectedOrder || errorCode.isFailure()) {
1561         infoln(fileTestName);
1562         errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1563               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1564         infoln(prevFileLine);
1565         infoln(fileLine);
1566         infoln(printCollationKey(prevKey));
1567         infoln(printCollationKey(key));
1568         return FALSE;
1569     }
1570     order = coll->compare(s, prevString, errorCode);
1571     if(order != -expectedOrder || errorCode.isFailure()) {
1572         infoln(fileTestName);
1573         errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1574               (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1575         infoln(prevFileLine);
1576         infoln(fileLine);
1577         infoln(printCollationKey(prevKey));
1578         infoln(printCollationKey(key));
1579         return FALSE;
1580     }
1581     // Test NUL-termination if the strings do not contain NUL characters.
1582     UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1583     if(!containNUL) {
1584         order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1585         if(order != expectedOrder || errorCode.isFailure()) {
1586             infoln(fileTestName);
1587             errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1588                   (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1589             infoln(prevFileLine);
1590             infoln(fileLine);
1591             infoln(printCollationKey(prevKey));
1592             infoln(printCollationKey(key));
1593             return FALSE;
1594         }
1595         order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1596         if(order != -expectedOrder || errorCode.isFailure()) {
1597             infoln(fileTestName);
1598             errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1599                   (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1600             infoln(prevFileLine);
1601             infoln(fileLine);
1602             infoln(printCollationKey(prevKey));
1603             infoln(printCollationKey(key));
1604             return FALSE;
1605         }
1606     }
1607 
1608 #if U_HAVE_STD_STRING
1609     // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1610     // Unpaired surrogates cannot be converted to UTF-8.
1611     // Create valid UTF-16 strings if necessary, and use those for
1612     // both the expected compare() result and for the input to compare(UTF-8).
1613     UnicodeString prevBuffer, sBuffer;
1614     const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1615     const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1616     std::string prevUTF8, sUTF8;
1617     UnicodeString(prevValid).toUTF8String(prevUTF8);
1618     UnicodeString(sValid).toUTF8String(sUTF8);
1619     UCollationResult expectedUTF8Order;
1620     if(&prevValid == &prevString && &sValid == &s) {
1621         expectedUTF8Order = expectedOrder;
1622     } else {
1623         expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1624     }
1625 
1626     order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1627     if(order != expectedUTF8Order || errorCode.isFailure()) {
1628         infoln(fileTestName);
1629         errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1630               (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1631         infoln(prevFileLine);
1632         infoln(fileLine);
1633         infoln(printCollationKey(prevKey));
1634         infoln(printCollationKey(key));
1635         return FALSE;
1636     }
1637     order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1638     if(order != -expectedUTF8Order || errorCode.isFailure()) {
1639         infoln(fileTestName);
1640         errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1641               (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1642         infoln(prevFileLine);
1643         infoln(fileLine);
1644         infoln(printCollationKey(prevKey));
1645         infoln(printCollationKey(key));
1646         return FALSE;
1647     }
1648     // Test NUL-termination if the strings do not contain NUL characters.
1649     if(!containNUL) {
1650         order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1651         if(order != expectedUTF8Order || errorCode.isFailure()) {
1652             infoln(fileTestName);
1653             errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1654                   (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1655             infoln(prevFileLine);
1656             infoln(fileLine);
1657             infoln(printCollationKey(prevKey));
1658             infoln(printCollationKey(key));
1659             return FALSE;
1660         }
1661         order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1662         if(order != -expectedUTF8Order || errorCode.isFailure()) {
1663             infoln(fileTestName);
1664             errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1665                   (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1666             infoln(prevFileLine);
1667             infoln(fileLine);
1668             infoln(printCollationKey(prevKey));
1669             infoln(printCollationKey(key));
1670             return FALSE;
1671         }
1672     }
1673 #endif
1674 
1675     UCharIterator leftIter;
1676     UCharIterator rightIter;
1677     uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1678     uiter_setString(&rightIter, s.getBuffer(), s.length());
1679     order = coll->compare(leftIter, rightIter, errorCode);
1680     if(order != expectedOrder || errorCode.isFailure()) {
1681         infoln(fileTestName);
1682         errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1683               "wrong order: %d != %d (%s)",
1684               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1685         infoln(prevFileLine);
1686         infoln(fileLine);
1687         infoln(printCollationKey(prevKey));
1688         infoln(printCollationKey(key));
1689         return FALSE;
1690     }
1691 
1692     order = prevKey.compareTo(key, errorCode);
1693     if(order != expectedOrder || errorCode.isFailure()) {
1694         infoln(fileTestName);
1695         errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1696               (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1697         infoln(prevFileLine);
1698         infoln(fileLine);
1699         infoln(printCollationKey(prevKey));
1700         infoln(printCollationKey(key));
1701         return FALSE;
1702     }
1703     UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1704     int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1705     if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1706         if(level != expectedLevel) {
1707             infoln(fileTestName);
1708             errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1709                   (int)fileLineNumber, norm, order, level, expectedLevel);
1710             infoln(prevFileLine);
1711             infoln(fileLine);
1712             infoln(printCollationKey(prevKey));
1713             infoln(printCollationKey(key));
1714             return FALSE;
1715         }
1716     }
1717 
1718     // If either string contains U+FFFE, then their sort keys must compare the same as
1719     // the merged sort keys of each string's between-FFFE segments.
1720     //
1721     // It is not required that
1722     //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1723     // only that those two methods yield the same order.
1724     //
1725     // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1726     if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1727                 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1728             errorCode.isFailure()) {
1729         order = prevKey.compareTo(key, errorCode);
1730         if(order != expectedOrder || errorCode.isFailure()) {
1731             infoln(fileTestName);
1732             errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1733                 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1734                 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1735             infoln(prevFileLine);
1736             infoln(fileLine);
1737             infoln(printCollationKey(prevKey));
1738             infoln(printCollationKey(key));
1739             return FALSE;
1740         }
1741         int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1742         if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1743             if(mergedLevel != level) {
1744                 infoln(fileTestName);
1745                 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1746                     "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1747                     (int)fileLineNumber, norm, order, mergedLevel, level);
1748                 infoln(prevFileLine);
1749                 infoln(fileLine);
1750                 infoln(printCollationKey(prevKey));
1751                 infoln(printCollationKey(key));
1752                 return FALSE;
1753             }
1754         }
1755     }
1756     return TRUE;
1757 }
1758 
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1759 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1760     if(errorCode.isFailure()) { return; }
1761     UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1762     UnicodeString prevString, s;
1763     prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1764     while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1765         // Parse the line even if it will be ignored (when we do not have a Collator)
1766         // in order to report syntax issues.
1767         Collation::Level relation = parseRelationAndString(s, errorCode);
1768         if(errorCode.isFailure()) {
1769             errorCode.reset();
1770             break;
1771         }
1772         if(coll == NULL) {
1773             // We were unable to create the Collator but continue with tests.
1774             // Ignore test data for this Collator.
1775             // The next Collator creation might work.
1776             continue;
1777         }
1778         UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1779         Collation::Level expectedLevel = relation;
1780         s.getTerminatedBuffer();  // Ensure NUL-termination.
1781         UBool isOk = TRUE;
1782         if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1783             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1784             isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1785                                    expectedOrder, expectedLevel, errorCode);
1786         }
1787         if(isOk) {
1788             coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1789             isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1790                                    expectedOrder, expectedLevel, errorCode);
1791         }
1792         if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1793             UnicodeString pn = nfd->normalize(prevString, errorCode);
1794             UnicodeString n = nfd->normalize(s, errorCode);
1795             pn.getTerminatedBuffer();
1796             n.getTerminatedBuffer();
1797             errorCode.assertSuccess();
1798             isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1799                                    expectedOrder, expectedLevel, errorCode);
1800         }
1801         if(!isOk) {
1802             errorCode.reset();  // already reported
1803         }
1804         prevFileLine = fileLine;
1805         prevString = s;
1806         prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1807     }
1808 }
1809 
TestDataDriven()1810 void CollationTest::TestDataDriven() {
1811     IcuTestErrorCode errorCode(*this, "TestDataDriven");
1812 
1813     fcd = Normalizer2Factory::getFCDInstance(errorCode);
1814     nfd = Normalizer2::getNFDInstance(errorCode);
1815     if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1816         return;
1817     }
1818 
1819     CharString path(getSourceTestData(errorCode), errorCode);
1820     path.appendPathPart("collationtest.txt", errorCode);
1821     const char *codePage = "UTF-8";
1822     LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1823     if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1824         return;
1825     }
1826     // Read a new line if necessary.
1827     // Sub-parsers leave the first line set that they do not handle.
1828     while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1829         if(!isSectionStarter(fileLine[0])) {
1830             errln("syntax error on line %d", (int)fileLineNumber);
1831             infoln(fileLine);
1832             return;
1833         }
1834         if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1835             fileTestName = fileLine;
1836             logln(fileLine);
1837             fileLine.remove();
1838         } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1839             setRootCollator(errorCode);
1840             fileLine.remove();
1841         } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1842             setLocaleCollator(errorCode);
1843             fileLine.remove();
1844         } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1845             buildTailoring(f.getAlias(), errorCode);
1846         } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1847             parseAndSetAttribute(errorCode);
1848         } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1849             checkCompareStrings(f.getAlias(), errorCode);
1850         } else {
1851             errln("syntax error on line %d", (int)fileLineNumber);
1852             infoln(fileLine);
1853             return;
1854         }
1855     }
1856 }
1857 
1858 #endif  // !UCONFIG_NO_COLLATION
1859