1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationtest.cpp
9 *
10 * created on: 2012apr27
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/coll.h"
19 #include "unicode/errorcode.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/normalizer2.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/std_string.h"
24 #include "unicode/strenum.h"
25 #include "unicode/tblcoll.h"
26 #include "unicode/uiter.h"
27 #include "unicode/uniset.h"
28 #include "unicode/unistr.h"
29 #include "unicode/usetiter.h"
30 #include "unicode/ustring.h"
31 #include "charstr.h"
32 #include "cmemory.h"
33 #include "collation.h"
34 #include "collationdata.h"
35 #include "collationfcd.h"
36 #include "collationiterator.h"
37 #include "collationroot.h"
38 #include "collationrootelements.h"
39 #include "collationruleparser.h"
40 #include "collationweights.h"
41 #include "cstring.h"
42 #include "intltest.h"
43 #include "normalizer2impl.h"
44 #include "ucbuf.h"
45 #include "uhash.h"
46 #include "uitercollationiterator.h"
47 #include "utf16collationiterator.h"
48 #include "utf8collationiterator.h"
49 #include "uvectr32.h"
50 #include "uvectr64.h"
51 #include "writesrc.h"
52
53 class CodePointIterator;
54
55 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
56
57 class CollationTest : public IntlTest {
58 public:
CollationTest()59 CollationTest()
60 : fcd(NULL), nfd(NULL),
61 fileLineNumber(0),
62 coll(NULL) {}
63
~CollationTest()64 ~CollationTest() {
65 delete coll;
66 }
67
68 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
69
70 void TestMinMax();
71 void TestImplicits();
72 void TestNulTerminated();
73 void TestIllegalUTF8();
74 void TestShortFCDData();
75 void TestFCD();
76 void TestCollationWeights();
77 void TestRootElements();
78 void TestTailoredElements();
79 void TestDataDriven();
80
81 private:
82 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
83 void checkAllocWeights(CollationWeights &cw,
84 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
85 int32_t someLength, int32_t minCount);
86
87 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
88 static UnicodeString printCollationKey(const CollationKey &key);
89
90 // Helpers & fields for data-driven test.
isCROrLF(UChar c)91 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)92 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)93 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
skipSpaces(int32_t i)94 int32_t skipSpaces(int32_t i) {
95 while(isSpace(fileLine[i])) { ++i; }
96 return i;
97 }
98
99 UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
100 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
101 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
102 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
103 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
104 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
105 void setRootCollator(IcuTestErrorCode &errorCode);
106 void setLocaleCollator(IcuTestErrorCode &errorCode);
107
108 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
109
110 UBool getSortKeyParts(const UChar *s, int32_t length,
111 CharString &dest, int32_t partSize,
112 IcuTestErrorCode &errorCode);
113 UBool getCollationKey(const char *norm, const UnicodeString &line,
114 const UChar *s, int32_t length,
115 CollationKey &key, IcuTestErrorCode &errorCode);
116 UBool getMergedCollationKey(const UChar *s, int32_t length,
117 CollationKey &key, IcuTestErrorCode &errorCode);
118 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
119 const UnicodeString &prevString, const UnicodeString &s,
120 UCollationResult expectedOrder, Collation::Level expectedLevel,
121 IcuTestErrorCode &errorCode);
122 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
123
124 const Normalizer2 *fcd, *nfd;
125 UnicodeString fileLine;
126 int32_t fileLineNumber;
127 UnicodeString fileTestName;
128 Collator *coll;
129 };
130
createCollationTest()131 extern IntlTest *createCollationTest() {
132 return new CollationTest();
133 }
134
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)135 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
136 if(exec) {
137 logln("TestSuite CollationTest: ");
138 }
139 TESTCASE_AUTO_BEGIN;
140 TESTCASE_AUTO(TestMinMax);
141 TESTCASE_AUTO(TestImplicits);
142 TESTCASE_AUTO(TestNulTerminated);
143 TESTCASE_AUTO(TestIllegalUTF8);
144 TESTCASE_AUTO(TestShortFCDData);
145 TESTCASE_AUTO(TestFCD);
146 TESTCASE_AUTO(TestCollationWeights);
147 TESTCASE_AUTO(TestRootElements);
148 TESTCASE_AUTO(TestTailoredElements);
149 TESTCASE_AUTO(TestDataDriven);
150 TESTCASE_AUTO_END;
151 }
152
TestMinMax()153 void CollationTest::TestMinMax() {
154 IcuTestErrorCode errorCode(*this, "TestMinMax");
155
156 setRootCollator(errorCode);
157 if(errorCode.isFailure()) {
158 errorCode.reset();
159 return;
160 }
161 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
162 if(rbc == NULL) {
163 errln("the root collator is not a RuleBasedCollator");
164 return;
165 }
166
167 static const UChar s[2] = { 0xfffe, 0xffff };
168 UVector64 ces(errorCode);
169 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
170 errorCode.assertSuccess();
171 if(ces.size() != 2) {
172 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
173 return;
174 }
175 int64_t ce = ces.elementAti(0);
176 int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
177 if(ce != expected) {
178 errln("CE(U+fffe)=%04lx != 02..", (long)ce);
179 }
180
181 ce = ces.elementAti(1);
182 expected = Collation::makeCE(Collation::MAX_PRIMARY);
183 if(ce != expected) {
184 errln("CE(U+ffff)=%04lx != max..", (long)ce);
185 }
186 }
187
TestImplicits()188 void CollationTest::TestImplicits() {
189 IcuTestErrorCode errorCode(*this, "TestImplicits");
190
191 const CollationData *cd = CollationRoot::getData(errorCode);
192 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
193 return;
194 }
195
196 // Implicit primary weights should be assigned for the following sets,
197 // and sort in ascending order by set and then code point.
198 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
199
200 // core Han Unified Ideographs
201 UnicodeSet coreHan("[\\p{unified_ideograph}&"
202 "[\\p{Block=CJK_Unified_Ideographs}"
203 "\\p{Block=CJK_Compatibility_Ideographs}]]",
204 errorCode);
205 // all other Unified Han ideographs
206 UnicodeSet otherHan("[\\p{unified ideograph}-"
207 "[\\p{Block=CJK_Unified_Ideographs}"
208 "\\p{Block=CJK_Compatibility_Ideographs}]]",
209 errorCode);
210 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
211 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
212
213 // Starting with CLDR 26/ICU 54, the root Han order may instead be
214 // the Unihan radical-stroke order.
215 // The tests should pass either way, so we only test the order of a small set of Han characters
216 // whose radical-stroke order is the same as their code point order.
217 UnicodeSet someHanInCPOrder(
218 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
219 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
220 errorCode);
221 UnicodeSet inOrder(someHanInCPOrder);
222 inOrder.addAll(unassigned).freeze();
223 if(errorCode.errIfFailureAndReset("UnicodeSet")) {
224 return;
225 }
226 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
227 UChar32 prev = 0;
228 uint32_t prevPrimary = 0;
229 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
230 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
231 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
232 while(iter->next()) {
233 UChar32 c = iter->getCodepoint();
234 UnicodeString s(c);
235 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
236 int64_t ce = ci.nextCE(errorCode);
237 int64_t ce2 = ci.nextCE(errorCode);
238 if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
239 return;
240 }
241 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
242 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
243 continue;
244 }
245 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
246 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
247 (long)c, (long)(ce & 0xffffffff));
248 continue;
249 }
250 uint32_t primary = (uint32_t)(ce >> 32);
251 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
252 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
253 (long)c, (long)primary, (long)prev, (long)prevPrimary);
254 }
255 prev = c;
256 prevPrimary = primary;
257 }
258 }
259 }
260
TestNulTerminated()261 void CollationTest::TestNulTerminated() {
262 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
263 const CollationData *data = CollationRoot::getData(errorCode);
264 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
265 return;
266 }
267
268 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
269
270 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
271 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
272 for(int32_t i = 0;; ++i) {
273 int64_t ce1 = ci1.nextCE(errorCode);
274 int64_t ce2 = ci2.nextCE(errorCode);
275 if(errorCode.errIfFailureAndReset("CollationIterator.nextCE()")) {
276 return;
277 }
278 if(ce1 != ce2) {
279 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
280 break;
281 }
282 if(ce1 == Collation::NO_CE) { break; }
283 }
284 }
285
TestIllegalUTF8()286 void CollationTest::TestIllegalUTF8() {
287 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
288
289 setRootCollator(errorCode);
290 if(errorCode.isFailure()) {
291 errorCode.reset();
292 return;
293 }
294 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
295
296 static const char *strings[] = {
297 // string with U+FFFD == illegal byte sequence
298 u8"a\uFFFDz", "a\x80z", // trail byte
299 u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form
300 u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form
301 u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800
302 u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
303 u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form
304 u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000
305 };
306
307 for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
308 StringPiece fffd(strings[i]);
309 StringPiece illegal(strings[i + 1]);
310 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
311 if(order != UCOL_EQUAL) {
312 errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
313 (int)i, order);
314 }
315 }
316 }
317
318 namespace {
319
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)320 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
321 for(UChar32 c = 0x10000; c < 0x110000;) {
322 UChar32 next = c + 0x400;
323 if(src.containsSome(c, next - 1)) {
324 dest.add(U16_LEAD(c));
325 }
326 c = next;
327 }
328 }
329
330 } // namespace
331
TestShortFCDData()332 void CollationTest::TestShortFCDData() {
333 // See CollationFCD class comments.
334 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
335 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
336 errorCode.assertSuccess();
337 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
338 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
339 UnicodeSet lccc; // actual
340 for(UChar32 c = 0; c <= 0xffff; ++c) {
341 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
342 }
343 UnicodeSet diff(expectedLccc);
344 diff.removeAll(lccc);
345 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
346 UnicodeString empty("[]");
347 UnicodeString diffString;
348 diff.toPattern(diffString, TRUE);
349 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
350 diff = lccc;
351 diff.removeAll(expectedLccc);
352 diff.toPattern(diffString, TRUE);
353 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
354
355 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
356 if (errorCode.isSuccess()) {
357 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
358 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
359 UnicodeSet tccc; // actual
360 for(UChar32 c = 0; c <= 0xffff; ++c) {
361 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
362 }
363 diff = expectedTccc;
364 diff.removeAll(tccc);
365 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
366 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
367 diff = tccc;
368 diff.removeAll(expectedTccc);
369 diff.toPattern(diffString, TRUE);
370 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
371 }
372 }
373
374 class CodePointIterator {
375 public:
CodePointIterator(const UChar32 * cp,int32_t length)376 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()377 void resetToStart() { pos = 0; }
next()378 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()379 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const380 int32_t getLength() const { return length; }
getIndex() const381 int getIndex() const { return (int)pos; }
382 private:
383 const UChar32 *cp;
384 int32_t length;
385 int32_t pos;
386 };
387
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)388 void CollationTest::checkFCD(const char *name,
389 CollationIterator &ci, CodePointIterator &cpi) {
390 IcuTestErrorCode errorCode(*this, "checkFCD");
391
392 // Iterate forward to the limit.
393 for(;;) {
394 UChar32 c1 = ci.nextCodePoint(errorCode);
395 UChar32 c2 = cpi.next();
396 if(c1 != c2) {
397 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
398 name, (long)c1, (long)c2, cpi.getIndex());
399 return;
400 }
401 if(c1 < 0) { break; }
402 }
403
404 // Iterate backward most of the way.
405 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
406 UChar32 c1 = ci.previousCodePoint(errorCode);
407 UChar32 c2 = cpi.previous();
408 if(c1 != c2) {
409 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
410 name, (long)c1, (long)c2, cpi.getIndex());
411 return;
412 }
413 }
414
415 // Forward again.
416 for(;;) {
417 UChar32 c1 = ci.nextCodePoint(errorCode);
418 UChar32 c2 = cpi.next();
419 if(c1 != c2) {
420 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
421 name, (long)c1, (long)c2, cpi.getIndex());
422 return;
423 }
424 if(c1 < 0) { break; }
425 }
426
427 // Iterate backward to the start.
428 for(;;) {
429 UChar32 c1 = ci.previousCodePoint(errorCode);
430 UChar32 c2 = cpi.previous();
431 if(c1 != c2) {
432 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
433 name, (long)c1, (long)c2, cpi.getIndex());
434 return;
435 }
436 if(c1 < 0) { break; }
437 }
438 }
439
TestFCD()440 void CollationTest::TestFCD() {
441 IcuTestErrorCode errorCode(*this, "TestFCD");
442 const CollationData *data = CollationRoot::getData(errorCode);
443 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
444 return;
445 }
446
447 // Input string, not FCD, NUL-terminated.
448 static const UChar s[] = {
449 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
450 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
451 0x327, 0x308, // ccc=202, 230
452 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
453 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
454 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
455 0xac01,
456 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
457 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
458 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
459 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
460 0x4e00, 0xf81,
461 0
462 };
463 // Expected code points.
464 static const UChar32 cp[] = {
465 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
466 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
467 0x1D15F, 0x1D16D,
468 0xac01,
469 0x63, 0x327, 0x1D165, 0x1D16D,
470 0x61,
471 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
472 0x4e00, 0xf71, 0xf80
473 };
474
475 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
476 if(errorCode.errIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
477 return;
478 }
479 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
480 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
481
482 cpi.resetToStart();
483 std::string utf8;
484 UnicodeString(s).toUTF8String(utf8);
485 FCDUTF8CollationIterator u8ci(data, FALSE,
486 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
487 if(errorCode.errIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
488 return;
489 }
490 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
491
492 cpi.resetToStart();
493 UCharIterator iter;
494 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the terminating NUL
495 FCDUIterCollationIterator uici(data, FALSE, iter, 0);
496 if(errorCode.errIfFailureAndReset("FCDUIterCollationIterator constructor")) {
497 return;
498 }
499 checkFCD("FCDUIterCollationIterator", uici, cpi);
500 }
501
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)502 void CollationTest::checkAllocWeights(CollationWeights &cw,
503 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
504 int32_t someLength, int32_t minCount) {
505 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
506 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
507 (long)lowerLimit, (long)upperLimit, (long)n);
508 return;
509 }
510 uint32_t previous = lowerLimit;
511 int32_t count = 0; // number of weights that have someLength
512 for(int32_t i = 0; i < n; ++i) {
513 uint32_t w = cw.nextWeight();
514 if(w == 0xffffffff) {
515 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
516 "returns only %ld weights",
517 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
518 return;
519 }
520 if(!(previous < w && w < upperLimit)) {
521 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
522 "number %ld -> %lx not between %lx and %lx",
523 (long)lowerLimit, (long)upperLimit, (long)n,
524 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
525 return;
526 }
527 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
528 }
529 if(count < minCount) {
530 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
531 "returns only %ld < %ld weights of length %d",
532 (long)lowerLimit, (long)upperLimit, (long)n,
533 (long)count, (long)minCount, (int)someLength);
534 }
535 }
536
TestCollationWeights()537 void CollationTest::TestCollationWeights() {
538 CollationWeights cw;
539
540 // Non-compressible primaries use 254 second bytes 02..FF.
541 logln("CollationWeights.initForPrimary(non-compressible)");
542 cw.initForPrimary(FALSE);
543 // Expect 1 weight 11 and 254 weights 12xx.
544 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
545 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
546 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
547 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
548 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
549 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
550 // Expect 254^2=64516 three-byte weights.
551 // During computation, there should be 3 three-byte ranges
552 // 10ffff, 11xxxx, 120202.
553 // The middle one should be split 64515:1,
554 // and the newly-split-off range and the last ranged lengthened.
555 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
556 // Expect weights 1102 & 1103.
557 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
558 // Expect weights 102102 & 102103.
559 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
560
561 // Compressible primaries use 251 second bytes 04..FE.
562 logln("CollationWeights.initForPrimary(compressible)");
563 cw.initForPrimary(TRUE);
564 // Expect 1 weight 11 and 251 weights 12xx.
565 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
566 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
567 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
568 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
569 // Expect weights 1104 & 1105.
570 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
571 // Expect weights 102102 & 102103.
572 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
573
574 // Secondary and tertiary weights use only bytes 3 & 4.
575 logln("CollationWeights.initForSecondary()");
576 cw.initForSecondary();
577 // Expect weights fbxx and all four fc..ff.
578 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
579
580 logln("CollationWeights.initForTertiary()");
581 cw.initForTertiary();
582 // Expect weights 3dxx and both 3e & 3f.
583 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
584 }
585
586 namespace {
587
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)588 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
589 uint32_t p, uint32_t s, uint32_t ctq) {
590 uint32_t p1 = p >> 24;
591 uint32_t p2 = (p >> 16) & 0xff;
592 uint32_t p3 = (p >> 8) & 0xff;
593 uint32_t p4 = p & 0xff;
594 uint32_t s1 = s >> 8;
595 uint32_t s2 = s & 0xff;
596 // ctq = Case, Tertiary, Quaternary
597 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
598 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
599 uint32_t t1 = t >> 8;
600 uint32_t t2 = t & 0xff;
601 uint32_t q = ctq & Collation::QUATERNARY_MASK;
602 // No leading zero bytes.
603 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
604 return FALSE;
605 }
606 // No intermediate zero bytes.
607 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
608 return FALSE;
609 }
610 if(p2 != 0 && p3 == 0 && p4 != 0) {
611 return FALSE;
612 }
613 // Minimum & maximum lead bytes.
614 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
615 s1 == Collation::LEVEL_SEPARATOR_BYTE ||
616 t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
617 return FALSE;
618 }
619 if(c > 2) {
620 return FALSE;
621 }
622 // The valid byte range for the second primary byte depends on compressibility.
623 if(p2 != 0) {
624 if(data.isCompressibleLeadByte(p1)) {
625 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
626 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
627 return FALSE;
628 }
629 } else {
630 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
631 return FALSE;
632 }
633 }
634 }
635 // Other bytes just need to avoid the level separator.
636 // Trailing zeros are ok.
637 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
638 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
639 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
640 return FALSE;
641 }
642 // Well-formed CEs.
643 if(p == 0) {
644 if(s == 0) {
645 if(t == 0) {
646 // Completely ignorable CE.
647 // Quaternary CEs are not supported.
648 if(c != 0 || q != 0) {
649 return FALSE;
650 }
651 } else {
652 // Tertiary CE.
653 if(t < re.getTertiaryBoundary() || c != 2) {
654 return FALSE;
655 }
656 }
657 } else {
658 // Secondary CE.
659 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
660 return FALSE;
661 }
662 }
663 } else {
664 // Primary CE.
665 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
666 s >= re.getSecondaryBoundary()) {
667 return FALSE;
668 }
669 if(t == 0 || t >= re.getTertiaryBoundary()) {
670 return FALSE;
671 }
672 }
673 return TRUE;
674 }
675
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)676 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
677 uint32_t p = (uint32_t)(ce >> 32);
678 uint32_t secTer = (uint32_t)ce;
679 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
680 }
681
682 class RootElementsIterator {
683 public:
RootElementsIterator(const CollationData & root)684 RootElementsIterator(const CollationData &root)
685 : data(root),
686 elements(root.rootElements), length(root.rootElementsLength),
687 pri(0), secTer(0),
688 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
689
next()690 UBool next() {
691 if(index >= length) { return FALSE; }
692 uint32_t p = elements[index];
693 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
694 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
695 ++index;
696 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
697 return TRUE;
698 }
699 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
700 // End of a range, enumerate the primaries in the range.
701 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
702 p &= 0xffffff00;
703 if(pri == p) {
704 // Finished the range, return the next CE after it.
705 ++index;
706 return next();
707 }
708 U_ASSERT(pri < p);
709 // Return the next primary in this range.
710 UBool isCompressible = data.isCompressiblePrimary(pri);
711 if((pri & 0xffff) == 0) {
712 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
713 } else {
714 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
715 }
716 return TRUE;
717 }
718 // Simple primary CE.
719 ++index;
720 pri = p;
721 // Does this have an explicit below-common sec/ter unit,
722 // or does it imply a common one?
723 if(index == length) {
724 secTer = Collation::COMMON_SEC_AND_TER_CE;
725 } else {
726 secTer = elements[index];
727 if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
728 // No sec/ter delta.
729 secTer = Collation::COMMON_SEC_AND_TER_CE;
730 } else {
731 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
732 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
733 // Implied sec/ter.
734 secTer = Collation::COMMON_SEC_AND_TER_CE;
735 } else {
736 // Explicit sec/ter below common/common.
737 ++index;
738 }
739 }
740 }
741 return TRUE;
742 }
743
getPrimary() const744 uint32_t getPrimary() const { return pri; }
getSecTer() const745 uint32_t getSecTer() const { return secTer; }
746
747 private:
748 const CollationData &data;
749 const uint32_t *elements;
750 int32_t length;
751
752 uint32_t pri;
753 uint32_t secTer;
754 int32_t index;
755 };
756
757 } // namespace
758
TestRootElements()759 void CollationTest::TestRootElements() {
760 IcuTestErrorCode errorCode(*this, "TestRootElements");
761 const CollationData *root = CollationRoot::getData(errorCode);
762 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
763 return;
764 }
765 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
766 RootElementsIterator iter(*root);
767
768 // We check each root CE for validity,
769 // and we also verify that there is a tailoring gap between each two CEs.
770 CollationWeights cw1c; // compressible primary weights
771 CollationWeights cw1u; // uncompressible primary weights
772 CollationWeights cw2;
773 CollationWeights cw3;
774
775 cw1c.initForPrimary(TRUE);
776 cw1u.initForPrimary(FALSE);
777 cw2.initForSecondary();
778 cw3.initForTertiary();
779
780 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
781 // nor the special merge-separator CE for U+FFFE.
782 uint32_t prevPri = 0;
783 uint32_t prevSec = 0;
784 uint32_t prevTer = 0;
785 while(iter.next()) {
786 uint32_t pri = iter.getPrimary();
787 uint32_t secTer = iter.getSecTer();
788 // CollationRootElements CEs must have 0 case and quaternary bits.
789 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
790 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
791 (long)pri, (long)secTer);
792 }
793 uint32_t sec = secTer >> 16;
794 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
795 uint32_t ctq = ter;
796 if(pri == 0 && sec == 0 && ter != 0) {
797 // Tertiary CEs must have uppercase bits,
798 // but they are not stored in the CollationRootElements.
799 ctq |= 0x8000;
800 }
801 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
802 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
803 } else {
804 if(pri != prevPri) {
805 uint32_t newWeight = 0;
806 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
807 // There is currently no tailoring gap after primary ignorables,
808 // and we forbid tailoring after U+FFFD and U+FFFF.
809 } else if(root->isCompressiblePrimary(prevPri)) {
810 if(!cw1c.allocWeights(prevPri, pri, 1)) {
811 errln("no primary/compressible tailoring gap between %08lx and %08lx",
812 (long)prevPri, (long)pri);
813 } else {
814 newWeight = cw1c.nextWeight();
815 }
816 } else {
817 if(!cw1u.allocWeights(prevPri, pri, 1)) {
818 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
819 (long)prevPri, (long)pri);
820 } else {
821 newWeight = cw1u.nextWeight();
822 }
823 }
824 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
825 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
826 (long)prevPri, (long)newWeight, (long)pri);
827 }
828 } else if(sec != prevSec) {
829 uint32_t lowerLimit =
830 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
831 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
832 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
833 } else {
834 uint32_t newWeight = cw2.nextWeight();
835 if(!(prevSec < newWeight && newWeight < sec)) {
836 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
837 (long)lowerLimit, (long)newWeight, (long)sec);
838 }
839 }
840 } else if(ter != prevTer) {
841 uint32_t lowerLimit =
842 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
843 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
844 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
845 } else {
846 uint32_t newWeight = cw3.nextWeight();
847 if(!(prevTer < newWeight && newWeight < ter)) {
848 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
849 (long)lowerLimit, (long)newWeight, (long)ter);
850 }
851 }
852 } else {
853 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
854 }
855 }
856 prevPri = pri;
857 prevSec = sec;
858 prevTer = ter;
859 }
860 }
861
TestTailoredElements()862 void CollationTest::TestTailoredElements() {
863 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
864 const CollationData *root = CollationRoot::getData(errorCode);
865 if(errorCode.errDataIfFailureAndReset("CollationRoot::getData()")) {
866 return;
867 }
868 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
869
870 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
871 if(errorCode.errIfFailureAndReset("failed to create a hash table")) {
872 return;
873 }
874 uhash_setKeyDeleter(prevLocales, uprv_free);
875 // TestRootElements() tests the root collator which does not have tailorings.
876 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
877 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
878 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
879
880 UVector64 ces(errorCode);
881 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
882 U_ASSERT(locales.isValid());
883 const char *localeID = "root";
884 do {
885 Locale locale(localeID);
886 LocalPointer<StringEnumeration> types(
887 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
888 errorCode.assertSuccess();
889 const char *type; // first: default type
890 while((type = types->next(NULL, errorCode)) != NULL) {
891 if(strncmp(type, "private-", 8) == 0) {
892 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
893 localeID, type);
894 }
895 Locale localeWithType(locale);
896 localeWithType.setKeywordValue("collation", type, errorCode);
897 errorCode.assertSuccess();
898 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
899 if(errorCode.errIfFailureAndReset("Collator::createInstance(%s)",
900 localeWithType.getName())) {
901 continue;
902 }
903 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
904 if(uhash_geti(prevLocales, actual.getName()) != 0) {
905 continue;
906 }
907 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
908 errorCode.assertSuccess();
909 logln("TestTailoredElements(): requested %s -> actual %s",
910 localeWithType.getName(), actual.getName());
911 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
912 if(rbc == NULL) {
913 continue;
914 }
915 // Note: It would be better to get tailored strings such that we can
916 // identify the prefix, and only get the CEs for the prefix+string,
917 // not also for the prefix.
918 // There is currently no API for that.
919 // It would help in an unusual case where a contraction starting in the prefix
920 // extends past its end, and we do not see the intended mapping.
921 // For example, for a mapping p|st, if there is also a contraction ps,
922 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
923 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
924 errorCode.assertSuccess();
925 UnicodeSetIterator iter(*tailored);
926 while(iter.next()) {
927 const UnicodeString &s = iter.getString();
928 ces.removeAllElements();
929 rbc->internalGetCEs(s, ces, errorCode);
930 errorCode.assertSuccess();
931 for(int32_t i = 0; i < ces.size(); ++i) {
932 int64_t ce = ces.elementAti(i);
933 if(!isValidCE(rootElements, *root, ce)) {
934 errln("invalid tailored CE %016llx at CE index %d from string:",
935 (long long)ce, (int)i);
936 infoln(prettify(s));
937 }
938 }
939 }
940 }
941 } while((localeID = locales->next(NULL, errorCode)) != NULL);
942 uhash_close(prevLocales);
943 }
944
printSortKey(const uint8_t * p,int32_t length)945 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
946 UnicodeString s;
947 for(int32_t i = 0; i < length; ++i) {
948 if(i > 0) { s.append((UChar)0x20); }
949 uint8_t b = p[i];
950 if(b == 0) {
951 s.append((UChar)0x2e); // period
952 } else if(b == 1) {
953 s.append((UChar)0x7c); // vertical bar
954 } else {
955 appendHex(b, 2, s);
956 }
957 }
958 return s;
959 }
960
printCollationKey(const CollationKey & key)961 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
962 int32_t length;
963 const uint8_t *p = key.getByteArray(length);
964 return printSortKey(p, length);
965 }
966
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)967 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
968 for(;;) {
969 int32_t lineLength;
970 const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
971 if(line == NULL || errorCode.isFailure()) {
972 fileLine.remove();
973 return FALSE;
974 }
975 ++fileLineNumber;
976 // Strip trailing CR/LF, comments, and spaces.
977 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
978 if(comment != NULL) {
979 lineLength = (int32_t)(comment - line);
980 } else {
981 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
982 }
983 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
984 if(lineLength != 0) {
985 fileLine.setTo(FALSE, line, lineLength);
986 return TRUE;
987 }
988 // Empty line, continue.
989 }
990 }
991
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)992 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
993 UErrorCode &errorCode) {
994 int32_t length = fileLine.length();
995 int32_t i;
996 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
997 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|'
998 if(pipeIndex >= 0) {
999 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1000 if(prefix.isEmpty()) {
1001 errln("empty prefix on line %d", (int)fileLineNumber);
1002 infoln(fileLine);
1003 errorCode = U_PARSE_ERROR;
1004 return;
1005 }
1006 start = pipeIndex + 1;
1007 } else {
1008 prefix.remove();
1009 }
1010 s = fileLine.tempSubStringBetween(start, i).unescape();
1011 if(s.isEmpty()) {
1012 errln("empty string on line %d", (int)fileLineNumber);
1013 infoln(fileLine);
1014 errorCode = U_PARSE_ERROR;
1015 return;
1016 }
1017 start = i;
1018 }
1019
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1020 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1021 Collation::Level relation;
1022 int32_t start;
1023 if(fileLine[0] == 0x3c) { // <
1024 UChar second = fileLine[1];
1025 start = 2;
1026 switch(second) {
1027 case 0x31: // <1
1028 relation = Collation::PRIMARY_LEVEL;
1029 break;
1030 case 0x32: // <2
1031 relation = Collation::SECONDARY_LEVEL;
1032 break;
1033 case 0x33: // <3
1034 relation = Collation::TERTIARY_LEVEL;
1035 break;
1036 case 0x34: // <4
1037 relation = Collation::QUATERNARY_LEVEL;
1038 break;
1039 case 0x63: // <c
1040 relation = Collation::CASE_LEVEL;
1041 break;
1042 case 0x69: // <i
1043 relation = Collation::IDENTICAL_LEVEL;
1044 break;
1045 default: // just <
1046 relation = Collation::NO_LEVEL;
1047 start = 1;
1048 break;
1049 }
1050 } else if(fileLine[0] == 0x3d) { // =
1051 relation = Collation::ZERO_LEVEL;
1052 start = 1;
1053 } else {
1054 start = 0;
1055 }
1056 if(start == 0 || !isSpace(fileLine[start])) {
1057 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1058 infoln(fileLine);
1059 errorCode.set(U_PARSE_ERROR);
1060 return Collation::NO_LEVEL;
1061 }
1062 start = skipSpaces(start);
1063 UnicodeString prefix;
1064 parseString(start, prefix, s, errorCode);
1065 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1066 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1067 infoln(fileLine);
1068 errorCode.set(U_PARSE_ERROR);
1069 return Collation::NO_LEVEL;
1070 }
1071 if(start < fileLine.length()) {
1072 errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1073 infoln(fileLine);
1074 errorCode.set(U_PARSE_ERROR);
1075 return Collation::NO_LEVEL;
1076 }
1077 return relation;
1078 }
1079
1080 static const struct {
1081 const char *name;
1082 UColAttribute attr;
1083 } attributes[] = {
1084 { "backwards", UCOL_FRENCH_COLLATION },
1085 { "alternate", UCOL_ALTERNATE_HANDLING },
1086 { "caseFirst", UCOL_CASE_FIRST },
1087 { "caseLevel", UCOL_CASE_LEVEL },
1088 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1089 { "strength", UCOL_STRENGTH },
1090 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1091 { "numeric", UCOL_NUMERIC_COLLATION }
1092 };
1093
1094 static const struct {
1095 const char *name;
1096 UColAttributeValue value;
1097 } attributeValues[] = {
1098 { "default", UCOL_DEFAULT },
1099 { "primary", UCOL_PRIMARY },
1100 { "secondary", UCOL_SECONDARY },
1101 { "tertiary", UCOL_TERTIARY },
1102 { "quaternary", UCOL_QUATERNARY },
1103 { "identical", UCOL_IDENTICAL },
1104 { "off", UCOL_OFF },
1105 { "on", UCOL_ON },
1106 { "shifted", UCOL_SHIFTED },
1107 { "non-ignorable", UCOL_NON_IGNORABLE },
1108 { "lower", UCOL_LOWER_FIRST },
1109 { "upper", UCOL_UPPER_FIRST }
1110 };
1111
parseAndSetAttribute(IcuTestErrorCode & errorCode)1112 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1113 // Parse attributes even if the Collator could not be created,
1114 // in order to report syntax errors.
1115 int32_t start = skipSpaces(1);
1116 int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1117 if(equalPos < 0) {
1118 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1119 parseAndSetReorderCodes(start + 7, errorCode);
1120 return;
1121 }
1122 errln("missing '=' on line %d", (int)fileLineNumber);
1123 infoln(fileLine);
1124 errorCode.set(U_PARSE_ERROR);
1125 return;
1126 }
1127
1128 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1129 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1130 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1131 UColReorderCode max;
1132 if(valueString == UNICODE_STRING("space", 5)) {
1133 max = UCOL_REORDER_CODE_SPACE;
1134 } else if(valueString == UNICODE_STRING("punct", 5)) {
1135 max = UCOL_REORDER_CODE_PUNCTUATION;
1136 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1137 max = UCOL_REORDER_CODE_SYMBOL;
1138 } else if(valueString == UNICODE_STRING("currency", 8)) {
1139 max = UCOL_REORDER_CODE_CURRENCY;
1140 } else {
1141 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1142 infoln(fileLine);
1143 errorCode.set(U_PARSE_ERROR);
1144 return;
1145 }
1146 if(coll != NULL) {
1147 coll->setMaxVariable(max, errorCode);
1148 if(errorCode.isFailure()) {
1149 errln("setMaxVariable() failed on line %d: %s",
1150 (int)fileLineNumber, errorCode.errorName());
1151 infoln(fileLine);
1152 return;
1153 }
1154 }
1155 fileLine.remove();
1156 return;
1157 }
1158
1159 UColAttribute attr;
1160 for(int32_t i = 0;; ++i) {
1161 if(i == UPRV_LENGTHOF(attributes)) {
1162 errln("invalid attribute name on line %d", (int)fileLineNumber);
1163 infoln(fileLine);
1164 errorCode.set(U_PARSE_ERROR);
1165 return;
1166 }
1167 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1168 attr = attributes[i].attr;
1169 break;
1170 }
1171 }
1172
1173 UColAttributeValue value;
1174 for(int32_t i = 0;; ++i) {
1175 if(i == UPRV_LENGTHOF(attributeValues)) {
1176 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1177 infoln(fileLine);
1178 errorCode.set(U_PARSE_ERROR);
1179 return;
1180 }
1181 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1182 value = attributeValues[i].value;
1183 break;
1184 }
1185 }
1186
1187 if(coll != NULL) {
1188 coll->setAttribute(attr, value, errorCode);
1189 if(errorCode.isFailure()) {
1190 errln("illegal attribute=value combination on line %d: %s",
1191 (int)fileLineNumber, errorCode.errorName());
1192 infoln(fileLine);
1193 return;
1194 }
1195 }
1196 fileLine.remove();
1197 }
1198
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1199 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1200 UVector32 reorderCodes(errorCode);
1201 while(start < fileLine.length()) {
1202 start = skipSpaces(start);
1203 int32_t limit = start;
1204 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1205 CharString name;
1206 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1207 int32_t code = CollationRuleParser::getReorderCode(name.data());
1208 if(code < 0) {
1209 if(uprv_stricmp(name.data(), "default") == 0) {
1210 code = UCOL_REORDER_CODE_DEFAULT; // -1
1211 } else {
1212 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1213 infoln(fileLine);
1214 errorCode.set(U_PARSE_ERROR);
1215 return;
1216 }
1217 }
1218 reorderCodes.addElement(code, errorCode);
1219 start = limit;
1220 }
1221 if(coll != NULL) {
1222 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1223 if(errorCode.isFailure()) {
1224 errln("setReorderCodes() failed on line %d: %s",
1225 (int)fileLineNumber, errorCode.errorName());
1226 infoln(fileLine);
1227 return;
1228 }
1229 }
1230 fileLine.remove();
1231 }
1232
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1233 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1234 UnicodeString rules;
1235 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1236 rules.append(fileLine.unescape());
1237 }
1238 if(errorCode.isFailure()) { return; }
1239 logln(rules);
1240
1241 UParseError parseError;
1242 UnicodeString reason;
1243 delete coll;
1244 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1245 if(coll == NULL) {
1246 errln("unable to allocate a new collator");
1247 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1248 return;
1249 }
1250 if(errorCode.isFailure()) {
1251 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1252 infoln(UnicodeString(" reason: ") + reason);
1253 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); }
1254 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1255 infoln(UnicodeString(" snippet: ...") +
1256 parseError.preContext + "(!)" + parseError.postContext + "...");
1257 }
1258 delete coll;
1259 coll = NULL;
1260 errorCode.reset();
1261 } else {
1262 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1263 UnicodeString(), reason);
1264 }
1265 }
1266
setRootCollator(IcuTestErrorCode & errorCode)1267 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1268 if(errorCode.isFailure()) { return; }
1269 delete coll;
1270 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1271 if(errorCode.isFailure()) {
1272 dataerrln("unable to create a root collator");
1273 return;
1274 }
1275 }
1276
setLocaleCollator(IcuTestErrorCode & errorCode)1277 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1278 if(errorCode.isFailure()) { return; }
1279 delete coll;
1280 coll = NULL;
1281 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant
1282 if(at >= 0) {
1283 fileLine.setCharAt(at, (UChar)0x2a); // *
1284 }
1285 CharString localeID;
1286 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1287 if(at >= 0) {
1288 localeID.data()[at - 9] = '@';
1289 }
1290 Locale locale(localeID.data());
1291 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1292 errln("invalid language tag on line %d", (int)fileLineNumber);
1293 infoln(fileLine);
1294 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1295 return;
1296 }
1297
1298 logln("creating a collator for locale ID %s", locale.getName());
1299 coll = Collator::createInstance(locale, errorCode);
1300 if(errorCode.isFailure()) {
1301 dataerrln("unable to create a collator for locale %s on line %d",
1302 locale.getName(), (int)fileLineNumber);
1303 infoln(fileLine);
1304 delete coll;
1305 coll = NULL;
1306 errorCode.reset();
1307 }
1308 }
1309
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1310 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1311 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1312 // In some sequences with Tibetan composite vowel signs,
1313 // even if the string passes the FCD check,
1314 // those composites must be decomposed.
1315 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1316 int32_t index = 0;
1317 while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1318 if(++index < s.length()) {
1319 UChar c = s[index];
1320 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1321 }
1322 }
1323 return FALSE;
1324 }
1325
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1326 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1327 CharString &dest, int32_t partSize,
1328 IcuTestErrorCode &errorCode) {
1329 if(errorCode.isFailure()) { return FALSE; }
1330 uint8_t part[32];
1331 U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1332 UCharIterator iter;
1333 uiter_setString(&iter, s, length);
1334 uint32_t state[2] = { 0, 0 };
1335 for(;;) {
1336 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1337 UBool done = partLength < partSize;
1338 if(done) {
1339 // At the end, append the next byte as well which should be 00.
1340 ++partLength;
1341 }
1342 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1343 if(done) {
1344 return errorCode.isSuccess();
1345 }
1346 }
1347 }
1348
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1349 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1350 const UChar *s, int32_t length,
1351 CollationKey &key, IcuTestErrorCode &errorCode) {
1352 if(errorCode.isFailure()) { return FALSE; }
1353 coll->getCollationKey(s, length, key, errorCode);
1354 if(errorCode.isFailure()) {
1355 infoln(fileTestName);
1356 errln("Collator(%s).getCollationKey() failed: %s",
1357 norm, errorCode.errorName());
1358 infoln(line);
1359 return FALSE;
1360 }
1361 int32_t keyLength;
1362 const uint8_t *keyBytes = key.getByteArray(keyLength);
1363 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1364 infoln(fileTestName);
1365 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1366 norm);
1367 infoln(line);
1368 infoln(printCollationKey(key));
1369 return FALSE;
1370 }
1371
1372 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1373 if(numLevels < UCOL_IDENTICAL) {
1374 ++numLevels;
1375 } else {
1376 numLevels = 5;
1377 }
1378 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1379 ++numLevels;
1380 }
1381 errorCode.assertSuccess();
1382 int32_t numLevelSeparators = 0;
1383 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1384 uint8_t b = keyBytes[i];
1385 if(b == 0) {
1386 infoln(fileTestName);
1387 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1388 infoln(line);
1389 infoln(printCollationKey(key));
1390 return FALSE;
1391 }
1392 if(b == 1) { ++numLevelSeparators; }
1393 }
1394 if(numLevelSeparators != (numLevels - 1)) {
1395 infoln(fileTestName);
1396 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1397 norm, (int)numLevelSeparators, (int)numLevels);
1398 infoln(line);
1399 infoln(printCollationKey(key));
1400 return FALSE;
1401 }
1402
1403 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1404 static const int32_t partSizes[] = { 32, 3, 1 };
1405 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1406 int32_t partSize = partSizes[psi];
1407 CharString parts;
1408 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1409 infoln(fileTestName);
1410 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1411 norm, (int)partSize, errorCode.errorName());
1412 infoln(line);
1413 return FALSE;
1414 }
1415 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1416 infoln(fileTestName);
1417 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1418 norm, (int)partSize);
1419 infoln(line);
1420 infoln(printCollationKey(key));
1421 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1422 return FALSE;
1423 }
1424 }
1425 return TRUE;
1426 }
1427
1428 /**
1429 * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1430 * Leaves key unchanged if s does not contain U+FFFE.
1431 * @return TRUE if the key was successfully changed
1432 */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1433 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1434 CollationKey &key, IcuTestErrorCode &errorCode) {
1435 if(errorCode.isFailure()) { return FALSE; }
1436 LocalMemory<uint8_t> mergedKey;
1437 int32_t mergedKeyLength = 0;
1438 int32_t mergedKeyCapacity = 0;
1439 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1440 int32_t segmentStart = 0;
1441 for(int32_t i = 0;;) {
1442 if(i == sLength) {
1443 if(segmentStart == 0) {
1444 // s does not contain any U+FFFE.
1445 return FALSE;
1446 }
1447 } else if(s[i] != 0xfffe) {
1448 ++i;
1449 continue;
1450 }
1451 // Get the sort key for another segment and merge it into mergedKey.
1452 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1453 CollationKey key2;
1454 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1455 int32_t key1Length, key2Length;
1456 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1457 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1458 uint8_t *dest;
1459 int32_t minCapacity = key1Length + key2Length;
1460 if(key1Length > 0) { --minCapacity; }
1461 if(minCapacity <= mergedKeyCapacity) {
1462 dest = mergedKey.getAlias();
1463 } else {
1464 if(minCapacity <= 200) {
1465 mergedKeyCapacity = 200;
1466 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1467 mergedKeyCapacity *= 2;
1468 } else {
1469 mergedKeyCapacity = minCapacity;
1470 }
1471 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1472 }
1473 U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1474 if(key1Length == 0) {
1475 // key2 is the sort key for the first segment.
1476 uprv_memcpy(dest, key2Bytes, key2Length);
1477 mergedKeyLength = key2Length;
1478 } else {
1479 mergedKeyLength =
1480 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1481 dest, mergedKeyCapacity);
1482 }
1483 if(i == sLength) { break; }
1484 segmentStart = ++i;
1485 }
1486 key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1487 return TRUE;
1488 }
1489
1490 namespace {
1491
1492 /**
1493 * Replaces unpaired surrogates with U+FFFD.
1494 * Returns s if no replacement was made, otherwise buffer.
1495 */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1496 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1497 int32_t i = 0;
1498 while(i < s.length()) {
1499 UChar32 c = s.char32At(i);
1500 if(U_IS_SURROGATE(c)) {
1501 if(buffer.length() < i) {
1502 buffer.append(s, buffer.length(), i - buffer.length());
1503 }
1504 buffer.append((UChar)0xfffd);
1505 }
1506 i += U16_LENGTH(c);
1507 }
1508 if(buffer.isEmpty()) {
1509 return s;
1510 }
1511 if(buffer.length() < i) {
1512 buffer.append(s, buffer.length(), i - buffer.length());
1513 }
1514 return buffer;
1515 }
1516
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1517 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1518 UCollationResult order, UBool collHasCaseLevel) {
1519 if(order == UCOL_EQUAL) {
1520 return Collation::NO_LEVEL;
1521 }
1522 int32_t prevKeyLength;
1523 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1524 int32_t keyLength;
1525 const uint8_t *bytes = key.getByteArray(keyLength);
1526 int32_t level = Collation::PRIMARY_LEVEL;
1527 for(int32_t i = 0;; ++i) {
1528 uint8_t b = prevBytes[i];
1529 if(b != bytes[i]) { break; }
1530 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1531 ++level;
1532 if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1533 ++level;
1534 }
1535 }
1536 }
1537 return level;
1538 }
1539
1540 }
1541
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1542 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1543 const UnicodeString &prevString, const UnicodeString &s,
1544 UCollationResult expectedOrder, Collation::Level expectedLevel,
1545 IcuTestErrorCode &errorCode) {
1546 if(errorCode.isFailure()) { return FALSE; }
1547
1548 // Get the sort keys first, for error debug output.
1549 CollationKey prevKey;
1550 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1551 prevKey, errorCode)) {
1552 return FALSE;
1553 }
1554 CollationKey key;
1555 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1556
1557 UCollationResult order = coll->compare(prevString, s, errorCode);
1558 if(order != expectedOrder || errorCode.isFailure()) {
1559 infoln(fileTestName);
1560 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1561 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1562 infoln(prevFileLine);
1563 infoln(fileLine);
1564 infoln(printCollationKey(prevKey));
1565 infoln(printCollationKey(key));
1566 return FALSE;
1567 }
1568 order = coll->compare(s, prevString, errorCode);
1569 if(order != -expectedOrder || errorCode.isFailure()) {
1570 infoln(fileTestName);
1571 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1572 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1573 infoln(prevFileLine);
1574 infoln(fileLine);
1575 infoln(printCollationKey(prevKey));
1576 infoln(printCollationKey(key));
1577 return FALSE;
1578 }
1579 // Test NUL-termination if the strings do not contain NUL characters.
1580 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1581 if(!containNUL) {
1582 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1583 if(order != expectedOrder || errorCode.isFailure()) {
1584 infoln(fileTestName);
1585 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1586 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1587 infoln(prevFileLine);
1588 infoln(fileLine);
1589 infoln(printCollationKey(prevKey));
1590 infoln(printCollationKey(key));
1591 return FALSE;
1592 }
1593 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1594 if(order != -expectedOrder || errorCode.isFailure()) {
1595 infoln(fileTestName);
1596 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1597 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1598 infoln(prevFileLine);
1599 infoln(fileLine);
1600 infoln(printCollationKey(prevKey));
1601 infoln(printCollationKey(key));
1602 return FALSE;
1603 }
1604 }
1605
1606 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1607 // Unpaired surrogates cannot be converted to UTF-8.
1608 // Create valid UTF-16 strings if necessary, and use those for
1609 // both the expected compare() result and for the input to compare(UTF-8).
1610 UnicodeString prevBuffer, sBuffer;
1611 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1612 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1613 std::string prevUTF8, sUTF8;
1614 UnicodeString(prevValid).toUTF8String(prevUTF8);
1615 UnicodeString(sValid).toUTF8String(sUTF8);
1616 UCollationResult expectedUTF8Order;
1617 if(&prevValid == &prevString && &sValid == &s) {
1618 expectedUTF8Order = expectedOrder;
1619 } else {
1620 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1621 }
1622
1623 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1624 if(order != expectedUTF8Order || errorCode.isFailure()) {
1625 infoln(fileTestName);
1626 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1627 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1628 infoln(prevFileLine);
1629 infoln(fileLine);
1630 infoln(printCollationKey(prevKey));
1631 infoln(printCollationKey(key));
1632 return FALSE;
1633 }
1634 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1635 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1636 infoln(fileTestName);
1637 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1638 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1639 infoln(prevFileLine);
1640 infoln(fileLine);
1641 infoln(printCollationKey(prevKey));
1642 infoln(printCollationKey(key));
1643 return FALSE;
1644 }
1645 // Test NUL-termination if the strings do not contain NUL characters.
1646 if(!containNUL) {
1647 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1648 if(order != expectedUTF8Order || errorCode.isFailure()) {
1649 infoln(fileTestName);
1650 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1651 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1652 infoln(prevFileLine);
1653 infoln(fileLine);
1654 infoln(printCollationKey(prevKey));
1655 infoln(printCollationKey(key));
1656 return FALSE;
1657 }
1658 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1659 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1660 infoln(fileTestName);
1661 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1662 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1663 infoln(prevFileLine);
1664 infoln(fileLine);
1665 infoln(printCollationKey(prevKey));
1666 infoln(printCollationKey(key));
1667 return FALSE;
1668 }
1669 }
1670
1671 UCharIterator leftIter;
1672 UCharIterator rightIter;
1673 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1674 uiter_setString(&rightIter, s.getBuffer(), s.length());
1675 order = coll->compare(leftIter, rightIter, errorCode);
1676 if(order != expectedOrder || errorCode.isFailure()) {
1677 infoln(fileTestName);
1678 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1679 "wrong order: %d != %d (%s)",
1680 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1681 infoln(prevFileLine);
1682 infoln(fileLine);
1683 infoln(printCollationKey(prevKey));
1684 infoln(printCollationKey(key));
1685 return FALSE;
1686 }
1687
1688 order = prevKey.compareTo(key, errorCode);
1689 if(order != expectedOrder || errorCode.isFailure()) {
1690 infoln(fileTestName);
1691 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1692 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1693 infoln(prevFileLine);
1694 infoln(fileLine);
1695 infoln(printCollationKey(prevKey));
1696 infoln(printCollationKey(key));
1697 return FALSE;
1698 }
1699 UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1700 int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1701 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1702 if(level != expectedLevel) {
1703 infoln(fileTestName);
1704 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1705 (int)fileLineNumber, norm, order, level, expectedLevel);
1706 infoln(prevFileLine);
1707 infoln(fileLine);
1708 infoln(printCollationKey(prevKey));
1709 infoln(printCollationKey(key));
1710 return FALSE;
1711 }
1712 }
1713
1714 // If either string contains U+FFFE, then their sort keys must compare the same as
1715 // the merged sort keys of each string's between-FFFE segments.
1716 //
1717 // It is not required that
1718 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1719 // only that those two methods yield the same order.
1720 //
1721 // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1722 if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1723 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1724 errorCode.isFailure()) {
1725 order = prevKey.compareTo(key, errorCode);
1726 if(order != expectedOrder || errorCode.isFailure()) {
1727 infoln(fileTestName);
1728 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1729 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1730 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1731 infoln(prevFileLine);
1732 infoln(fileLine);
1733 infoln(printCollationKey(prevKey));
1734 infoln(printCollationKey(key));
1735 return FALSE;
1736 }
1737 int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1738 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1739 if(mergedLevel != level) {
1740 infoln(fileTestName);
1741 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1742 "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1743 (int)fileLineNumber, norm, order, mergedLevel, level);
1744 infoln(prevFileLine);
1745 infoln(fileLine);
1746 infoln(printCollationKey(prevKey));
1747 infoln(printCollationKey(key));
1748 return FALSE;
1749 }
1750 }
1751 }
1752 return TRUE;
1753 }
1754
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1755 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1756 if(errorCode.isFailure()) { return; }
1757 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1758 UnicodeString prevString, s;
1759 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1760 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1761 // Parse the line even if it will be ignored (when we do not have a Collator)
1762 // in order to report syntax issues.
1763 Collation::Level relation = parseRelationAndString(s, errorCode);
1764 if(errorCode.isFailure()) {
1765 errorCode.reset();
1766 break;
1767 }
1768 if(coll == NULL) {
1769 // We were unable to create the Collator but continue with tests.
1770 // Ignore test data for this Collator.
1771 // The next Collator creation might work.
1772 continue;
1773 }
1774 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1775 Collation::Level expectedLevel = relation;
1776 s.getTerminatedBuffer(); // Ensure NUL-termination.
1777 UBool isOk = TRUE;
1778 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1779 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1780 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1781 expectedOrder, expectedLevel, errorCode);
1782 }
1783 if(isOk) {
1784 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1785 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1786 expectedOrder, expectedLevel, errorCode);
1787 }
1788 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1789 UnicodeString pn = nfd->normalize(prevString, errorCode);
1790 UnicodeString n = nfd->normalize(s, errorCode);
1791 pn.getTerminatedBuffer();
1792 n.getTerminatedBuffer();
1793 errorCode.assertSuccess();
1794 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1795 expectedOrder, expectedLevel, errorCode);
1796 }
1797 if(!isOk) {
1798 errorCode.reset(); // already reported
1799 }
1800 prevFileLine = fileLine;
1801 prevString = s;
1802 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1803 }
1804 }
1805
TestDataDriven()1806 void CollationTest::TestDataDriven() {
1807 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1808
1809 fcd = Normalizer2Factory::getFCDInstance(errorCode);
1810 nfd = Normalizer2::getNFDInstance(errorCode);
1811 if(errorCode.errDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1812 return;
1813 }
1814
1815 CharString path(getSourceTestData(errorCode), errorCode);
1816 path.appendPathPart("collationtest.txt", errorCode);
1817 const char *codePage = "UTF-8";
1818 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1819 if(errorCode.errIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1820 return;
1821 }
1822 // Read a new line if necessary.
1823 // Sub-parsers leave the first line set that they do not handle.
1824 while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1825 if(!isSectionStarter(fileLine[0])) {
1826 errln("syntax error on line %d", (int)fileLineNumber);
1827 infoln(fileLine);
1828 return;
1829 }
1830 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1831 fileTestName = fileLine;
1832 logln(fileLine);
1833 fileLine.remove();
1834 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1835 setRootCollator(errorCode);
1836 fileLine.remove();
1837 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1838 setLocaleCollator(errorCode);
1839 fileLine.remove();
1840 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1841 buildTailoring(f.getAlias(), errorCode);
1842 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1843 parseAndSetAttribute(errorCode);
1844 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1845 checkCompareStrings(f.getAlias(), errorCode);
1846 } else {
1847 errln("syntax error on line %d", (int)fileLineNumber);
1848 infoln(fileLine);
1849 return;
1850 }
1851 }
1852 }
1853
1854 #endif // !UCONFIG_NO_COLLATION
1855