1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationtest.cpp
7 *
8 * created on: 2012apr27
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/coll.h"
17 #include "unicode/errorcode.h"
18 #include "unicode/localpointer.h"
19 #include "unicode/normalizer2.h"
20 #include "unicode/sortkey.h"
21 #include "unicode/std_string.h"
22 #include "unicode/strenum.h"
23 #include "unicode/tblcoll.h"
24 #include "unicode/uiter.h"
25 #include "unicode/uniset.h"
26 #include "unicode/unistr.h"
27 #include "unicode/usetiter.h"
28 #include "unicode/ustring.h"
29 #include "charstr.h"
30 #include "cmemory.h"
31 #include "collation.h"
32 #include "collationdata.h"
33 #include "collationfcd.h"
34 #include "collationiterator.h"
35 #include "collationroot.h"
36 #include "collationrootelements.h"
37 #include "collationruleparser.h"
38 #include "collationweights.h"
39 #include "cstring.h"
40 #include "intltest.h"
41 #include "normalizer2impl.h"
42 #include "ucbuf.h"
43 #include "uhash.h"
44 #include "uitercollationiterator.h"
45 #include "utf16collationiterator.h"
46 #include "utf8collationiterator.h"
47 #include "uvectr32.h"
48 #include "uvectr64.h"
49 #include "writesrc.h"
50
51 class CodePointIterator;
52
53 // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
54
55 class CollationTest : public IntlTest {
56 public:
CollationTest()57 CollationTest()
58 : fcd(NULL), nfd(NULL),
59 fileLineNumber(0),
60 coll(NULL) {}
61
~CollationTest()62 ~CollationTest() {
63 delete coll;
64 }
65
66 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
67
68 void TestMinMax();
69 void TestImplicits();
70 void TestNulTerminated();
71 void TestIllegalUTF8();
72 void TestShortFCDData();
73 void TestFCD();
74 void TestCollationWeights();
75 void TestRootElements();
76 void TestTailoredElements();
77 void TestDataDriven();
78
79 private:
80 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
81 void checkAllocWeights(CollationWeights &cw,
82 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
83 int32_t someLength, int32_t minCount);
84
85 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
86 static UnicodeString printCollationKey(const CollationKey &key);
87
88 // Helpers & fields for data-driven test.
isCROrLF(UChar c)89 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
isSpace(UChar c)90 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
isSectionStarter(UChar c)91 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
skipSpaces(int32_t i)92 int32_t skipSpaces(int32_t i) {
93 while(isSpace(fileLine[i])) { ++i; }
94 return i;
95 }
96
97 UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
98 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
99 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
100 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
101 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
102 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
103 void setRootCollator(IcuTestErrorCode &errorCode);
104 void setLocaleCollator(IcuTestErrorCode &errorCode);
105
106 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
107
108 UBool getSortKeyParts(const UChar *s, int32_t length,
109 CharString &dest, int32_t partSize,
110 IcuTestErrorCode &errorCode);
111 UBool getCollationKey(const char *norm, const UnicodeString &line,
112 const UChar *s, int32_t length,
113 CollationKey &key, IcuTestErrorCode &errorCode);
114 UBool getMergedCollationKey(const UChar *s, int32_t length,
115 CollationKey &key, IcuTestErrorCode &errorCode);
116 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
117 const UnicodeString &prevString, const UnicodeString &s,
118 UCollationResult expectedOrder, Collation::Level expectedLevel,
119 IcuTestErrorCode &errorCode);
120 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
121
122 const Normalizer2 *fcd, *nfd;
123 UnicodeString fileLine;
124 int32_t fileLineNumber;
125 UnicodeString fileTestName;
126 Collator *coll;
127 };
128
createCollationTest()129 extern IntlTest *createCollationTest() {
130 return new CollationTest();
131 }
132
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)133 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
134 if(exec) {
135 logln("TestSuite CollationTest: ");
136 }
137 TESTCASE_AUTO_BEGIN;
138 TESTCASE_AUTO(TestMinMax);
139 TESTCASE_AUTO(TestImplicits);
140 TESTCASE_AUTO(TestNulTerminated);
141 TESTCASE_AUTO(TestIllegalUTF8);
142 TESTCASE_AUTO(TestShortFCDData);
143 TESTCASE_AUTO(TestFCD);
144 TESTCASE_AUTO(TestCollationWeights);
145 TESTCASE_AUTO(TestRootElements);
146 TESTCASE_AUTO(TestTailoredElements);
147 TESTCASE_AUTO(TestDataDriven);
148 TESTCASE_AUTO_END;
149 }
150
TestMinMax()151 void CollationTest::TestMinMax() {
152 IcuTestErrorCode errorCode(*this, "TestMinMax");
153
154 setRootCollator(errorCode);
155 if(errorCode.isFailure()) {
156 errorCode.reset();
157 return;
158 }
159 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
160 if(rbc == NULL) {
161 errln("the root collator is not a RuleBasedCollator");
162 return;
163 }
164
165 static const UChar s[2] = { 0xfffe, 0xffff };
166 UVector64 ces(errorCode);
167 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
168 errorCode.assertSuccess();
169 if(ces.size() != 2) {
170 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
171 return;
172 }
173 int64_t ce = ces.elementAti(0);
174 int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
175 if(ce != expected) {
176 errln("CE(U+fffe)=%04lx != 02..", (long)ce);
177 }
178
179 ce = ces.elementAti(1);
180 expected = Collation::makeCE(Collation::MAX_PRIMARY);
181 if(ce != expected) {
182 errln("CE(U+ffff)=%04lx != max..", (long)ce);
183 }
184 }
185
TestImplicits()186 void CollationTest::TestImplicits() {
187 IcuTestErrorCode errorCode(*this, "TestImplicits");
188
189 const CollationData *cd = CollationRoot::getData(errorCode);
190 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
191 return;
192 }
193
194 // Implicit primary weights should be assigned for the following sets,
195 // and sort in ascending order by set and then code point.
196 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
197
198 // core Han Unified Ideographs
199 UnicodeSet coreHan("[\\p{unified_ideograph}&"
200 "[\\p{Block=CJK_Unified_Ideographs}"
201 "\\p{Block=CJK_Compatibility_Ideographs}]]",
202 errorCode);
203 // all other Unified Han ideographs
204 UnicodeSet otherHan("[\\p{unified ideograph}-"
205 "[\\p{Block=CJK_Unified_Ideographs}"
206 "\\p{Block=CJK_Compatibility_Ideographs}]]",
207 errorCode);
208 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
209 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
210
211 // Starting with CLDR 26/ICU 54, the root Han order may instead be
212 // the Unihan radical-stroke order.
213 // The tests should pass either way, so we only test the order of a small set of Han characters
214 // whose radical-stroke order is the same as their code point order.
215 UnicodeSet someHanInCPOrder(
216 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
217 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
218 errorCode);
219 UnicodeSet inOrder(someHanInCPOrder);
220 inOrder.addAll(unassigned).freeze();
221 if(errorCode.logIfFailureAndReset("UnicodeSet")) {
222 return;
223 }
224 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
225 UChar32 prev = 0;
226 uint32_t prevPrimary = 0;
227 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
228 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
229 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
230 while(iter->next()) {
231 UChar32 c = iter->getCodepoint();
232 UnicodeString s(c);
233 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
234 int64_t ce = ci.nextCE(errorCode);
235 int64_t ce2 = ci.nextCE(errorCode);
236 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
237 return;
238 }
239 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
240 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
241 continue;
242 }
243 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
244 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
245 (long)c, (long)(ce & 0xffffffff));
246 continue;
247 }
248 uint32_t primary = (uint32_t)(ce >> 32);
249 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
250 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
251 (long)c, (long)primary, (long)prev, (long)prevPrimary);
252 }
253 prev = c;
254 prevPrimary = primary;
255 }
256 }
257 }
258
TestNulTerminated()259 void CollationTest::TestNulTerminated() {
260 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
261 const CollationData *data = CollationRoot::getData(errorCode);
262 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
263 return;
264 }
265
266 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
267
268 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
269 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
270 for(int32_t i = 0;; ++i) {
271 int64_t ce1 = ci1.nextCE(errorCode);
272 int64_t ce2 = ci2.nextCE(errorCode);
273 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
274 return;
275 }
276 if(ce1 != ce2) {
277 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
278 break;
279 }
280 if(ce1 == Collation::NO_CE) { break; }
281 }
282 }
283
TestIllegalUTF8()284 void CollationTest::TestIllegalUTF8() {
285 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
286
287 setRootCollator(errorCode);
288 if(errorCode.isFailure()) {
289 errorCode.reset();
290 return;
291 }
292 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
293
294 static const char *strings[] = {
295 // U+FFFD
296 "a\xef\xbf\xbdz",
297 // illegal byte sequences
298 "a\x80z", // trail byte
299 "a\xc1\x81z", // non-shortest form
300 "a\xe0\x82\x83z", // non-shortest form
301 "a\xed\xa0\x80z", // lead surrogate: would be U+D800
302 "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
303 "a\xf0\x8f\xbf\xbfz", // non-shortest form
304 "a\xf4\x90\x80\x80z" // out of range: would be U+110000
305 };
306
307 StringPiece fffd(strings[0]);
308 for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
309 StringPiece illegal(strings[i]);
310 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
311 if(order != UCOL_EQUAL) {
312 errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
313 (int)i, order);
314 }
315 }
316 }
317
318 namespace {
319
addLeadSurrogatesForSupplementary(const UnicodeSet & src,UnicodeSet & dest)320 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
321 for(UChar32 c = 0x10000; c < 0x110000;) {
322 UChar32 next = c + 0x400;
323 if(src.containsSome(c, next - 1)) {
324 dest.add(U16_LEAD(c));
325 }
326 c = next;
327 }
328 }
329
330 } // namespace
331
TestShortFCDData()332 void CollationTest::TestShortFCDData() {
333 // See CollationFCD class comments.
334 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
335 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
336 errorCode.assertSuccess();
337 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
338 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
339 UnicodeSet lccc; // actual
340 for(UChar32 c = 0; c <= 0xffff; ++c) {
341 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
342 }
343 UnicodeSet diff(expectedLccc);
344 diff.removeAll(lccc);
345 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
346 UnicodeString empty("[]");
347 UnicodeString diffString;
348 diff.toPattern(diffString, TRUE);
349 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
350 diff = lccc;
351 diff.removeAll(expectedLccc);
352 diff.toPattern(diffString, TRUE);
353 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
354
355 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
356 if (errorCode.isSuccess()) {
357 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
358 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
359 UnicodeSet tccc; // actual
360 for(UChar32 c = 0; c <= 0xffff; ++c) {
361 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
362 }
363 diff = expectedTccc;
364 diff.removeAll(tccc);
365 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
366 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
367 diff = tccc;
368 diff.removeAll(expectedTccc);
369 diff.toPattern(diffString, TRUE);
370 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
371 }
372 }
373
374 class CodePointIterator {
375 public:
CodePointIterator(const UChar32 * cp,int32_t length)376 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
resetToStart()377 void resetToStart() { pos = 0; }
next()378 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
previous()379 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
getLength() const380 int32_t getLength() const { return length; }
getIndex() const381 int getIndex() const { return (int)pos; }
382 private:
383 const UChar32 *cp;
384 int32_t length;
385 int32_t pos;
386 };
387
checkFCD(const char * name,CollationIterator & ci,CodePointIterator & cpi)388 void CollationTest::checkFCD(const char *name,
389 CollationIterator &ci, CodePointIterator &cpi) {
390 IcuTestErrorCode errorCode(*this, "checkFCD");
391
392 // Iterate forward to the limit.
393 for(;;) {
394 UChar32 c1 = ci.nextCodePoint(errorCode);
395 UChar32 c2 = cpi.next();
396 if(c1 != c2) {
397 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
398 name, (long)c1, (long)c2, cpi.getIndex());
399 return;
400 }
401 if(c1 < 0) { break; }
402 }
403
404 // Iterate backward most of the way.
405 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
406 UChar32 c1 = ci.previousCodePoint(errorCode);
407 UChar32 c2 = cpi.previous();
408 if(c1 != c2) {
409 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
410 name, (long)c1, (long)c2, cpi.getIndex());
411 return;
412 }
413 }
414
415 // Forward again.
416 for(;;) {
417 UChar32 c1 = ci.nextCodePoint(errorCode);
418 UChar32 c2 = cpi.next();
419 if(c1 != c2) {
420 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
421 name, (long)c1, (long)c2, cpi.getIndex());
422 return;
423 }
424 if(c1 < 0) { break; }
425 }
426
427 // Iterate backward to the start.
428 for(;;) {
429 UChar32 c1 = ci.previousCodePoint(errorCode);
430 UChar32 c2 = cpi.previous();
431 if(c1 != c2) {
432 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
433 name, (long)c1, (long)c2, cpi.getIndex());
434 return;
435 }
436 if(c1 < 0) { break; }
437 }
438 }
439
TestFCD()440 void CollationTest::TestFCD() {
441 IcuTestErrorCode errorCode(*this, "TestFCD");
442 const CollationData *data = CollationRoot::getData(errorCode);
443 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
444 return;
445 }
446
447 // Input string, not FCD, NUL-terminated.
448 static const UChar s[] = {
449 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
450 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
451 0x327, 0x308, // ccc=202, 230
452 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
453 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
454 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
455 0xac01,
456 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence.
457 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
458 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence.
459 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
460 0x4e00, 0xf81,
461 0
462 };
463 // Expected code points.
464 static const UChar32 cp[] = {
465 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
466 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
467 0x1D15F, 0x1D16D,
468 0xac01,
469 0x63, 0x327, 0x1D165, 0x1D16D,
470 0x61,
471 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
472 0x4e00, 0xf71, 0xf80
473 };
474
475 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
476 if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
477 return;
478 }
479 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
480 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
481
482 #if U_HAVE_STD_STRING
483 cpi.resetToStart();
484 std::string utf8;
485 UnicodeString(s).toUTF8String(utf8);
486 FCDUTF8CollationIterator u8ci(data, FALSE,
487 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
488 if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
489 return;
490 }
491 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
492 #endif
493
494 cpi.resetToStart();
495 UCharIterator iter;
496 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the terminating NUL
497 FCDUIterCollationIterator uici(data, FALSE, iter, 0);
498 if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
499 return;
500 }
501 checkFCD("FCDUIterCollationIterator", uici, cpi);
502 }
503
checkAllocWeights(CollationWeights & cw,uint32_t lowerLimit,uint32_t upperLimit,int32_t n,int32_t someLength,int32_t minCount)504 void CollationTest::checkAllocWeights(CollationWeights &cw,
505 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
506 int32_t someLength, int32_t minCount) {
507 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
508 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
509 (long)lowerLimit, (long)upperLimit, (long)n);
510 return;
511 }
512 uint32_t previous = lowerLimit;
513 int32_t count = 0; // number of weights that have someLength
514 for(int32_t i = 0; i < n; ++i) {
515 uint32_t w = cw.nextWeight();
516 if(w == 0xffffffff) {
517 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
518 "returns only %ld weights",
519 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
520 return;
521 }
522 if(!(previous < w && w < upperLimit)) {
523 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
524 "number %ld -> %lx not between %lx and %lx",
525 (long)lowerLimit, (long)upperLimit, (long)n,
526 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
527 return;
528 }
529 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
530 }
531 if(count < minCount) {
532 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
533 "returns only %ld < %ld weights of length %d",
534 (long)lowerLimit, (long)upperLimit, (long)n,
535 (long)count, (long)minCount, (int)someLength);
536 }
537 }
538
TestCollationWeights()539 void CollationTest::TestCollationWeights() {
540 CollationWeights cw;
541
542 // Non-compressible primaries use 254 second bytes 02..FF.
543 logln("CollationWeights.initForPrimary(non-compressible)");
544 cw.initForPrimary(FALSE);
545 // Expect 1 weight 11 and 254 weights 12xx.
546 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
547 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
548 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
549 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
550 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
551 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
552 // Expect 254^2=64516 three-byte weights.
553 // During computation, there should be 3 three-byte ranges
554 // 10ffff, 11xxxx, 120202.
555 // The middle one should be split 64515:1,
556 // and the newly-split-off range and the last ranged lengthened.
557 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
558 // Expect weights 1102 & 1103.
559 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
560 // Expect weights 102102 & 102103.
561 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
562
563 // Compressible primaries use 251 second bytes 04..FE.
564 logln("CollationWeights.initForPrimary(compressible)");
565 cw.initForPrimary(TRUE);
566 // Expect 1 weight 11 and 251 weights 12xx.
567 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
568 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
569 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
570 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
571 // Expect weights 1104 & 1105.
572 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
573 // Expect weights 102102 & 102103.
574 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
575
576 // Secondary and tertiary weights use only bytes 3 & 4.
577 logln("CollationWeights.initForSecondary()");
578 cw.initForSecondary();
579 // Expect weights fbxx and all four fc..ff.
580 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
581
582 logln("CollationWeights.initForTertiary()");
583 cw.initForTertiary();
584 // Expect weights 3dxx and both 3e & 3f.
585 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
586 }
587
588 namespace {
589
isValidCE(const CollationRootElements & re,const CollationData & data,uint32_t p,uint32_t s,uint32_t ctq)590 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
591 uint32_t p, uint32_t s, uint32_t ctq) {
592 uint32_t p1 = p >> 24;
593 uint32_t p2 = (p >> 16) & 0xff;
594 uint32_t p3 = (p >> 8) & 0xff;
595 uint32_t p4 = p & 0xff;
596 uint32_t s1 = s >> 8;
597 uint32_t s2 = s & 0xff;
598 // ctq = Case, Tertiary, Quaternary
599 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
600 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
601 uint32_t t1 = t >> 8;
602 uint32_t t2 = t & 0xff;
603 uint32_t q = ctq & Collation::QUATERNARY_MASK;
604 // No leading zero bytes.
605 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
606 return FALSE;
607 }
608 // No intermediate zero bytes.
609 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
610 return FALSE;
611 }
612 if(p2 != 0 && p3 == 0 && p4 != 0) {
613 return FALSE;
614 }
615 // Minimum & maximum lead bytes.
616 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
617 s1 == Collation::LEVEL_SEPARATOR_BYTE ||
618 t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
619 return FALSE;
620 }
621 if(c > 2) {
622 return FALSE;
623 }
624 // The valid byte range for the second primary byte depends on compressibility.
625 if(p2 != 0) {
626 if(data.isCompressibleLeadByte(p1)) {
627 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
628 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
629 return FALSE;
630 }
631 } else {
632 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
633 return FALSE;
634 }
635 }
636 }
637 // Other bytes just need to avoid the level separator.
638 // Trailing zeros are ok.
639 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
640 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
641 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
642 return FALSE;
643 }
644 // Well-formed CEs.
645 if(p == 0) {
646 if(s == 0) {
647 if(t == 0) {
648 // Completely ignorable CE.
649 // Quaternary CEs are not supported.
650 if(c != 0 || q != 0) {
651 return FALSE;
652 }
653 } else {
654 // Tertiary CE.
655 if(t < re.getTertiaryBoundary() || c != 2) {
656 return FALSE;
657 }
658 }
659 } else {
660 // Secondary CE.
661 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
662 return FALSE;
663 }
664 }
665 } else {
666 // Primary CE.
667 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
668 s >= re.getSecondaryBoundary()) {
669 return FALSE;
670 }
671 if(t == 0 || t >= re.getTertiaryBoundary()) {
672 return FALSE;
673 }
674 }
675 return TRUE;
676 }
677
isValidCE(const CollationRootElements & re,const CollationData & data,int64_t ce)678 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
679 uint32_t p = (uint32_t)(ce >> 32);
680 uint32_t secTer = (uint32_t)ce;
681 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
682 }
683
684 class RootElementsIterator {
685 public:
RootElementsIterator(const CollationData & root)686 RootElementsIterator(const CollationData &root)
687 : data(root),
688 elements(root.rootElements), length(root.rootElementsLength),
689 pri(0), secTer(0),
690 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
691
next()692 UBool next() {
693 if(index >= length) { return FALSE; }
694 uint32_t p = elements[index];
695 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
696 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
697 ++index;
698 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
699 return TRUE;
700 }
701 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
702 // End of a range, enumerate the primaries in the range.
703 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
704 p &= 0xffffff00;
705 if(pri == p) {
706 // Finished the range, return the next CE after it.
707 ++index;
708 return next();
709 }
710 U_ASSERT(pri < p);
711 // Return the next primary in this range.
712 UBool isCompressible = data.isCompressiblePrimary(pri);
713 if((pri & 0xffff) == 0) {
714 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
715 } else {
716 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
717 }
718 return TRUE;
719 }
720 // Simple primary CE.
721 ++index;
722 pri = p;
723 // Does this have an explicit below-common sec/ter unit,
724 // or does it imply a common one?
725 if(index == length) {
726 secTer = Collation::COMMON_SEC_AND_TER_CE;
727 } else {
728 secTer = elements[index];
729 if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
730 // No sec/ter delta.
731 secTer = Collation::COMMON_SEC_AND_TER_CE;
732 } else {
733 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
734 if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
735 // Implied sec/ter.
736 secTer = Collation::COMMON_SEC_AND_TER_CE;
737 } else {
738 // Explicit sec/ter below common/common.
739 ++index;
740 }
741 }
742 }
743 return TRUE;
744 }
745
getPrimary() const746 uint32_t getPrimary() const { return pri; }
getSecTer() const747 uint32_t getSecTer() const { return secTer; }
748
749 private:
750 const CollationData &data;
751 const uint32_t *elements;
752 int32_t length;
753
754 uint32_t pri;
755 uint32_t secTer;
756 int32_t index;
757 };
758
759 } // namespace
760
TestRootElements()761 void CollationTest::TestRootElements() {
762 IcuTestErrorCode errorCode(*this, "TestRootElements");
763 const CollationData *root = CollationRoot::getData(errorCode);
764 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
765 return;
766 }
767 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
768 RootElementsIterator iter(*root);
769
770 // We check each root CE for validity,
771 // and we also verify that there is a tailoring gap between each two CEs.
772 CollationWeights cw1c; // compressible primary weights
773 CollationWeights cw1u; // uncompressible primary weights
774 CollationWeights cw2;
775 CollationWeights cw3;
776
777 cw1c.initForPrimary(TRUE);
778 cw1u.initForPrimary(FALSE);
779 cw2.initForSecondary();
780 cw3.initForTertiary();
781
782 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
783 // nor the special merge-separator CE for U+FFFE.
784 uint32_t prevPri = 0;
785 uint32_t prevSec = 0;
786 uint32_t prevTer = 0;
787 while(iter.next()) {
788 uint32_t pri = iter.getPrimary();
789 uint32_t secTer = iter.getSecTer();
790 // CollationRootElements CEs must have 0 case and quaternary bits.
791 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
792 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
793 (long)pri, (long)secTer);
794 }
795 uint32_t sec = secTer >> 16;
796 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
797 uint32_t ctq = ter;
798 if(pri == 0 && sec == 0 && ter != 0) {
799 // Tertiary CEs must have uppercase bits,
800 // but they are not stored in the CollationRootElements.
801 ctq |= 0x8000;
802 }
803 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
804 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
805 } else {
806 if(pri != prevPri) {
807 uint32_t newWeight = 0;
808 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
809 // There is currently no tailoring gap after primary ignorables,
810 // and we forbid tailoring after U+FFFD and U+FFFF.
811 } else if(root->isCompressiblePrimary(prevPri)) {
812 if(!cw1c.allocWeights(prevPri, pri, 1)) {
813 errln("no primary/compressible tailoring gap between %08lx and %08lx",
814 (long)prevPri, (long)pri);
815 } else {
816 newWeight = cw1c.nextWeight();
817 }
818 } else {
819 if(!cw1u.allocWeights(prevPri, pri, 1)) {
820 errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
821 (long)prevPri, (long)pri);
822 } else {
823 newWeight = cw1u.nextWeight();
824 }
825 }
826 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
827 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
828 (long)prevPri, (long)newWeight, (long)pri);
829 }
830 } else if(sec != prevSec) {
831 uint32_t lowerLimit =
832 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
833 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
834 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
835 } else {
836 uint32_t newWeight = cw2.nextWeight();
837 if(!(prevSec < newWeight && newWeight < sec)) {
838 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
839 (long)lowerLimit, (long)newWeight, (long)sec);
840 }
841 }
842 } else if(ter != prevTer) {
843 uint32_t lowerLimit =
844 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
845 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
846 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
847 } else {
848 uint32_t newWeight = cw3.nextWeight();
849 if(!(prevTer < newWeight && newWeight < ter)) {
850 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
851 (long)lowerLimit, (long)newWeight, (long)ter);
852 }
853 }
854 } else {
855 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
856 }
857 }
858 prevPri = pri;
859 prevSec = sec;
860 prevTer = ter;
861 }
862 }
863
TestTailoredElements()864 void CollationTest::TestTailoredElements() {
865 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
866 const CollationData *root = CollationRoot::getData(errorCode);
867 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
868 return;
869 }
870 CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
871
872 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
873 if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
874 return;
875 }
876 uhash_setKeyDeleter(prevLocales, uprv_free);
877 // TestRootElements() tests the root collator which does not have tailorings.
878 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
879 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
880 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
881
882 UVector64 ces(errorCode);
883 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
884 U_ASSERT(locales.isValid());
885 const char *localeID = "root";
886 do {
887 Locale locale(localeID);
888 LocalPointer<StringEnumeration> types(
889 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
890 errorCode.assertSuccess();
891 const char *type; // first: default type
892 while((type = types->next(NULL, errorCode)) != NULL) {
893 if(strncmp(type, "private-", 8) == 0) {
894 errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
895 localeID, type);
896 }
897 Locale localeWithType(locale);
898 localeWithType.setKeywordValue("collation", type, errorCode);
899 errorCode.assertSuccess();
900 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
901 if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
902 localeWithType.getName())) {
903 continue;
904 }
905 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
906 if(uhash_geti(prevLocales, actual.getName()) != 0) {
907 continue;
908 }
909 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
910 errorCode.assertSuccess();
911 logln("TestTailoredElements(): requested %s -> actual %s",
912 localeWithType.getName(), actual.getName());
913 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
914 if(rbc == NULL) {
915 continue;
916 }
917 // Note: It would be better to get tailored strings such that we can
918 // identify the prefix, and only get the CEs for the prefix+string,
919 // not also for the prefix.
920 // There is currently no API for that.
921 // It would help in an unusual case where a contraction starting in the prefix
922 // extends past its end, and we do not see the intended mapping.
923 // For example, for a mapping p|st, if there is also a contraction ps,
924 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
925 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
926 errorCode.assertSuccess();
927 UnicodeSetIterator iter(*tailored);
928 while(iter.next()) {
929 const UnicodeString &s = iter.getString();
930 ces.removeAllElements();
931 rbc->internalGetCEs(s, ces, errorCode);
932 errorCode.assertSuccess();
933 for(int32_t i = 0; i < ces.size(); ++i) {
934 int64_t ce = ces.elementAti(i);
935 if(!isValidCE(rootElements, *root, ce)) {
936 errln("invalid tailored CE %016llx at CE index %d from string:",
937 (long long)ce, (int)i);
938 infoln(prettify(s));
939 }
940 }
941 }
942 }
943 } while((localeID = locales->next(NULL, errorCode)) != NULL);
944 uhash_close(prevLocales);
945 }
946
printSortKey(const uint8_t * p,int32_t length)947 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
948 UnicodeString s;
949 for(int32_t i = 0; i < length; ++i) {
950 if(i > 0) { s.append((UChar)0x20); }
951 uint8_t b = p[i];
952 if(b == 0) {
953 s.append((UChar)0x2e); // period
954 } else if(b == 1) {
955 s.append((UChar)0x7c); // vertical bar
956 } else {
957 appendHex(b, 2, s);
958 }
959 }
960 return s;
961 }
962
printCollationKey(const CollationKey & key)963 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
964 int32_t length;
965 const uint8_t *p = key.getByteArray(length);
966 return printSortKey(p, length);
967 }
968
readNonEmptyLine(UCHARBUF * f,IcuTestErrorCode & errorCode)969 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
970 for(;;) {
971 int32_t lineLength;
972 const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
973 if(line == NULL || errorCode.isFailure()) {
974 fileLine.remove();
975 return FALSE;
976 }
977 ++fileLineNumber;
978 // Strip trailing CR/LF, comments, and spaces.
979 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
980 if(comment != NULL) {
981 lineLength = (int32_t)(comment - line);
982 } else {
983 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
984 }
985 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
986 if(lineLength != 0) {
987 fileLine.setTo(FALSE, line, lineLength);
988 return TRUE;
989 }
990 // Empty line, continue.
991 }
992 }
993
parseString(int32_t & start,UnicodeString & prefix,UnicodeString & s,UErrorCode & errorCode)994 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
995 UErrorCode &errorCode) {
996 int32_t length = fileLine.length();
997 int32_t i;
998 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
999 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|'
1000 if(pipeIndex >= 0) {
1001 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1002 if(prefix.isEmpty()) {
1003 errln("empty prefix on line %d", (int)fileLineNumber);
1004 infoln(fileLine);
1005 errorCode = U_PARSE_ERROR;
1006 return;
1007 }
1008 start = pipeIndex + 1;
1009 } else {
1010 prefix.remove();
1011 }
1012 s = fileLine.tempSubStringBetween(start, i).unescape();
1013 if(s.isEmpty()) {
1014 errln("empty string on line %d", (int)fileLineNumber);
1015 infoln(fileLine);
1016 errorCode = U_PARSE_ERROR;
1017 return;
1018 }
1019 start = i;
1020 }
1021
parseRelationAndString(UnicodeString & s,IcuTestErrorCode & errorCode)1022 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1023 Collation::Level relation;
1024 int32_t start;
1025 if(fileLine[0] == 0x3c) { // <
1026 UChar second = fileLine[1];
1027 start = 2;
1028 switch(second) {
1029 case 0x31: // <1
1030 relation = Collation::PRIMARY_LEVEL;
1031 break;
1032 case 0x32: // <2
1033 relation = Collation::SECONDARY_LEVEL;
1034 break;
1035 case 0x33: // <3
1036 relation = Collation::TERTIARY_LEVEL;
1037 break;
1038 case 0x34: // <4
1039 relation = Collation::QUATERNARY_LEVEL;
1040 break;
1041 case 0x63: // <c
1042 relation = Collation::CASE_LEVEL;
1043 break;
1044 case 0x69: // <i
1045 relation = Collation::IDENTICAL_LEVEL;
1046 break;
1047 default: // just <
1048 relation = Collation::NO_LEVEL;
1049 start = 1;
1050 break;
1051 }
1052 } else if(fileLine[0] == 0x3d) { // =
1053 relation = Collation::ZERO_LEVEL;
1054 start = 1;
1055 } else {
1056 start = 0;
1057 }
1058 if(start == 0 || !isSpace(fileLine[start])) {
1059 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1060 infoln(fileLine);
1061 errorCode.set(U_PARSE_ERROR);
1062 return Collation::NO_LEVEL;
1063 }
1064 start = skipSpaces(start);
1065 UnicodeString prefix;
1066 parseString(start, prefix, s, errorCode);
1067 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1068 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1069 infoln(fileLine);
1070 errorCode.set(U_PARSE_ERROR);
1071 return Collation::NO_LEVEL;
1072 }
1073 if(start < fileLine.length()) {
1074 errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1075 infoln(fileLine);
1076 errorCode.set(U_PARSE_ERROR);
1077 return Collation::NO_LEVEL;
1078 }
1079 return relation;
1080 }
1081
1082 static const struct {
1083 const char *name;
1084 UColAttribute attr;
1085 } attributes[] = {
1086 { "backwards", UCOL_FRENCH_COLLATION },
1087 { "alternate", UCOL_ALTERNATE_HANDLING },
1088 { "caseFirst", UCOL_CASE_FIRST },
1089 { "caseLevel", UCOL_CASE_LEVEL },
1090 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1091 { "strength", UCOL_STRENGTH },
1092 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1093 { "numeric", UCOL_NUMERIC_COLLATION }
1094 };
1095
1096 static const struct {
1097 const char *name;
1098 UColAttributeValue value;
1099 } attributeValues[] = {
1100 { "default", UCOL_DEFAULT },
1101 { "primary", UCOL_PRIMARY },
1102 { "secondary", UCOL_SECONDARY },
1103 { "tertiary", UCOL_TERTIARY },
1104 { "quaternary", UCOL_QUATERNARY },
1105 { "identical", UCOL_IDENTICAL },
1106 { "off", UCOL_OFF },
1107 { "on", UCOL_ON },
1108 { "shifted", UCOL_SHIFTED },
1109 { "non-ignorable", UCOL_NON_IGNORABLE },
1110 { "lower", UCOL_LOWER_FIRST },
1111 { "upper", UCOL_UPPER_FIRST }
1112 };
1113
parseAndSetAttribute(IcuTestErrorCode & errorCode)1114 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1115 // Parse attributes even if the Collator could not be created,
1116 // in order to report syntax errors.
1117 int32_t start = skipSpaces(1);
1118 int32_t equalPos = fileLine.indexOf(0x3d);
1119 if(equalPos < 0) {
1120 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1121 parseAndSetReorderCodes(start + 7, errorCode);
1122 return;
1123 }
1124 errln("missing '=' on line %d", (int)fileLineNumber);
1125 infoln(fileLine);
1126 errorCode.set(U_PARSE_ERROR);
1127 return;
1128 }
1129
1130 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1131 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1132 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1133 UColReorderCode max;
1134 if(valueString == UNICODE_STRING("space", 5)) {
1135 max = UCOL_REORDER_CODE_SPACE;
1136 } else if(valueString == UNICODE_STRING("punct", 5)) {
1137 max = UCOL_REORDER_CODE_PUNCTUATION;
1138 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1139 max = UCOL_REORDER_CODE_SYMBOL;
1140 } else if(valueString == UNICODE_STRING("currency", 8)) {
1141 max = UCOL_REORDER_CODE_CURRENCY;
1142 } else {
1143 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1144 infoln(fileLine);
1145 errorCode.set(U_PARSE_ERROR);
1146 return;
1147 }
1148 if(coll != NULL) {
1149 coll->setMaxVariable(max, errorCode);
1150 if(errorCode.isFailure()) {
1151 errln("setMaxVariable() failed on line %d: %s",
1152 (int)fileLineNumber, errorCode.errorName());
1153 infoln(fileLine);
1154 return;
1155 }
1156 }
1157 fileLine.remove();
1158 return;
1159 }
1160
1161 UColAttribute attr;
1162 for(int32_t i = 0;; ++i) {
1163 if(i == UPRV_LENGTHOF(attributes)) {
1164 errln("invalid attribute name on line %d", (int)fileLineNumber);
1165 infoln(fileLine);
1166 errorCode.set(U_PARSE_ERROR);
1167 return;
1168 }
1169 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1170 attr = attributes[i].attr;
1171 break;
1172 }
1173 }
1174
1175 UColAttributeValue value;
1176 for(int32_t i = 0;; ++i) {
1177 if(i == UPRV_LENGTHOF(attributeValues)) {
1178 errln("invalid attribute value name on line %d", (int)fileLineNumber);
1179 infoln(fileLine);
1180 errorCode.set(U_PARSE_ERROR);
1181 return;
1182 }
1183 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1184 value = attributeValues[i].value;
1185 break;
1186 }
1187 }
1188
1189 if(coll != NULL) {
1190 coll->setAttribute(attr, value, errorCode);
1191 if(errorCode.isFailure()) {
1192 errln("illegal attribute=value combination on line %d: %s",
1193 (int)fileLineNumber, errorCode.errorName());
1194 infoln(fileLine);
1195 return;
1196 }
1197 }
1198 fileLine.remove();
1199 }
1200
parseAndSetReorderCodes(int32_t start,IcuTestErrorCode & errorCode)1201 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1202 UVector32 reorderCodes(errorCode);
1203 while(start < fileLine.length()) {
1204 start = skipSpaces(start);
1205 int32_t limit = start;
1206 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1207 CharString name;
1208 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1209 int32_t code = CollationRuleParser::getReorderCode(name.data());
1210 if(code < 0) {
1211 if(uprv_stricmp(name.data(), "default") == 0) {
1212 code = UCOL_REORDER_CODE_DEFAULT; // -1
1213 } else {
1214 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1215 infoln(fileLine);
1216 errorCode.set(U_PARSE_ERROR);
1217 return;
1218 }
1219 }
1220 reorderCodes.addElement(code, errorCode);
1221 start = limit;
1222 }
1223 if(coll != NULL) {
1224 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1225 if(errorCode.isFailure()) {
1226 errln("setReorderCodes() failed on line %d: %s",
1227 (int)fileLineNumber, errorCode.errorName());
1228 infoln(fileLine);
1229 return;
1230 }
1231 }
1232 fileLine.remove();
1233 }
1234
buildTailoring(UCHARBUF * f,IcuTestErrorCode & errorCode)1235 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1236 UnicodeString rules;
1237 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1238 rules.append(fileLine.unescape());
1239 }
1240 if(errorCode.isFailure()) { return; }
1241 logln(rules);
1242
1243 UParseError parseError;
1244 UnicodeString reason;
1245 delete coll;
1246 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1247 if(coll == NULL) {
1248 errln("unable to allocate a new collator");
1249 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1250 return;
1251 }
1252 if(errorCode.isFailure()) {
1253 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1254 infoln(UnicodeString(" reason: ") + reason);
1255 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); }
1256 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1257 infoln(UnicodeString(" snippet: ...") +
1258 parseError.preContext + "(!)" + parseError.postContext + "...");
1259 }
1260 delete coll;
1261 coll = NULL;
1262 errorCode.reset();
1263 } else {
1264 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1265 UnicodeString(), reason);
1266 }
1267 }
1268
setRootCollator(IcuTestErrorCode & errorCode)1269 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1270 if(errorCode.isFailure()) { return; }
1271 delete coll;
1272 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1273 if(errorCode.isFailure()) {
1274 dataerrln("unable to create a root collator");
1275 return;
1276 }
1277 }
1278
setLocaleCollator(IcuTestErrorCode & errorCode)1279 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1280 if(errorCode.isFailure()) { return; }
1281 delete coll;
1282 coll = NULL;
1283 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant
1284 if(at >= 0) {
1285 fileLine.setCharAt(at, (UChar)0x2a); // *
1286 }
1287 CharString localeID;
1288 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1289 if(at >= 0) {
1290 localeID.data()[at - 9] = '@';
1291 }
1292 Locale locale(localeID.data());
1293 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1294 errln("invalid language tag on line %d", (int)fileLineNumber);
1295 infoln(fileLine);
1296 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1297 return;
1298 }
1299
1300 logln("creating a collator for locale ID %s", locale.getName());
1301 coll = Collator::createInstance(locale, errorCode);
1302 if(errorCode.isFailure()) {
1303 dataerrln("unable to create a collator for locale %s on line %d",
1304 locale.getName(), (int)fileLineNumber);
1305 infoln(fileLine);
1306 delete coll;
1307 coll = NULL;
1308 errorCode.reset();
1309 }
1310 }
1311
needsNormalization(const UnicodeString & s,UErrorCode & errorCode) const1312 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1313 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1314 // In some sequences with Tibetan composite vowel signs,
1315 // even if the string passes the FCD check,
1316 // those composites must be decomposed.
1317 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1318 int32_t index = 0;
1319 while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1320 if(++index < s.length()) {
1321 UChar c = s[index];
1322 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1323 }
1324 }
1325 return FALSE;
1326 }
1327
getSortKeyParts(const UChar * s,int32_t length,CharString & dest,int32_t partSize,IcuTestErrorCode & errorCode)1328 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1329 CharString &dest, int32_t partSize,
1330 IcuTestErrorCode &errorCode) {
1331 if(errorCode.isFailure()) { return FALSE; }
1332 uint8_t part[32];
1333 U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1334 UCharIterator iter;
1335 uiter_setString(&iter, s, length);
1336 uint32_t state[2] = { 0, 0 };
1337 for(;;) {
1338 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1339 UBool done = partLength < partSize;
1340 if(done) {
1341 // At the end, append the next byte as well which should be 00.
1342 ++partLength;
1343 }
1344 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1345 if(done) {
1346 return errorCode.isSuccess();
1347 }
1348 }
1349 }
1350
getCollationKey(const char * norm,const UnicodeString & line,const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1351 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1352 const UChar *s, int32_t length,
1353 CollationKey &key, IcuTestErrorCode &errorCode) {
1354 if(errorCode.isFailure()) { return FALSE; }
1355 coll->getCollationKey(s, length, key, errorCode);
1356 if(errorCode.isFailure()) {
1357 infoln(fileTestName);
1358 errln("Collator(%s).getCollationKey() failed: %s",
1359 norm, errorCode.errorName());
1360 infoln(line);
1361 return FALSE;
1362 }
1363 int32_t keyLength;
1364 const uint8_t *keyBytes = key.getByteArray(keyLength);
1365 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1366 infoln(fileTestName);
1367 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1368 norm);
1369 infoln(line);
1370 infoln(printCollationKey(key));
1371 return FALSE;
1372 }
1373
1374 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1375 if(numLevels < UCOL_IDENTICAL) {
1376 ++numLevels;
1377 } else {
1378 numLevels = 5;
1379 }
1380 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1381 ++numLevels;
1382 }
1383 errorCode.assertSuccess();
1384 int32_t numLevelSeparators = 0;
1385 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1386 uint8_t b = keyBytes[i];
1387 if(b == 0) {
1388 infoln(fileTestName);
1389 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1390 infoln(line);
1391 infoln(printCollationKey(key));
1392 return FALSE;
1393 }
1394 if(b == 1) { ++numLevelSeparators; }
1395 }
1396 if(numLevelSeparators != (numLevels - 1)) {
1397 infoln(fileTestName);
1398 errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1399 norm, (int)numLevelSeparators, (int)numLevels);
1400 infoln(line);
1401 infoln(printCollationKey(key));
1402 return FALSE;
1403 }
1404
1405 // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1406 static const int32_t partSizes[] = { 32, 3, 1 };
1407 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1408 int32_t partSize = partSizes[psi];
1409 CharString parts;
1410 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1411 infoln(fileTestName);
1412 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1413 norm, (int)partSize, errorCode.errorName());
1414 infoln(line);
1415 return FALSE;
1416 }
1417 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1418 infoln(fileTestName);
1419 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1420 norm, (int)partSize);
1421 infoln(line);
1422 infoln(printCollationKey(key));
1423 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1424 return FALSE;
1425 }
1426 }
1427 return TRUE;
1428 }
1429
1430 /**
1431 * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1432 * Leaves key unchanged if s does not contain U+FFFE.
1433 * @return TRUE if the key was successfully changed
1434 */
getMergedCollationKey(const UChar * s,int32_t length,CollationKey & key,IcuTestErrorCode & errorCode)1435 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1436 CollationKey &key, IcuTestErrorCode &errorCode) {
1437 if(errorCode.isFailure()) { return FALSE; }
1438 LocalMemory<uint8_t> mergedKey;
1439 int32_t mergedKeyLength = 0;
1440 int32_t mergedKeyCapacity = 0;
1441 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1442 int32_t segmentStart = 0;
1443 for(int32_t i = 0;;) {
1444 if(i == sLength) {
1445 if(segmentStart == 0) {
1446 // s does not contain any U+FFFE.
1447 return FALSE;
1448 }
1449 } else if(s[i] != 0xfffe) {
1450 ++i;
1451 continue;
1452 }
1453 // Get the sort key for another segment and merge it into mergedKey.
1454 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1455 CollationKey key2;
1456 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1457 int32_t key1Length, key2Length;
1458 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1459 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1460 uint8_t *dest;
1461 int32_t minCapacity = key1Length + key2Length;
1462 if(key1Length > 0) { --minCapacity; }
1463 if(minCapacity <= mergedKeyCapacity) {
1464 dest = mergedKey.getAlias();
1465 } else {
1466 if(minCapacity <= 200) {
1467 mergedKeyCapacity = 200;
1468 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1469 mergedKeyCapacity *= 2;
1470 } else {
1471 mergedKeyCapacity = minCapacity;
1472 }
1473 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1474 }
1475 U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1476 if(key1Length == 0) {
1477 // key2 is the sort key for the first segment.
1478 uprv_memcpy(dest, key2Bytes, key2Length);
1479 mergedKeyLength = key2Length;
1480 } else {
1481 mergedKeyLength =
1482 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1483 dest, mergedKeyCapacity);
1484 }
1485 if(i == sLength) { break; }
1486 segmentStart = ++i;
1487 }
1488 key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1489 return TRUE;
1490 }
1491
1492 namespace {
1493
1494 /**
1495 * Replaces unpaired surrogates with U+FFFD.
1496 * Returns s if no replacement was made, otherwise buffer.
1497 */
surrogatesToFFFD(const UnicodeString & s,UnicodeString & buffer)1498 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1499 int32_t i = 0;
1500 while(i < s.length()) {
1501 UChar32 c = s.char32At(i);
1502 if(U_IS_SURROGATE(c)) {
1503 if(buffer.length() < i) {
1504 buffer.append(s, buffer.length(), i - buffer.length());
1505 }
1506 buffer.append((UChar)0xfffd);
1507 }
1508 i += U16_LENGTH(c);
1509 }
1510 if(buffer.isEmpty()) {
1511 return s;
1512 }
1513 if(buffer.length() < i) {
1514 buffer.append(s, buffer.length(), i - buffer.length());
1515 }
1516 return buffer;
1517 }
1518
getDifferenceLevel(const CollationKey & prevKey,const CollationKey & key,UCollationResult order,UBool collHasCaseLevel)1519 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1520 UCollationResult order, UBool collHasCaseLevel) {
1521 if(order == UCOL_EQUAL) {
1522 return Collation::NO_LEVEL;
1523 }
1524 int32_t prevKeyLength;
1525 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1526 int32_t keyLength;
1527 const uint8_t *bytes = key.getByteArray(keyLength);
1528 int32_t level = Collation::PRIMARY_LEVEL;
1529 for(int32_t i = 0;; ++i) {
1530 uint8_t b = prevBytes[i];
1531 if(b != bytes[i]) { break; }
1532 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1533 ++level;
1534 if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1535 ++level;
1536 }
1537 }
1538 }
1539 return level;
1540 }
1541
1542 }
1543
checkCompareTwo(const char * norm,const UnicodeString & prevFileLine,const UnicodeString & prevString,const UnicodeString & s,UCollationResult expectedOrder,Collation::Level expectedLevel,IcuTestErrorCode & errorCode)1544 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1545 const UnicodeString &prevString, const UnicodeString &s,
1546 UCollationResult expectedOrder, Collation::Level expectedLevel,
1547 IcuTestErrorCode &errorCode) {
1548 if(errorCode.isFailure()) { return FALSE; }
1549
1550 // Get the sort keys first, for error debug output.
1551 CollationKey prevKey;
1552 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1553 prevKey, errorCode)) {
1554 return FALSE;
1555 }
1556 CollationKey key;
1557 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1558
1559 UCollationResult order = coll->compare(prevString, s, errorCode);
1560 if(order != expectedOrder || errorCode.isFailure()) {
1561 infoln(fileTestName);
1562 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1563 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1564 infoln(prevFileLine);
1565 infoln(fileLine);
1566 infoln(printCollationKey(prevKey));
1567 infoln(printCollationKey(key));
1568 return FALSE;
1569 }
1570 order = coll->compare(s, prevString, errorCode);
1571 if(order != -expectedOrder || errorCode.isFailure()) {
1572 infoln(fileTestName);
1573 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1574 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1575 infoln(prevFileLine);
1576 infoln(fileLine);
1577 infoln(printCollationKey(prevKey));
1578 infoln(printCollationKey(key));
1579 return FALSE;
1580 }
1581 // Test NUL-termination if the strings do not contain NUL characters.
1582 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1583 if(!containNUL) {
1584 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1585 if(order != expectedOrder || errorCode.isFailure()) {
1586 infoln(fileTestName);
1587 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1588 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1589 infoln(prevFileLine);
1590 infoln(fileLine);
1591 infoln(printCollationKey(prevKey));
1592 infoln(printCollationKey(key));
1593 return FALSE;
1594 }
1595 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1596 if(order != -expectedOrder || errorCode.isFailure()) {
1597 infoln(fileTestName);
1598 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1599 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1600 infoln(prevFileLine);
1601 infoln(fileLine);
1602 infoln(printCollationKey(prevKey));
1603 infoln(printCollationKey(key));
1604 return FALSE;
1605 }
1606 }
1607
1608 #if U_HAVE_STD_STRING
1609 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1610 // Unpaired surrogates cannot be converted to UTF-8.
1611 // Create valid UTF-16 strings if necessary, and use those for
1612 // both the expected compare() result and for the input to compare(UTF-8).
1613 UnicodeString prevBuffer, sBuffer;
1614 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1615 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1616 std::string prevUTF8, sUTF8;
1617 UnicodeString(prevValid).toUTF8String(prevUTF8);
1618 UnicodeString(sValid).toUTF8String(sUTF8);
1619 UCollationResult expectedUTF8Order;
1620 if(&prevValid == &prevString && &sValid == &s) {
1621 expectedUTF8Order = expectedOrder;
1622 } else {
1623 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1624 }
1625
1626 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1627 if(order != expectedUTF8Order || errorCode.isFailure()) {
1628 infoln(fileTestName);
1629 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1630 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1631 infoln(prevFileLine);
1632 infoln(fileLine);
1633 infoln(printCollationKey(prevKey));
1634 infoln(printCollationKey(key));
1635 return FALSE;
1636 }
1637 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1638 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1639 infoln(fileTestName);
1640 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1641 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1642 infoln(prevFileLine);
1643 infoln(fileLine);
1644 infoln(printCollationKey(prevKey));
1645 infoln(printCollationKey(key));
1646 return FALSE;
1647 }
1648 // Test NUL-termination if the strings do not contain NUL characters.
1649 if(!containNUL) {
1650 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1651 if(order != expectedUTF8Order || errorCode.isFailure()) {
1652 infoln(fileTestName);
1653 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1654 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1655 infoln(prevFileLine);
1656 infoln(fileLine);
1657 infoln(printCollationKey(prevKey));
1658 infoln(printCollationKey(key));
1659 return FALSE;
1660 }
1661 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1662 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1663 infoln(fileTestName);
1664 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1665 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1666 infoln(prevFileLine);
1667 infoln(fileLine);
1668 infoln(printCollationKey(prevKey));
1669 infoln(printCollationKey(key));
1670 return FALSE;
1671 }
1672 }
1673 #endif
1674
1675 UCharIterator leftIter;
1676 UCharIterator rightIter;
1677 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1678 uiter_setString(&rightIter, s.getBuffer(), s.length());
1679 order = coll->compare(leftIter, rightIter, errorCode);
1680 if(order != expectedOrder || errorCode.isFailure()) {
1681 infoln(fileTestName);
1682 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1683 "wrong order: %d != %d (%s)",
1684 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1685 infoln(prevFileLine);
1686 infoln(fileLine);
1687 infoln(printCollationKey(prevKey));
1688 infoln(printCollationKey(key));
1689 return FALSE;
1690 }
1691
1692 order = prevKey.compareTo(key, errorCode);
1693 if(order != expectedOrder || errorCode.isFailure()) {
1694 infoln(fileTestName);
1695 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1696 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1697 infoln(prevFileLine);
1698 infoln(fileLine);
1699 infoln(printCollationKey(prevKey));
1700 infoln(printCollationKey(key));
1701 return FALSE;
1702 }
1703 UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1704 int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1705 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1706 if(level != expectedLevel) {
1707 infoln(fileTestName);
1708 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1709 (int)fileLineNumber, norm, order, level, expectedLevel);
1710 infoln(prevFileLine);
1711 infoln(fileLine);
1712 infoln(printCollationKey(prevKey));
1713 infoln(printCollationKey(key));
1714 return FALSE;
1715 }
1716 }
1717
1718 // If either string contains U+FFFE, then their sort keys must compare the same as
1719 // the merged sort keys of each string's between-FFFE segments.
1720 //
1721 // It is not required that
1722 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1723 // only that those two methods yield the same order.
1724 //
1725 // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1726 if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1727 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1728 errorCode.isFailure()) {
1729 order = prevKey.compareTo(key, errorCode);
1730 if(order != expectedOrder || errorCode.isFailure()) {
1731 infoln(fileTestName);
1732 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1733 "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1734 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1735 infoln(prevFileLine);
1736 infoln(fileLine);
1737 infoln(printCollationKey(prevKey));
1738 infoln(printCollationKey(key));
1739 return FALSE;
1740 }
1741 int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1742 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1743 if(mergedLevel != level) {
1744 infoln(fileTestName);
1745 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1746 "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1747 (int)fileLineNumber, norm, order, mergedLevel, level);
1748 infoln(prevFileLine);
1749 infoln(fileLine);
1750 infoln(printCollationKey(prevKey));
1751 infoln(printCollationKey(key));
1752 return FALSE;
1753 }
1754 }
1755 }
1756 return TRUE;
1757 }
1758
checkCompareStrings(UCHARBUF * f,IcuTestErrorCode & errorCode)1759 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1760 if(errorCode.isFailure()) { return; }
1761 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1762 UnicodeString prevString, s;
1763 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1764 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1765 // Parse the line even if it will be ignored (when we do not have a Collator)
1766 // in order to report syntax issues.
1767 Collation::Level relation = parseRelationAndString(s, errorCode);
1768 if(errorCode.isFailure()) {
1769 errorCode.reset();
1770 break;
1771 }
1772 if(coll == NULL) {
1773 // We were unable to create the Collator but continue with tests.
1774 // Ignore test data for this Collator.
1775 // The next Collator creation might work.
1776 continue;
1777 }
1778 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1779 Collation::Level expectedLevel = relation;
1780 s.getTerminatedBuffer(); // Ensure NUL-termination.
1781 UBool isOk = TRUE;
1782 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1783 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1784 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1785 expectedOrder, expectedLevel, errorCode);
1786 }
1787 if(isOk) {
1788 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1789 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1790 expectedOrder, expectedLevel, errorCode);
1791 }
1792 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1793 UnicodeString pn = nfd->normalize(prevString, errorCode);
1794 UnicodeString n = nfd->normalize(s, errorCode);
1795 pn.getTerminatedBuffer();
1796 n.getTerminatedBuffer();
1797 errorCode.assertSuccess();
1798 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1799 expectedOrder, expectedLevel, errorCode);
1800 }
1801 if(!isOk) {
1802 errorCode.reset(); // already reported
1803 }
1804 prevFileLine = fileLine;
1805 prevString = s;
1806 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1807 }
1808 }
1809
TestDataDriven()1810 void CollationTest::TestDataDriven() {
1811 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1812
1813 fcd = Normalizer2Factory::getFCDInstance(errorCode);
1814 nfd = Normalizer2::getNFDInstance(errorCode);
1815 if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1816 return;
1817 }
1818
1819 CharString path(getSourceTestData(errorCode), errorCode);
1820 path.appendPathPart("collationtest.txt", errorCode);
1821 const char *codePage = "UTF-8";
1822 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1823 if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1824 return;
1825 }
1826 // Read a new line if necessary.
1827 // Sub-parsers leave the first line set that they do not handle.
1828 while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1829 if(!isSectionStarter(fileLine[0])) {
1830 errln("syntax error on line %d", (int)fileLineNumber);
1831 infoln(fileLine);
1832 return;
1833 }
1834 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1835 fileTestName = fileLine;
1836 logln(fileLine);
1837 fileLine.remove();
1838 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1839 setRootCollator(errorCode);
1840 fileLine.remove();
1841 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1842 setLocaleCollator(errorCode);
1843 fileLine.remove();
1844 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1845 buildTailoring(f.getAlias(), errorCode);
1846 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1847 parseAndSetAttribute(errorCode);
1848 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1849 checkCompareStrings(f.getAlias(), errorCode);
1850 } else {
1851 errln("syntax error on line %d", (int)fileLineNumber);
1852 infoln(fileLine);
1853 return;
1854 }
1855 }
1856 }
1857
1858 #endif // !UCONFIG_NO_COLLATION
1859