1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1999-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   12/09/99    aliu        Ported from Java.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_COLLATION
16 
17 #include "thcoll.h"
18 #include "unicode/utypes.h"
19 #include "unicode/coll.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/sortkey.h"
22 #include "unicode/tblcoll.h"
23 #include "unicode/ustring.h"
24 #include "cmemory.h"
25 #include "cstring.h"
26 #include "filestrm.h"
27 #include "textfile.h"
28 
29 /**
30  * The TestDictionary test expects a file of this name, with this
31  * encoding, to be present in the directory $ICU/source/test/testdata.
32  */
33 //#define TEST_FILE           "th18057.txt"
34 
35 /**
36  * This is the most failures we show in TestDictionary.  If this number
37  * is < 0, we show all failures.
38  */
39 #define MAX_FAILURES_TO_SHOW -1
40 
CollationThaiTest()41 CollationThaiTest::CollationThaiTest() {
42     UErrorCode status = U_ZERO_ERROR;
43     coll = Collator::createInstance(Locale("th", "TH", ""), status);
44     if (coll && U_SUCCESS(status)) {
45         //coll->setStrength(Collator::TERTIARY);
46     } else {
47         delete coll;
48         coll = 0;
49     }
50 }
51 
~CollationThaiTest()52 CollationThaiTest::~CollationThaiTest() {
53     delete coll;
54 }
55 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)56 void CollationThaiTest::runIndexedTest(int32_t index, UBool exec, const char* &name,
57                                        char* /*par*/) {
58 
59     if((!coll) && exec) {
60       dataerrln(__FILE__ " cannot test - failed to create collator.");
61       name = "some test";
62       return;
63     }
64 
65     switch (index) {
66         TESTCASE(0,TestDictionary);
67         TESTCASE(1,TestCornerCases);
68         TESTCASE(2,TestNamesList);
69         TESTCASE(3,TestInvalidThai);
70         TESTCASE(4,TestReordering);
71         default: name = ""; break;
72     }
73 }
74 
75 /**
76  * Read the external names list, and confirms that the collator
77  * gets the same results when comparing lines one to another
78  * using regular and iterative comparison.
79  */
TestNamesList(void)80 void CollationThaiTest::TestNamesList(void) {
81     if (coll == 0) {
82         errln("Error: could not construct Thai collator");
83         return;
84     }
85 
86     UErrorCode ec = U_ZERO_ERROR;
87     TextFile names("TestNames_Thai.txt", "UTF16LE", ec);
88     if (U_FAILURE(ec)) {
89         logln("Can't open TestNames_Thai.txt: %s; skipping test",
90               u_errorName(ec));
91         return;
92     }
93 
94     //
95     // Loop through each word in the dictionary and compare it to the previous
96     // word.  They should be in sorted order.
97     //
98     UnicodeString lastWord, word;
99     //int32_t failed = 0;
100     int32_t wordCount = 0;
101     while (names.readLineSkippingComments(word, ec, FALSE) && U_SUCCESS(ec)) {
102 
103         // Show the first 8 words being compared, so we can see what's happening
104         ++wordCount;
105         if (wordCount <= 8) {
106             UnicodeString str;
107             logln((UnicodeString)"Word " + wordCount + ": " + IntlTest::prettify(word, str));
108         }
109 
110         if (lastWord.length() > 0) {
111             Collator::EComparisonResult result = coll->compare(lastWord, word);
112             doTest(coll, lastWord, word, result);
113         }
114         lastWord = word;
115     }
116 
117     assertSuccess("readLine", ec);
118 
119     logln((UnicodeString)"Words checked: " + wordCount);
120 }
121 
122 /**
123  * Read the external dictionary file, which is already in proper
124  * sorted order, and confirm that the collator compares each line as
125  * preceding the following line.
126  */
TestDictionary(void)127 void CollationThaiTest::TestDictionary(void) {
128     if (coll == 0) {
129         errln("Error: could not construct Thai collator");
130         return;
131     }
132 
133     UErrorCode ec = U_ZERO_ERROR;
134     TextFile riwords("riwords.txt", "UTF8", ec);
135     if (U_FAILURE(ec)) {
136         logln("Can't open riwords.txt: %s; skipping test",
137               u_errorName(ec));
138         return;
139     }
140 
141     //
142     // Loop through each word in the dictionary and compare it to the previous
143     // word.  They should be in sorted order.
144     //
145     UnicodeString lastWord, word;
146     int32_t failed = 0;
147     int32_t wordCount = 0;
148     while (riwords.readLineSkippingComments(word, ec, FALSE) && U_SUCCESS(ec)) {
149 
150         // Show the first 8 words being compared, so we can see what's happening
151         ++wordCount;
152         if (wordCount <= 8) {
153             UnicodeString str;
154             logln((UnicodeString)"Word " + wordCount + ": " + IntlTest::prettify(word, str));
155         }
156 
157         if (lastWord.length() > 0) {
158             int32_t result = coll->compare(lastWord, word);
159 
160             if (result > 0) {
161                 failed++;
162                 if (MAX_FAILURES_TO_SHOW < 0 || failed <= MAX_FAILURES_TO_SHOW) {
163                     UnicodeString str;
164                     UnicodeString msg =
165                         UnicodeString("--------------------------------------------\n")
166                         + riwords.getLineNumber()
167                         + " compare(" + IntlTest::prettify(lastWord, str);
168                     msg += UnicodeString(", ")
169                         + IntlTest::prettify(word, str) + ") returned " + result
170                         + ", expected -1\n";
171                     UErrorCode status = U_ZERO_ERROR;
172                     CollationKey k1, k2;
173                     coll->getCollationKey(lastWord, k1, status);
174                     coll->getCollationKey(word, k2, status);
175                     if (U_FAILURE(status)) {
176                         errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
177                         return;
178                     }
179                     msg.append("key1: ").append(prettify(k1, str)).append("\n");
180                     msg.append("key2: ").append(prettify(k2, str));
181                     errln(msg);
182                 }
183             }
184         }
185         lastWord = word;
186     }
187 
188     assertSuccess("readLine", ec);
189 
190     if (failed != 0) {
191         if (failed > MAX_FAILURES_TO_SHOW) {
192             errln((UnicodeString)"Too many failures; only the first " +
193                   MAX_FAILURES_TO_SHOW + " failures were shown");
194         }
195         errln((UnicodeString)"Summary: " + failed + " of " + (riwords.getLineNumber() - 1) +
196               " comparisons failed");
197     }
198 
199     logln((UnicodeString)"Words checked: " + wordCount);
200 }
201 
202 /**
203  * Odd corner conditions taken from "How to Sort Thai Without Rewriting Sort",
204  * by Doug Cooper, http://seasrc.th.net/paper/thaisort.zip
205  */
TestCornerCases(void)206 void CollationThaiTest::TestCornerCases(void) {
207     const char* TESTS[] = {
208         // Shorter words precede longer
209         "\\u0e01",                               "<",    "\\u0e01\\u0e01",
210 
211         // Tone marks are considered after letters (i.e. are primary ignorable)
212         "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e49\\u0e32",
213 
214         // ditto for other over-marks
215         "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e32\\u0e4c",
216 
217         // commonly used mark-in-context order.
218         // In effect, marks are sorted after each syllable.
219         "\\u0e01\\u0e32\\u0e01\\u0e49\\u0e32",   "<",    "\\u0e01\\u0e48\\u0e32\\u0e01\\u0e49\\u0e32",
220 
221         // Hyphens and other punctuation follow whitespace but come before letters
222         "\\u0e01\\u0e32",                        "=",    "\\u0e01\\u0e32-",
223         "\\u0e01\\u0e32-",                       "<",    "\\u0e01\\u0e32\\u0e01\\u0e32",
224 
225         // Doubler follows an indentical word without the doubler
226         "\\u0e01\\u0e32",                        "=",    "\\u0e01\\u0e32\\u0e46",
227         "\\u0e01\\u0e32\\u0e46",                 "<",    "\\u0e01\\u0e32\\u0e01\\u0e32",
228 
229 
230         // \\u0e45 after either \\u0e24 or \\u0e26 is treated as a single
231         // combining character, similar to "c < ch" in traditional spanish.
232         // TODO: beef up this case
233         "\\u0e24\\u0e29\\u0e35",                 "<",    "\\u0e24\\u0e45\\u0e29\\u0e35",
234         "\\u0e26\\u0e29\\u0e35",                 "<",    "\\u0e26\\u0e45\\u0e29\\u0e35",
235 
236         // Vowels reorder, should compare \\u0e2d and \\u0e34
237         "\\u0e40\\u0e01\\u0e2d",                 "<",    "\\u0e40\\u0e01\\u0e34",
238 
239         // Tones are compared after the rest of the word (e.g. primary ignorable)
240         "\\u0e01\\u0e32\\u0e01\\u0e48\\u0e32",   "<",    "\\u0e01\\u0e49\\u0e32\\u0e01\\u0e32",
241 
242         // Periods are ignored entirely
243         "\\u0e01.\\u0e01.",                      "<",    "\\u0e01\\u0e32",
244     };
245     const int32_t TESTS_length = UPRV_LENGTHOF(TESTS);
246 
247     if (coll == 0) {
248         errln("Error: could not construct Thai collator");
249         return;
250     }
251     compareArray(*coll, TESTS, TESTS_length);
252 }
253 
254 //------------------------------------------------------------------------
255 // Internal utilities
256 //------------------------------------------------------------------------
257 
compareArray(Collator & c,const char * tests[],int32_t testsLength)258 void CollationThaiTest::compareArray(Collator& c, const char* tests[],
259                                      int32_t testsLength) {
260     for (int32_t i = 0; i < testsLength; i += 3) {
261 
262         Collator::EComparisonResult expect;
263         if (tests[i+1][0] == '<') {
264           expect = Collator::LESS;
265         } else if (tests[i+1][0] == '>') {
266           expect = Collator::GREATER;
267         } else if (tests[i+1][0] == '=') {
268           expect = Collator::EQUAL;
269         } else {
270             // expect = Integer.decode(tests[i+1]).intValue();
271             errln((UnicodeString)"Error: unknown operator " + tests[i+1]);
272             return;
273         }
274 
275         UnicodeString s1, s2;
276         parseChars(s1, tests[i]);
277         parseChars(s2, tests[i+2]);
278 
279         doTest(&c, s1, s2, expect);
280 #if 0
281         UErrorCode status = U_ZERO_ERROR;
282         int32_t result = c.compare(s1, s2);
283         if (sign(result) != sign(expect))
284         {
285             UnicodeString t1, t2;
286             errln(UnicodeString("") +
287                   i/3 + ": compare(" + IntlTest::prettify(s1, t1)
288                   + " , " + IntlTest::prettify(s2, t2)
289                   + ") got " + result + "; expected " + expect);
290 
291             CollationKey k1, k2;
292             c.getCollationKey(s1, k1, status);
293             c.getCollationKey(s2, k2, status);
294             if (U_FAILURE(status)) {
295                 errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
296                 return;
297             }
298             errln((UnicodeString)"  key1: " + prettify(k1, t1) );
299             errln((UnicodeString)"  key2: " + prettify(k2, t2) );
300         }
301         else
302         {
303             // Collator.compare worked OK; now try the collation keys
304             CollationKey k1, k2;
305             c.getCollationKey(s1, k1, status);
306             c.getCollationKey(s2, k2, status);
307             if (U_FAILURE(status)) {
308                 errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
309                 return;
310             }
311 
312             result = k1.compareTo(k2);
313             if (sign(result) != sign(expect)) {
314                 UnicodeString t1, t2;
315                 errln(UnicodeString("") +
316                       i/3 + ": key(" + IntlTest::prettify(s1, t1)
317                       + ").compareTo(key(" + IntlTest::prettify(s2, t2)
318                       + ")) got " + result + "; expected " + expect);
319 
320                 errln((UnicodeString)"  " + prettify(k1, t1) + " vs. " + prettify(k2, t2));
321             }
322         }
323 #endif
324     }
325 }
326 
sign(int32_t i)327 int8_t CollationThaiTest::sign(int32_t i) {
328     if (i < 0) return -1;
329     if (i > 0) return 1;
330     return 0;
331 }
332 
333 /**
334  * Set a UnicodeString corresponding to the given string.  Use
335  * UnicodeString and the default converter, unless we see the sequence
336  * "\\u", in which case we interpret the subsequent escape.
337  */
parseChars(UnicodeString & result,const char * chars)338 UnicodeString& CollationThaiTest::parseChars(UnicodeString& result,
339                                              const char* chars) {
340     return result = CharsToUnicodeString(chars);
341 }
342 
343 UCollator *thaiColl = NULL;
344 
345 U_CDECL_BEGIN
346 static int U_CALLCONV
StrCmp(const void * p1,const void * p2)347 StrCmp(const void *p1, const void *p2) {
348   return ucol_strcoll(thaiColl, *(UChar **) p1, -1,  *(UChar **)p2, -1);
349 }
350 U_CDECL_END
351 
352 
353 #define LINES 6
354 
TestInvalidThai(void)355 void CollationThaiTest::TestInvalidThai(void) {
356   const char *tests[LINES] = {
357     "\\u0E44\\u0E01\\u0E44\\u0E01",
358     "\\u0E44\\u0E01\\u0E01\\u0E44",
359     "\\u0E01\\u0E44\\u0E01\\u0E44",
360     "\\u0E01\\u0E01\\u0E44\\u0E44",
361     "\\u0E44\\u0E44\\u0E01\\u0E01",
362     "\\u0E01\\u0E44\\u0E44\\u0E01",
363   };
364 
365   UChar strings[LINES][20];
366 
367   UChar *toSort[LINES];
368 
369   int32_t i = 0, j = 0, len = 0;
370 
371   UErrorCode coll_status = U_ZERO_ERROR;
372   UnicodeString iteratorText;
373 
374   thaiColl = ucol_open ("th_TH", &coll_status);
375   if (U_FAILURE(coll_status)) {
376     errln("Error opening Thai collator: %s", u_errorName(coll_status));
377     return;
378   }
379 
380   CollationElementIterator* c = ((RuleBasedCollator *)coll)->createCollationElementIterator( iteratorText );
381 
382   for(i = 0; i < UPRV_LENGTHOF(tests); i++) {
383     len = u_unescape(tests[i], strings[i], 20);
384     strings[i][len] = 0;
385     toSort[i] = strings[i];
386   }
387 
388   qsort (toSort, LINES, sizeof (UChar *), StrCmp);
389 
390   for (i=0; i < LINES; i++)
391   {
392     logln("%i", i);
393       for (j=i+1; j < LINES; j++) {
394           if (ucol_strcoll (thaiColl, toSort[i], -1, toSort[j], -1) == UCOL_GREATER)
395           {
396               // inconsistency ordering found!
397             errln("Inconsistent ordering between strings %i and %i", i, j);
398           }
399       }
400       iteratorText.setTo(toSort[i]);
401       c->setText(iteratorText, coll_status);
402       backAndForth(*c);
403   }
404 
405 
406   ucol_close(thaiColl);
407   delete c;
408 }
409 
TestReordering(void)410 void CollationThaiTest::TestReordering(void) {
411   // Until UCA 4.1, the collation code swapped Thai/Lao prevowels with the following consonants,
412   // resulting in consonant+prevowel == prevowel+consonant.
413   // From UCA 5.0 on, there are order-reversing contractions for prevowel+consonant.
414   // From UCA 5.0 until UCA 6.1, there was a tertiary difference between
415   // consonant+prevowel and prevowel+consonant.
416   // In UCA 6.2, they compare equal again.
417   // The test was modified to using a collator with strength=secondary,
418   // ignoring possible tertiary differences.
419   const char *tests[] = {
420     "\\u0E41c\\u0301",       "=", "\\u0E41\\u0107", // composition
421     "\\u0E41\\U0001D7CE",    "<", "\\u0E41\\U0001D7CF", // supplementaries
422     "\\u0E41\\U0001D15F",    "=", "\\u0E41\\U0001D158\\U0001D165", // supplementary composition decomps to supplementary
423     "\\u0E41\\U0002F802",    "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
424     "\\u0E41\\u0301",        "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
425     "\\u0E41\\u0301\\u0316", "=", "\\u0E41\\u0316\\u0301",
426 
427     "\\u0e24\\u0e41",        "=", "\\u0e41\\u0e24", // exiting contraction bug
428     "\\u0e3f\\u0e3f\\u0e24\\u0e41", "=", "\\u0e3f\\u0e3f\\u0e41\\u0e24",
429 
430     "abc\\u0E41c\\u0301",       "=", "abc\\u0E41\\u0107", // composition
431     "abc\\u0E41\\U0001D000",    "<", "abc\\u0E41\\U0001D001", // supplementaries
432     "abc\\u0E41\\U0001D15F",    "=", "abc\\u0E41\\U0001D158\\U0001D165", // supplementary composition decomps to supplementary
433     "abc\\u0E41\\U0002F802",    "=", "abc\\u0E41\\u4E41", // supplementary composition decomps to BMP
434     "abc\\u0E41\\u0301",        "=", "abc\\u0E41\\u0301", // unsafe (just checking backwards iteration)
435     "abc\\u0E41\\u0301\\u0316", "=", "abc\\u0E41\\u0316\\u0301",
436 
437     "\\u0E41c\\u0301abc",       "=", "\\u0E41\\u0107abc", // composition
438     "\\u0E41\\U0001D000abc",    "<", "\\u0E41\\U0001D001abc", // supplementaries
439     "\\u0E41\\U0001D15Fabc",    "=", "\\u0E41\\U0001D158\\U0001D165abc", // supplementary composition decomps to supplementary
440     "\\u0E41\\U0002F802abc",    "=", "\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
441     "\\u0E41\\u0301abc",        "=", "\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
442     "\\u0E41\\u0301\\u0316abc", "=", "\\u0E41\\u0316\\u0301abc",
443 
444     "abc\\u0E41c\\u0301abc",       "=", "abc\\u0E41\\u0107abc", // composition
445     "abc\\u0E41\\U0001D000abc",    "<", "abc\\u0E41\\U0001D001abc", // supplementaries
446     "abc\\u0E41\\U0001D15Fabc",    "=", "abc\\u0E41\\U0001D158\\U0001D165abc", // supplementary composition decomps to supplementary
447     "abc\\u0E41\\U0002F802abc",    "=", "abc\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
448     "abc\\u0E41\\u0301abc",        "=", "abc\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
449     "abc\\u0E41\\u0301\\u0316abc", "=", "abc\\u0E41\\u0316\\u0301abc",
450   };
451 
452   LocalPointer<Collator> coll2(coll->clone());
453   UErrorCode status = U_ZERO_ERROR;
454   coll2->setAttribute(UCOL_STRENGTH, UCOL_SECONDARY, status);
455   if(U_FAILURE(status)) {
456     errln("Unable to set the Thai collator clone to secondary strength");
457     return;
458   }
459   compareArray(*coll2, tests, UPRV_LENGTHOF(tests));
460 
461   const char *rule = "& c < ab";
462   const char *testcontraction[] = { "\\u0E41ab", ">", "\\u0E41c"}; // After UCA 4.1 Thai are normal so won't break a contraction
463   UnicodeString rules;
464   parseChars(rules, rule);
465   LocalPointer<RuleBasedCollator> rcoll(new RuleBasedCollator(rules, status), status);
466   if(U_SUCCESS(status)) {
467     compareArray(*rcoll, testcontraction, 3);
468   } else {
469     errln("Couldn't instantiate collator from rules");
470   }
471 
472 }
473 
474 
475 #endif /* #if !UCONFIG_NO_COLLATION */
476