1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 2002-2014, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************
6  *
7  * @author Mark E. Davis
8  * @author Vladimir Weinstein
9  */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_NORMALIZATION
14 
15 #include "intltest.h"
16 #include "cstring.h"
17 #include "canittst.h"
18 #include "unicode/caniter.h"
19 #include "unicode/normlzr.h"
20 #include "unicode/uchar.h"
21 #include "hash.h"
22 
23 #define ARRAY_LENGTH(array) ((int32_t)(sizeof (array) / sizeof (*array)))
24 
25 #define CASE(id,test) case id:                          \
26                           name = #test;                 \
27                           if (exec) {                   \
28                               logln(#test "---");       \
29                               logln((UnicodeString)""); \
30                               test();                   \
31                           }                             \
32                           break
33 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)34 void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec,
35                                          const char* &name, char* /*par*/) {
36     switch (index) {
37         CASE(0, TestBasic);
38         CASE(1, TestExhaustive);
39         CASE(2, TestAPI);
40       default: name = ""; break;
41     }
42 }
43 
44 /**
45  * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
46 static UnicodeString str(const char *input)
47 {
48     UnicodeString str(input, ""); // Invariant conversion
49     return str.unescape();
50 }
51  */
52 
53 
CanonicalIteratorTest()54 CanonicalIteratorTest::CanonicalIteratorTest() :
55 nameTrans(NULL), hexTrans(NULL)
56 {
57 }
58 
~CanonicalIteratorTest()59 CanonicalIteratorTest::~CanonicalIteratorTest()
60 {
61 #if !UCONFIG_NO_TRANSLITERATION
62   if(nameTrans != NULL) {
63     delete(nameTrans);
64   }
65   if(hexTrans != NULL) {
66     delete(hexTrans);
67   }
68 #endif
69 }
70 
TestExhaustive()71 void CanonicalIteratorTest::TestExhaustive() {
72     UErrorCode status = U_ZERO_ERROR;
73     CanonicalIterator it("", status);
74     if (U_FAILURE(status)) {
75         dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
76         return;
77     }
78     UChar32 i = 0;
79     UnicodeString s;
80     // Test static and dynamic class IDs
81     if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
82         errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID");
83     }
84     for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) {
85         //for (i = 0xae00; i < 0xaf00; ++i) {
86 
87         if ((i % 0x100) == 0) {
88             logln("Testing U+%06X", i);
89         }
90 
91         // skip characters we know don't have decomps
92         int8_t type = u_charType(i);
93         if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR
94             || type == U_SURROGATE) continue;
95 
96         s = i;
97         characterTest(s, i, it);
98 
99         s += (UChar32)0x0345; //"\\u0345";
100         characterTest(s, i, it);
101     }
102 }
103 
TestBasic()104 void CanonicalIteratorTest::TestBasic() {
105 
106     UErrorCode status = U_ZERO_ERROR;
107 
108     static const char * const testArray[][2] = {
109         {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, "
110             "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, "
111             "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, "
112             "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"},
113         {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"},
114         {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"},
115     };
116 
117 #if 0
118     // This is not interesting for C/C++ as the data is already built beforehand
119     // check build
120     UnicodeSet ss = CanonicalIterator.getSafeStart();
121     logln("Safe Start: " + ss.toPattern(true));
122     ss = CanonicalIterator.getStarts('a');
123     expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
124         new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
125         + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
126             );
127 #endif
128 
129     // check permute
130     // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
131 
132     Hashtable *permutations = new Hashtable(FALSE, status);
133     permutations->setValueDeleter(uprv_deleteUObject);
134     UnicodeString toPermute("ABC");
135 
136     CanonicalIterator::permute(toPermute, FALSE, permutations, status);
137 
138     logln("testing permutation");
139 
140     expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA");
141 
142     delete permutations;
143 
144     // try samples
145     logln("testing samples");
146     Hashtable *set = new Hashtable(FALSE, status);
147     set->setValueDeleter(uprv_deleteUObject);
148     int32_t i = 0;
149     CanonicalIterator it("", status);
150     if(U_SUCCESS(status)) {
151       for (i = 0; i < ARRAY_LENGTH(testArray); ++i) {
152           //logln("Results for: " + name.transliterate(testArray[i]));
153           UnicodeString testStr = CharsToUnicodeString(testArray[i][0]);
154           it.setSource(testStr, status);
155           set->removeAll();
156           for (;;) {
157               //UnicodeString *result = new UnicodeString(it.next());
158               UnicodeString result(it.next());
159               if (result.isBogus()) {
160                   break;
161               }
162               set->put(result, new UnicodeString(result), status); // Add result to the table
163               //logln(++counter + ": " + hex.transliterate(result));
164               //logln(" = " + name.transliterate(result));
165           }
166           expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1]));
167 
168       }
169     } else {
170       dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status));
171     }
172     delete set;
173 }
174 
characterTest(UnicodeString & s,UChar32 ch,CanonicalIterator & it)175 void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it)
176 {
177     UErrorCode status = U_ZERO_ERROR;
178     UnicodeString decomp, comp;
179     UBool gotDecomp = FALSE;
180     UBool gotComp = FALSE;
181     UBool gotSource = FALSE;
182 
183     Normalizer::decompose(s, FALSE, 0, decomp, status);
184     Normalizer::compose(s, FALSE, 0, comp, status);
185 
186     // skip characters that don't have either decomp.
187     // need quick test for this!
188     if (s == decomp && s == comp) {
189         return;
190     }
191 
192     it.setSource(s, status);
193 
194     for (;;) {
195         UnicodeString item = it.next();
196         if (item.isBogus()) break;
197         if (item == s) gotSource = TRUE;
198         if (item == decomp) gotDecomp = TRUE;
199         if (item == comp) gotComp = TRUE;
200     }
201 
202     if (!gotSource || !gotDecomp || !gotComp) {
203         errln("FAIL CanonicalIterator: " + s + (int)ch);
204     }
205 }
206 
expectEqual(const UnicodeString & message,const UnicodeString & item,const UnicodeString & a,const UnicodeString & b)207 void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) {
208     if (!(a==b)) {
209         errln("FAIL: " + message + getReadable(item));
210         errln("\t" + getReadable(a));
211         errln("\t" + getReadable(b));
212     } else {
213         logln("Checked: " + message + getReadable(item));
214         logln("\t" + getReadable(a));
215         logln("\t" + getReadable(b));
216     }
217 }
218 
getReadable(const UnicodeString & s)219 UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) {
220   UErrorCode status = U_ZERO_ERROR;
221   UnicodeString result = "[";
222     if (s.length() == 0) return "";
223     // set up for readable display
224 #if !UCONFIG_NO_TRANSLITERATION
225     if(verbose) {
226       if (nameTrans == NULL)
227           nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status);
228       UnicodeString sName = s;
229       nameTrans->transliterate(sName);
230       result += sName;
231       result += ";";
232     }
233     if (hexTrans == NULL)
234         hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status);
235 #endif
236     UnicodeString sHex = s;
237 #if !UCONFIG_NO_TRANSLITERATION
238     if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated
239       hexTrans->transliterate(sHex);
240     }
241 #endif
242     result += sHex;
243     result += "]";
244     return result;
245     //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]";
246 }
247 
248 U_CFUNC int U_CALLCONV
compareUnicodeStrings(const void * s1,const void * s2)249 compareUnicodeStrings(const void *s1, const void *s2) {
250   UnicodeString **st1 = (UnicodeString **)s1;
251   UnicodeString **st2 = (UnicodeString **)s2;
252 
253   return (*st1)->compare(**st2);
254 }
255 
256 
collectionToString(Hashtable * col)257 UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) {
258     UnicodeString result;
259 
260     // Iterate over the Hashtable, then qsort.
261 
262     UnicodeString **resArray = new UnicodeString*[col->count()];
263     int32_t i = 0;
264 
265     const UHashElement *ne = NULL;
266     int32_t el = UHASH_FIRST;
267     //Iterator it = basic.iterator();
268     ne = col->nextElement(el);
269     //while (it.hasNext())
270     while (ne != NULL) {
271       //String item = (String) it.next();
272       UnicodeString *item = (UnicodeString *)(ne->value.pointer);
273       resArray[i++] = item;
274       ne = col->nextElement(el);
275     }
276 
277     for(i = 0; i<col->count(); ++i) {
278       logln(*resArray[i]);
279     }
280 
281     qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings);
282 
283     result = *resArray[0];
284 
285     for(i = 1; i<col->count(); ++i) {
286       result += ", ";
287       result += *resArray[i];
288     }
289 
290 /*
291     Iterator it = col.iterator();
292     while (it.hasNext()) {
293         if (result.length() != 0) result.append(", ");
294         result.append(it.next().toString());
295     }
296 */
297 
298     delete [] resArray;
299 
300     return result;
301 }
302 
TestAPI()303 void CanonicalIteratorTest::TestAPI() {
304   UErrorCode status = U_ZERO_ERROR;
305   // Test reset and getSource
306   UnicodeString start("ljubav");
307   logln("Testing CanonicalIterator::getSource");
308   logln("Instantiating canonical iterator with string "+start);
309   CanonicalIterator can(start, status);
310   if (U_FAILURE(status)) {
311       dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
312       return;
313   }
314   UnicodeString source = can.getSource();
315   logln("CanonicalIterator::getSource returned "+source);
316   if(start != source) {
317     errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source);
318   }
319   logln("Testing CanonicalIterator::reset");
320   UnicodeString next = can.next();
321   logln("CanonicalIterator::next returned "+next);
322 
323   can.reset();
324 
325   UnicodeString afterReset = can.next();
326   logln("After reset, CanonicalIterator::next returned "+afterReset);
327 
328   if(next != afterReset) {
329     errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+").");
330   }
331 
332   logln("Testing getStaticClassID and getDynamicClassID");
333   if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
334       errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID");
335   }
336 }
337 
338 #endif /* #if !UCONFIG_NO_NORMALIZATION */
339