1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1999-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   11/10/99    aliu        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "transtst.h"
18 #include "unicode/locid.h"
19 #include "unicode/dtfmtsym.h"
20 #include "unicode/normlzr.h"
21 #include "unicode/translit.h"
22 #include "unicode/uchar.h"
23 #include "unicode/unifilt.h"
24 #include "unicode/uniset.h"
25 #include "unicode/ustring.h"
26 #include "unicode/usetiter.h"
27 #include "unicode/uscript.h"
28 #include "unicode/utf16.h"
29 #include "cpdtrans.h"
30 #include "nultrans.h"
31 #include "rbt.h"
32 #include "rbt_pars.h"
33 #include "anytrans.h"
34 #include "esctrn.h"
35 #include "name2uni.h"
36 #include "nortrans.h"
37 #include "remtrans.h"
38 #include "titletrn.h"
39 #include "tolowtrn.h"
40 #include "toupptrn.h"
41 #include "unesctrn.h"
42 #include "uni2name.h"
43 #include "cstring.h"
44 #include "cmemory.h"
45 #include <stdio.h>
46 
47 /***********************************************************************
48 
49                      HOW TO USE THIS TEST FILE
50                                -or-
51                   How I developed on two platforms
52                 without losing (too much of) my mind
53 
54 
55 1. Add new tests by copying/pasting/changing existing tests.  On Java,
56    any public void method named Test...() taking no parameters becomes
57    a test.  On C++, you need to modify the header and add a line to
58    the runIndexedTest() dispatch method.
59 
60 2. Make liberal use of the expect() method; it is your friend.
61 
62 3. The tests in this file exactly match those in a sister file on the
63    other side.  The two files are:
64 
65    icu4j:  src/com/ibm/test/translit/TransliteratorTest.java
66    icu4c:  source/test/intltest/transtst.cpp
67 
68                   ==> THIS IS THE IMPORTANT PART <==
69 
70    When you add a test in this file, add it in TransliteratorTest.java
71    too.  Give it the same name and put it in the same relative place.
72    This makes maintenance a lot simpler for any poor soul who ends up
73    trying to synchronize the tests between icu4j and icu4c.
74 
75 4. If you MUST enter a test that is NOT paralleled in the sister file,
76    then add it in the special non-mirrored section.  These are
77    labeled
78 
79      "icu4j ONLY"
80 
81    or
82 
83      "icu4c ONLY"
84 
85    Make sure you document the reason the test is here and not there.
86 
87 
88 Thank you.
89 The Management
90 ***********************************************************************/
91 
92 // Define character constants thusly to be EBCDIC-friendly
93 enum {
94     LEFT_BRACE=((UChar)0x007B), /*{*/
95     PIPE      =((UChar)0x007C), /*|*/
96     ZERO      =((UChar)0x0030), /*0*/
97     UPPER_A   =((UChar)0x0041)  /*A*/
98 };
99 
TransliteratorTest()100 TransliteratorTest::TransliteratorTest()
101 :   DESERET_DEE((UChar32)0x10414),
102     DESERET_dee((UChar32)0x1043C)
103 {
104 }
105 
~TransliteratorTest()106 TransliteratorTest::~TransliteratorTest() {}
107 
108 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)109 TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
110                                    const char* &name, char* /*par*/) {
111     switch (index) {
112         TESTCASE(0,TestInstantiation);
113         TESTCASE(1,TestSimpleRules);
114         TESTCASE(2,TestRuleBasedInverse);
115         TESTCASE(3,TestKeyboard);
116         TESTCASE(4,TestKeyboard2);
117         TESTCASE(5,TestKeyboard3);
118         TESTCASE(6,TestArabic);
119         TESTCASE(7,TestCompoundKana);
120         TESTCASE(8,TestCompoundHex);
121         TESTCASE(9,TestFiltering);
122         TESTCASE(10,TestInlineSet);
123         TESTCASE(11,TestPatternQuoting);
124         TESTCASE(12,TestJ277);
125         TESTCASE(13,TestJ243);
126         TESTCASE(14,TestJ329);
127         TESTCASE(15,TestSegments);
128         TESTCASE(16,TestCursorOffset);
129         TESTCASE(17,TestArbitraryVariableValues);
130         TESTCASE(18,TestPositionHandling);
131         TESTCASE(19,TestHiraganaKatakana);
132         TESTCASE(20,TestCopyJ476);
133         TESTCASE(21,TestAnchors);
134         TESTCASE(22,TestInterIndic);
135         TESTCASE(23,TestFilterIDs);
136         TESTCASE(24,TestCaseMap);
137         TESTCASE(25,TestNameMap);
138         TESTCASE(26,TestLiberalizedID);
139         TESTCASE(27,TestCreateInstance);
140         TESTCASE(28,TestNormalizationTransliterator);
141         TESTCASE(29,TestCompoundRBT);
142         TESTCASE(30,TestCompoundFilter);
143         TESTCASE(31,TestRemove);
144         TESTCASE(32,TestToRules);
145         TESTCASE(33,TestContext);
146         TESTCASE(34,TestSupplemental);
147         TESTCASE(35,TestQuantifier);
148         TESTCASE(36,TestSTV);
149         TESTCASE(37,TestCompoundInverse);
150         TESTCASE(38,TestNFDChainRBT);
151         TESTCASE(39,TestNullInverse);
152         TESTCASE(40,TestAliasInverseID);
153         TESTCASE(41,TestCompoundInverseID);
154         TESTCASE(42,TestUndefinedVariable);
155         TESTCASE(43,TestEmptyContext);
156         TESTCASE(44,TestCompoundFilterID);
157         TESTCASE(45,TestPropertySet);
158         TESTCASE(46,TestNewEngine);
159         TESTCASE(47,TestQuantifiedSegment);
160         TESTCASE(48,TestDevanagariLatinRT);
161         TESTCASE(49,TestTeluguLatinRT);
162         TESTCASE(50,TestCompoundLatinRT);
163         TESTCASE(51,TestSanskritLatinRT);
164         TESTCASE(52,TestLocaleInstantiation);
165         TESTCASE(53,TestTitleAccents);
166         TESTCASE(54,TestLocaleResource);
167         TESTCASE(55,TestParseError);
168         TESTCASE(56,TestOutputSet);
169         TESTCASE(57,TestVariableRange);
170         TESTCASE(58,TestInvalidPostContext);
171         TESTCASE(59,TestIDForms);
172         TESTCASE(60,TestToRulesMark);
173         TESTCASE(61,TestEscape);
174         TESTCASE(62,TestAnchorMasking);
175         TESTCASE(63,TestDisplayName);
176         TESTCASE(64,TestSpecialCases);
177 #if !UCONFIG_NO_FILE_IO
178         TESTCASE(65,TestIncrementalProgress);
179 #endif
180         TESTCASE(66,TestSurrogateCasing);
181         TESTCASE(67,TestFunction);
182         TESTCASE(68,TestInvalidBackRef);
183         TESTCASE(69,TestMulticharStringSet);
184         TESTCASE(70,TestUserFunction);
185         TESTCASE(71,TestAnyX);
186         TESTCASE(72,TestSourceTargetSet);
187         TESTCASE(73,TestGurmukhiDevanagari);
188         TESTCASE(74,TestPatternWhiteSpace);
189         TESTCASE(75,TestAllCodepoints);
190         TESTCASE(76,TestBoilerplate);
191         TESTCASE(77,TestAlternateSyntax);
192         TESTCASE(78,TestBeginEnd);
193         TESTCASE(79,TestBeginEndToRules);
194         TESTCASE(80,TestRegisterAlias);
195         TESTCASE(81,TestRuleStripping);
196         TESTCASE(82,TestHalfwidthFullwidth);
197         TESTCASE(83,TestThai);
198         TESTCASE(84,TestAny);
199         TESTCASE(85,TestBasicTransliteratorEvenWithoutData);
200         default: name = ""; break;
201     }
202 }
203 
204 /**
205  * Make sure every system transliterator can be instantiated.
206  *
207  * ALSO test that the result of toRules() for each rule is a valid
208  * rule.  Do this here so we don't have to have another test that
209  * instantiates everything as well.
210  */
TestInstantiation()211 void TransliteratorTest::TestInstantiation() {
212     UErrorCode ec = U_ZERO_ERROR;
213     StringEnumeration* avail = Transliterator::getAvailableIDs(ec);
214     assertSuccess("getAvailableIDs()", ec);
215     assertTrue("getAvailableIDs()!=NULL", avail!=NULL);
216     int32_t n = Transliterator::countAvailableIDs();
217     assertTrue("getAvailableIDs().count()==countAvailableIDs()",
218                avail->count(ec) == n);
219     assertSuccess("count()", ec);
220     UnicodeString name;
221     for (int32_t i=0; i<n; ++i) {
222         const UnicodeString& id = *avail->snext(ec);
223         if (!assertSuccess("snext()", ec) ||
224             !assertTrue("snext()!=NULL", (&id)!=NULL, TRUE)) {
225             break;
226         }
227         UnicodeString id2 = Transliterator::getAvailableID(i);
228         if (id.length() < 1) {
229             errln(UnicodeString("FAIL: getAvailableID(") +
230                   i + ") returned empty string");
231             continue;
232         }
233         if (id != id2) {
234             errln(UnicodeString("FAIL: getAvailableID(") +
235                   i + ") != getAvailableIDs().snext()");
236             continue;
237         }
238         UParseError parseError;
239         UErrorCode status = U_ZERO_ERROR;
240         Transliterator* t = Transliterator::createInstance(id,
241                               UTRANS_FORWARD, parseError,status);
242         name.truncate(0);
243         Transliterator::getDisplayName(id, name);
244         if (t == 0) {
245 #if UCONFIG_NO_BREAK_ITERATION
246             // If UCONFIG_NO_BREAK_ITERATION is on, then only Thai should fail.
247             if (id.compare((UnicodeString)"Thai-Latn") != 0 &&
248                 id.compare((UnicodeString)"Thai-Latin") != 0)
249 #endif
250                 dataerrln(UnicodeString("FAIL: Couldn't create ") + id +
251                       /*", parse error " + parseError.code +*/
252                       ", line " + parseError.line +
253                       ", offset " + parseError.offset +
254                       ", pre-context " + prettify(parseError.preContext, TRUE) +
255                       ", post-context " +prettify(parseError.postContext,TRUE) +
256                       ", Error: " + u_errorName(status));
257                 // When createInstance fails, it deletes the failing
258                 // entry from the available ID list.  We detect this
259                 // here by looking for a change in countAvailableIDs.
260             int32_t nn = Transliterator::countAvailableIDs();
261             if (nn == (n - 1)) {
262                 n = nn;
263                 --i; // Compensate for deleted entry
264             }
265         } else {
266             logln(UnicodeString("OK: ") + name + " (" + id + ")");
267 
268             // Now test toRules
269             UnicodeString rules;
270             t->toRules(rules, TRUE);
271             Transliterator *u = Transliterator::createFromRules("x",
272                                     rules, UTRANS_FORWARD, parseError,status);
273             if (u == 0) {
274                 errln(UnicodeString("FAIL: ") + id +
275                       ".createFromRules() => bad rules" +
276                       /*", parse error " + parseError.code +*/
277                       ", line " + parseError.line +
278                       ", offset " + parseError.offset +
279                       ", context " + prettify(parseError.preContext, TRUE) +
280                       ", rules: " + prettify(rules, TRUE));
281             } else {
282                 delete u;
283             }
284             delete t;
285         }
286     }
287     assertTrue("snext()==NULL", avail->snext(ec)==NULL);
288     assertSuccess("snext()", ec);
289     delete avail;
290 
291     // Now test the failure path
292     UParseError parseError;
293     UErrorCode status = U_ZERO_ERROR;
294     UnicodeString id("<Not a valid Transliterator ID>");
295     Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
296     if (t != 0) {
297         errln("FAIL: " + id + " returned a transliterator");
298         delete t;
299     } else {
300         logln("OK: Bogus ID handled properly");
301     }
302 }
303 
TestSimpleRules(void)304 void TransliteratorTest::TestSimpleRules(void) {
305     /* Example: rules 1. ab>x|y
306      *                2. yc>z
307      *
308      * []|eabcd  start - no match, copy e to tranlated buffer
309      * [e]|abcd  match rule 1 - copy output & adjust cursor
310      * [ex|y]cd  match rule 2 - copy output & adjust cursor
311      * [exz]|d   no match, copy d to transliterated buffer
312      * [exzd]|   done
313      */
314     expect(UnicodeString("ab>x|y;", "") +
315            "yc>z",
316            "eabcd", "exzd");
317 
318     /* Another set of rules:
319      *    1. ab>x|yzacw
320      *    2. za>q
321      *    3. qc>r
322      *    4. cw>n
323      *
324      * []|ab       Rule 1
325      * [x|yzacw]   No match
326      * [xy|zacw]   Rule 2
327      * [xyq|cw]    Rule 4
328      * [xyqn]|     Done
329      */
330     expect(UnicodeString("ab>x|yzacw;") +
331            "za>q;" +
332            "qc>r;" +
333            "cw>n",
334            "ab", "xyqn");
335 
336     /* Test categories
337      */
338     UErrorCode status = U_ZERO_ERROR;
339     UParseError parseError;
340     Transliterator *t = Transliterator::createFromRules(
341         "<ID>",
342         UnicodeString("$dummy=").append((UChar)0xE100) +
343         UnicodeString(";"
344                       "$vowel=[aeiouAEIOU];"
345                       "$lu=[:Lu:];"
346                       "$vowel } $lu > '!';"
347                       "$vowel > '&';"
348                       "'!' { $lu > '^';"
349                       "$lu > '*';"
350                       "a > ERROR", ""),
351         UTRANS_FORWARD, parseError,
352         status);
353     if (U_FAILURE(status)) {
354         dataerrln("FAIL: RBT constructor failed - %s", u_errorName(status));
355         return;
356     }
357     expect(*t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
358     delete t;
359 }
360 
361 /**
362  * Test inline set syntax and set variable syntax.
363  */
TestInlineSet(void)364 void TransliteratorTest::TestInlineSet(void) {
365     expect("{ [:Ll:] } x > y; [:Ll:] > z;", "aAbxq", "zAyzz");
366     expect("a[0-9]b > qrs", "1a7b9", "1qrs9");
367 
368     expect(UnicodeString(
369            "$digit = [0-9];"
370            "$alpha = [a-zA-Z];"
371            "$alphanumeric = [$digit $alpha];" // ***
372            "$special = [^$alphanumeric];"     // ***
373            "$alphanumeric > '-';"
374            "$special > '*';", ""),
375 
376            "thx-1138", "---*----");
377 }
378 
379 /**
380  * Create some inverses and confirm that they work.  We have to be
381  * careful how we do this, since the inverses will not be true
382  * inverses -- we can't throw any random string at the composition
383  * of the transliterators and expect the identity function.  F x
384  * F' != I.  However, if we are careful about the input, we will
385  * get the expected results.
386  */
TestRuleBasedInverse(void)387 void TransliteratorTest::TestRuleBasedInverse(void) {
388     UnicodeString RULES =
389         UnicodeString("abc>zyx;") +
390         "ab>yz;" +
391         "bc>zx;" +
392         "ca>xy;" +
393         "a>x;" +
394         "b>y;" +
395         "c>z;" +
396 
397         "abc<zyx;" +
398         "ab<yz;" +
399         "bc<zx;" +
400         "ca<xy;" +
401         "a<x;" +
402         "b<y;" +
403         "c<z;" +
404 
405         "";
406 
407     const char* DATA[] = {
408         // Careful here -- random strings will not work.  If we keep
409         // the left side to the domain and the right side to the range
410         // we will be okay though (left, abc; right xyz).
411         "a", "x",
412         "abcacab", "zyxxxyy",
413         "caccb", "xyzzy",
414     };
415 
416     int32_t DATA_length = UPRV_LENGTHOF(DATA);
417 
418     UErrorCode status = U_ZERO_ERROR;
419     UParseError parseError;
420     Transliterator *fwd = Transliterator::createFromRules("<ID>", RULES,
421                                 UTRANS_FORWARD, parseError, status);
422     Transliterator *rev = Transliterator::createFromRules("<ID>", RULES,
423                                 UTRANS_REVERSE, parseError, status);
424     if (U_FAILURE(status)) {
425         errln("FAIL: RBT constructor failed");
426         return;
427     }
428     for (int32_t i=0; i<DATA_length; i+=2) {
429         expect(*fwd, DATA[i], DATA[i+1]);
430         expect(*rev, DATA[i+1], DATA[i]);
431     }
432     delete fwd;
433     delete rev;
434 }
435 
436 /**
437  * Basic test of keyboard.
438  */
TestKeyboard(void)439 void TransliteratorTest::TestKeyboard(void) {
440     UParseError parseError;
441     UErrorCode status = U_ZERO_ERROR;
442     Transliterator *t = Transliterator::createFromRules("<ID>",
443                               UnicodeString("psch>Y;")
444                               +"ps>y;"
445                               +"ch>x;"
446                               +"a>A;",
447                               UTRANS_FORWARD, parseError,
448                               status);
449     if (U_FAILURE(status)) {
450         errln("FAIL: RBT constructor failed");
451         return;
452     }
453     const char* DATA[] = {
454         // insertion, buffer
455         "a", "A",
456         "p", "Ap",
457         "s", "Aps",
458         "c", "Apsc",
459         "a", "AycA",
460         "psch", "AycAY",
461         0, "AycAY", // null means finishKeyboardTransliteration
462     };
463 
464     keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
465     delete t;
466 }
467 
468 /**
469  * Basic test of keyboard with cursor.
470  */
TestKeyboard2(void)471 void TransliteratorTest::TestKeyboard2(void) {
472     UParseError parseError;
473     UErrorCode status = U_ZERO_ERROR;
474     Transliterator *t = Transliterator::createFromRules("<ID>",
475                               UnicodeString("ych>Y;")
476                               +"ps>|y;"
477                               +"ch>x;"
478                               +"a>A;",
479                               UTRANS_FORWARD, parseError,
480                               status);
481     if (U_FAILURE(status)) {
482         errln("FAIL: RBT constructor failed");
483         return;
484     }
485     const char* DATA[] = {
486         // insertion, buffer
487         "a", "A",
488         "p", "Ap",
489         "s", "Aps", // modified for rollback - "Ay",
490         "c", "Apsc", // modified for rollback - "Ayc",
491         "a", "AycA",
492         "p", "AycAp",
493         "s", "AycAps", // modified for rollback - "AycAy",
494         "c", "AycApsc", // modified for rollback - "AycAyc",
495         "h", "AycAY",
496         0, "AycAY", // null means finishKeyboardTransliteration
497     };
498 
499     keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
500     delete t;
501 }
502 
503 /**
504  * Test keyboard transliteration with back-replacement.
505  */
TestKeyboard3(void)506 void TransliteratorTest::TestKeyboard3(void) {
507     // We want th>z but t>y.  Furthermore, during keyboard
508     // transliteration we want t>y then yh>z if t, then h are
509     // typed.
510     UnicodeString RULES("t>|y;"
511                         "yh>z;");
512 
513     const char* DATA[] = {
514         // Column 1: characters to add to buffer (as if typed)
515         // Column 2: expected appearance of buffer after
516         //           keyboard xliteration.
517         "a", "a",
518         "b", "ab",
519         "t", "abt", // modified for rollback - "aby",
520         "c", "abyc",
521         "t", "abyct", // modified for rollback - "abycy",
522         "h", "abycz",
523         0, "abycz", // null means finishKeyboardTransliteration
524     };
525 
526     UParseError parseError;
527     UErrorCode status = U_ZERO_ERROR;
528     Transliterator *t = Transliterator::createFromRules("<ID>", RULES, UTRANS_FORWARD, parseError, status);
529     if (U_FAILURE(status)) {
530         errln("FAIL: RBT constructor failed");
531         return;
532     }
533     keyboardAux(*t, DATA, UPRV_LENGTHOF(DATA));
534     delete t;
535 }
536 
keyboardAux(const Transliterator & t,const char * DATA[],int32_t DATA_length)537 void TransliteratorTest::keyboardAux(const Transliterator& t,
538                                      const char* DATA[], int32_t DATA_length) {
539     UErrorCode status = U_ZERO_ERROR;
540     UTransPosition index={0, 0, 0, 0};
541     UnicodeString s;
542     for (int32_t i=0; i<DATA_length; i+=2) {
543         UnicodeString log;
544         if (DATA[i] != 0) {
545             log = s + " + "
546                 + DATA[i]
547                 + " -> ";
548             t.transliterate(s, index, DATA[i], status);
549         } else {
550             log = s + " => ";
551             t.finishTransliteration(s, index);
552         }
553         // Show the start index '{' and the cursor '|'
554         UnicodeString a, b, c;
555         s.extractBetween(0, index.contextStart, a);
556         s.extractBetween(index.contextStart, index.start, b);
557         s.extractBetween(index.start, s.length(), c);
558         log.append(a).
559             append((UChar)LEFT_BRACE).
560             append(b).
561             append((UChar)PIPE).
562             append(c);
563         if (s == DATA[i+1] && U_SUCCESS(status)) {
564             logln(log);
565         } else {
566             errln(UnicodeString("FAIL: ") + log + ", expected " + DATA[i+1]);
567         }
568     }
569 }
570 
TestArabic(void)571 void TransliteratorTest::TestArabic(void) {
572 // Test disabled for 2.0 until new Arabic transliterator can be written.
573 //    /*
574 //    const char* DATA[] = {
575 //        "Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+
576 //                  "\u0627\u0644\u0644\u063a\u0629\u0020"+
577 //                  "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
578 //                  "\u0628\u0628\u0646\u0638\u0645\u0020"+
579 //                  "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
580 //                  "\u062c\u0645\u064a\u0644\u0629",
581 //    };
582 //    */
583 //
584 //    UChar ar_raw[] = {
585 //        0x062a, 0x062a, 0x0645, 0x062a, 0x0639, 0x0020, 0x0627,
586 //        0x0644, 0x0644, 0x063a, 0x0629, 0x0020, 0x0627, 0x0644,
587 //        0x0639, 0x0631, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
588 //        0x0628, 0x0628, 0x0646, 0x0638, 0x0645, 0x0020, 0x0643,
589 //        0x062a, 0x0627, 0x0628, 0x0628, 0x064a, 0x0629, 0x0020,
590 //        0x062c, 0x0645, 0x064a, 0x0644, 0x0629, 0
591 //    };
592 //    UnicodeString ar(ar_raw);
593 //    UErrorCode status=U_ZERO_ERROR;
594 //    UParseError parseError;
595 //    Transliterator *t = Transliterator::createInstance("Latin-Arabic", UTRANS_FORWARD, parseError, status);
596 //    if (t == 0) {
597 //        errln("FAIL: createInstance failed");
598 //        return;
599 //    }
600 //    expect(*t, "Arabic", ar);
601 //    delete t;
602 }
603 
604 /**
605  * Compose the Kana transliterator forward and reverse and try
606  * some strings that should come out unchanged.
607  */
TestCompoundKana(void)608 void TransliteratorTest::TestCompoundKana(void) {
609     UParseError parseError;
610     UErrorCode status = U_ZERO_ERROR;
611     Transliterator* t = Transliterator::createInstance("Latin-Hiragana;Hiragana-Latin", UTRANS_FORWARD, parseError, status);
612     if (t == 0) {
613         dataerrln("FAIL: construction of Latin-Hiragana;Hiragana-Latin failed - %s", u_errorName(status));
614     } else {
615         expect(*t, "aaaaa", "aaaaa");
616         delete t;
617     }
618 }
619 
620 /**
621  * Compose the hex transliterators forward and reverse.
622  */
TestCompoundHex(void)623 void TransliteratorTest::TestCompoundHex(void) {
624     UParseError parseError;
625     UErrorCode status = U_ZERO_ERROR;
626     Transliterator* a = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
627     Transliterator* b = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, parseError, status);
628     Transliterator* transab[] = { a, b };
629     Transliterator* transba[] = { b, a };
630     if (a == 0 || b == 0) {
631         errln("FAIL: construction failed");
632         delete a;
633         delete b;
634         return;
635     }
636     // Do some basic tests of a
637     expect(*a, "01", UnicodeString("\\u0030\\u0031", ""));
638     // Do some basic tests of b
639     expect(*b, UnicodeString("\\u0030\\u0031", ""), "01");
640 
641     Transliterator* ab = new CompoundTransliterator(transab, 2);
642     UnicodeString s("abcde", "");
643     expect(*ab, s, s);
644 
645     UnicodeString str(s);
646     a->transliterate(str);
647     Transliterator* ba = new CompoundTransliterator(transba, 2);
648     expect(*ba, str, str);
649 
650     delete ab;
651     delete ba;
652     delete a;
653     delete b;
654 }
655 
656 int gTestFilterClassID = 0;
657 /**
658  * Used by TestFiltering().
659  */
660 class TestFilter : public UnicodeFilter {
clone() const661     virtual TestFilter* clone() const {
662         return new TestFilter(*this);
663     }
contains(UChar32 c) const664     virtual UBool contains(UChar32 c) const {
665         return c != (UChar)0x0063 /*c*/;
666     }
667     // Stubs
toPattern(UnicodeString & result,UBool) const668     virtual UnicodeString& toPattern(UnicodeString& result,
669                                      UBool /*escapeUnprintable*/) const {
670         return result;
671     }
matchesIndexValue(uint8_t) const672     virtual UBool matchesIndexValue(uint8_t /*v*/) const {
673         return FALSE;
674     }
addMatchSetTo(UnicodeSet &) const675     virtual void addMatchSetTo(UnicodeSet& /*toUnionTo*/) const {}
676 public:
getDynamicClassID() const677     UClassID getDynamicClassID() const { return (UClassID)&gTestFilterClassID; }
678 };
679 
680 /**
681  * Do some basic tests of filtering.
682  */
TestFiltering(void)683 void TransliteratorTest::TestFiltering(void) {
684     UParseError parseError;
685     UErrorCode status = U_ZERO_ERROR;
686     Transliterator* hex = Transliterator::createInstance("Any-Hex", UTRANS_FORWARD, parseError, status);
687     if (hex == 0) {
688         errln("FAIL: createInstance(Any-Hex) failed");
689         return;
690     }
691     hex->adoptFilter(new TestFilter());
692     UnicodeString s("abcde");
693     hex->transliterate(s);
694     UnicodeString exp("\\u0061\\u0062c\\u0064\\u0065", "");
695     if (s == exp) {
696         logln(UnicodeString("Ok:   \"") + exp + "\"");
697     } else {
698         logln(UnicodeString("FAIL: \"") + s + "\", wanted \"" + exp + "\"");
699     }
700 
701     // ICU4C ONLY. Do not find Transliterator.orphanFilter() in ICU4J.
702     UnicodeFilter *f = hex->orphanFilter();
703     if (f == NULL){
704         errln("FAIL: orphanFilter() should get a UnicodeFilter");
705     } else {
706         delete f;
707     }
708     delete hex;
709 }
710 
711 /**
712  * Test anchors
713  */
TestAnchors(void)714 void TransliteratorTest::TestAnchors(void) {
715     expect(UnicodeString("^a  > 0; a$ > 2 ; a > 1;", ""),
716            "aaa",
717            "012");
718     expect(UnicodeString("$s=[z$]; $s{a>0; a}$s>2; a>1;", ""),
719            "aaa",
720            "012");
721     expect(UnicodeString("^ab  > 01 ;"
722            " ab  > |8 ;"
723            "  b  > k ;"
724            " 8x$ > 45 ;"
725            " 8x  > 77 ;", ""),
726 
727            "ababbabxabx",
728            "018k7745");
729     expect(UnicodeString("$s = [z$] ;"
730            "$s{ab    > 01 ;"
731            "   ab    > |8 ;"
732            "    b    > k ;"
733            "   8x}$s > 45 ;"
734            "   8x    > 77 ;", ""),
735 
736            "abzababbabxzabxabx",
737            "01z018k45z01x45");
738 }
739 
740 /**
741  * Test pattern quoting and escape mechanisms.
742  */
TestPatternQuoting(void)743 void TransliteratorTest::TestPatternQuoting(void) {
744     // Array of 3n items
745     // Each item is <rules>, <input>, <expected output>
746     const UnicodeString DATA[] = {
747         UnicodeString(UChar(0x4E01)) + ">'[male adult]'",
748         UnicodeString(UChar(0x4E01)),
749         "[male adult]"
750     };
751 
752     for (int32_t i=0; i<3; i+=3) {
753         logln(UnicodeString("Pattern: ") + prettify(DATA[i]));
754         UParseError parseError;
755         UErrorCode status = U_ZERO_ERROR;
756         Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
757         if (U_FAILURE(status)) {
758             errln("RBT constructor failed");
759         } else {
760             expect(*t, DATA[i+1], DATA[i+2]);
761         }
762         delete t;
763     }
764 }
765 
766 /**
767  * Regression test for bugs found in Greek transliteration.
768  */
TestJ277(void)769 void TransliteratorTest::TestJ277(void) {
770     UErrorCode status = U_ZERO_ERROR;
771     UParseError parseError;
772     Transliterator *gl = Transliterator::createInstance("Greek-Latin; NFD; [:M:]Remove; NFC", UTRANS_FORWARD, parseError, status);
773     if (gl == NULL) {
774         dataerrln("FAIL: createInstance(Greek-Latin) returned NULL - %s", u_errorName(status));
775         return;
776     }
777 
778     UChar sigma = 0x3C3;
779     UChar upsilon = 0x3C5;
780     UChar nu = 0x3BD;
781 //    UChar PHI = 0x3A6;
782     UChar alpha = 0x3B1;
783 //    UChar omega = 0x3C9;
784 //    UChar omicron = 0x3BF;
785 //    UChar epsilon = 0x3B5;
786 
787     // sigma upsilon nu -> syn
788     UnicodeString syn;
789     syn.append(sigma).append(upsilon).append(nu);
790     expect(*gl, syn, "syn");
791 
792     // sigma alpha upsilon nu -> saun
793     UnicodeString sayn;
794     sayn.append(sigma).append(alpha).append(upsilon).append(nu);
795     expect(*gl, sayn, "saun");
796 
797     // Again, using a smaller rule set
798     UnicodeString rules(
799                 "$alpha   = \\u03B1;"
800                 "$nu      = \\u03BD;"
801                 "$sigma   = \\u03C3;"
802                 "$ypsilon = \\u03C5;"
803                 "$vowel   = [aeiouAEIOU$alpha$ypsilon];"
804                 "s <>           $sigma;"
805                 "a <>           $alpha;"
806                 "u <>  $vowel { $ypsilon;"
807                 "y <>           $ypsilon;"
808                 "n <>           $nu;",
809                 "");
810     Transliterator *mini = Transliterator::createFromRules("mini", rules, UTRANS_REVERSE, parseError, status);
811     if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
812     expect(*mini, syn, "syn");
813     expect(*mini, sayn, "saun");
814     delete mini;
815     mini = NULL;
816 
817 #if !UCONFIG_NO_FORMATTING
818     // Transliterate the Greek locale data
819     Locale el("el");
820     DateFormatSymbols syms(el, status);
821     if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
822     int32_t i, count;
823     const UnicodeString* data = syms.getMonths(count);
824     for (i=0; i<count; ++i) {
825         if (data[i].length() == 0) {
826             continue;
827         }
828         UnicodeString out(data[i]);
829         gl->transliterate(out);
830         UBool ok = TRUE;
831         if (data[i].length() >= 2 && out.length() >= 2 &&
832             u_isupper(data[i].charAt(0)) && u_islower(data[i].charAt(1))) {
833             if (!(u_isupper(out.charAt(0)) && u_islower(out.charAt(1)))) {
834                 ok = FALSE;
835             }
836         }
837         if (ok) {
838             logln(prettify(data[i] + " -> " + out));
839         } else {
840             errln(UnicodeString("FAIL: ") + prettify(data[i] + " -> " + out));
841         }
842     }
843 #endif
844 
845     delete gl;
846 }
847 
848 /**
849  * Prefix, suffix support in hex transliterators
850  */
TestJ243(void)851 void TransliteratorTest::TestJ243(void) {
852     UErrorCode ec = U_ZERO_ERROR;
853 
854     // Test default Hex-Any, which should handle
855     // \u, \U, u+, and U+
856     Transliterator *hex =
857         Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, ec);
858     if (assertSuccess("getInstance", ec)) {
859         expect(*hex, UnicodeString("\\u0041+\\U00000042,U+0043uU+0044z", ""), "A+B,CuDz");
860     }
861     delete hex;
862 
863 //    // Try a custom Hex-Unicode
864 //    // \uXXXX and &#xXXXX;
865 //    ec = U_ZERO_ERROR;
866 //    HexToUnicodeTransliterator hex2(UnicodeString("\\\\u###0;&\\#x###0\\;", ""), ec);
867 //    expect(hex2, UnicodeString("\\u61\\u062\\u0063\\u00645\\u66x&#x30;&#x031;&#x0032;&#x00033;", ""),
868 //           "abcd5fx012&#x00033;");
869 //    // Try custom Any-Hex (default is tested elsewhere)
870 //    ec = U_ZERO_ERROR;
871 //    UnicodeToHexTransliterator hex3(UnicodeString("&\\#x###0;", ""), ec);
872 //    expect(hex3, "012", "&#x30;&#x31;&#x32;");
873 }
874 
875 /**
876  * Parsers need better syntax error messages.
877  */
TestJ329(void)878 void TransliteratorTest::TestJ329(void) {
879 
880     struct { UBool containsErrors; const char* rule; } DATA[] = {
881         { FALSE, "a > b; c > d" },
882         { TRUE,  "a > b; no operator; c > d" },
883     };
884     int32_t DATA_length = UPRV_LENGTHOF(DATA);
885 
886     for (int32_t i=0; i<DATA_length; ++i) {
887         UErrorCode status = U_ZERO_ERROR;
888         UParseError parseError;
889         Transliterator *rbt = Transliterator::createFromRules("<ID>",
890                                     DATA[i].rule,
891                                     UTRANS_FORWARD,
892                                     parseError,
893                                     status);
894         UBool gotError = U_FAILURE(status);
895         UnicodeString desc(DATA[i].rule);
896         desc.append(gotError ? " -> error" : " -> no error");
897         if (gotError) {
898             desc = desc + ", ParseError code=" + u_errorName(status) +
899                 " line=" + parseError.line +
900                 " offset=" + parseError.offset +
901                 " context=" + parseError.preContext;
902         }
903         if (gotError == DATA[i].containsErrors) {
904             logln(UnicodeString("Ok:   ") + desc);
905         } else {
906             errln(UnicodeString("FAIL: ") + desc);
907         }
908         delete rbt;
909     }
910 }
911 
912 /**
913  * Test segments and segment references.
914  */
TestSegments(void)915 void TransliteratorTest::TestSegments(void) {
916     // Array of 3n items
917     // Each item is <rules>, <input>, <expected output>
918     UnicodeString DATA[] = {
919         "([a-z]) '.' ([0-9]) > $2 '-' $1",
920         "abc.123.xyz.456",
921         "ab1-c23.xy4-z56",
922 
923         // nested
924         "(([a-z])([0-9])) > $1 '.' $2 '.' $3;",
925         "a1 b2",
926         "a1.a.1 b2.b.2",
927     };
928     int32_t DATA_length = UPRV_LENGTHOF(DATA);
929 
930     for (int32_t i=0; i<DATA_length; i+=3) {
931         logln("Pattern: " + prettify(DATA[i]));
932         UParseError parseError;
933         UErrorCode status = U_ZERO_ERROR;
934         Transliterator *t = Transliterator::createFromRules("ID", DATA[i], UTRANS_FORWARD, parseError, status);
935         if (U_FAILURE(status)) {
936             errln("FAIL: RBT constructor");
937         } else {
938             expect(*t, DATA[i+1], DATA[i+2]);
939         }
940         delete t;
941     }
942 }
943 
944 /**
945  * Test cursor positioning outside of the key
946  */
TestCursorOffset(void)947 void TransliteratorTest::TestCursorOffset(void) {
948     // Array of 3n items
949     // Each item is <rules>, <input>, <expected output>
950     UnicodeString DATA[] = {
951         "pre {alpha} post > | @ ALPHA ;"
952         "eALPHA > beta ;"
953         "pre {beta} post > BETA @@ | ;"
954         "post > xyz",
955 
956         "prealphapost prebetapost",
957 
958         "prbetaxyz preBETApost",
959     };
960     int32_t DATA_length = UPRV_LENGTHOF(DATA);
961 
962     for (int32_t i=0; i<DATA_length; i+=3) {
963         logln("Pattern: " + prettify(DATA[i]));
964         UParseError parseError;
965         UErrorCode status = U_ZERO_ERROR;
966         Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
967         if (U_FAILURE(status)) {
968             errln("FAIL: RBT constructor");
969         } else {
970             expect(*t, DATA[i+1], DATA[i+2]);
971         }
972         delete t;
973     }
974 }
975 
976 /**
977  * Test zero length and > 1 char length variable values.  Test
978  * use of variable refs in UnicodeSets.
979  */
TestArbitraryVariableValues(void)980 void TransliteratorTest::TestArbitraryVariableValues(void) {
981     // Array of 3n items
982     // Each item is <rules>, <input>, <expected output>
983     UnicodeString DATA[] = {
984         "$abe = ab;"
985         "$pat = x[yY]z;"
986         "$ll  = 'a-z';"
987         "$llZ = [$ll];"
988         "$llY = [$ll$pat];"
989         "$emp = ;"
990 
991         "$abe > ABE;"
992         "$pat > END;"
993         "$llZ > 1;"
994         "$llY > 2;"
995         "7$emp 8 > 9;"
996         "",
997 
998         "ab xYzxyz stY78",
999         "ABE ENDEND 1129",
1000     };
1001     int32_t DATA_length = UPRV_LENGTHOF(DATA);
1002 
1003     for (int32_t i=0; i<DATA_length; i+=3) {
1004         logln("Pattern: " + prettify(DATA[i]));
1005         UParseError parseError;
1006         UErrorCode status = U_ZERO_ERROR;
1007         Transliterator *t = Transliterator::createFromRules("<ID>", DATA[i], UTRANS_FORWARD, parseError, status);
1008         if (U_FAILURE(status)) {
1009             errln("FAIL: RBT constructor");
1010         } else {
1011             expect(*t, DATA[i+1], DATA[i+2]);
1012         }
1013         delete t;
1014     }
1015 }
1016 
1017 /**
1018  * Confirm that the contextStart, contextLimit, start, and limit
1019  * behave correctly. J474.
1020  */
TestPositionHandling(void)1021 void TransliteratorTest::TestPositionHandling(void) {
1022     // Array of 3n items
1023     // Each item is <rules>, <input>, <expected output>
1024     const char* DATA[] = {
1025         "a{t} > SS ; {t}b > UU ; {t} > TT ;",
1026         "xtat txtb", // pos 0,9,0,9
1027         "xTTaSS TTxUUb",
1028 
1029         "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1030         "xtat txtb", // pos 2,9,3,8
1031         "xtaSS TTxUUb",
1032 
1033         "a{t} > SS ; {t}b > UU ; {t} > TT ; a > A ; b > B ;",
1034         "xtat txtb", // pos 3,8,3,8
1035         "xtaTT TTxTTb",
1036     };
1037 
1038     // Array of 4n positions -- these go with the DATA array
1039     // They are: contextStart, contextLimit, start, limit
1040     int32_t POS[] = {
1041         0, 9, 0, 9,
1042         2, 9, 3, 8,
1043         3, 8, 3, 8,
1044     };
1045 
1046     int32_t n = UPRV_LENGTHOF(DATA) / 3;
1047     for (int32_t i=0; i<n; i++) {
1048         UErrorCode status = U_ZERO_ERROR;
1049         UParseError parseError;
1050         Transliterator *t = Transliterator::createFromRules("<ID>",
1051                                 DATA[3*i], UTRANS_FORWARD, parseError, status);
1052         if (U_FAILURE(status)) {
1053             delete t;
1054             errln("FAIL: RBT constructor");
1055             return;
1056         }
1057         UTransPosition pos;
1058         pos.contextStart= POS[4*i];
1059         pos.contextLimit = POS[4*i+1];
1060         pos.start = POS[4*i+2];
1061         pos.limit = POS[4*i+3];
1062         UnicodeString rsource(DATA[3*i+1]);
1063         t->transliterate(rsource, pos, status);
1064         if (U_FAILURE(status)) {
1065             delete t;
1066             errln("FAIL: transliterate");
1067             return;
1068         }
1069         t->finishTransliteration(rsource, pos);
1070         expectAux(DATA[3*i],
1071                   DATA[3*i+1],
1072                   rsource,
1073                   DATA[3*i+2]);
1074         delete t;
1075     }
1076 }
1077 
1078 /**
1079  * Test the Hiragana-Katakana transliterator.
1080  */
TestHiraganaKatakana(void)1081 void TransliteratorTest::TestHiraganaKatakana(void) {
1082     UParseError parseError;
1083     UErrorCode status = U_ZERO_ERROR;
1084     Transliterator* hk = Transliterator::createInstance("Hiragana-Katakana", UTRANS_FORWARD, parseError, status);
1085     Transliterator* kh = Transliterator::createInstance("Katakana-Hiragana", UTRANS_FORWARD, parseError, status);
1086     if (hk == 0 || kh == 0) {
1087         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1088         delete hk;
1089         delete kh;
1090         return;
1091     }
1092 
1093     // Array of 3n items
1094     // Each item is "hk"|"kh"|"both", <Hiragana>, <Katakana>
1095     const char* DATA[] = {
1096         "both",
1097         "\\u3042\\u3090\\u3099\\u3092\\u3050",
1098         "\\u30A2\\u30F8\\u30F2\\u30B0",
1099 
1100         "kh",
1101         "\\u307C\\u3051\\u3060\\u3042\\u3093\\u30FC",
1102         "\\u30DC\\u30F6\\u30C0\\u30FC\\u30F3\\u30FC",
1103     };
1104     int32_t DATA_length = UPRV_LENGTHOF(DATA);
1105 
1106     for (int32_t i=0; i<DATA_length; i+=3) {
1107         UnicodeString h = CharsToUnicodeString(DATA[i+1]);
1108         UnicodeString k = CharsToUnicodeString(DATA[i+2]);
1109         switch (*DATA[i]) {
1110         case 0x68: //'h': // Hiragana-Katakana
1111             expect(*hk, h, k);
1112             break;
1113         case 0x6B: //'k': // Katakana-Hiragana
1114             expect(*kh, k, h);
1115             break;
1116         case 0x62: //'b': // both
1117             expect(*hk, h, k);
1118             expect(*kh, k, h);
1119             break;
1120         }
1121     }
1122     delete hk;
1123     delete kh;
1124 }
1125 
1126 /**
1127  * Test cloning / copy constructor of RBT.
1128  */
TestCopyJ476(void)1129 void TransliteratorTest::TestCopyJ476(void) {
1130     // The real test here is what happens when the destructors are
1131     // called.  So we let one object get destructed, and check to
1132     // see that its copy still works.
1133     Transliterator *t2 = 0;
1134     {
1135         UParseError parseError;
1136         UErrorCode status = U_ZERO_ERROR;
1137         Transliterator *t1 = Transliterator::createFromRules("t1",
1138             "a>A;b>B;'foo'+>'bar'", UTRANS_FORWARD, parseError, status);
1139         if (U_FAILURE(status)) {
1140             errln("FAIL: RBT constructor");
1141             return;
1142         }
1143         t2 = t1->clone(); // Call copy constructor under the covers.
1144         expect(*t1, "abcfoofoo", "ABcbar");
1145         delete t1;
1146     }
1147     expect(*t2, "abcfoofoo", "ABcbar");
1148     delete t2;
1149 }
1150 
1151 /**
1152  * Test inter-Indic transliterators.  These are composed.
1153  * ICU4C Jitterbug 483.
1154  */
TestInterIndic(void)1155 void TransliteratorTest::TestInterIndic(void) {
1156     UnicodeString ID("Devanagari-Gujarati", "");
1157     UErrorCode status = U_ZERO_ERROR;
1158     UParseError parseError;
1159     Transliterator* dg = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1160     if (dg == 0) {
1161         dataerrln("FAIL: createInstance(" + ID + ") returned NULL - " + u_errorName(status));
1162         return;
1163     }
1164     UnicodeString id = dg->getID();
1165     if (id != ID) {
1166         errln("FAIL: createInstance(" + ID + ")->getID() => " + id);
1167     }
1168     UnicodeString dev = CharsToUnicodeString("\\u0901\\u090B\\u0925");
1169     UnicodeString guj = CharsToUnicodeString("\\u0A81\\u0A8B\\u0AA5");
1170     expect(*dg, dev, guj);
1171     delete dg;
1172 }
1173 
1174 /**
1175  * Test filter syntax in IDs. (J918)
1176  */
TestFilterIDs(void)1177 void TransliteratorTest::TestFilterIDs(void) {
1178     // Array of 3n strings:
1179     // <id>, <inverse id>, <input>, <expected output>
1180     const char* DATA[] = {
1181         "[aeiou]Any-Hex", // ID
1182         "[aeiou]Hex-Any", // expected inverse ID
1183         "quizzical",      // src
1184         "q\\u0075\\u0069zz\\u0069c\\u0061l", // expected ID.translit(src)
1185 
1186         "[aeiou]Any-Hex;[^5]Hex-Any",
1187         "[^5]Any-Hex;[aeiou]Hex-Any",
1188         "quizzical",
1189         "q\\u0075izzical",
1190 
1191         "[abc]Null",
1192         "[abc]Null",
1193         "xyz",
1194         "xyz",
1195     };
1196     enum { DATA_length = UPRV_LENGTHOF(DATA) };
1197 
1198     for (int i=0; i<DATA_length; i+=4) {
1199         UnicodeString ID(DATA[i], "");
1200         UnicodeString uID(DATA[i+1], "");
1201         UnicodeString data2(DATA[i+2], "");
1202         UnicodeString data3(DATA[i+3], "");
1203         UParseError parseError;
1204         UErrorCode status = U_ZERO_ERROR;
1205         Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, parseError, status);
1206         if (t == 0) {
1207             errln("FAIL: createInstance(" + ID + ") returned NULL");
1208             return;
1209         }
1210         expect(*t, data2, data3);
1211 
1212         // Check the ID
1213         if (ID != t->getID()) {
1214             errln("FAIL: createInstance(" + ID + ").getID() => " +
1215                   t->getID());
1216         }
1217 
1218         // Check the inverse
1219         Transliterator *u = t->createInverse(status);
1220         if (u == 0) {
1221             errln("FAIL: " + ID + ".createInverse() returned NULL");
1222         } else if (u->getID() != uID) {
1223             errln("FAIL: " + ID + ".createInverse().getID() => " +
1224                   u->getID() + ", expected " + uID);
1225         }
1226 
1227         delete t;
1228         delete u;
1229     }
1230 }
1231 
1232 /**
1233  * Test the case mapping transliterators.
1234  */
TestCaseMap(void)1235 void TransliteratorTest::TestCaseMap(void) {
1236     UParseError parseError;
1237     UErrorCode status = U_ZERO_ERROR;
1238     Transliterator* toUpper =
1239         Transliterator::createInstance("Any-Upper[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1240     Transliterator* toLower =
1241         Transliterator::createInstance("Any-Lower[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1242     Transliterator* toTitle =
1243         Transliterator::createInstance("Any-Title[^xyzXYZ]", UTRANS_FORWARD, parseError, status);
1244     if (toUpper==0 || toLower==0 || toTitle==0) {
1245         errln("FAIL: createInstance returned NULL");
1246         delete toUpper;
1247         delete toLower;
1248         delete toTitle;
1249         return;
1250     }
1251 
1252     expect(*toUpper, "The quick brown fox jumped over the lazy dogs.",
1253            "THE QUICK BROWN FOx JUMPED OVER THE LAzy DOGS.");
1254     expect(*toLower, "The quIck brown fOX jUMPED OVER THE LAzY dogs.",
1255            "the quick brown foX jumped over the lazY dogs.");
1256     expect(*toTitle, "the quick brown foX can't jump over the laZy dogs.",
1257            "The Quick Brown FoX Can't Jump Over The LaZy Dogs.");
1258 
1259     delete toUpper;
1260     delete toLower;
1261     delete toTitle;
1262 }
1263 
1264 /**
1265  * Test the name mapping transliterators.
1266  */
TestNameMap(void)1267 void TransliteratorTest::TestNameMap(void) {
1268     UParseError parseError;
1269     UErrorCode status = U_ZERO_ERROR;
1270     Transliterator* uni2name =
1271         Transliterator::createInstance("Any-Name[^abc]", UTRANS_FORWARD, parseError, status);
1272     Transliterator* name2uni =
1273         Transliterator::createInstance("Name-Any", UTRANS_FORWARD, parseError, status);
1274     if (uni2name==0 || name2uni==0) {
1275         errln("FAIL: createInstance returned NULL");
1276         delete uni2name;
1277         delete name2uni;
1278         return;
1279     }
1280 
1281     // Careful:  CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1282     expect(*uni2name, CharsToUnicodeString("\\u00A0abc\\u4E01\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF"),
1283            CharsToUnicodeString("\\\\N{NO-BREAK SPACE}abc\\\\N{CJK UNIFIED IDEOGRAPH-4E01}\\\\N{MICRO SIGN}\\\\N{GUJARATI SIGN CANDRABINDU}\\\\N{REPLACEMENT CHARACTER}\\\\N{<control-0004>}\\\\N{<control-0009>}\\\\N{<control-0081>}\\\\N{<noncharacter-FFFF>}"));
1284     expect(*name2uni, UNICODE_STRING_SIMPLE("{\\N { NO-BREAK SPACE}abc\\N{  CJK UNIFIED  IDEOGRAPH-4E01  }\\N{x\\N{MICRO SIGN}\\N{GUJARATI SIGN CANDRABINDU}\\N{REPLACEMENT CHARACTER}\\N{<control-0004>}\\N{<control-0009>}\\N{<control-0081>}\\N{<noncharacter-FFFF>}\\N{<control-0004>}\\N{"),
1285            CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{"));
1286 
1287     delete uni2name;
1288     delete name2uni;
1289 
1290     // round trip
1291     Transliterator* t =
1292         Transliterator::createInstance("Any-Name;Name-Any", UTRANS_FORWARD, parseError, status);
1293     if (t==0) {
1294         errln("FAIL: createInstance returned NULL");
1295         delete t;
1296         return;
1297     }
1298 
1299     // Careful:  CharsToUS will convert "\\N" => "N"; use "\\\\N" for \N
1300     UnicodeString s = CharsToUnicodeString("{\\u00A0abc\\u4E01\\\\N{x\\u00B5\\u0A81\\uFFFD\\u0004\\u0009\\u0081\\uFFFF\\u0004\\\\N{");
1301     expect(*t, s, s);
1302     delete t;
1303 }
1304 
1305 /**
1306  * Test liberalized ID syntax.  1006c
1307  */
TestLiberalizedID(void)1308 void TransliteratorTest::TestLiberalizedID(void) {
1309     // Some test cases have an expected getID() value of NULL.  This
1310     // means I have disabled the test case for now.  This stuff is
1311     // still under development, and I haven't decided whether to make
1312     // getID() return canonical case yet.  It will all get rewritten
1313     // with the move to Source-Target/Variant IDs anyway. [aliu]
1314     const char* DATA[] = {
1315         "latin-greek", NULL /*"Latin-Greek"*/, "case insensitivity",
1316         "  Null  ", "Null", "whitespace",
1317         " Latin[a-z]-Greek  ", "[a-z]Latin-Greek", "inline filter",
1318         "  null  ; latin-greek  ", NULL /*"Null;Latin-Greek"*/, "compound whitespace",
1319     };
1320     const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1321     UParseError parseError;
1322     UErrorCode status= U_ZERO_ERROR;
1323     for (int32_t i=0; i<DATA_length; i+=3) {
1324         Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, parseError, status);
1325         if (t == 0) {
1326             dataerrln(UnicodeString("FAIL: ") + DATA[i+2] +
1327                   " cannot create ID \"" + DATA[i] + "\" - " + u_errorName(status));
1328         } else {
1329             UnicodeString exp;
1330             if (DATA[i+1]) {
1331                 exp = UnicodeString(DATA[i+1], "");
1332             }
1333             // Don't worry about getID() if the expected char*
1334             // is NULL -- see above.
1335             if (exp.length() == 0 || exp == t->getID()) {
1336                 logln(UnicodeString("Ok: ") + DATA[i+2] +
1337                       " create ID \"" + DATA[i] + "\" => \"" +
1338                       exp + "\"");
1339             } else {
1340                 errln(UnicodeString("FAIL: ") + DATA[i+2] +
1341                       " create ID \"" + DATA[i] + "\" => \"" +
1342                       t->getID() + "\", exp \"" + exp + "\"");
1343             }
1344             delete t;
1345         }
1346     }
1347 }
1348 
1349 /* test for Jitterbug 912 */
TestCreateInstance()1350 void TransliteratorTest::TestCreateInstance(){
1351     const char* FORWARD = "F";
1352     const char* REVERSE = "R";
1353     const char* DATA[] = {
1354         // Column 1: id
1355         // Column 2: direction
1356         // Column 3: expected ID, or "" if expect failure
1357         "Latin-Hangul", REVERSE, "Hangul-Latin", // JB#912
1358 
1359         // JB#2689: bad compound causes crash
1360         "InvalidSource-InvalidTarget", FORWARD, "",
1361         "InvalidSource-InvalidTarget", REVERSE, "",
1362         "Hex-Any;InvalidSource-InvalidTarget", FORWARD, "",
1363         "Hex-Any;InvalidSource-InvalidTarget", REVERSE, "",
1364         "InvalidSource-InvalidTarget;Hex-Any", FORWARD, "",
1365         "InvalidSource-InvalidTarget;Hex-Any", REVERSE, "",
1366 
1367         NULL
1368     };
1369 
1370     for (int32_t i=0; DATA[i]; i+=3) {
1371         UParseError err;
1372         UErrorCode ec = U_ZERO_ERROR;
1373         UnicodeString id(DATA[i]);
1374         UTransDirection dir = (DATA[i+1]==FORWARD)?
1375             UTRANS_FORWARD:UTRANS_REVERSE;
1376         UnicodeString expID(DATA[i+2]);
1377         Transliterator* t =
1378             Transliterator::createInstance(id,dir,err,ec);
1379         UnicodeString newID;
1380         if (t) {
1381             newID = t->getID();
1382         }
1383         UBool ok = (newID == expID);
1384         if (!t) {
1385             newID = u_errorName(ec);
1386         }
1387         if (ok) {
1388             logln((UnicodeString)"Ok: createInstance(" +
1389                   id + "," + DATA[i+1] + ") => " + newID);
1390         } else {
1391             dataerrln((UnicodeString)"FAIL: createInstance(" +
1392                   id + "," + DATA[i+1] + ") => " + newID +
1393                   ", expected " + expID);
1394         }
1395         delete t;
1396     }
1397 }
1398 
1399 /**
1400  * Test the normalization transliterator.
1401  */
TestNormalizationTransliterator()1402 void TransliteratorTest::TestNormalizationTransliterator() {
1403     // THE FOLLOWING TWO TABLES ARE COPIED FROM com.ibm.test.normalizer.BasicTest
1404     // PLEASE KEEP THEM IN SYNC WITH BasicTest.
1405     const char* CANON[] = {
1406         // Input               Decomposed            Composed
1407         "cat",                "cat",                "cat"               ,
1408         "\\u00e0ardvark",      "a\\u0300ardvark",     "\\u00e0ardvark"    ,
1409 
1410         "\\u1e0a",             "D\\u0307",            "\\u1e0a"            , // D-dot_above
1411         "D\\u0307",            "D\\u0307",            "\\u1e0a"            , // D dot_above
1412 
1413         "\\u1e0c\\u0307",       "D\\u0323\\u0307",      "\\u1e0c\\u0307"      , // D-dot_below dot_above
1414         "\\u1e0a\\u0323",       "D\\u0323\\u0307",      "\\u1e0c\\u0307"      , // D-dot_above dot_below
1415         "D\\u0307\\u0323",      "D\\u0323\\u0307",      "\\u1e0c\\u0307"      , // D dot_below dot_above
1416 
1417         "\\u1e10\\u0307\\u0323", "D\\u0327\\u0323\\u0307","\\u1e10\\u0323\\u0307", // D dot_below cedilla dot_above
1418         "D\\u0307\\u0328\\u0323","D\\u0328\\u0323\\u0307","\\u1e0c\\u0328\\u0307", // D dot_above ogonek dot_below
1419 
1420         "\\u1E14",             "E\\u0304\\u0300",      "\\u1E14"            , // E-macron-grave
1421         "\\u0112\\u0300",       "E\\u0304\\u0300",      "\\u1E14"            , // E-macron + grave
1422         "\\u00c8\\u0304",       "E\\u0300\\u0304",      "\\u00c8\\u0304"      , // E-grave + macron
1423 
1424         "\\u212b",             "A\\u030a",            "\\u00c5"            , // angstrom_sign
1425         "\\u00c5",             "A\\u030a",            "\\u00c5"            , // A-ring
1426 
1427         "\\u00fdffin",         "y\\u0301ffin",        "\\u00fdffin"        ,    //updated with 3.0
1428         "\\u00fd\\uFB03n",      "y\\u0301\\uFB03n",     "\\u00fd\\uFB03n"     , //updated with 3.0
1429 
1430         "Henry IV",           "Henry IV",           "Henry IV"          ,
1431         "Henry \\u2163",       "Henry \\u2163",       "Henry \\u2163"      ,
1432 
1433         "\\u30AC",             "\\u30AB\\u3099",       "\\u30AC"            , // ga (Katakana)
1434         "\\u30AB\\u3099",       "\\u30AB\\u3099",       "\\u30AC"            , // ka + ten
1435         "\\uFF76\\uFF9E",       "\\uFF76\\uFF9E",       "\\uFF76\\uFF9E"      , // hw_ka + hw_ten
1436         "\\u30AB\\uFF9E",       "\\u30AB\\uFF9E",       "\\u30AB\\uFF9E"      , // ka + hw_ten
1437         "\\uFF76\\u3099",       "\\uFF76\\u3099",       "\\uFF76\\u3099"      , // hw_ka + ten
1438 
1439         "A\\u0300\\u0316",      "A\\u0316\\u0300",      "\\u00C0\\u0316"      ,
1440         0 // end
1441     };
1442 
1443     const char* COMPAT[] = {
1444         // Input               Decomposed            Composed
1445         "\\uFB4f",             "\\u05D0\\u05DC",       "\\u05D0\\u05DC"     , // Alef-Lamed vs. Alef, Lamed
1446 
1447         "\\u00fdffin",         "y\\u0301ffin",        "\\u00fdffin"        ,    //updated for 3.0
1448         "\\u00fd\\uFB03n",      "y\\u0301ffin",        "\\u00fdffin"        , // ffi ligature -> f + f + i
1449 
1450         "Henry IV",           "Henry IV",           "Henry IV"          ,
1451         "Henry \\u2163",       "Henry IV",           "Henry IV"          ,
1452 
1453         "\\u30AC",             "\\u30AB\\u3099",       "\\u30AC"            , // ga (Katakana)
1454         "\\u30AB\\u3099",       "\\u30AB\\u3099",       "\\u30AC"            , // ka + ten
1455 
1456         "\\uFF76\\u3099",       "\\u30AB\\u3099",       "\\u30AC"            , // hw_ka + ten
1457         0 // end
1458     };
1459 
1460     int32_t i;
1461     UParseError parseError;
1462     UErrorCode status = U_ZERO_ERROR;
1463     Transliterator* NFD = Transliterator::createInstance("NFD", UTRANS_FORWARD, parseError, status);
1464     Transliterator* NFC = Transliterator::createInstance("NFC", UTRANS_FORWARD, parseError, status);
1465     if (!NFD || !NFC) {
1466         dataerrln("FAIL: createInstance failed: %s", u_errorName(status));
1467         delete NFD;
1468         delete NFC;
1469         return;
1470     }
1471     for (i=0; CANON[i]; i+=3) {
1472         UnicodeString in = CharsToUnicodeString(CANON[i]);
1473         UnicodeString expd = CharsToUnicodeString(CANON[i+1]);
1474         UnicodeString expc = CharsToUnicodeString(CANON[i+2]);
1475         expect(*NFD, in, expd);
1476         expect(*NFC, in, expc);
1477     }
1478     delete NFD;
1479     delete NFC;
1480 
1481     Transliterator* NFKD = Transliterator::createInstance("NFKD", UTRANS_FORWARD, parseError, status);
1482     Transliterator* NFKC = Transliterator::createInstance("NFKC", UTRANS_FORWARD, parseError, status);
1483     if (!NFKD || !NFKC) {
1484         dataerrln("FAIL: createInstance failed");
1485         delete NFKD;
1486         delete NFKC;
1487         return;
1488     }
1489     for (i=0; COMPAT[i]; i+=3) {
1490         UnicodeString in = CharsToUnicodeString(COMPAT[i]);
1491         UnicodeString expkd = CharsToUnicodeString(COMPAT[i+1]);
1492         UnicodeString expkc = CharsToUnicodeString(COMPAT[i+2]);
1493         expect(*NFKD, in, expkd);
1494         expect(*NFKC, in, expkc);
1495     }
1496     delete NFKD;
1497     delete NFKC;
1498 
1499     UParseError pe;
1500     status = U_ZERO_ERROR;
1501     Transliterator *t = Transliterator::createInstance("NFD; [x]Remove",
1502                                                        UTRANS_FORWARD,
1503                                                        pe, status);
1504     if (t == 0) {
1505         errln("FAIL: createInstance failed");
1506     }
1507     expect(*t, CharsToUnicodeString("\\u010dx"),
1508            CharsToUnicodeString("c\\u030C"));
1509     delete t;
1510 }
1511 
1512 /**
1513  * Test we can create basic transliterator even without data.
1514  */
TestBasicTransliteratorEvenWithoutData()1515 void TransliteratorTest::TestBasicTransliteratorEvenWithoutData() {
1516     const char16_t* TEST_DATA = u"\u0124e\u0301 \uFB01nd x";
1517     const char16_t* EXPECTED_RESULTS[] = {
1518         u"H\u0302e\u0301 \uFB01nd x",  // NFD
1519         u"\u0124\u00E9 \uFB01nd x",  // NFC
1520         u"H\u0302e\u0301 find x",  // NFKD
1521         u"\u0124\u00E9 find x",  // NFKC
1522         u"\u0124e\u0301 \uFB01nd x",  // Hex-Any
1523         u"\u0125e\u0301 \uFB01nd x",  // Lower
1524         u"\u0124e\uFB01ndx",  // [:^L:]Remove
1525         u"H\u0302e\u0301 \uFB01nd ",  // NFD; [x]Remove
1526         u"h\u0302e\u0301 find x",  // Lower; NFKD;
1527         u"hefindx",  // Lower; NFKD; [:^L:]Remove; NFC;
1528         u"\u0124e \uFB01nd x",  // [:Nonspacing Mark:] Remove;
1529         u"He \uFB01nd x",  // NFD; [:Nonspacing Mark:] Remove; NFC;
1530         // end
1531         0
1532     };
1533 
1534     const char* BASIC_TRANSLITERATOR_ID[] = {
1535         "NFD",
1536         "NFC",
1537         "NFKD",
1538         "NFKC",
1539         "Hex-Any",
1540         "Lower",
1541         "[:^L:]Remove",
1542         "NFD; [x]Remove",
1543         "Lower; NFKD;",
1544         "Lower; NFKD; [:^L:]Remove; NFC;",
1545         "[:Nonspacing Mark:] Remove;",
1546         "NFD; [:Nonspacing Mark:] Remove; NFC;",
1547         // end
1548         0
1549     };
1550     const char* BASIC_TRANSLITERATOR_RULES[] = {
1551         "::Lower; ::NFKD;",
1552         "::Lower; ::NFKD; ::[:^L:]Remove; ::NFC;",
1553         "::[:Nonspacing Mark:] Remove;",
1554         "::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;",
1555         // end
1556         0
1557     };
1558     for (int32_t i=0; BASIC_TRANSLITERATOR_ID[i]; i++) {
1559         UErrorCode status = U_ZERO_ERROR;
1560         UParseError parseError;
1561         std::unique_ptr<Transliterator> translit(Transliterator::createInstance(
1562             BASIC_TRANSLITERATOR_ID[i], UTRANS_FORWARD, parseError, status));
1563         if (translit.get() == nullptr || !U_SUCCESS(status)) {
1564             dataerrln("FAIL: createInstance %s failed", BASIC_TRANSLITERATOR_ID[i]);
1565             continue;
1566         }
1567         UnicodeString data(TEST_DATA);
1568         UnicodeString expected(EXPECTED_RESULTS[i]);
1569         translit->transliterate(data);
1570         if (data != expected) {
1571             dataerrln(UnicodeString("FAIL: expected translit(") +
1572                       BASIC_TRANSLITERATOR_ID[i] + ") = '" +
1573                       EXPECTED_RESULTS[i] + "' but got '" + data);
1574             continue;
1575         }
1576     }
1577     for (int32_t i=0; BASIC_TRANSLITERATOR_RULES[i]; i++) {
1578         UErrorCode status = U_ZERO_ERROR;
1579         UParseError parseError;
1580         std::unique_ptr<Transliterator> translit(Transliterator::createFromRules(
1581             "Test",
1582             BASIC_TRANSLITERATOR_RULES[i], UTRANS_FORWARD, parseError, status));
1583         if (translit.get() == nullptr || !U_SUCCESS(status)) {
1584             dataerrln("FAIL: createFromRules %s failed", BASIC_TRANSLITERATOR_RULES[i]);
1585             continue;
1586         }
1587     }
1588 }
1589 
1590 /**
1591  * Test compound RBT rules.
1592  */
TestCompoundRBT(void)1593 void TransliteratorTest::TestCompoundRBT(void) {
1594     // Careful with spacing and ';' here:  Phrase this exactly
1595     // as toRules() is going to return it.  If toRules() changes
1596     // with regard to spacing or ';', then adjust this string.
1597     UnicodeString rule("::Hex-Any;\n"
1598                        "::Any-Lower;\n"
1599                        "a > '.A.';\n"
1600                        "b > '.B.';\n"
1601                        "::[^t]Any-Upper;", "");
1602     UParseError parseError;
1603     UErrorCode status = U_ZERO_ERROR;
1604     Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, parseError, status);
1605     if (t == 0) {
1606         errln("FAIL: createFromRules failed");
1607         return;
1608     }
1609     expect(*t, UNICODE_STRING_SIMPLE("\\u0043at in the hat, bat on the mat"),
1610            "C.A.t IN tHE H.A.t, .B..A.t ON tHE M.A.t");
1611     UnicodeString r;
1612     t->toRules(r, TRUE);
1613     if (r == rule) {
1614         logln((UnicodeString)"OK: toRules() => " + r);
1615     } else {
1616         errln((UnicodeString)"FAIL: toRules() => " + r +
1617               ", expected " + rule);
1618     }
1619     delete t;
1620 
1621     // Now test toRules
1622     t = Transliterator::createInstance("Greek-Latin; Latin-Cyrillic", UTRANS_FORWARD, parseError, status);
1623     if (t == 0) {
1624         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1625         return;
1626     }
1627     UnicodeString exp("::Greek-Latin;\n::Latin-Cyrillic;");
1628     t->toRules(r, TRUE);
1629     if (r != exp) {
1630         errln((UnicodeString)"FAIL: toRules() => " + r +
1631               ", expected " + exp);
1632     } else {
1633         logln((UnicodeString)"OK: toRules() => " + r);
1634     }
1635     delete t;
1636 
1637     // Round trip the result of toRules
1638     t = Transliterator::createFromRules("Test", r, UTRANS_FORWARD, parseError, status);
1639     if (t == 0) {
1640         errln("FAIL: createFromRules #2 failed");
1641         return;
1642     } else {
1643         logln((UnicodeString)"OK: createFromRules(" + r + ") succeeded");
1644     }
1645 
1646     // Test toRules again
1647     t->toRules(r, TRUE);
1648     if (r != exp) {
1649         errln((UnicodeString)"FAIL: toRules() => " + r +
1650               ", expected " + exp);
1651     } else {
1652         logln((UnicodeString)"OK: toRules() => " + r);
1653     }
1654 
1655     delete t;
1656 
1657     // Test Foo(Bar) IDs.  Careful with spacing in id; make it conform
1658     // to what the regenerated ID will look like.
1659     UnicodeString id("Upper(Lower);(NFKC)", "");
1660     t = Transliterator::createInstance(id, UTRANS_FORWARD, parseError, status);
1661     if (t == 0) {
1662         errln("FAIL: createInstance #2 failed");
1663         return;
1664     }
1665     if (t->getID() == id) {
1666         logln((UnicodeString)"OK: created " + id);
1667     } else {
1668         errln((UnicodeString)"FAIL: createInstance(" + id +
1669               ").getID() => " + t->getID());
1670     }
1671 
1672     Transliterator *u = t->createInverse(status);
1673     if (u == 0) {
1674         errln("FAIL: createInverse failed");
1675         delete t;
1676         return;
1677     }
1678     exp = "NFKC();Lower(Upper)";
1679     if (u->getID() == exp) {
1680         logln((UnicodeString)"OK: createInverse(" + id + ") => " +
1681               u->getID());
1682     } else {
1683         errln((UnicodeString)"FAIL: createInverse(" + id + ") => " +
1684               u->getID());
1685     }
1686     delete t;
1687     delete u;
1688 }
1689 
1690 /**
1691  * Compound filter semantics were orginially not implemented
1692  * correctly.  Originally, each component filter f(i) is replaced by
1693  * f'(i) = f(i) && g, where g is the filter for the compound
1694  * transliterator.
1695  *
1696  * From Mark:
1697  *
1698  * Suppose and I have a transliterator X. Internally X is
1699  * "Greek-Latin; Latin-Cyrillic; Any-Lower". I use a filter [^A].
1700  *
1701  * The compound should convert all greek characters (through latin) to
1702  * cyrillic, then lowercase the result. The filter should say "don't
1703  * touch 'A' in the original". But because an intermediate result
1704  * happens to go through "A", the Greek Alpha gets hung up.
1705  */
TestCompoundFilter(void)1706 void TransliteratorTest::TestCompoundFilter(void) {
1707     UParseError parseError;
1708     UErrorCode status = U_ZERO_ERROR;
1709     Transliterator *t = Transliterator::createInstance
1710         ("Greek-Latin; Latin-Greek; Lower", UTRANS_FORWARD, parseError, status);
1711     if (t == 0) {
1712         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
1713         return;
1714     }
1715     t->adoptFilter(new UnicodeSet("[^A]", status));
1716     if (U_FAILURE(status)) {
1717         errln("FAIL: UnicodeSet ct failed");
1718         delete t;
1719         return;
1720     }
1721 
1722     // Only the 'A' at index 1 should remain unchanged
1723     expect(*t,
1724            CharsToUnicodeString("BA\\u039A\\u0391"),
1725            CharsToUnicodeString("\\u03b2A\\u03ba\\u03b1"));
1726     delete t;
1727 }
1728 
TestRemove(void)1729 void TransliteratorTest::TestRemove(void) {
1730     UParseError parseError;
1731     UErrorCode status = U_ZERO_ERROR;
1732     Transliterator *t = Transliterator::createInstance("Remove[abc]", UTRANS_FORWARD, parseError, status);
1733     if (t == 0) {
1734         errln("FAIL: createInstance failed");
1735         return;
1736     }
1737 
1738     expect(*t, "Able bodied baker's cats", "Ale odied ker's ts");
1739 
1740     // extra test for RemoveTransliterator::clone(), which at one point wasn't
1741     // duplicating the filter
1742     Transliterator* t2 = t->clone();
1743     expect(*t2, "Able bodied baker's cats", "Ale odied ker's ts");
1744 
1745     delete t;
1746     delete t2;
1747 }
1748 
TestToRules(void)1749 void TransliteratorTest::TestToRules(void) {
1750     const char* RBT = "rbt";
1751     const char* SET = "set";
1752     static const char* DATA[] = {
1753         RBT,
1754         "$a=\\u4E61; [$a] > A;",
1755         "[\\u4E61] > A;",
1756 
1757         RBT,
1758         "$white=[[:Zs:][:Zl:]]; $white{a} > A;",
1759         "[[:Zs:][:Zl:]]{a} > A;",
1760 
1761         SET,
1762         "[[:Zs:][:Zl:]]",
1763         "[[:Zs:][:Zl:]]",
1764 
1765         SET,
1766         "[:Ps:]",
1767         "[:Ps:]",
1768 
1769         SET,
1770         "[:L:]",
1771         "[:L:]",
1772 
1773         SET,
1774         "[[:L:]-[A]]",
1775         "[[:L:]-[A]]",
1776 
1777         SET,
1778         "[~[:Lu:][:Ll:]]",
1779         "[~[:Lu:][:Ll:]]",
1780 
1781         SET,
1782         "[~[a-z]]",
1783         "[~[a-z]]",
1784 
1785         RBT,
1786         "$white=[:Zs:]; $black=[^$white]; $black{a} > A;",
1787         "[^[:Zs:]]{a} > A;",
1788 
1789         RBT,
1790         "$a=[:Zs:]; $b=[[a-z]-$a]; $b{a} > A;",
1791         "[[a-z]-[:Zs:]]{a} > A;",
1792 
1793         RBT,
1794         "$a=[:Zs:]; $b=[$a&[a-z]]; $b{a} > A;",
1795         "[[:Zs:]&[a-z]]{a} > A;",
1796 
1797         RBT,
1798         "$a=[:Zs:]; $b=[x$a]; $b{a} > A;",
1799         "[x[:Zs:]]{a} > A;",
1800 
1801         RBT,
1802         "$accentMinus = [ [\\u0300-\\u0345] & [:M:] - [\\u0338]] ;"
1803         "$macron = \\u0304 ;"
1804         "$evowel = [aeiouyAEIOUY] ;"
1805         "$iotasub = \\u0345 ;"
1806         "($evowel $macron $accentMinus *) i > | $1 $iotasub ;",
1807         "([AEIOUYaeiouy]\\u0304[[\\u0300-\\u0345]&[:M:]-[\\u0338]]*)i > | $1 \\u0345;",
1808 
1809         RBT,
1810         "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1811         "([AEIOUYaeiouy]\\u0304[[:M:]-[\\u0304\\u0345]]*)i > | $1 \\u0345;",
1812     };
1813     static const int32_t DATA_length = UPRV_LENGTHOF(DATA);
1814 
1815     for (int32_t d=0; d < DATA_length; d+=3) {
1816         if (DATA[d] == RBT) {
1817             // Transliterator test
1818             UParseError parseError;
1819             UErrorCode status = U_ZERO_ERROR;
1820             Transliterator *t = Transliterator::createFromRules("ID",
1821                                                                 UnicodeString(DATA[d+1], -1, US_INV), UTRANS_FORWARD, parseError, status);
1822             if (t == 0) {
1823                 dataerrln("FAIL: createFromRules failed - %s", u_errorName(status));
1824                 return;
1825             }
1826             UnicodeString rules, escapedRules;
1827             t->toRules(rules, FALSE);
1828             t->toRules(escapedRules, TRUE);
1829             UnicodeString expRules = CharsToUnicodeString(DATA[d+2]);
1830             UnicodeString expEscapedRules(DATA[d+2], -1, US_INV);
1831             if (rules == expRules) {
1832                 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1833                       " => " + rules);
1834             } else {
1835                 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1836                       " => " + rules + ", exp " + expRules);
1837             }
1838             if (escapedRules == expEscapedRules) {
1839                 logln((UnicodeString)"Ok: " + UnicodeString(DATA[d+1], -1, US_INV) +
1840                       " => " + escapedRules);
1841             } else {
1842                 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[d+1], -1, US_INV) +
1843                       " => " + escapedRules + ", exp " + expEscapedRules);
1844             }
1845             delete t;
1846 
1847         } else {
1848             // UnicodeSet test
1849             UErrorCode status = U_ZERO_ERROR;
1850             UnicodeString pat(DATA[d+1], -1, US_INV);
1851             UnicodeString expToPat(DATA[d+2], -1, US_INV);
1852             UnicodeSet set(pat, status);
1853             if (U_FAILURE(status)) {
1854                 errln("FAIL: UnicodeSet ct failed");
1855                 return;
1856             }
1857             // Adjust spacing etc. as necessary.
1858             UnicodeString toPat;
1859             set.toPattern(toPat);
1860             if (expToPat == toPat) {
1861                 logln((UnicodeString)"Ok: " + pat +
1862                       " => " + toPat);
1863             } else {
1864                 errln((UnicodeString)"FAIL: " + pat +
1865                       " => " + prettify(toPat, TRUE) +
1866                       ", exp " + prettify(pat, TRUE));
1867             }
1868         }
1869     }
1870 }
1871 
TestContext()1872 void TransliteratorTest::TestContext() {
1873     UTransPosition pos = {0, 2, 0, 1}; // cs cl s l
1874     expect("de > x; {d}e > y;",
1875            "de",
1876            "ye",
1877            &pos);
1878 
1879     expect("ab{c} > z;",
1880            "xadabdabcy",
1881            "xadabdabzy");
1882 }
1883 
TestSupplemental()1884 void TransliteratorTest::TestSupplemental() {
1885 
1886     expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];"
1887                                 "a > $a; $s > i;"),
1888            CharsToUnicodeString("ab\\U0001030Fx"),
1889            CharsToUnicodeString("\\U00010300bix"));
1890 
1891     expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];"
1892                                 "$b=[A-Z\\U00010400-\\U0001044D];"
1893                                 "($a)($b) > $2 $1;"),
1894            CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
1895            CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
1896 
1897     // k|ax\\U00010300xm
1898 
1899     // k|a\\U00010400\\U00010300xm
1900     // ky|\\U00010400\\U00010300xm
1901     // ky\\U00010400|\\U00010300xm
1902 
1903     // ky\\U00010400|\\U00010300\\U00010400m
1904     // ky\\U00010400y|\\U00010400m
1905     expect(CharsToUnicodeString("$a=[a\\U00010300-\\U00010323];"
1906                                 "$a {x} > | @ \\U00010400;"
1907                                 "{$a} [^\\u0000-\\uFFFF] > y;"),
1908            CharsToUnicodeString("kax\\U00010300xm"),
1909            CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
1910 
1911     expectT("Any-Name",
1912            CharsToUnicodeString("\\U00010330\\U000E0061\\u00A0"),
1913            UNICODE_STRING_SIMPLE("\\N{GOTHIC LETTER AHSA}\\N{TAG LATIN SMALL LETTER A}\\N{NO-BREAK SPACE}"));
1914 
1915     expectT("Any-Hex/Unicode",
1916            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1917            UNICODE_STRING_SIMPLE("U+10330U+10FF00U+E0061U+00A0"));
1918 
1919     expectT("Any-Hex/C",
1920            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1921            UNICODE_STRING_SIMPLE("\\U00010330\\U0010FF00\\U000E0061\\u00A0"));
1922 
1923     expectT("Any-Hex/Perl",
1924            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1925            UNICODE_STRING_SIMPLE("\\x{10330}\\x{10FF00}\\x{E0061}\\x{A0}"));
1926 
1927     expectT("Any-Hex/Java",
1928            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1929            UNICODE_STRING_SIMPLE("\\uD800\\uDF30\\uDBFF\\uDF00\\uDB40\\uDC61\\u00A0"));
1930 
1931     expectT("Any-Hex/XML",
1932            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1933            "&#x10330;&#x10FF00;&#xE0061;&#xA0;");
1934 
1935     expectT("Any-Hex/XML10",
1936            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1937            "&#66352;&#1113856;&#917601;&#160;");
1938 
1939     expectT(UNICODE_STRING_SIMPLE("[\\U000E0000-\\U000E0FFF] Remove"),
1940            CharsToUnicodeString("\\U00010330\\U0010FF00\\U000E0061\\u00A0"),
1941            CharsToUnicodeString("\\U00010330\\U0010FF00\\u00A0"));
1942 }
1943 
TestQuantifier()1944 void TransliteratorTest::TestQuantifier() {
1945 
1946     // Make sure @ in a quantified anteContext works
1947     expect("a+ {b} > | @@ c; A > a; (a+ c) > '(' $1 ')';",
1948            "AAAAAb",
1949            "aaa(aac)");
1950 
1951     // Make sure @ in a quantified postContext works
1952     expect("{b} a+ > c @@ |; (a+) > '(' $1 ')';",
1953            "baaaaa",
1954            "caa(aaa)");
1955 
1956     // Make sure @ in a quantified postContext with seg ref works
1957     expect("{(b)} a+ > $1 @@ |; (a+) > '(' $1 ')';",
1958            "baaaaa",
1959            "baa(aaa)");
1960 
1961     // Make sure @ past ante context doesn't enter ante context
1962     UTransPosition pos = {0, 5, 3, 5};
1963     expect("a+ {b} > | @@ c; x > y; (a+ c) > '(' $1 ')';",
1964            "xxxab",
1965            "xxx(ac)",
1966            &pos);
1967 
1968     // Make sure @ past post context doesn't pass limit
1969     UTransPosition pos2 = {0, 4, 0, 2};
1970     expect("{b} a+ > c @@ |; x > y; a > A;",
1971            "baxx",
1972            "caxx",
1973            &pos2);
1974 
1975     // Make sure @ past post context doesn't enter post context
1976     expect("{b} a+ > c @@ |; x > y; a > A;",
1977            "baxx",
1978            "cayy");
1979 
1980     expect("(ab)? c > d;",
1981            "c abc ababc",
1982            "d d abd");
1983 
1984     // NOTE: The (ab)+ when referenced just yields a single "ab",
1985     // not the full sequence of them.  This accords with perl behavior.
1986     expect("(ab)+ {x} > '(' $1 ')';",
1987            "x abx ababxy",
1988            "x ab(ab) abab(ab)y");
1989 
1990     expect("b+ > x;",
1991            "ac abc abbc abbbc",
1992            "ac axc axc axc");
1993 
1994     expect("[abc]+ > x;",
1995            "qac abrc abbcs abtbbc",
1996            "qx xrx xs xtx");
1997 
1998     expect("q{(ab)+} > x;",
1999            "qa qab qaba qababc qaba",
2000            "qa qx qxa qxc qxa");
2001 
2002     expect("q(ab)* > x;",
2003            "qa qab qaba qababc",
2004            "xa x xa xc");
2005 
2006     // NOTE: The (ab)+ when referenced just yields a single "ab",
2007     // not the full sequence of them.  This accords with perl behavior.
2008     expect("q(ab)* > '(' $1 ')';",
2009            "qa qab qaba qababc",
2010            "()a (ab) (ab)a (ab)c");
2011 
2012     // 'foo'+ and 'foo'* -- the quantifier should apply to the entire
2013     // quoted string
2014     expect("'ab'+ > x;",
2015            "bb ab ababb",
2016            "bb x xb");
2017 
2018     // $foo+ and $foo* -- the quantifier should apply to the entire
2019     // variable reference
2020     expect("$var = ab; $var+ > x;",
2021            "bb ab ababb",
2022            "bb x xb");
2023 }
2024 
2025 class TestTrans : public Transliterator {
2026 public:
TestTrans(const UnicodeString & id)2027     TestTrans(const UnicodeString& id) : Transliterator(id, 0) {
2028     }
clone(void) const2029     virtual TestTrans* clone(void) const {
2030         return new TestTrans(getID());
2031     }
handleTransliterate(Replaceable &,UTransPosition & offsets,UBool) const2032     virtual void handleTransliterate(Replaceable& /*text*/, UTransPosition& offsets,
2033         UBool /*isIncremental*/) const
2034     {
2035         offsets.start = offsets.limit;
2036     }
2037     virtual UClassID getDynamicClassID() const;
2038     static UClassID U_EXPORT2 getStaticClassID();
2039 };
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)2040 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TestTrans)
2041 
2042 /**
2043  * Test Source-Target/Variant.
2044  */
2045 void TransliteratorTest::TestSTV(void) {
2046     int32_t ns = Transliterator::countAvailableSources();
2047     if (ns < 0 || ns > 255) {
2048         errln((UnicodeString)"FAIL: Bad source count: " + ns);
2049         return;
2050     }
2051     int32_t i, j;
2052     for (i=0; i<ns; ++i) {
2053         UnicodeString source;
2054         Transliterator::getAvailableSource(i, source);
2055         logln((UnicodeString)"" + i + ": " + source);
2056         if (source.length() == 0) {
2057             errln("FAIL: empty source");
2058             continue;
2059         }
2060         int32_t nt = Transliterator::countAvailableTargets(source);
2061         if (nt < 0 || nt > 255) {
2062             errln((UnicodeString)"FAIL: Bad target count: " + nt);
2063             continue;
2064         }
2065         for (int32_t j=0; j<nt; ++j) {
2066             UnicodeString target;
2067             Transliterator::getAvailableTarget(j, source, target);
2068             logln((UnicodeString)" " + j + ": " + target);
2069             if (target.length() == 0) {
2070                 errln("FAIL: empty target");
2071                 continue;
2072             }
2073             int32_t nv = Transliterator::countAvailableVariants(source, target);
2074             if (nv < 0 || nv > 255) {
2075                 errln((UnicodeString)"FAIL: Bad variant count: " + nv);
2076                 continue;
2077             }
2078             for (int32_t k=0; k<nv; ++k) {
2079                 UnicodeString variant;
2080                 Transliterator::getAvailableVariant(k, source, target, variant);
2081                 if (variant.length() == 0) {
2082                     logln((UnicodeString)"  " + k + ": <empty>");
2083                 } else {
2084                     logln((UnicodeString)"  " + k + ": " + variant);
2085                 }
2086             }
2087         }
2088     }
2089 
2090     // Test registration
2091     const char* IDS[] = { "Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2092     const char* FULL_IDS[] = { "Any-Fieruwer", "Seoridf-Sweorie", "Oewoir-Oweri/Vsie" };
2093     const char* SOURCES[] = { NULL, "Seoridf", "Oewoir" };
2094     for (i=0; i<3; ++i) {
2095         Transliterator *t = new TestTrans(IDS[i]);
2096         if (t == 0) {
2097             errln("FAIL: out of memory");
2098             return;
2099         }
2100         if (t->getID() != IDS[i]) {
2101             errln((UnicodeString)"FAIL: ID mismatch for " + IDS[i]);
2102             delete t;
2103             return;
2104         }
2105         Transliterator::registerInstance(t);
2106         UErrorCode status = U_ZERO_ERROR;
2107         t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2108         if (t == NULL) {
2109             errln((UnicodeString)"FAIL: Registration/creation failed for ID " +
2110                   IDS[i]);
2111         } else {
2112             logln((UnicodeString)"Ok: Registration/creation succeeded for ID " +
2113                   IDS[i]);
2114             delete t;
2115         }
2116         Transliterator::unregister(IDS[i]);
2117         t = Transliterator::createInstance(IDS[i], UTRANS_FORWARD, status);
2118         if (t != NULL) {
2119             errln((UnicodeString)"FAIL: Unregistration failed for ID " +
2120                   IDS[i]);
2121             delete t;
2122         }
2123     }
2124 
2125     // Make sure getAvailable API reflects removal
2126     int32_t n = Transliterator::countAvailableIDs();
2127     for (i=0; i<n; ++i) {
2128         UnicodeString id = Transliterator::getAvailableID(i);
2129         for (j=0; j<3; ++j) {
2130             if (id.caseCompare(FULL_IDS[j],0)==0) {
2131                 errln((UnicodeString)"FAIL: unregister(" + id + ") failed");
2132             }
2133         }
2134     }
2135     n = Transliterator::countAvailableTargets("Any");
2136     for (i=0; i<n; ++i) {
2137         UnicodeString t;
2138         Transliterator::getAvailableTarget(i, "Any", t);
2139         if (t.caseCompare(IDS[0],0)==0) {
2140             errln((UnicodeString)"FAIL: unregister(Any-" + t + ") failed");
2141         }
2142     }
2143     n = Transliterator::countAvailableSources();
2144     for (i=0; i<n; ++i) {
2145         UnicodeString s;
2146         Transliterator::getAvailableSource(i, s);
2147         for (j=0; j<3; ++j) {
2148             if (SOURCES[j] == NULL) continue;
2149             if (s.caseCompare(SOURCES[j],0)==0) {
2150                 errln((UnicodeString)"FAIL: unregister(" + s + "-*) failed");
2151             }
2152         }
2153     }
2154 }
2155 
2156 /**
2157  * Test inverse of Greek-Latin; Title()
2158  */
TestCompoundInverse(void)2159 void TransliteratorTest::TestCompoundInverse(void) {
2160     UParseError parseError;
2161     UErrorCode status = U_ZERO_ERROR;
2162     Transliterator *t = Transliterator::createInstance
2163         ("Greek-Latin; Title()", UTRANS_REVERSE,parseError, status);
2164     if (t == 0) {
2165         dataerrln("FAIL: createInstance - %s", u_errorName(status));
2166         return;
2167     }
2168     UnicodeString exp("(Title);Latin-Greek");
2169     if (t->getID() == exp) {
2170         logln("Ok: inverse of \"Greek-Latin; Title()\" is \"" +
2171               t->getID());
2172     } else {
2173         errln("FAIL: inverse of \"Greek-Latin; Title()\" is \"" +
2174               t->getID() + "\", expected \"" + exp + "\"");
2175     }
2176     delete t;
2177 }
2178 
2179 /**
2180  * Test NFD chaining with RBT
2181  */
TestNFDChainRBT()2182 void TransliteratorTest::TestNFDChainRBT() {
2183     UParseError pe;
2184     UErrorCode ec = U_ZERO_ERROR;
2185     Transliterator* t = Transliterator::createFromRules(
2186                                "TEST", "::NFD; aa > Q; a > q;",
2187                                UTRANS_FORWARD, pe, ec);
2188     if (t == NULL || U_FAILURE(ec)) {
2189         dataerrln("FAIL: Transliterator::createFromRules failed with %s", u_errorName(ec));
2190         return;
2191     }
2192     expect(*t, "aa", "Q");
2193     delete t;
2194 
2195     // TEMPORARY TESTS -- BEING DEBUGGED
2196 //=-    UnicodeString s, s2;
2197 //=-    t = Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, pe, ec);
2198 //=-    s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2199 //=-    s2 = CharsToUnicodeString("\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D");
2200 //=-    expect(*t, s, s2);
2201 //=-    delete t;
2202 //=-
2203 //=-    t = Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2204 //=-    expect(*t, s2, s);
2205 //=-    delete t;
2206 //=-
2207 //=-    t = Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, pe, ec);
2208 //=-    s = CharsToUnicodeString("rmk\\u1E63\\u0113t");
2209 //=-    expect(*t, s, s);
2210 //=-    delete t;
2211 
2212 //    const char* source[] = {
2213 //        /*
2214 //        "\\u015Br\\u012Bmad",
2215 //        "bhagavadg\\u012Bt\\u0101",
2216 //        "adhy\\u0101ya",
2217 //        "arjuna",
2218 //        "vi\\u1E63\\u0101da",
2219 //        "y\\u014Dga",
2220 //        "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2221 //        "uv\\u0101cr\\u0325",
2222 //        */
2223 //        "rmk\\u1E63\\u0113t",
2224 //      //"dharmak\\u1E63\\u0113tr\\u0113",
2225 //        /*
2226 //        "kuruk\\u1E63\\u0113tr\\u0113",
2227 //        "samav\\u0113t\\u0101",
2228 //        "yuyutsava-\\u1E25",
2229 //        "m\\u0101mak\\u0101-\\u1E25",
2230 //     // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2231 //        "kimakurvata",
2232 //        "san\\u0304java",
2233 //        */
2234 //
2235 //        0
2236 //    };
2237 //    const char* expected[] = {
2238 //        /*
2239 //        "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2240 //        "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2241 //        "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2242 //        "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2243 //        "\\u0935\\u093f\\u0937\\u093e\\u0926",
2244 //        "\\u092f\\u094b\\u0917",
2245 //        "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2246 //        "\\u0909\\u0935\\u093E\\u091A\\u0943",
2247 //        */
2248 //        "\\u0927",
2249 //        //"\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2250 //        /*
2251 //        "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2252 //        "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2253 //        "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2254 //        "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2255 //    //  "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2256 //        "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2257 //        "\\u0938\\u0902\\u091c\\u0935",
2258 //        */
2259 //        0
2260 //    };
2261 //    UErrorCode status = U_ZERO_ERROR;
2262 //    UParseError parseError;
2263 //    UnicodeString message;
2264 //    Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2265 //    Transliterator* devToLatinToDev=Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2266 //    if(U_FAILURE(status)){
2267 //        errln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2268 //        errln("PreContext: " + prettify(parseError.preContext) + "PostContext: " + prettify( parseError.postContext) );
2269 //        delete latinToDevToLatin;
2270 //        delete devToLatinToDev;
2271 //        return;
2272 //    }
2273 //    UnicodeString gotResult;
2274 //    for(int i= 0; source[i] != 0; i++){
2275 //        gotResult = source[i];
2276 //        expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2277 //        expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2278 //    }
2279 //    delete latinToDevToLatin;
2280 //    delete devToLatinToDev;
2281 }
2282 
2283 /**
2284  * Inverse of "Null" should be "Null". (J21)
2285  */
TestNullInverse()2286 void TransliteratorTest::TestNullInverse() {
2287     UParseError pe;
2288     UErrorCode ec = U_ZERO_ERROR;
2289     Transliterator *t = Transliterator::createInstance("Null", UTRANS_FORWARD, pe, ec);
2290     if (t == 0 || U_FAILURE(ec)) {
2291         errln("FAIL: createInstance");
2292         return;
2293     }
2294     Transliterator *u = t->createInverse(ec);
2295     if (u == 0 || U_FAILURE(ec)) {
2296         errln("FAIL: createInverse");
2297         delete t;
2298         return;
2299     }
2300     if (u->getID() != "Null") {
2301         errln("FAIL: Inverse of Null should be Null");
2302     }
2303     delete t;
2304     delete u;
2305 }
2306 
2307 /**
2308  * Check ID of inverse of alias. (J22)
2309  */
TestAliasInverseID()2310 void TransliteratorTest::TestAliasInverseID() {
2311     UnicodeString ID("Latin-Hangul", ""); // This should be any alias ID with an inverse
2312     UParseError pe;
2313     UErrorCode ec = U_ZERO_ERROR;
2314     Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2315     if (t == 0 || U_FAILURE(ec)) {
2316         dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2317         return;
2318     }
2319     Transliterator *u = t->createInverse(ec);
2320     if (u == 0 || U_FAILURE(ec)) {
2321         errln("FAIL: createInverse");
2322         delete t;
2323         return;
2324     }
2325     UnicodeString exp = "Hangul-Latin";
2326     UnicodeString got = u->getID();
2327     if (got != exp) {
2328         errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2329               ", expected " + exp);
2330     }
2331     delete t;
2332     delete u;
2333 }
2334 
2335 /**
2336  * Test IDs of inverses of compound transliterators. (J20)
2337  */
TestCompoundInverseID()2338 void TransliteratorTest::TestCompoundInverseID() {
2339     UnicodeString ID = "Latin-Jamo;NFC(NFD)";
2340     UParseError pe;
2341     UErrorCode ec = U_ZERO_ERROR;
2342     Transliterator *t = Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
2343     if (t == 0 || U_FAILURE(ec)) {
2344         dataerrln("FAIL: createInstance - %s", u_errorName(ec));
2345         return;
2346     }
2347     Transliterator *u = t->createInverse(ec);
2348     if (u == 0 || U_FAILURE(ec)) {
2349         errln("FAIL: createInverse");
2350         delete t;
2351         return;
2352     }
2353     UnicodeString exp = "NFD(NFC);Jamo-Latin";
2354     UnicodeString got = u->getID();
2355     if (got != exp) {
2356         errln((UnicodeString)"FAIL: Inverse of " + ID + " is " + got +
2357               ", expected " + exp);
2358     }
2359     delete t;
2360     delete u;
2361 }
2362 
2363 /**
2364  * Test undefined variable.
2365 
2366  */
TestUndefinedVariable()2367 void TransliteratorTest::TestUndefinedVariable() {
2368     UnicodeString rule = "$initial } a <> \\u1161;";
2369     UParseError pe;
2370     UErrorCode ec = U_ZERO_ERROR;
2371     Transliterator *t = Transliterator::createFromRules("<ID>", rule, UTRANS_FORWARD, pe, ec);
2372     delete t;
2373     if (U_FAILURE(ec)) {
2374         logln((UnicodeString)"OK: Got exception for " + rule + ", as expected: " +
2375               u_errorName(ec));
2376         return;
2377     }
2378     errln((UnicodeString)"Fail: bogus rule " + rule + " compiled with error " +
2379           u_errorName(ec));
2380 }
2381 
2382 /**
2383  * Test empty context.
2384  */
TestEmptyContext()2385 void TransliteratorTest::TestEmptyContext() {
2386     expect(" { a } > b;", "xay a ", "xby b ");
2387 }
2388 
2389 /**
2390 * Test compound filter ID syntax
2391 */
TestCompoundFilterID(void)2392 void TransliteratorTest::TestCompoundFilterID(void) {
2393     static const char* DATA[] = {
2394         // Col. 1 = ID or rule set (latter must start with #)
2395 
2396         // = columns > 1 are null if expect col. 1 to be illegal =
2397 
2398         // Col. 2 = direction, "F..." or "R..."
2399         // Col. 3 = source string
2400         // Col. 4 = exp result
2401 
2402         "[abc]; [abc]", NULL, NULL, NULL, // multiple filters
2403         "Latin-Greek; [abc];", NULL, NULL, NULL, // misplaced filter
2404         "[b]; Latin-Greek; Upper; ([xyz])", "F", "abc", "a\\u0392c",
2405         "[b]; (Lower); Latin-Greek; Upper(); ([\\u0392])", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2406         "#\n::[b]; ::Latin-Greek; ::Upper; ::([xyz]);", "F", "abc", "a\\u0392c",
2407         "#\n::[b]; ::(Lower); ::Latin-Greek; ::Upper(); ::([\\u0392]);", "R", "\\u0391\\u0392\\u0393", "\\u0391b\\u0393",
2408         NULL,
2409     };
2410 
2411     for (int32_t i=0; DATA[i]; i+=4) {
2412         UnicodeString id = CharsToUnicodeString(DATA[i]);
2413         UTransDirection direction = (DATA[i+1] != NULL && DATA[i+1][0] == 'R') ?
2414             UTRANS_REVERSE : UTRANS_FORWARD;
2415         UnicodeString source;
2416         UnicodeString exp;
2417         if (DATA[i+2] != NULL) {
2418             source = CharsToUnicodeString(DATA[i+2]);
2419             exp = CharsToUnicodeString(DATA[i+3]);
2420         }
2421         UBool expOk = (DATA[i+1] != NULL);
2422         LocalPointer<Transliterator> t;
2423         UParseError pe;
2424         UErrorCode ec = U_ZERO_ERROR;
2425         if (id.charAt(0) == 0x23/*#*/) {
2426             t.adoptInstead(Transliterator::createFromRules("ID", id, direction, pe, ec));
2427         } else {
2428             t.adoptInstead(Transliterator::createInstance(id, direction, pe, ec));
2429         }
2430         UBool ok = (t.isValid() && U_SUCCESS(ec));
2431         UnicodeString transID;
2432         if (t.isValid()) {
2433             transID = t->getID();
2434         }
2435         else {
2436             transID = UnicodeString("NULL", "");
2437         }
2438         if (ok == expOk) {
2439             logln((UnicodeString)"Ok: " + id + " => " + transID + ", " +
2440                   u_errorName(ec));
2441             if (source.length() != 0) {
2442                 expect(*t, source, exp);
2443             }
2444         } else {
2445             dataerrln((UnicodeString)"FAIL: " + id + " => " + transID + ", " +
2446                   u_errorName(ec));
2447         }
2448     }
2449 }
2450 
2451 /**
2452  * Test new property set syntax
2453  */
TestPropertySet()2454 void TransliteratorTest::TestPropertySet() {
2455     expect(UNICODE_STRING_SIMPLE("a>A; \\p{Lu}>x; \\p{ANY}>y;"), "abcDEF", "Ayyxxx");
2456     expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
2457            "[ a stitch ]\n[ in time ]\r[ saves 9]");
2458 }
2459 
2460 /**
2461  * Test various failure points of the new 2.0 engine.
2462  */
TestNewEngine()2463 void TransliteratorTest::TestNewEngine() {
2464     UParseError pe;
2465     UErrorCode ec = U_ZERO_ERROR;
2466     Transliterator *t = Transliterator::createInstance("Latin-Hiragana", UTRANS_FORWARD, pe, ec);
2467     if (t == 0 || U_FAILURE(ec)) {
2468         dataerrln("FAIL: createInstance Latin-Hiragana - %s", u_errorName(ec));
2469         return;
2470     }
2471     // Katakana should be untouched
2472     expect(*t, CharsToUnicodeString("a\\u3042\\u30A2"),
2473            CharsToUnicodeString("\\u3042\\u3042\\u30A2"));
2474 
2475     delete t;
2476 
2477 #if 1
2478     // This test will only work if Transliterator.ROLLBACK is
2479     // true.  Otherwise, this test will fail, revealing a
2480     // limitation of global filters in incremental mode.
2481     Transliterator *a =
2482         Transliterator::createFromRules("a_to_A", "a > A;", UTRANS_FORWARD, pe, ec);
2483     Transliterator *A =
2484         Transliterator::createFromRules("A_to_b", "A > b;", UTRANS_FORWARD, pe, ec);
2485     if (U_FAILURE(ec)) {
2486         delete a;
2487         delete A;
2488         return;
2489     }
2490 
2491     Transliterator* array[3];
2492     array[0] = a;
2493     array[1] = Transliterator::createInstance("NFD", UTRANS_FORWARD, pe, ec);
2494     array[2] = A;
2495     if (U_FAILURE(ec)) {
2496         errln("FAIL: createInstance NFD");
2497         delete a;
2498         delete A;
2499         delete array[1];
2500         return;
2501     }
2502 
2503     t = new CompoundTransliterator(array, 3, new UnicodeSet("[:Ll:]", ec));
2504     if (U_FAILURE(ec)) {
2505         errln("FAIL: UnicodeSet constructor");
2506         delete a;
2507         delete A;
2508         delete array[1];
2509         delete t;
2510         return;
2511     }
2512 
2513     expect(*t, "aAaA", "bAbA");
2514 
2515     assertTrue("countElements", t->countElements() == 3);
2516     assertEquals("getElement(0)", t->getElement(0, ec).getID(), "a_to_A");
2517     assertEquals("getElement(1)", t->getElement(1, ec).getID(), "NFD");
2518     assertEquals("getElement(2)", t->getElement(2, ec).getID(), "A_to_b");
2519     assertSuccess("getElement", ec);
2520 
2521     delete a;
2522     delete A;
2523     delete array[1];
2524     delete t;
2525 #endif
2526 
2527     expect("$smooth = x; $macron = q; [:^L:] { ([aeiouyAEIOUY] $macron?) } [^aeiouyAEIOUY$smooth$macron] > | $1 $smooth ;",
2528            "a",
2529            "ax");
2530 
2531     UnicodeString gr = CharsToUnicodeString(
2532         "$ddot = \\u0308 ;"
2533         "$lcgvowel = [\\u03b1\\u03b5\\u03b7\\u03b9\\u03bf\\u03c5\\u03c9] ;"
2534         "$rough = \\u0314 ;"
2535         "($lcgvowel+ $ddot?) $rough > h | $1 ;"
2536         "\\u03b1 <> a ;"
2537         "$rough <> h ;");
2538 
2539     expect(gr, CharsToUnicodeString("\\u03B1\\u0314"), "ha");
2540 }
2541 
2542 /**
2543  * Test quantified segment behavior.  We want:
2544  * ([abc])+ > x $1 x; applied to "cba" produces "xax"
2545  */
TestQuantifiedSegment(void)2546 void TransliteratorTest::TestQuantifiedSegment(void) {
2547     // The normal case
2548     expect("([abc]+) > x $1 x;", "cba", "xcbax");
2549 
2550     // The tricky case; the quantifier is around the segment
2551     expect("([abc])+ > x $1 x;", "cba", "xax");
2552 
2553     // Tricky case in reverse direction
2554     expect("([abc])+ { q > x $1 x;", "cbaq", "cbaxax");
2555 
2556     // Check post-context segment
2557     expect("{q} ([a-d])+ > '(' $1 ')';", "ddqcba", "dd(a)cba");
2558 
2559     // Test toRule/toPattern for non-quantified segment.
2560     // Careful with spacing here.
2561     UnicodeString r("([a-c]){q} > x $1 x;");
2562     UParseError pe;
2563     UErrorCode ec = U_ZERO_ERROR;
2564     Transliterator* t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2565     if (U_FAILURE(ec)) {
2566         errln("FAIL: createFromRules");
2567         delete t;
2568         return;
2569     }
2570     UnicodeString rr;
2571     t->toRules(rr, TRUE);
2572     if (r != rr) {
2573         errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2574     } else {
2575         logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2576     }
2577     delete t;
2578 
2579     // Test toRule/toPattern for quantified segment.
2580     // Careful with spacing here.
2581     r = "([a-c])+{q} > x $1 x;";
2582     t = Transliterator::createFromRules("ID", r, UTRANS_FORWARD, pe, ec);
2583     if (U_FAILURE(ec)) {
2584         errln("FAIL: createFromRules");
2585         delete t;
2586         return;
2587     }
2588     t->toRules(rr, TRUE);
2589     if (r != rr) {
2590         errln((UnicodeString)"FAIL: \"" + r + "\" x toRules() => \"" + rr + "\"");
2591     } else {
2592         logln((UnicodeString)"Ok: \"" + r + "\" x toRules() => \"" + rr + "\"");
2593     }
2594     delete t;
2595 }
2596 
2597 //======================================================================
2598 // Ram's tests
2599 //======================================================================
TestDevanagariLatinRT()2600 void TransliteratorTest::TestDevanagariLatinRT(){
2601     const int MAX_LEN= 52;
2602     const char* const source[MAX_LEN] = {
2603         "bh\\u0101rata",
2604         "kra",
2605         "k\\u1E63a",
2606         "khra",
2607         "gra",
2608         "\\u1E45ra",
2609         "cra",
2610         "chra",
2611         "j\\u00F1a",
2612         "jhra",
2613         "\\u00F1ra",
2614         "\\u1E6Dya",
2615         "\\u1E6Dhra",
2616         "\\u1E0Dya",
2617       //"r\\u0323ya", // \u095c is not valid in Devanagari
2618         "\\u1E0Dhya",
2619         "\\u1E5Bhra",
2620         "\\u1E47ra",
2621         "tta",
2622         "thra",
2623         "dda",
2624         "dhra",
2625         "nna",
2626         "pra",
2627         "phra",
2628         "bra",
2629         "bhra",
2630         "mra",
2631         "\\u1E49ra",
2632       //"l\\u0331ra",
2633         "yra",
2634         "\\u1E8Fra",
2635       //"l-",
2636         "vra",
2637         "\\u015Bra",
2638         "\\u1E63ra",
2639         "sra",
2640         "hma",
2641         "\\u1E6D\\u1E6Da",
2642         "\\u1E6D\\u1E6Dha",
2643         "\\u1E6Dh\\u1E6Dha",
2644         "\\u1E0D\\u1E0Da",
2645         "\\u1E0D\\u1E0Dha",
2646         "\\u1E6Dya",
2647         "\\u1E6Dhya",
2648         "\\u1E0Dya",
2649         "\\u1E0Dhya",
2650         // Not roundtrippable --
2651         // \\u0939\\u094d\\u094d\\u092E  - hma
2652         // \\u0939\\u094d\\u092E         - hma
2653         // CharsToUnicodeString("hma"),
2654         "hya",
2655         "\\u015Br\\u0325",
2656         "\\u015Bca",
2657         "\\u0115",
2658         "san\\u0304j\\u012Bb s\\u0113nagupta",
2659         "\\u0101nand vaddir\\u0101ju",
2660         "\\u0101",
2661         "a"
2662     };
2663     const char* const expected[MAX_LEN] = {
2664         "\\u092D\\u093E\\u0930\\u0924",   /* bha\\u0304rata */
2665         "\\u0915\\u094D\\u0930",          /* kra         */
2666         "\\u0915\\u094D\\u0937",          /* ks\\u0323a  */
2667         "\\u0916\\u094D\\u0930",          /* khra        */
2668         "\\u0917\\u094D\\u0930",          /* gra         */
2669         "\\u0919\\u094D\\u0930",          /* n\\u0307ra  */
2670         "\\u091A\\u094D\\u0930",          /* cra         */
2671         "\\u091B\\u094D\\u0930",          /* chra        */
2672         "\\u091C\\u094D\\u091E",          /* jn\\u0303a  */
2673         "\\u091D\\u094D\\u0930",          /* jhra        */
2674         "\\u091E\\u094D\\u0930",          /* n\\u0303ra  */
2675         "\\u091F\\u094D\\u092F",          /* t\\u0323ya  */
2676         "\\u0920\\u094D\\u0930",          /* t\\u0323hra */
2677         "\\u0921\\u094D\\u092F",          /* d\\u0323ya  */
2678       //"\\u095C\\u094D\\u092F",        /* r\\u0323ya  */ // \u095c is not valid in Devanagari
2679         "\\u0922\\u094D\\u092F",          /* d\\u0323hya */
2680         "\\u0922\\u093C\\u094D\\u0930",   /* r\\u0323hra */
2681         "\\u0923\\u094D\\u0930",          /* n\\u0323ra  */
2682         "\\u0924\\u094D\\u0924",          /* tta         */
2683         "\\u0925\\u094D\\u0930",          /* thra        */
2684         "\\u0926\\u094D\\u0926",          /* dda         */
2685         "\\u0927\\u094D\\u0930",          /* dhra        */
2686         "\\u0928\\u094D\\u0928",          /* nna         */
2687         "\\u092A\\u094D\\u0930",          /* pra         */
2688         "\\u092B\\u094D\\u0930",          /* phra        */
2689         "\\u092C\\u094D\\u0930",          /* bra         */
2690         "\\u092D\\u094D\\u0930",          /* bhra        */
2691         "\\u092E\\u094D\\u0930",          /* mra         */
2692         "\\u0929\\u094D\\u0930",          /* n\\u0331ra  */
2693       //"\\u0934\\u094D\\u0930",        /* l\\u0331ra  */
2694         "\\u092F\\u094D\\u0930",          /* yra         */
2695         "\\u092F\\u093C\\u094D\\u0930",   /* y\\u0307ra  */
2696       //"l-",
2697         "\\u0935\\u094D\\u0930",          /* vra         */
2698         "\\u0936\\u094D\\u0930",          /* s\\u0301ra  */
2699         "\\u0937\\u094D\\u0930",          /* s\\u0323ra  */
2700         "\\u0938\\u094D\\u0930",          /* sra         */
2701         "\\u0939\\u094d\\u092E",          /* hma         */
2702         "\\u091F\\u094D\\u091F",          /* t\\u0323t\\u0323a  */
2703         "\\u091F\\u094D\\u0920",          /* t\\u0323t\\u0323ha */
2704         "\\u0920\\u094D\\u0920",          /* t\\u0323ht\\u0323ha*/
2705         "\\u0921\\u094D\\u0921",          /* d\\u0323d\\u0323a  */
2706         "\\u0921\\u094D\\u0922",          /* d\\u0323d\\u0323ha */
2707         "\\u091F\\u094D\\u092F",          /* t\\u0323ya  */
2708         "\\u0920\\u094D\\u092F",          /* t\\u0323hya */
2709         "\\u0921\\u094D\\u092F",          /* d\\u0323ya  */
2710         "\\u0922\\u094D\\u092F",          /* d\\u0323hya */
2711      // "hma",                         /* hma         */
2712         "\\u0939\\u094D\\u092F",          /* hya         */
2713         "\\u0936\\u0943",                 /* s\\u0301r\\u0325a  */
2714         "\\u0936\\u094D\\u091A",          /* s\\u0301ca  */
2715         "\\u090d",                        /* e\\u0306    */
2716         "\\u0938\\u0902\\u091C\\u0940\\u092C\\u094D \\u0938\\u0947\\u0928\\u0917\\u0941\\u092A\\u094D\\u0924",
2717         "\\u0906\\u0928\\u0902\\u0926\\u094D \\u0935\\u0926\\u094D\\u0926\\u093F\\u0930\\u093E\\u091C\\u0941",
2718         "\\u0906",
2719         "\\u0905",
2720     };
2721     UErrorCode status = U_ZERO_ERROR;
2722     UParseError parseError;
2723     UnicodeString message;
2724     Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2725     Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2726     if(U_FAILURE(status)){
2727         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2728         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2729         return;
2730     }
2731     UnicodeString gotResult;
2732     for(int i= 0; i<MAX_LEN; i++){
2733         gotResult = source[i];
2734         expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2735         expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2736     }
2737     delete latinToDev;
2738     delete devToLatin;
2739 }
2740 
TestTeluguLatinRT()2741 void TransliteratorTest::TestTeluguLatinRT(){
2742     const int MAX_LEN=10;
2743     const char* const source[MAX_LEN] = {
2744         "raghur\\u0101m vi\\u015Bvan\\u0101dha",                         /* Raghuram Viswanadha    */
2745         "\\u0101nand vaddir\\u0101ju",                                   /* Anand Vaddiraju        */
2746         "r\\u0101j\\u012Bv ka\\u015Barab\\u0101da",                      /* Rajeev Kasarabada      */
2747         "san\\u0304j\\u012Bv ka\\u015Barab\\u0101da",                    /* sanjeev kasarabada     */
2748         "san\\u0304j\\u012Bb sen'gupta",                                 /* sanjib sengupata       */
2749         "amar\\u0113ndra hanum\\u0101nula",                              /* Amarendra hanumanula   */
2750         "ravi kum\\u0101r vi\\u015Bvan\\u0101dha",                       /* Ravi Kumar Viswanadha  */
2751         "\\u0101ditya kandr\\u0113gula",                                 /* Aditya Kandregula      */
2752         "\\u015Br\\u012Bdhar ka\\u1E47\\u1E6Dama\\u015Be\\u1E6D\\u1E6Di",/* Shridhar Kantamsetty   */
2753         "m\\u0101dhav de\\u015Be\\u1E6D\\u1E6Di"                         /* Madhav Desetty         */
2754     };
2755 
2756     const char* const expected[MAX_LEN] = {
2757         "\\u0c30\\u0c18\\u0c41\\u0c30\\u0c3e\\u0c2e\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2758         "\\u0c06\\u0c28\\u0c02\\u0c26\\u0c4d \\u0C35\\u0C26\\u0C4D\\u0C26\\u0C3F\\u0C30\\u0C3E\\u0C1C\\u0C41",
2759         "\\u0c30\\u0c3e\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2760         "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c35\\u0c4d \\u0c15\\u0c36\\u0c30\\u0c2c\\u0c3e\\u0c26",
2761         "\\u0c38\\u0c02\\u0c1c\\u0c40\\u0c2c\\u0c4d \\u0c38\\u0c46\\u0c28\\u0c4d\\u0c17\\u0c41\\u0c2a\\u0c4d\\u0c24",
2762         "\\u0c05\\u0c2e\\u0c30\\u0c47\\u0c02\\u0c26\\u0c4d\\u0c30 \\u0c39\\u0c28\\u0c41\\u0c2e\\u0c3e\\u0c28\\u0c41\\u0c32",
2763         "\\u0c30\\u0c35\\u0c3f \\u0c15\\u0c41\\u0c2e\\u0c3e\\u0c30\\u0c4d \\u0c35\\u0c3f\\u0c36\\u0c4d\\u0c35\\u0c28\\u0c3e\\u0c27",
2764         "\\u0c06\\u0c26\\u0c3f\\u0c24\\u0c4d\\u0c2f \\u0C15\\u0C02\\u0C26\\u0C4D\\u0C30\\u0C47\\u0C17\\u0C41\\u0c32",
2765         "\\u0c36\\u0c4d\\u0c30\\u0c40\\u0C27\\u0C30\\u0C4D \\u0c15\\u0c02\\u0c1f\\u0c2e\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2766         "\\u0c2e\\u0c3e\\u0c27\\u0c35\\u0c4d \\u0c26\\u0c46\\u0c36\\u0c46\\u0c1f\\u0c4d\\u0c1f\\u0c3f",
2767     };
2768 
2769     UErrorCode status = U_ZERO_ERROR;
2770     UParseError parseError;
2771     UnicodeString message;
2772     Transliterator* latinToDev=Transliterator::createInstance("Latin-Telugu", UTRANS_FORWARD, parseError, status);
2773     Transliterator* devToLatin=Transliterator::createInstance("Telugu-Latin", UTRANS_FORWARD, parseError, status);
2774     if(U_FAILURE(status)){
2775         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2776         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2777         return;
2778     }
2779     UnicodeString gotResult;
2780     for(int i= 0; i<MAX_LEN; i++){
2781         gotResult = source[i];
2782         expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2783         expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2784     }
2785     delete latinToDev;
2786     delete devToLatin;
2787 }
2788 
TestSanskritLatinRT()2789 void TransliteratorTest::TestSanskritLatinRT(){
2790     const int MAX_LEN =16;
2791     const char* const source[MAX_LEN] = {
2792         "rmk\\u1E63\\u0113t",
2793         "\\u015Br\\u012Bmad",
2794         "bhagavadg\\u012Bt\\u0101",
2795         "adhy\\u0101ya",
2796         "arjuna",
2797         "vi\\u1E63\\u0101da",
2798         "y\\u014Dga",
2799         "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2800         "uv\\u0101cr\\u0325",
2801         "dharmak\\u1E63\\u0113tr\\u0113",
2802         "kuruk\\u1E63\\u0113tr\\u0113",
2803         "samav\\u0113t\\u0101",
2804         "yuyutsava\\u1E25",
2805         "m\\u0101mak\\u0101\\u1E25",
2806     // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2807         "kimakurvata",
2808         "san\\u0304java",
2809     };
2810     const char* const expected[MAX_LEN] = {
2811         "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2812         "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2813         "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2814         "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2815         "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2816         "\\u0935\\u093f\\u0937\\u093e\\u0926",
2817         "\\u092f\\u094b\\u0917",
2818         "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2819         "\\u0909\\u0935\\u093E\\u091A\\u0943",
2820         "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2821         "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2822         "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2823         "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2824         "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2825     //"\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2826         "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2827         "\\u0938\\u0902\\u091c\\u0935",
2828     };
2829     UErrorCode status = U_ZERO_ERROR;
2830     UParseError parseError;
2831     UnicodeString message;
2832     Transliterator* latinToDev=Transliterator::createInstance("Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2833     Transliterator* devToLatin=Transliterator::createInstance("Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2834     if(U_FAILURE(status)){
2835         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2836         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2837         return;
2838     }
2839     UnicodeString gotResult;
2840     for(int i= 0; i<MAX_LEN; i++){
2841         gotResult = source[i];
2842         expect(*latinToDev,CharsToUnicodeString(source[i]),CharsToUnicodeString(expected[i]));
2843         expect(*devToLatin,CharsToUnicodeString(expected[i]),CharsToUnicodeString(source[i]));
2844     }
2845     delete latinToDev;
2846     delete devToLatin;
2847 }
2848 
2849 
TestCompoundLatinRT()2850 void TransliteratorTest::TestCompoundLatinRT(){
2851     const char* const source[] = {
2852         "rmk\\u1E63\\u0113t",
2853         "\\u015Br\\u012Bmad",
2854         "bhagavadg\\u012Bt\\u0101",
2855         "adhy\\u0101ya",
2856         "arjuna",
2857         "vi\\u1E63\\u0101da",
2858         "y\\u014Dga",
2859         "dhr\\u0325tar\\u0101\\u1E63\\u1E6Dra",
2860         "uv\\u0101cr\\u0325",
2861         "dharmak\\u1E63\\u0113tr\\u0113",
2862         "kuruk\\u1E63\\u0113tr\\u0113",
2863         "samav\\u0113t\\u0101",
2864         "yuyutsava\\u1E25",
2865         "m\\u0101mak\\u0101\\u1E25",
2866      // "p\\u0101\\u1E47\\u1E0Dav\\u0101\\u015Bcaiva",
2867         "kimakurvata",
2868         "san\\u0304java"
2869     };
2870     const int MAX_LEN = UPRV_LENGTHOF(source);
2871     const char* const expected[MAX_LEN] = {
2872         "\\u0930\\u094D\\u092E\\u094D\\u0915\\u094D\\u0937\\u0947\\u0924\\u094D",
2873         "\\u0936\\u094d\\u0930\\u0940\\u092e\\u0926\\u094d",
2874         "\\u092d\\u0917\\u0935\\u0926\\u094d\\u0917\\u0940\\u0924\\u093e",
2875         "\\u0905\\u0927\\u094d\\u092f\\u093e\\u092f",
2876         "\\u0905\\u0930\\u094d\\u091c\\u0941\\u0928",
2877         "\\u0935\\u093f\\u0937\\u093e\\u0926",
2878         "\\u092f\\u094b\\u0917",
2879         "\\u0927\\u0943\\u0924\\u0930\\u093e\\u0937\\u094d\\u091f\\u094d\\u0930",
2880         "\\u0909\\u0935\\u093E\\u091A\\u0943",
2881         "\\u0927\\u0930\\u094d\\u092e\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2882         "\\u0915\\u0941\\u0930\\u0941\\u0915\\u094d\\u0937\\u0947\\u0924\\u094d\\u0930\\u0947",
2883         "\\u0938\\u092e\\u0935\\u0947\\u0924\\u093e",
2884         "\\u092f\\u0941\\u092f\\u0941\\u0924\\u094d\\u0938\\u0935\\u0903",
2885         "\\u092e\\u093e\\u092e\\u0915\\u093e\\u0903",
2886     //  "\\u092a\\u093e\\u0923\\u094d\\u0921\\u0935\\u093e\\u0936\\u094d\\u091a\\u0948\\u0935",
2887         "\\u0915\\u093f\\u092e\\u0915\\u0941\\u0930\\u094d\\u0935\\u0924",
2888         "\\u0938\\u0902\\u091c\\u0935"
2889     };
2890     if(MAX_LEN != UPRV_LENGTHOF(expected)) {
2891         errln("error in TestCompoundLatinRT: source[] and expected[] have different lengths!");
2892         return;
2893     }
2894 
2895     UErrorCode status = U_ZERO_ERROR;
2896     UParseError parseError;
2897     UnicodeString message;
2898     Transliterator* devToLatinToDev  =Transliterator::createInstance("Devanagari-Latin;Latin-Devanagari", UTRANS_FORWARD, parseError, status);
2899     Transliterator* latinToDevToLatin=Transliterator::createInstance("Latin-Devanagari;Devanagari-Latin", UTRANS_FORWARD, parseError, status);
2900     Transliterator* devToTelToDev    =Transliterator::createInstance("Devanagari-Telugu;Telugu-Devanagari", UTRANS_FORWARD, parseError, status);
2901     Transliterator* latinToTelToLatin=Transliterator::createInstance("Latin-Telugu;Telugu-Latin", UTRANS_FORWARD, parseError, status);
2902 
2903     if(U_FAILURE(status)){
2904         dataerrln("FAIL: construction " +   UnicodeString(" Error: ") + u_errorName(status));
2905         dataerrln("PreContext: " + prettify(parseError.preContext) + " PostContext: " + prettify( parseError.postContext) );
2906         return;
2907     }
2908     UnicodeString gotResult;
2909     for(int i= 0; i<MAX_LEN; i++){
2910         gotResult = source[i];
2911         expect(*devToLatinToDev,CharsToUnicodeString(expected[i]),CharsToUnicodeString(expected[i]));
2912         expect(*latinToDevToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2913         expect(*latinToTelToLatin,CharsToUnicodeString(source[i]),CharsToUnicodeString(source[i]));
2914 
2915     }
2916     delete(latinToDevToLatin);
2917     delete(devToLatinToDev);
2918     delete(devToTelToDev);
2919     delete(latinToTelToLatin);
2920 }
2921 
2922 /**
2923  * Test Gurmukhi-Devanagari Tippi and Bindi
2924  */
TestGurmukhiDevanagari()2925 void TransliteratorTest::TestGurmukhiDevanagari(){
2926     // the rule says:
2927     // (\u0902) (when preceded by vowel)      --->  (\u0A02)
2928     // (\u0902) (when preceded by consonant)  --->  (\u0A70)
2929     UErrorCode status = U_ZERO_ERROR;
2930     UnicodeSet vowel(UnicodeString("[\\u0905-\\u090A \\u090F\\u0910\\u0913\\u0914 \\u093e-\\u0942\\u0947\\u0948\\u094B\\u094C\\u094D]", -1, US_INV).unescape(), status);
2931     UnicodeSet non_vowel(UnicodeString("[\\u0915-\\u0928\\u092A-\\u0930]", -1, US_INV).unescape(), status);
2932     UParseError parseError;
2933 
2934     UnicodeSetIterator vIter(vowel);
2935     UnicodeSetIterator nvIter(non_vowel);
2936     Transliterator* trans = Transliterator::createInstance("Devanagari-Gurmukhi",UTRANS_FORWARD, parseError, status);
2937     if(U_FAILURE(status)) {
2938       dataerrln("Error creating transliterator %s", u_errorName(status));
2939       delete trans;
2940       return;
2941     }
2942     UnicodeString src (" \\u0902", -1, US_INV);
2943     UnicodeString expected(" \\u0A02", -1, US_INV);
2944     src = src.unescape();
2945     expected= expected.unescape();
2946 
2947     while(vIter.next()){
2948         src.setCharAt(0,(UChar) vIter.getCodepoint());
2949         expected.setCharAt(0,(UChar) (vIter.getCodepoint()+0x0100));
2950         expect(*trans,src,expected);
2951     }
2952 
2953     expected.setCharAt(1,0x0A70);
2954     while(nvIter.next()){
2955         //src.setCharAt(0,(char) nvIter.codepoint);
2956         src.setCharAt(0,(UChar)nvIter.getCodepoint());
2957         expected.setCharAt(0,(UChar) (nvIter.getCodepoint()+0x0100));
2958         expect(*trans,src,expected);
2959     }
2960     delete trans;
2961 }
2962 /**
2963  * Test instantiation from a locale.
2964  */
TestLocaleInstantiation(void)2965 void TransliteratorTest::TestLocaleInstantiation(void) {
2966     UParseError pe;
2967     UErrorCode ec = U_ZERO_ERROR;
2968     Transliterator *t = Transliterator::createInstance("ru_RU-Latin", UTRANS_FORWARD, pe, ec);
2969     if (U_FAILURE(ec)) {
2970         dataerrln("FAIL: createInstance(ru_RU-Latin) - %s", u_errorName(ec));
2971         delete t;
2972         return;
2973     }
2974     expect(*t, CharsToUnicodeString("\\u0430"), "a");
2975     delete t;
2976 
2977     t = Transliterator::createInstance("en-el", UTRANS_FORWARD, pe, ec);
2978     if (U_FAILURE(ec)) {
2979         errln("FAIL: createInstance(en-el)");
2980         delete t;
2981         return;
2982     }
2983     expect(*t, "a", CharsToUnicodeString("\\u03B1"));
2984     delete t;
2985 }
2986 
2987 /**
2988  * Test title case handling of accent (should ignore accents)
2989  */
TestTitleAccents(void)2990 void TransliteratorTest::TestTitleAccents(void) {
2991     UParseError pe;
2992     UErrorCode ec = U_ZERO_ERROR;
2993     Transliterator *t = Transliterator::createInstance("Title", UTRANS_FORWARD, pe, ec);
2994     if (U_FAILURE(ec)) {
2995         errln("FAIL: createInstance(Title)");
2996         delete t;
2997         return;
2998     }
2999     expect(*t, CharsToUnicodeString("a\\u0300b can't abe"), CharsToUnicodeString("A\\u0300b Can't Abe"));
3000     delete t;
3001 }
3002 
3003 /**
3004  * Basic test of a locale resource based rule.
3005  */
TestLocaleResource()3006 void TransliteratorTest::TestLocaleResource() {
3007     const char* DATA[] = {
3008         // id                    from               to
3009         //"Latin-Greek/UNGEGN",    "b",               "\\u03bc\\u03c0",
3010         "Latin-el",              "b",               "\\u03bc\\u03c0",
3011         "Latin-Greek",           "b",               "\\u03B2",
3012         "Greek-Latin/UNGEGN",    "\\u03B2",         "v",
3013         "el-Latin",              "\\u03B2",         "v",
3014         "Greek-Latin",           "\\u03B2",         "b",
3015     };
3016     const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3017     for (int32_t i=0; i<DATA_length; i+=3) {
3018         UParseError pe;
3019         UErrorCode ec = U_ZERO_ERROR;
3020         Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, pe, ec);
3021         if (U_FAILURE(ec)) {
3022             dataerrln((UnicodeString)"FAIL: createInstance(" + DATA[i] + ") - " + u_errorName(ec));
3023             delete t;
3024             continue;
3025         }
3026         expect(*t, CharsToUnicodeString(DATA[i+1]),
3027                CharsToUnicodeString(DATA[i+2]));
3028         delete t;
3029     }
3030 }
3031 
3032 /**
3033  * Make sure parse errors reference the right line.
3034  */
TestParseError()3035 void TransliteratorTest::TestParseError() {
3036     static const char* rule =
3037         "a > b;\n"
3038         "# more stuff\n"
3039         "d << b;";
3040     UErrorCode ec = U_ZERO_ERROR;
3041     UParseError pe;
3042     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3043     delete t;
3044     if (U_FAILURE(ec)) {
3045         UnicodeString err(pe.preContext);
3046         err.append((UChar)124/*|*/).append(pe.postContext);
3047         if (err.indexOf("d << b") >= 0) {
3048             logln("Ok: " + err);
3049         } else {
3050             errln("FAIL: " + err);
3051         }
3052     }
3053     else {
3054         errln("FAIL: no syntax error");
3055     }
3056     static const char* maskingRule =
3057         "a>x;\n"
3058         "# more stuff\n"
3059         "ab>y;";
3060     ec = U_ZERO_ERROR;
3061     delete Transliterator::createFromRules("ID", maskingRule, UTRANS_FORWARD, pe, ec);
3062     if (ec != U_RULE_MASK_ERROR) {
3063         errln("FAIL: returned %s instead of U_RULE_MASK_ERROR", u_errorName(ec));
3064     }
3065     else if (UnicodeString("a > x;") != UnicodeString(pe.preContext)) {
3066         errln("FAIL: did not get expected precontext");
3067     }
3068     else if (UnicodeString("ab > y;") != UnicodeString(pe.postContext)) {
3069         errln("FAIL: did not get expected postcontext");
3070     }
3071 }
3072 
3073 /**
3074  * Make sure sets on output are disallowed.
3075  */
TestOutputSet()3076 void TransliteratorTest::TestOutputSet() {
3077     UnicodeString rule = "$set = [a-cm-n]; b > $set;";
3078     UErrorCode ec = U_ZERO_ERROR;
3079     UParseError pe;
3080     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3081     delete t;
3082     if (U_FAILURE(ec)) {
3083         UnicodeString err(pe.preContext);
3084         err.append((UChar)124/*|*/).append(pe.postContext);
3085         logln("Ok: " + err);
3086         return;
3087     }
3088     errln("FAIL: No syntax error");
3089 }
3090 
3091 /**
3092  * Test the use variable range pragma, making sure that use of
3093  * variable range characters is detected and flagged as an error.
3094  */
TestVariableRange()3095 void TransliteratorTest::TestVariableRange() {
3096     UnicodeString rule = "use variable range 0x70 0x72; a > A; b > B; q > Q;";
3097     UErrorCode ec = U_ZERO_ERROR;
3098     UParseError pe;
3099     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3100     delete t;
3101     if (U_FAILURE(ec)) {
3102         UnicodeString err(pe.preContext);
3103         err.append((UChar)124/*|*/).append(pe.postContext);
3104         logln("Ok: " + err);
3105         return;
3106     }
3107     errln("FAIL: No syntax error");
3108 }
3109 
3110 /**
3111  * Test invalid post context error handling
3112  */
TestInvalidPostContext()3113 void TransliteratorTest::TestInvalidPostContext() {
3114     UnicodeString rule = "a}b{c>d;";
3115     UErrorCode ec = U_ZERO_ERROR;
3116     UParseError pe;
3117     Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
3118     delete t;
3119     if (U_FAILURE(ec)) {
3120         UnicodeString err(pe.preContext);
3121         err.append((UChar)124/*|*/).append(pe.postContext);
3122         if (err.indexOf("a}b{c") >= 0) {
3123             logln("Ok: " + err);
3124         } else {
3125             errln("FAIL: " + err);
3126         }
3127         return;
3128     }
3129     errln("FAIL: No syntax error");
3130 }
3131 
3132 /**
3133  * Test ID form variants
3134  */
TestIDForms()3135 void TransliteratorTest::TestIDForms() {
3136     const char* DATA[] = {
3137         "NFC", NULL, "NFD",
3138         "nfd", NULL, "NFC", // make sure case is ignored
3139         "Any-NFKD", NULL, "Any-NFKC",
3140         "Null", NULL, "Null",
3141         "-nfkc", "nfkc", "NFKD",
3142         "-nfkc/", "nfkc", "NFKD",
3143         "Latin-Greek/UNGEGN", NULL, "Greek-Latin/UNGEGN",
3144         "Greek/UNGEGN-Latin", "Greek-Latin/UNGEGN", "Latin-Greek/UNGEGN",
3145         "Bengali-Devanagari/", "Bengali-Devanagari", "Devanagari-Bengali",
3146         "Source-", NULL, NULL,
3147         "Source/Variant-", NULL, NULL,
3148         "Source-/Variant", NULL, NULL,
3149         "/Variant", NULL, NULL,
3150         "/Variant-", NULL, NULL,
3151         "-/Variant", NULL, NULL,
3152         "-/", NULL, NULL,
3153         "-", NULL, NULL,
3154         "/", NULL, NULL,
3155     };
3156     const int32_t DATA_length = UPRV_LENGTHOF(DATA);
3157 
3158     for (int32_t i=0; i<DATA_length; i+=3) {
3159         const char* ID = DATA[i];
3160         const char* expID = DATA[i+1];
3161         const char* expInvID = DATA[i+2];
3162         UBool expValid = (expInvID != NULL);
3163         if (expID == NULL) {
3164             expID = ID;
3165         }
3166         UParseError pe;
3167         UErrorCode ec = U_ZERO_ERROR;
3168         Transliterator *t =
3169             Transliterator::createInstance(ID, UTRANS_FORWARD, pe, ec);
3170         if (U_FAILURE(ec)) {
3171             if (!expValid) {
3172                 logln((UnicodeString)"Ok: getInstance(" + ID +") => " + u_errorName(ec));
3173             } else {
3174                 dataerrln((UnicodeString)"FAIL: Couldn't create " + ID + " - " + u_errorName(ec));
3175             }
3176             delete t;
3177             continue;
3178         }
3179         Transliterator *u = t->createInverse(ec);
3180         if (U_FAILURE(ec)) {
3181             errln((UnicodeString)"FAIL: Couldn't create inverse of " + ID);
3182             delete t;
3183             delete u;
3184             continue;
3185         }
3186         if (t->getID() == expID &&
3187             u->getID() == expInvID) {
3188             logln((UnicodeString)"Ok: " + ID + ".getInverse() => " + expInvID);
3189         } else {
3190             errln((UnicodeString)"FAIL: getInstance(" + ID + ") => " +
3191                   t->getID() + " x getInverse() => " + u->getID() +
3192                   ", expected " + expInvID);
3193         }
3194         delete t;
3195         delete u;
3196     }
3197 }
3198 
3199 static const UChar SPACE[]   = {32,0};
3200 static const UChar NEWLINE[] = {10,0};
3201 static const UChar RETURN[]  = {13,0};
3202 static const UChar EMPTY[]   = {0};
3203 
checkRules(const UnicodeString & label,Transliterator & t2,const UnicodeString & testRulesForward)3204 void TransliteratorTest::checkRules(const UnicodeString& label, Transliterator& t2,
3205                                     const UnicodeString& testRulesForward) {
3206     UnicodeString rules2; t2.toRules(rules2, TRUE);
3207     //rules2 = TestUtility.replaceAll(rules2, new UnicodeSet("[' '\n\r]"), "");
3208     rules2.findAndReplace(SPACE, EMPTY);
3209     rules2.findAndReplace(NEWLINE, EMPTY);
3210     rules2.findAndReplace(RETURN, EMPTY);
3211 
3212     UnicodeString testRules(testRulesForward); testRules.findAndReplace(SPACE, EMPTY);
3213 
3214     if (rules2 != testRules) {
3215         errln(label);
3216         logln((UnicodeString)"GENERATED RULES: " + rules2);
3217         logln((UnicodeString)"SHOULD BE:       " + testRulesForward);
3218     }
3219 }
3220 
3221 /**
3222  * Mark's toRules test.
3223  */
TestToRulesMark()3224 void TransliteratorTest::TestToRulesMark() {
3225     const char* testRules =
3226         "::[[:Latin:][:Mark:]];"
3227         "::NFKD (NFC);"
3228         "::Lower (Lower);"
3229         "a <> \\u03B1;" // alpha
3230         "::NFKC (NFD);"
3231         "::Upper (Lower);"
3232         "::Lower ();"
3233         "::([[:Greek:][:Mark:]]);"
3234         ;
3235     const char* testRulesForward =
3236         "::[[:Latin:][:Mark:]];"
3237         "::NFKD(NFC);"
3238         "::Lower(Lower);"
3239         "a > \\u03B1;"
3240         "::NFKC(NFD);"
3241         "::Upper (Lower);"
3242         "::Lower ();"
3243         ;
3244     const char* testRulesBackward =
3245         "::[[:Greek:][:Mark:]];"
3246         "::Lower (Upper);"
3247         "::NFD(NFKC);"
3248         "\\u03B1 > a;"
3249         "::Lower(Lower);"
3250         "::NFC(NFKD);"
3251         ;
3252     UnicodeString source = CharsToUnicodeString("\\u00E1"); // a-acute
3253     UnicodeString target = CharsToUnicodeString("\\u03AC"); // alpha-acute
3254 
3255     UParseError pe;
3256     UErrorCode ec = U_ZERO_ERROR;
3257     LocalPointer<Transliterator> t2(
3258             Transliterator::createFromRules("source-target", UnicodeString(testRules, -1, US_INV), UTRANS_FORWARD, pe, ec));
3259     LocalPointer<Transliterator> t3(
3260             Transliterator::createFromRules("target-source", UnicodeString(testRules, -1, US_INV), UTRANS_REVERSE, pe, ec));
3261 
3262     if (U_FAILURE(ec)) {
3263         dataerrln((UnicodeString)"FAIL: createFromRules => " + u_errorName(ec));
3264         return;
3265     }
3266 
3267     expect(*t2, source, target);
3268     expect(*t3, target, source);
3269 
3270     checkRules("Failed toRules FORWARD", *t2, UnicodeString(testRulesForward, -1, US_INV));
3271     checkRules("Failed toRules BACKWARD", *t3, UnicodeString(testRulesBackward, -1, US_INV));
3272 }
3273 
3274 /**
3275  * Test Escape and Unescape transliterators.
3276  */
TestEscape()3277 void TransliteratorTest::TestEscape() {
3278     UParseError pe;
3279     UErrorCode ec;
3280     Transliterator *t;
3281 
3282     ec = U_ZERO_ERROR;
3283     t = Transliterator::createInstance("Hex-Any", UTRANS_FORWARD, pe, ec);
3284     if (U_FAILURE(ec)) {
3285         errln((UnicodeString)"FAIL: createInstance");
3286     } else {
3287         expect(*t,
3288                UNICODE_STRING_SIMPLE("\\x{40}\\U00000031&#x32;&#81;"),
3289                "@12Q");
3290     }
3291     delete t;
3292 
3293     ec = U_ZERO_ERROR;
3294     t = Transliterator::createInstance("Any-Hex/C", UTRANS_FORWARD, pe, ec);
3295     if (U_FAILURE(ec)) {
3296         errln((UnicodeString)"FAIL: createInstance");
3297     } else {
3298         expect(*t,
3299                CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3300                UNICODE_STRING_SIMPLE("\\u0041\\U0010BEEF\\uFEED"));
3301     }
3302     delete t;
3303 
3304     ec = U_ZERO_ERROR;
3305     t = Transliterator::createInstance("Any-Hex/Java", UTRANS_FORWARD, pe, ec);
3306     if (U_FAILURE(ec)) {
3307         errln((UnicodeString)"FAIL: createInstance");
3308     } else {
3309         expect(*t,
3310                CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3311                UNICODE_STRING_SIMPLE("\\u0041\\uDBEF\\uDEEF\\uFEED"));
3312     }
3313     delete t;
3314 
3315     ec = U_ZERO_ERROR;
3316     t = Transliterator::createInstance("Any-Hex/Perl", UTRANS_FORWARD, pe, ec);
3317     if (U_FAILURE(ec)) {
3318         errln((UnicodeString)"FAIL: createInstance");
3319     } else {
3320         expect(*t,
3321                CharsToUnicodeString("A\\U0010BEEF\\uFEED"),
3322                UNICODE_STRING_SIMPLE("\\x{41}\\x{10BEEF}\\x{FEED}"));
3323     }
3324     delete t;
3325 }
3326 
3327 
TestAnchorMasking()3328 void TransliteratorTest::TestAnchorMasking(){
3329     UnicodeString rule ("^a > Q; a > q;");
3330     UErrorCode status= U_ZERO_ERROR;
3331     UParseError parseError;
3332 
3333     Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status);
3334     if(U_FAILURE(status)){
3335         errln(UnicodeString("FAIL: ") + "ID" +
3336               ".createFromRules() => bad rules" +
3337               /*", parse error " + parseError.code +*/
3338               ", line " + parseError.line +
3339               ", offset " + parseError.offset +
3340               ", context " + prettify(parseError.preContext, TRUE) +
3341               ", rules: " + prettify(rule, TRUE));
3342     }
3343     delete t;
3344 }
3345 
3346 /**
3347  * Make sure display names of variants look reasonable.
3348  */
TestDisplayName()3349 void TransliteratorTest::TestDisplayName() {
3350 #if UCONFIG_NO_FORMATTING
3351     logln("Skipping, UCONFIG_NO_FORMATTING is set\n");
3352     return;
3353 #else
3354     static const char* DATA[] = {
3355         // ID, forward name, reverse name
3356         // Update the text as necessary -- the important thing is
3357         // not the text itself, but how various cases are handled.
3358 
3359         // Basic test
3360         "Any-Hex", "Any to Hex Escape", "Hex Escape to Any",
3361 
3362         // Variants
3363         "Any-Hex/Perl", "Any to Hex Escape/Perl", "Hex Escape to Any/Perl",
3364 
3365         // Target-only IDs
3366         "NFC", "Any to NFC", "Any to NFD",
3367     };
3368 
3369     int32_t DATA_length = UPRV_LENGTHOF(DATA);
3370 
3371     Locale US("en", "US");
3372 
3373     for (int32_t i=0; i<DATA_length; i+=3) {
3374         UnicodeString name;
3375         Transliterator::getDisplayName(DATA[i], US, name);
3376         if (name != DATA[i+1]) {
3377             dataerrln((UnicodeString)"FAIL: " + DATA[i] + ".getDisplayName() => " +
3378                   name + ", expected " + DATA[i+1]);
3379         } else {
3380             logln((UnicodeString)"Ok: " + DATA[i] + ".getDisplayName() => " + name);
3381         }
3382         UErrorCode ec = U_ZERO_ERROR;
3383         UParseError pe;
3384         Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_REVERSE, pe, ec);
3385         if (U_FAILURE(ec)) {
3386             delete t;
3387             dataerrln("FAIL: createInstance failed - %s", u_errorName(ec));
3388             continue;
3389         }
3390         name = Transliterator::getDisplayName(t->getID(), US, name);
3391         if (name != DATA[i+2]) {
3392             dataerrln((UnicodeString)"FAIL: " + t->getID() + ".getDisplayName() => " +
3393                   name + ", expected " + DATA[i+2]);
3394         } else {
3395             logln((UnicodeString)"Ok: " + t->getID() + ".getDisplayName() => " + name);
3396         }
3397         delete t;
3398     }
3399 #endif
3400 }
3401 
TestSpecialCases(void)3402 void TransliteratorTest::TestSpecialCases(void) {
3403     const UnicodeString registerRules[] = {
3404         "Any-Dev1", "x > X; y > Y;",
3405         "Any-Dev2", "XY > Z",
3406         "Greek-Latin/FAKE",
3407             CharsToUnicodeString
3408             ("[^[:L:][:M:]] { \\u03bc\\u03c0 > b ; \\u03bc\\u03c0 } [^[:L:][:M:]] > b ; [^[:L:][:M:]] { [\\u039c\\u03bc][\\u03a0\\u03c0] > B ; [\\u039c\\u03bc][\\u03a0\\u03c0] } [^[:L:][:M:]] > B ;"),
3409         "" // END MARKER
3410     };
3411 
3412     const UnicodeString testCases[] = {
3413         // NORMALIZATION
3414         // should add more test cases
3415         "NFD" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3416         "NFC" , CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3417         "NFKD", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3418         "NFKC", CharsToUnicodeString("a\\u0300 \\u00E0 \\u1100\\u1161 \\uFF76\\uFF9E\\u03D3"), "",
3419 
3420         // mp -> b BUG
3421         "Greek-Latin/UNGEGN", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3422         "Greek-Latin/FAKE", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
3423 
3424         // check for devanagari bug
3425         "nfd;Dev1;Dev2;nfc", "xy", "Z",
3426 
3427         // ff, i, dotless-i, I, dotted-I, LJLjlj deseret deeDEE
3428         "Title", CharsToUnicodeString("ab'cD ffi\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3429                  CharsToUnicodeString("Ab'cd Ffi\\u0131ii\\u0307 \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3430 
3431         //TODO: enable this test once Titlecase works right
3432         /*
3433         "Title", CharsToUnicodeString("\\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3434                  CharsToUnicodeString("Ffi\\u0131ii \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
3435                  */
3436         "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3437                  CharsToUnicodeString("AB'CD FFIII\\u0130 \\u01C7\\u01C7\\u01C7 ") + DESERET_DEE + DESERET_DEE,
3438         "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
3439                  CharsToUnicodeString("ab'cd \\uFB00i\\u0131ii\\u0307 \\u01C9\\u01C9\\u01C9 ") + DESERET_dee + DESERET_dee,
3440 
3441         "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3442         "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, "",
3443 
3444          // FORMS OF S
3445         "Greek-Latin/UNGEGN",  CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3446                                CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3447         "Latin-Greek/UNGEGN",  CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3448                                CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3") ,
3449         "Greek-Latin",  CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3450                         CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
3451         "Latin-Greek",  CharsToUnicodeString("s ss s\\u0331s\\u0331"),
3452                         CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
3453         // Tatiana bug
3454         // Upper: TAT\\u02B9\\u00C2NA
3455         // Lower: tat\\u02B9\\u00E2na
3456         // Title: Tat\\u02B9\\u00E2na
3457         "Upper", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3458                  CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3459         "Lower", CharsToUnicodeString("TAT\\u02B9\\u00C2NA"),
3460                  CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3461         "Title", CharsToUnicodeString("tat\\u02B9\\u00E2na"),
3462                  CharsToUnicodeString("Tat\\u02B9\\u00E2na"),
3463 
3464         "" // END MARKER
3465     };
3466 
3467     UParseError pos;
3468     int32_t i;
3469     for (i = 0; registerRules[i].length()!=0; i+=2) {
3470         UErrorCode status = U_ZERO_ERROR;
3471 
3472         Transliterator *t = Transliterator::createFromRules(registerRules[0+i],
3473             registerRules[i+1], UTRANS_FORWARD, pos, status);
3474         if (U_FAILURE(status)) {
3475             dataerrln("Fails: Unable to create the transliterator from rules. - %s", u_errorName(status));
3476         } else {
3477             Transliterator::registerInstance(t);
3478         }
3479     }
3480     for (i = 0; testCases[i].length()!=0; i+=3) {
3481         UErrorCode ec = U_ZERO_ERROR;
3482         UParseError pe;
3483         const UnicodeString& name = testCases[i];
3484         Transliterator *t = Transliterator::createInstance(name, UTRANS_FORWARD, pe, ec);
3485         if (U_FAILURE(ec)) {
3486             dataerrln((UnicodeString)"FAIL: Couldn't create " + name + " - " + u_errorName(ec));
3487             delete t;
3488             continue;
3489         }
3490         const UnicodeString& id = t->getID();
3491         const UnicodeString& source = testCases[i+1];
3492         UnicodeString target;
3493 
3494         // Automatic generation of targets, to make it simpler to add test cases (and more fail-safe)
3495 
3496         if (testCases[i+2].length() > 0) {
3497             target = testCases[i+2];
3498         } else if (0==id.caseCompare("NFD", U_FOLD_CASE_DEFAULT)) {
3499             Normalizer::normalize(source, UNORM_NFD, 0, target, ec);
3500         } else if (0==id.caseCompare("NFC", U_FOLD_CASE_DEFAULT)) {
3501             Normalizer::normalize(source, UNORM_NFC, 0, target, ec);
3502         } else if (0==id.caseCompare("NFKD", U_FOLD_CASE_DEFAULT)) {
3503             Normalizer::normalize(source, UNORM_NFKD, 0, target, ec);
3504         } else if (0==id.caseCompare("NFKC", U_FOLD_CASE_DEFAULT)) {
3505             Normalizer::normalize(source, UNORM_NFKC, 0, target, ec);
3506         } else if (0==id.caseCompare("Lower", U_FOLD_CASE_DEFAULT)) {
3507             target = source;
3508             target.toLower(Locale::getUS());
3509         } else if (0==id.caseCompare("Upper", U_FOLD_CASE_DEFAULT)) {
3510             target = source;
3511             target.toUpper(Locale::getUS());
3512         }
3513         if (U_FAILURE(ec)) {
3514             errln((UnicodeString)"FAIL: Internal error normalizing " + source);
3515             continue;
3516         }
3517 
3518         expect(*t, source, target);
3519         delete t;
3520     }
3521     for (i = 0; registerRules[i].length()!=0; i+=2) {
3522         Transliterator::unregister(registerRules[i]);
3523     }
3524 }
3525 
Char32ToEscapedChars(UChar32 ch,char * buffer)3526 char* Char32ToEscapedChars(UChar32 ch, char* buffer) {
3527     if (ch <= 0xFFFF) {
3528         sprintf(buffer, "\\u%04x", (int)ch);
3529     } else {
3530         sprintf(buffer, "\\U%08x", (int)ch);
3531     }
3532     return buffer;
3533 }
3534 
TestSurrogateCasing(void)3535 void TransliteratorTest::TestSurrogateCasing (void) {
3536     // check that casing handles surrogates
3537     // titlecase is currently defective
3538     char buffer[20];
3539     UChar buffer2[20];
3540     UChar32 dee;
3541     U16_GET(DESERET_dee,0, 0, DESERET_dee.length(), dee);
3542     UnicodeString DEE(u_totitle(dee));
3543     if (DEE != DESERET_DEE) {
3544         err("Fails titlecase of surrogates");
3545         err(Char32ToEscapedChars(dee, buffer));
3546         err(", ");
3547         errln(Char32ToEscapedChars(DEE.char32At(0), buffer));
3548     }
3549 
3550     UnicodeString deeDEETest=DESERET_dee + DESERET_DEE;
3551     UnicodeString deedeeTest = DESERET_dee + DESERET_dee;
3552     UnicodeString DEEDEETest = DESERET_DEE + DESERET_DEE;
3553     UErrorCode status= U_ZERO_ERROR;
3554 
3555     u_strToUpper(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
3556     if (U_FAILURE(status) || (UnicodeString(buffer2)!= DEEDEETest)) {
3557         errln("Fails: Can't uppercase surrogates.");
3558     }
3559 
3560     status= U_ZERO_ERROR;
3561     u_strToLower(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
3562     if (U_FAILURE(status) || (UnicodeString(buffer2)!= deedeeTest)) {
3563         errln("Fails: Can't lowercase surrogates.");
3564     }
3565 }
3566 
_trans(Transliterator & t,const UnicodeString & src,UnicodeString & result)3567 static void _trans(Transliterator& t, const UnicodeString& src,
3568                    UnicodeString& result) {
3569     result = src;
3570     t.transliterate(result);
3571 }
3572 
_trans(const UnicodeString & id,const UnicodeString & src,UnicodeString & result,UErrorCode ec)3573 static void _trans(const UnicodeString& id, const UnicodeString& src,
3574                    UnicodeString& result, UErrorCode ec) {
3575     UParseError pe;
3576     Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
3577     if (U_SUCCESS(ec)) {
3578         _trans(*t, src, result);
3579     }
3580     delete t;
3581 }
3582 
_findMatch(const UnicodeString & source,const UnicodeString * pairs)3583 static UnicodeString _findMatch(const UnicodeString& source,
3584                                        const UnicodeString* pairs) {
3585     UnicodeString empty;
3586     for (int32_t i=0; pairs[i].length() > 0; i+=2) {
3587         if (0==source.caseCompare(pairs[i], U_FOLD_CASE_DEFAULT)) {
3588             return pairs[i+1];
3589         }
3590     }
3591     return empty;
3592 }
3593 
3594 // Check to see that incremental gets at least part way through a reasonable string.
3595 
TestIncrementalProgress(void)3596 void TransliteratorTest::TestIncrementalProgress(void) {
3597     UErrorCode ec = U_ZERO_ERROR;
3598     UnicodeString latinTest = "The Quick Brown Fox.";
3599     UnicodeString devaTest;
3600     _trans("Latin-Devanagari", latinTest, devaTest, ec);
3601     UnicodeString kataTest;
3602     _trans("Latin-Katakana", latinTest, kataTest, ec);
3603     if (U_FAILURE(ec)) {
3604         errln("FAIL: Internal error");
3605         return;
3606     }
3607     const UnicodeString tests[] = {
3608         "Any", latinTest,
3609         "Latin", latinTest,
3610         "Halfwidth", latinTest,
3611         "Devanagari", devaTest,
3612         "Katakana", kataTest,
3613         "" // END MARKER
3614     };
3615 
3616     UnicodeString test("The Quick Brown Fox Jumped Over The Lazy Dog.");
3617     int32_t i = 0, j=0, k=0;
3618     int32_t sources = Transliterator::countAvailableSources();
3619     for (i = 0; i < sources; i++) {
3620         UnicodeString source;
3621         Transliterator::getAvailableSource(i, source);
3622         UnicodeString test = _findMatch(source, tests);
3623         if (test.length() == 0) {
3624             logln((UnicodeString)"Skipping " + source + "-X");
3625             continue;
3626         }
3627         int32_t targets = Transliterator::countAvailableTargets(source);
3628         for (j = 0; j < targets; j++) {
3629             UnicodeString target;
3630             Transliterator::getAvailableTarget(j, source, target);
3631             int32_t variants = Transliterator::countAvailableVariants(source, target);
3632             for (k =0; k< variants; k++) {
3633                 UnicodeString variant;
3634                 UParseError err;
3635                 UErrorCode status = U_ZERO_ERROR;
3636 
3637                 Transliterator::getAvailableVariant(k, source, target, variant);
3638                 UnicodeString id = source + "-" + target + "/" + variant;
3639 
3640                 Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, err, status);
3641                 if (U_FAILURE(status)) {
3642                     dataerrln((UnicodeString)"FAIL: Could not create " + id);
3643                     delete t;
3644                     continue;
3645                 }
3646                 status = U_ZERO_ERROR;
3647                 CheckIncrementalAux(t, test);
3648 
3649                 UnicodeString rev;
3650                 _trans(*t, test, rev);
3651                 Transliterator *inv = t->createInverse(status);
3652                 if (U_FAILURE(status)) {
3653                     // The following are forward-only, it is OK that creating an inverse will not work:
3654                     // 1. Devanagari-Arabic
3655                     // 2. Any-*/BGN
3656                     // 2a. Any-*/BGN_1981
3657                     // 3. Any-*/UNGEGN
3658                     // 4. Any-*/MNS
3659                     // If UCONFIG_NO_BREAK_ITERATION is on, Latin-Thai is also not expected to work.
3660                     if (    id.compare((UnicodeString)"Devanagari-Arabic/") != 0
3661                          && !(id.startsWith((UnicodeString)"Any-") &&
3662                                 (id.endsWith((UnicodeString)"/BGN") || id.endsWith((UnicodeString)"/BGN_1981") || id.endsWith((UnicodeString)"/UNGEGN") || id.endsWith((UnicodeString)"/MNS"))
3663                              )
3664 #if UCONFIG_NO_BREAK_ITERATION
3665                          && id.compare((UnicodeString)"Latin-Thai/") != 0
3666 #endif
3667                        )
3668                     {
3669                         errln((UnicodeString)"FAIL: Could not create inverse of " + id);
3670                     }
3671                     delete t;
3672                     delete inv;
3673                     continue;
3674                 }
3675                 CheckIncrementalAux(inv, rev);
3676                 delete t;
3677                 delete inv;
3678             }
3679         }
3680     }
3681 }
3682 
CheckIncrementalAux(const Transliterator * t,const UnicodeString & input)3683 void TransliteratorTest::CheckIncrementalAux(const Transliterator* t,
3684                                                       const UnicodeString& input) {
3685     UErrorCode ec = U_ZERO_ERROR;
3686     UTransPosition pos;
3687     UnicodeString test = input;
3688 
3689     pos.contextStart = 0;
3690     pos.contextLimit = input.length();
3691     pos.start = 0;
3692     pos.limit = input.length();
3693 
3694     t->transliterate(test, pos, ec);
3695     if (U_FAILURE(ec)) {
3696         errln((UnicodeString)"FAIL: transliterate() error " + u_errorName(ec));
3697         return;
3698     }
3699     UBool gotError = FALSE;
3700     (void)gotError;    // Suppress set but not used warning.
3701 
3702     // we have a few special cases. Any-Remove (pos.start = 0, but also = limit) and U+XXXXX?X?
3703 
3704     if (pos.start == 0 && pos.limit != 0 && t->getID() != "Hex-Any/Unicode") {
3705         errln((UnicodeString)"No Progress, " +
3706               t->getID() + ": " + formatInput(test, input, pos));
3707         gotError = TRUE;
3708     } else {
3709         logln((UnicodeString)"PASS Progress, " +
3710               t->getID() + ": " + formatInput(test, input, pos));
3711     }
3712     t->finishTransliteration(test, pos);
3713     if (pos.start != pos.limit) {
3714         errln((UnicodeString)"Incomplete, " +
3715               t->getID() + ": " + formatInput(test, input, pos));
3716         gotError = TRUE;
3717     }
3718 }
3719 
TestFunction()3720 void TransliteratorTest::TestFunction() {
3721     // Careful with spacing and ';' here:  Phrase this exactly
3722     // as toRules() is going to return it.  If toRules() changes
3723     // with regard to spacing or ';', then adjust this string.
3724     UnicodeString rule =
3725         "([:Lu:]) > $1 '(' &Lower( $1 ) '=' &Hex( &Any-Lower( $1 ) ) ')';";
3726 
3727     UParseError pe;
3728     UErrorCode ec = U_ZERO_ERROR;
3729     Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3730     if (t == NULL) {
3731         dataerrln("FAIL: createFromRules failed - %s", u_errorName(ec));
3732         return;
3733     }
3734 
3735     UnicodeString r;
3736     t->toRules(r, TRUE);
3737     if (r == rule) {
3738         logln((UnicodeString)"OK: toRules() => " + r);
3739     } else {
3740         errln((UnicodeString)"FAIL: toRules() => " + r +
3741               ", expected " + rule);
3742     }
3743 
3744     expect(*t, "The Quick Brown Fox",
3745            UNICODE_STRING_SIMPLE("T(t=\\u0074)he Q(q=\\u0071)uick B(b=\\u0062)rown F(f=\\u0066)ox"));
3746 
3747     delete t;
3748 }
3749 
TestInvalidBackRef(void)3750 void TransliteratorTest::TestInvalidBackRef(void) {
3751     UnicodeString rule =  ". > $1;";
3752     UnicodeString rule2 =CharsToUnicodeString("(.) <> &hex/unicode($1) &name($1); . > $1; [{}] >\\u0020;");
3753     UParseError pe;
3754     UErrorCode ec = U_ZERO_ERROR;
3755     Transliterator *t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3756     Transliterator *t2 = Transliterator::createFromRules("Test2", rule2, UTRANS_FORWARD, pe, ec);
3757 
3758     if (t != NULL) {
3759         errln("FAIL: createFromRules should have returned NULL");
3760         delete t;
3761     }
3762 
3763     if (t2 != NULL) {
3764         errln("FAIL: createFromRules should have returned NULL");
3765         delete t2;
3766     }
3767 
3768     if (U_SUCCESS(ec)) {
3769         errln("FAIL: Ok: . > $1; => no error");
3770     } else {
3771         logln((UnicodeString)"Ok: . > $1; => " + u_errorName(ec));
3772     }
3773 }
3774 
TestMulticharStringSet()3775 void TransliteratorTest::TestMulticharStringSet() {
3776     // Basic testing
3777     const char* rule =
3778         "       [{aa}]       > x;"
3779         "         a          > y;"
3780         "       [b{bc}]      > z;"
3781         "[{gd}] { e          > q;"
3782         "         e } [{fg}] > r;" ;
3783 
3784     UParseError pe;
3785     UErrorCode ec = U_ZERO_ERROR;
3786     Transliterator* t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3787     if (t == NULL || U_FAILURE(ec)) {
3788         delete t;
3789         errln("FAIL: createFromRules failed");
3790         return;
3791     }
3792 
3793     expect(*t, "a aa ab bc d gd de gde gdefg ddefg",
3794            "y x yz z d gd de gdq gdqfg ddrfg");
3795     delete t;
3796 
3797     // Overlapped string test.  Make sure that when multiple
3798     // strings can match that the longest one is matched.
3799     rule =
3800         "    [a {ab} {abc}]    > x;"
3801         "           b          > y;"
3802         "           c          > z;"
3803         " q [t {st} {rst}] { e > p;" ;
3804 
3805     t = Transliterator::createFromRules("Test", rule, UTRANS_FORWARD, pe, ec);
3806     if (t == NULL || U_FAILURE(ec)) {
3807         delete t;
3808         errln("FAIL: createFromRules failed");
3809         return;
3810     }
3811 
3812     expect(*t, "a ab abc qte qste qrste",
3813            "x x x qtp qstp qrstp");
3814     delete t;
3815 }
3816 
3817 // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
3818 // BEGIN TestUserFunction support factory
3819 
3820 Transliterator* _TUFF[4];
3821 UnicodeString* _TUFID[4];
3822 
_TUFFactory(const UnicodeString &,Transliterator::Token context)3823 static Transliterator* U_EXPORT2 _TUFFactory(const UnicodeString& /*ID*/,
3824                                    Transliterator::Token context) {
3825     return _TUFF[context.integer]->clone();
3826 }
3827 
_TUFReg(const UnicodeString & ID,Transliterator * t,int32_t n)3828 static void _TUFReg(const UnicodeString& ID, Transliterator* t, int32_t n) {
3829     _TUFF[n] = t;
3830     _TUFID[n] = new UnicodeString(ID);
3831     Transliterator::registerFactory(ID, _TUFFactory, Transliterator::integerToken(n));
3832 }
3833 
_TUFUnreg(int32_t n)3834 static void _TUFUnreg(int32_t n) {
3835     if (_TUFF[n] != NULL) {
3836         Transliterator::unregister(*_TUFID[n]);
3837         delete _TUFF[n];
3838         delete _TUFID[n];
3839     }
3840 }
3841 
3842 // END TestUserFunction support factory
3843 // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3844 
3845 /**
3846  * Test that user-registered transliterators can be used under function
3847  * syntax.
3848  */
TestUserFunction()3849 void TransliteratorTest::TestUserFunction() {
3850 
3851     Transliterator* t;
3852     UParseError pe;
3853     UErrorCode ec = U_ZERO_ERROR;
3854 
3855     // Setup our factory
3856     int32_t i;
3857     for (i=0; i<4; ++i) {
3858         _TUFF[i] = NULL;
3859     }
3860 
3861     // There's no need to register inverses if we don't use them
3862     t = Transliterator::createFromRules("gif",
3863                                         UNICODE_STRING_SIMPLE("'\\'u(..)(..) > '<img src=\"http://www.unicode.org/gifs/24/' $1 '/U' $1$2 '.gif\">';"),
3864                                         UTRANS_FORWARD, pe, ec);
3865     if (t == NULL || U_FAILURE(ec)) {
3866         dataerrln((UnicodeString)"FAIL: createFromRules gif " + u_errorName(ec));
3867         return;
3868     }
3869     _TUFReg("Any-gif", t, 0);
3870 
3871     t = Transliterator::createFromRules("RemoveCurly",
3872                                         UNICODE_STRING_SIMPLE("[\\{\\}] > ; '\\N' > ;"),
3873                                         UTRANS_FORWARD, pe, ec);
3874     if (t == NULL || U_FAILURE(ec)) {
3875         errln((UnicodeString)"FAIL: createFromRules RemoveCurly " + u_errorName(ec));
3876         goto FAIL;
3877     }
3878     expect(*t, UNICODE_STRING_SIMPLE("\\N{name}"), "name");
3879     _TUFReg("Any-RemoveCurly", t, 1);
3880 
3881     logln("Trying &hex");
3882     t = Transliterator::createFromRules("hex2",
3883                                         "(.) > &hex($1);",
3884                                         UTRANS_FORWARD, pe, ec);
3885     if (t == NULL || U_FAILURE(ec)) {
3886         errln("FAIL: createFromRules");
3887         goto FAIL;
3888     }
3889     logln("Registering");
3890     _TUFReg("Any-hex2", t, 2);
3891     t = Transliterator::createInstance("Any-hex2", UTRANS_FORWARD, ec);
3892     if (t == NULL || U_FAILURE(ec)) {
3893         errln((UnicodeString)"FAIL: createInstance Any-hex2 " + u_errorName(ec));
3894         goto FAIL;
3895     }
3896     expect(*t, "abc", UNICODE_STRING_SIMPLE("\\u0061\\u0062\\u0063"));
3897     delete t;
3898 
3899     logln("Trying &gif");
3900     t = Transliterator::createFromRules("gif2",
3901                                         "(.) > &Gif(&Hex2($1));",
3902                                         UTRANS_FORWARD, pe, ec);
3903     if (t == NULL || U_FAILURE(ec)) {
3904         errln((UnicodeString)"FAIL: createFromRules gif2 " + u_errorName(ec));
3905         goto FAIL;
3906     }
3907     logln("Registering");
3908     _TUFReg("Any-gif2", t, 3);
3909     t = Transliterator::createInstance("Any-gif2", UTRANS_FORWARD, ec);
3910     if (t == NULL || U_FAILURE(ec)) {
3911         errln((UnicodeString)"FAIL: createInstance Any-gif2 " + u_errorName(ec));
3912         goto FAIL;
3913     }
3914     expect(*t, "ab", "<img src=\"http://www.unicode.org/gifs/24/00/U0061.gif\">"
3915            "<img src=\"http://www.unicode.org/gifs/24/00/U0062.gif\">");
3916     delete t;
3917 
3918     // Test that filters are allowed after &
3919     t = Transliterator::createFromRules("test",
3920                                         "(.) > &Hex($1) ' ' &RemoveCurly(&Name($1)) ' ';",
3921                                         UTRANS_FORWARD, pe, ec);
3922     if (t == NULL || U_FAILURE(ec)) {
3923         errln((UnicodeString)"FAIL: createFromRules test " + u_errorName(ec));
3924         goto FAIL;
3925     }
3926     expect(*t, "abc",
3927            UNICODE_STRING_SIMPLE("\\u0061 LATIN SMALL LETTER A \\u0062 LATIN SMALL LETTER B \\u0063 LATIN SMALL LETTER C "));
3928     delete t;
3929 
3930  FAIL:
3931     for (i=0; i<4; ++i) {
3932         _TUFUnreg(i);
3933     }
3934 }
3935 
3936 /**
3937  * Test the Any-X transliterators.
3938  */
TestAnyX(void)3939 void TransliteratorTest::TestAnyX(void) {
3940     UParseError parseError;
3941     UErrorCode status = U_ZERO_ERROR;
3942     Transliterator* anyLatin =
3943         Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
3944     if (anyLatin==0) {
3945         dataerrln("FAIL: createInstance returned NULL - %s", u_errorName(status));
3946         delete anyLatin;
3947         return;
3948     }
3949 
3950     expect(*anyLatin,
3951            CharsToUnicodeString("greek:\\u03B1\\u03B2\\u03BA\\u0391\\u0392\\u039A hiragana:\\u3042\\u3076\\u304F cyrillic:\\u0430\\u0431\\u0446"),
3952            CharsToUnicodeString("greek:abkABK hiragana:abuku cyrillic:abc"));
3953 
3954     delete anyLatin;
3955 }
3956 
3957 /**
3958  * Test Any-X transliterators with sample letters from all scripts.
3959  */
TestAny(void)3960 void TransliteratorTest::TestAny(void) {
3961     UErrorCode status = U_ZERO_ERROR;
3962     // Note: there is a lot of implict construction of UnicodeStrings from (char *) in
3963     //       function call parameters going on in this test.
3964     UnicodeSet alphabetic("[:alphabetic:]", status);
3965     if (U_FAILURE(status)) {
3966         dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
3967         return;
3968     }
3969     alphabetic.freeze();
3970 
3971     UnicodeString testString;
3972     for (int32_t i = 0; i < USCRIPT_CODE_LIMIT; i++) {
3973         const char *scriptName = uscript_getShortName((UScriptCode)i);
3974         if (scriptName == NULL) {
3975             errln("Failure: file %s, line %d: Script Code %d is invalid, ", __FILE__, __LINE__, i);
3976             return;
3977         }
3978 
3979         UnicodeSet sample;
3980         sample.applyPropertyAlias("script", scriptName, status);
3981         if (U_FAILURE(status)) {
3982             errln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
3983             return;
3984         }
3985         sample.retainAll(alphabetic);
3986         for (int32_t count=0; count<5; count++) {
3987             UChar32 c = sample.charAt(count);
3988             if (c == -1) {
3989                 break;
3990             }
3991             testString.append(c);
3992         }
3993     }
3994 
3995     UParseError parseError;
3996     Transliterator* anyLatin =
3997         Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
3998     if (U_FAILURE(status)) {
3999         dataerrln("Failure: file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
4000         return;
4001     }
4002 
4003     logln(UnicodeString("Sample set for Any-Latin: ") + testString);
4004     anyLatin->transliterate(testString);
4005     logln(UnicodeString("Sample result for Any-Latin: ") + testString);
4006     delete anyLatin;
4007 }
4008 
4009 
4010 /**
4011  * Test the source and target set API.  These are only implemented
4012  * for RBT and CompoundTransliterator at this time.
4013  */
TestSourceTargetSet()4014 void TransliteratorTest::TestSourceTargetSet() {
4015     UErrorCode ec = U_ZERO_ERROR;
4016 
4017     // Rules
4018     const char* r =
4019         "a > b; "
4020         "r [x{lu}] > q;";
4021 
4022     // Expected source
4023     UnicodeSet expSrc("[arx{lu}]", ec);
4024 
4025     // Expected target
4026     UnicodeSet expTrg("[bq]", ec);
4027 
4028     UParseError pe;
4029     Transliterator* t = Transliterator::createFromRules("test", r, UTRANS_FORWARD, pe, ec);
4030 
4031     if (U_FAILURE(ec)) {
4032         delete t;
4033         errln("FAIL: Couldn't set up test");
4034         return;
4035     }
4036 
4037     UnicodeSet src; t->getSourceSet(src);
4038     UnicodeSet trg; t->getTargetSet(trg);
4039 
4040     if (src == expSrc && trg == expTrg) {
4041         UnicodeString a, b;
4042         logln((UnicodeString)"Ok: " +
4043               r + " => source = " + src.toPattern(a, TRUE) +
4044               ", target = " + trg.toPattern(b, TRUE));
4045     } else {
4046         UnicodeString a, b, c, d;
4047         errln((UnicodeString)"FAIL: " +
4048               r + " => source = " + src.toPattern(a, TRUE) +
4049               ", expected " + expSrc.toPattern(b, TRUE) +
4050               "; target = " + trg.toPattern(c, TRUE) +
4051               ", expected " + expTrg.toPattern(d, TRUE));
4052     }
4053 
4054     delete t;
4055 }
4056 
4057 /**
4058  * Test handling of Pattern_White_Space, for both RBT and UnicodeSet.
4059  */
TestPatternWhiteSpace()4060 void TransliteratorTest::TestPatternWhiteSpace() {
4061     // Rules
4062     const char* r = "a > \\u200E b;";
4063 
4064     UErrorCode ec = U_ZERO_ERROR;
4065     UParseError pe;
4066     Transliterator* t = Transliterator::createFromRules("test", CharsToUnicodeString(r), UTRANS_FORWARD, pe, ec);
4067 
4068     if (U_FAILURE(ec)) {
4069         errln("FAIL: Couldn't set up test");
4070     } else {
4071         expect(*t, "a", "b");
4072     }
4073     delete t;
4074 
4075     // UnicodeSet
4076     ec = U_ZERO_ERROR;
4077     UnicodeSet set(CharsToUnicodeString("[a \\u200E]"), ec);
4078 
4079     if (U_FAILURE(ec)) {
4080         errln("FAIL: Couldn't set up test");
4081     } else {
4082         if (set.contains(0x200E)) {
4083             errln("FAIL: U+200E not being ignored by UnicodeSet");
4084         }
4085     }
4086 }
4087 //======================================================================
4088 // this method is in TestUScript.java
4089 //======================================================================
TestAllCodepoints()4090 void TransliteratorTest::TestAllCodepoints(){
4091     UScriptCode code= USCRIPT_INVALID_CODE;
4092     char id[256]={'\0'};
4093     char abbr[256]={'\0'};
4094     char newId[256]={'\0'};
4095     char newAbbrId[256]={'\0'};
4096     char oldId[256]={'\0'};
4097     char oldAbbrId[256]={'\0'};
4098 
4099     UErrorCode status =U_ZERO_ERROR;
4100     UParseError pe;
4101 
4102     for(uint32_t i = 0; i<=0x10ffff; i++){
4103         code =  uscript_getScript(i,&status);
4104         if(code == USCRIPT_INVALID_CODE){
4105             dataerrln("uscript_getScript for codepoint \\U%08X failed.", i);
4106         }
4107         const char* myId = uscript_getName(code);
4108         if(!myId) {
4109           dataerrln("Valid script code returned NULL name. Check your data!");
4110           return;
4111         }
4112         uprv_strcpy(id,myId);
4113         uprv_strcpy(abbr,uscript_getShortName(code));
4114 
4115         uprv_strcpy(newId,"[:");
4116         uprv_strcat(newId,id);
4117         uprv_strcat(newId,":];NFD");
4118 
4119         uprv_strcpy(newAbbrId,"[:");
4120         uprv_strcat(newAbbrId,abbr);
4121         uprv_strcat(newAbbrId,":];NFD");
4122 
4123         if(uprv_strcmp(newId,oldId)!=0){
4124             Transliterator* t = Transliterator::createInstance(newId,UTRANS_FORWARD,pe,status);
4125             if(t==NULL || U_FAILURE(status)){
4126                 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4127             }
4128             delete t;
4129         }
4130         if(uprv_strcmp(newAbbrId,oldAbbrId)!=0){
4131             Transliterator* t = Transliterator::createInstance(newAbbrId,UTRANS_FORWARD,pe,status);
4132             if(t==NULL || U_FAILURE(status)){
4133                 dataerrln((UnicodeString)"FAIL: Could not create " + id + " - " + u_errorName(status));
4134             }
4135             delete t;
4136         }
4137         uprv_strcpy(oldId,newId);
4138         uprv_strcpy(oldAbbrId, newAbbrId);
4139 
4140     }
4141 
4142 }
4143 
4144 #define TEST_TRANSLIT_ID(id, cls) UPRV_BLOCK_MACRO_BEGIN { \
4145   UErrorCode ec = U_ZERO_ERROR; \
4146   Transliterator* t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); \
4147   if (U_FAILURE(ec)) { \
4148     dataerrln("FAIL: Couldn't create %s - %s", id, u_errorName(ec)); \
4149   } else { \
4150     if (t->getDynamicClassID() != cls::getStaticClassID()) { \
4151       errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4152     } \
4153     /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4154   } \
4155   delete t; \
4156 } UPRV_BLOCK_MACRO_END
4157 
4158 #define TEST_TRANSLIT_RULE(rule, cls) UPRV_BLOCK_MACRO_BEGIN { \
4159   UErrorCode ec = U_ZERO_ERROR; \
4160   UParseError pe; \
4161   Transliterator* t = Transliterator::createFromRules("_", rule, UTRANS_FORWARD, pe, ec); \
4162   if (U_FAILURE(ec)) { \
4163     errln("FAIL: Couldn't create " rule); \
4164   } else { \
4165     if (t->getDynamicClassID() != cls ::getStaticClassID()) { \
4166       errln("FAIL: " #cls " dynamic and static class ID mismatch"); \
4167     } \
4168     /* *t = *t; */ /*can't do this: coverage test for assignment op*/ \
4169   } \
4170   delete t; \
4171 } UPRV_BLOCK_MACRO_END
4172 
TestBoilerplate()4173 void TransliteratorTest::TestBoilerplate() {
4174     TEST_TRANSLIT_ID("Any-Latin", AnyTransliterator);
4175     TEST_TRANSLIT_ID("Any-Hex", EscapeTransliterator);
4176     TEST_TRANSLIT_ID("Hex-Any", UnescapeTransliterator);
4177     TEST_TRANSLIT_ID("Lower", LowercaseTransliterator);
4178     TEST_TRANSLIT_ID("Upper", UppercaseTransliterator);
4179     TEST_TRANSLIT_ID("Title", TitlecaseTransliterator);
4180     TEST_TRANSLIT_ID("Null", NullTransliterator);
4181     TEST_TRANSLIT_ID("Remove", RemoveTransliterator);
4182     TEST_TRANSLIT_ID("Any-Name", UnicodeNameTransliterator);
4183     TEST_TRANSLIT_ID("Name-Any", NameUnicodeTransliterator);
4184     TEST_TRANSLIT_ID("NFD", NormalizationTransliterator);
4185     TEST_TRANSLIT_ID("Latin-Greek", CompoundTransliterator);
4186     TEST_TRANSLIT_RULE("a>b;", RuleBasedTransliterator);
4187 }
4188 
TestAlternateSyntax()4189 void TransliteratorTest::TestAlternateSyntax() {
4190     // U+2206 == &
4191     // U+2190 == <
4192     // U+2192 == >
4193     // U+2194 == <>
4194     expect(CharsToUnicodeString("a \\u2192 x; b \\u2190 y; c \\u2194 z"),
4195            "abc",
4196            "xbz");
4197     expect(CharsToUnicodeString("([:^ASCII:]) \\u2192 \\u2206Name($1);"),
4198            CharsToUnicodeString("<=\\u2190; >=\\u2192; <>=\\u2194; &=\\u2206"),
4199            UNICODE_STRING_SIMPLE("<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}"));
4200 }
4201 
4202 static const char* BEGIN_END_RULES[] = {
4203     // [0]
4204     "abc > xy;"
4205     "aba > z;",
4206 
4207     // [1]
4208 /*
4209     "::BEGIN;"
4210     "abc > xy;"
4211     "::END;"
4212     "::BEGIN;"
4213     "aba > z;"
4214     "::END;",
4215 */
4216     "", // test case commented out below, this is here to keep from messing up the indexes
4217 
4218     // [2]
4219 /*
4220     "abc > xy;"
4221     "::BEGIN;"
4222     "aba > z;"
4223     "::END;",
4224 */
4225     "", // test case commented out below, this is here to keep from messing up the indexes
4226 
4227     // [3]
4228 /*
4229     "::BEGIN;"
4230     "abc > xy;"
4231     "::END;"
4232     "aba > z;",
4233 */
4234     "", // test case commented out below, this is here to keep from messing up the indexes
4235 
4236     // [4]
4237     "abc > xy;"
4238     "::Null;"
4239     "aba > z;",
4240 
4241     // [5]
4242     "::Upper;"
4243     "ABC > xy;"
4244     "AB > x;"
4245     "C > z;"
4246     "::Upper;"
4247     "XYZ > p;"
4248     "XY > q;"
4249     "Z > r;"
4250     "::Upper;",
4251 
4252     // [6]
4253     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4254     "$delim = [\\-$ws];"
4255     "$ws $delim* > ' ';"
4256     "'-' $delim* > '-';",
4257 
4258     // [7]
4259     "::Null;"
4260     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4261     "$delim = [\\-$ws];"
4262     "$ws $delim* > ' ';"
4263     "'-' $delim* > '-';",
4264 
4265     // [8]
4266     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4267     "$delim = [\\-$ws];"
4268     "$ws $delim* > ' ';"
4269     "'-' $delim* > '-';"
4270     "::Null;",
4271 
4272     // [9]
4273     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4274     "$delim = [\\-$ws];"
4275     "::Null;"
4276     "$ws $delim* > ' ';"
4277     "'-' $delim* > '-';",
4278 
4279     // [10]
4280 /*
4281     "::BEGIN;"
4282     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4283     "$delim = [\\-$ws];"
4284     "::END;"
4285     "$ws $delim* > ' ';"
4286     "'-' $delim* > '-';",
4287 */
4288     "", // test case commented out below, this is here to keep from messing up the indexes
4289 
4290     // [11]
4291 /*
4292     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4293     "$delim = [\\-$ws];"
4294     "::BEGIN;"
4295     "$ws $delim* > ' ';"
4296     "'-' $delim* > '-';"
4297     "::END;",
4298 */
4299     "", // test case commented out below, this is here to keep from messing up the indexes
4300 
4301     // [12]
4302 /*
4303     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4304     "$delim = [\\-$ws];"
4305     "$ab = [ab];"
4306     "::BEGIN;"
4307     "$ws $delim* > ' ';"
4308     "'-' $delim* > '-';"
4309     "::END;"
4310     "::BEGIN;"
4311     "$ab { ' ' } $ab > '-';"
4312     "c { ' ' > ;"
4313     "::END;"
4314     "::BEGIN;"
4315     "'a-a' > a\\%|a;"
4316     "::END;",
4317 */
4318     "", // test case commented out below, this is here to keep from messing up the indexes
4319 
4320     // [13]
4321     "$ws = [[:Separator:][\\u0009-\\u000C]$];"
4322     "$delim = [\\-$ws];"
4323     "$ab = [ab];"
4324     "::Null;"
4325     "$ws $delim* > ' ';"
4326     "'-' $delim* > '-';"
4327     "::Null;"
4328     "$ab { ' ' } $ab > '-';"
4329     "c { ' ' > ;"
4330     "::Null;"
4331     "'a-a' > a\\%|a;",
4332 
4333     // [14]
4334 /*
4335     "::[abc];"
4336     "::BEGIN;"
4337     "abc > xy;"
4338     "::END;"
4339     "::BEGIN;"
4340     "aba > yz;"
4341     "::END;"
4342     "::Upper;",
4343 */
4344     "", // test case commented out below, this is here to keep from messing up the indexes
4345 
4346     // [15]
4347     "::[abc];"
4348     "abc > xy;"
4349     "::Null;"
4350     "aba > yz;"
4351     "::Upper;",
4352 
4353     // [16]
4354 /*
4355     "::[abc];"
4356     "::BEGIN;"
4357     "abc <> xy;"
4358     "::END;"
4359     "::BEGIN;"
4360     "aba <> yz;"
4361     "::END;"
4362     "::Upper(Lower);"
4363     "::([XYZ]);"
4364 */
4365     "", // test case commented out below, this is here to keep from messing up the indexes
4366 
4367     // [17]
4368     "::[abc];"
4369     "abc <> xy;"
4370     "::Null;"
4371     "aba <> yz;"
4372     "::Upper(Lower);"
4373     "::([XYZ]);"
4374 };
4375 
4376 /*
4377 (This entire test is commented out below and will need some heavy revision when we re-add
4378 the ::BEGIN/::END stuff)
4379 static const char* BOGUS_BEGIN_END_RULES[] = {
4380     // [7]
4381     "::BEGIN;"
4382     "abc > xy;"
4383     "::BEGIN;"
4384     "aba > z;"
4385     "::END;"
4386     "::END;",
4387 
4388     // [8]
4389     "abc > xy;"
4390     " aba > z;"
4391     "::END;",
4392 
4393     // [9]
4394     "::BEGIN;"
4395     "::Upper;"
4396     "::END;"
4397 };
4398 static const int32_t BOGUS_BEGIN_END_RULES_length = UPRV_LENGTHOF(BOGUS_BEGIN_END_RULES);
4399 */
4400 
4401 static const char* BEGIN_END_TEST_CASES[] = {
4402     // rules             input                   expected output
4403     BEGIN_END_RULES[0],  "abc ababc aba",        "xy zbc z",
4404 //    BEGIN_END_RULES[1],  "abc ababc aba",        "xy abxy z",
4405 //    BEGIN_END_RULES[2],  "abc ababc aba",        "xy abxy z",
4406 //    BEGIN_END_RULES[3],  "abc ababc aba",        "xy abxy z",
4407     BEGIN_END_RULES[4],  "abc ababc aba",        "xy abxy z",
4408     BEGIN_END_RULES[5],  "abccabaacababcbc",     "PXAARXQBR",
4409 
4410     BEGIN_END_RULES[6],  "e   e - e---e-  e",    "e e e-e-e",
4411     BEGIN_END_RULES[7],  "e   e - e---e-  e",    "e e e-e-e",
4412     BEGIN_END_RULES[8],  "e   e - e---e-  e",    "e e e-e-e",
4413     BEGIN_END_RULES[9],  "e   e - e---e-  e",    "e e e-e-e",
4414 //    BEGIN_END_RULES[10],  "e   e - e---e-  e",    "e e e-e-e",
4415 //    BEGIN_END_RULES[11], "e   e - e---e-  e",    "e e e-e-e",
4416 //    BEGIN_END_RULES[12], "e   e - e---e-  e",    "e e e-e-e",
4417 //    BEGIN_END_RULES[12], "a    a    a    a",     "a%a%a%a",
4418 //    BEGIN_END_RULES[12], "a a-b c b a",          "a%a-b cb-a",
4419     BEGIN_END_RULES[13], "e   e - e---e-  e",    "e e e-e-e",
4420     BEGIN_END_RULES[13], "a    a    a    a",     "a%a%a%a",
4421     BEGIN_END_RULES[13], "a a-b c b a",          "a%a-b cb-a",
4422 
4423 //    BEGIN_END_RULES[14], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4424     BEGIN_END_RULES[15], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4425 //    BEGIN_END_RULES[16], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ",
4426     BEGIN_END_RULES[17], "abc xy ababc xyz aba", "XY xy ABXY xyz YZ"
4427 };
4428 static const int32_t BEGIN_END_TEST_CASES_length = UPRV_LENGTHOF(BEGIN_END_TEST_CASES);
4429 
TestBeginEnd()4430 void TransliteratorTest::TestBeginEnd() {
4431     // run through the list of test cases above
4432     int32_t i = 0;
4433     for (i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4434         expect((UnicodeString)"Test case #" + (i / 3),
4435                UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4436                UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4437                UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4438     }
4439 
4440     // instantiate the one reversible rule set in the reverse direction and make sure it does the right thing
4441     UParseError parseError;
4442     UErrorCode status = U_ZERO_ERROR;
4443     Transliterator* reversed  = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4444             UTRANS_REVERSE, parseError, status);
4445     if (reversed == 0 || U_FAILURE(status)) {
4446         reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4447     } else {
4448         expect(*reversed, UnicodeString("xy XY XYZ yz YZ"), UnicodeString("xy abc xaba yz aba"));
4449     }
4450     delete reversed;
4451 
4452     // finally, run through the list of syntactically-ill-formed rule sets above and make sure
4453     // that all of them cause errors
4454 /*
4455 (commented out until we have the real ::BEGIN/::END stuff in place
4456     for (i = 0; i < BOGUS_BEGIN_END_RULES_length; i++) {
4457         UParseError parseError;
4458         UErrorCode status = U_ZERO_ERROR;
4459         Transliterator* t = Transliterator::createFromRules("foo", UnicodeString(BOGUS_BEGIN_END_RULES[i]),
4460                 UTRANS_FORWARD, parseError, status);
4461         if (!U_FAILURE(status)) {
4462             delete t;
4463             errln((UnicodeString)"Should have gotten syntax error from " + BOGUS_BEGIN_END_RULES[i]);
4464         }
4465     }
4466 */
4467 }
4468 
TestBeginEndToRules()4469 void TransliteratorTest::TestBeginEndToRules() {
4470     // run through the same list of test cases we used above, but this time, instead of just
4471     // instantiating a Transliterator from the rules and running the test against it, we instantiate
4472     // a Transliterator from the rules, do toRules() on it, instantiate a Transliterator from
4473     // the resulting set of rules, and make sure that the generated rule set is semantically equivalent
4474     // to (i.e., does the same thing as) the original rule set
4475     for (int32_t i = 0; i < BEGIN_END_TEST_CASES_length; i += 3) {
4476         UParseError parseError;
4477         UErrorCode status = U_ZERO_ERROR;
4478         Transliterator* t = Transliterator::createFromRules("--", UnicodeString(BEGIN_END_TEST_CASES[i], -1, US_INV),
4479                 UTRANS_FORWARD, parseError, status);
4480         if (U_FAILURE(status)) {
4481             reportParseError(UnicodeString("FAIL: Couldn't create transliterator"), parseError, status);
4482         } else {
4483             UnicodeString rules;
4484             t->toRules(rules, TRUE);
4485             Transliterator* t2 = Transliterator::createFromRules((UnicodeString)"Test case #" + (i / 3), rules,
4486                     UTRANS_FORWARD, parseError, status);
4487             if (U_FAILURE(status)) {
4488                 reportParseError(UnicodeString("FAIL: Couldn't create transliterator from generated rules"),
4489                         parseError, status);
4490                 delete t;
4491             } else {
4492                 expect(*t2,
4493                        UnicodeString(BEGIN_END_TEST_CASES[i + 1], -1, US_INV),
4494                        UnicodeString(BEGIN_END_TEST_CASES[i + 2], -1, US_INV));
4495                 delete t;
4496                 delete t2;
4497             }
4498         }
4499     }
4500 
4501     // do the same thing for the reversible test case
4502     UParseError parseError;
4503     UErrorCode status = U_ZERO_ERROR;
4504     Transliterator* reversed = Transliterator::createFromRules("Reversed", UnicodeString(BEGIN_END_RULES[17]),
4505             UTRANS_REVERSE, parseError, status);
4506     if (U_FAILURE(status)) {
4507         reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator"), parseError, status);
4508     } else {
4509         UnicodeString rules;
4510         reversed->toRules(rules, FALSE);
4511         Transliterator* reversed2 = Transliterator::createFromRules("Reversed", rules, UTRANS_FORWARD,
4512                 parseError, status);
4513         if (U_FAILURE(status)) {
4514             reportParseError(UnicodeString("FAIL: Couldn't create reversed transliterator from generated rules"),
4515                     parseError, status);
4516             delete reversed;
4517         } else {
4518             expect(*reversed2,
4519                    UnicodeString("xy XY XYZ yz YZ"),
4520                    UnicodeString("xy abc xaba yz aba"));
4521             delete reversed;
4522             delete reversed2;
4523         }
4524     }
4525 }
4526 
TestRegisterAlias()4527 void TransliteratorTest::TestRegisterAlias() {
4528     UnicodeString longID("Lower;[aeiou]Upper");
4529     UnicodeString shortID("Any-CapVowels");
4530     UnicodeString reallyShortID("CapVowels");
4531 
4532     Transliterator::registerAlias(shortID, longID);
4533 
4534     UErrorCode err = U_ZERO_ERROR;
4535     Transliterator* t1 = Transliterator::createInstance(longID, UTRANS_FORWARD, err);
4536     if (U_FAILURE(err)) {
4537         errln("Failed to instantiate transliterator with long ID");
4538         Transliterator::unregister(shortID);
4539         return;
4540     }
4541     Transliterator* t2 = Transliterator::createInstance(reallyShortID, UTRANS_FORWARD, err);
4542     if (U_FAILURE(err)) {
4543         errln("Failed to instantiate transliterator with short ID");
4544         delete t1;
4545         Transliterator::unregister(shortID);
4546         return;
4547     }
4548 
4549     if (t1->getID() != longID)
4550         errln("Transliterator instantiated with long ID doesn't have long ID");
4551     if (t2->getID() != reallyShortID)
4552         errln("Transliterator instantiated with short ID doesn't have short ID");
4553 
4554     UnicodeString rules1;
4555     UnicodeString rules2;
4556 
4557     t1->toRules(rules1, TRUE);
4558     t2->toRules(rules2, TRUE);
4559     if (rules1 != rules2)
4560         errln("Alias transliterators aren't the same");
4561 
4562     delete t1;
4563     delete t2;
4564     Transliterator::unregister(shortID);
4565 
4566     t1 = Transliterator::createInstance(shortID, UTRANS_FORWARD, err);
4567     if (U_SUCCESS(err)) {
4568         errln("Instantiation with short ID succeeded after short ID was unregistered");
4569         delete t1;
4570     }
4571 
4572     // try the same thing again, but this time with something other than
4573     // an instance of CompoundTransliterator
4574     UnicodeString realID("Latin-Greek");
4575     UnicodeString fakeID("Latin-dlgkjdflkjdl");
4576     Transliterator::registerAlias(fakeID, realID);
4577 
4578     err = U_ZERO_ERROR;
4579     t1 = Transliterator::createInstance(realID, UTRANS_FORWARD, err);
4580     if (U_FAILURE(err)) {
4581         dataerrln("Failed to instantiate transliterator with real ID - %s", u_errorName(err));
4582         Transliterator::unregister(realID);
4583         return;
4584     }
4585     t2 = Transliterator::createInstance(fakeID, UTRANS_FORWARD, err);
4586     if (U_FAILURE(err)) {
4587         errln("Failed to instantiate transliterator with fake ID");
4588         delete t1;
4589         Transliterator::unregister(realID);
4590         return;
4591     }
4592 
4593     t1->toRules(rules1, TRUE);
4594     t2->toRules(rules2, TRUE);
4595     if (rules1 != rules2)
4596         errln("Alias transliterators aren't the same");
4597 
4598     delete t1;
4599     delete t2;
4600     Transliterator::unregister(fakeID);
4601 }
4602 
TestRuleStripping()4603 void TransliteratorTest::TestRuleStripping() {
4604     /*
4605 #
4606 \uE001>\u0C01; # SIGN
4607     */
4608     static const UChar rule[] = {
4609         0x0023,0x0020,0x000D,0x000A,
4610         0xE001,0x003E,0x0C01,0x003B,0x0020,0x0023,0x0020,0x0053,0x0049,0x0047,0x004E,0
4611     };
4612     static const UChar expectedRule[] = {
4613         0xE001,0x003E,0x0C01,0x003B,0
4614     };
4615     UChar result[UPRV_LENGTHOF(rule)];
4616     UErrorCode status = U_ZERO_ERROR;
4617     int32_t len = utrans_stripRules(rule, UPRV_LENGTHOF(rule), result, &status);
4618     if (len != u_strlen(expectedRule)) {
4619         errln("utrans_stripRules return len = %d", len);
4620     }
4621     if (u_strncmp(expectedRule, result, len) != 0) {
4622         errln("utrans_stripRules did not return expected string");
4623     }
4624 }
4625 
4626 /**
4627  * Test the Halfwidth-Fullwidth transliterator (ticket 6281).
4628  */
TestHalfwidthFullwidth(void)4629 void TransliteratorTest::TestHalfwidthFullwidth(void) {
4630     UParseError parseError;
4631     UErrorCode status = U_ZERO_ERROR;
4632     Transliterator* hf = Transliterator::createInstance("Halfwidth-Fullwidth", UTRANS_FORWARD, parseError, status);
4633     Transliterator* fh = Transliterator::createInstance("Fullwidth-Halfwidth", UTRANS_FORWARD, parseError, status);
4634     if (hf == 0 || fh == 0) {
4635         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4636         delete hf;
4637         delete fh;
4638         return;
4639     }
4640 
4641     // Array of 2n items
4642     // Each item is
4643     //   "hf"|"fh"|"both",
4644     //   <Halfwidth>,
4645     //   <Fullwidth>
4646     const char* DATA[] = {
4647         "both",
4648         "\\uFFE9\\uFFEA\\uFFEB\\uFFEC\\u0061\\uFF71\\u00AF\\u0020",
4649         "\\u2190\\u2191\\u2192\\u2193\\uFF41\\u30A2\\uFFE3\\u3000",
4650     };
4651     int32_t DATA_length = UPRV_LENGTHOF(DATA);
4652 
4653     for (int32_t i=0; i<DATA_length; i+=3) {
4654         UnicodeString h = CharsToUnicodeString(DATA[i+1]);
4655         UnicodeString f = CharsToUnicodeString(DATA[i+2]);
4656         switch (*DATA[i]) {
4657         case 0x68: //'h': // Halfwidth-Fullwidth only
4658             expect(*hf, h, f);
4659             break;
4660         case 0x66: //'f': // Fullwidth-Halfwidth only
4661             expect(*fh, f, h);
4662             break;
4663         case 0x62: //'b': // both directions
4664             expect(*hf, h, f);
4665             expect(*fh, f, h);
4666             break;
4667         }
4668     }
4669     delete hf;
4670     delete fh;
4671 }
4672 
4673 
4674     /**
4675      *  Test Thai.  The text is the first paragraph of "What is Unicode" from the Unicode.org web site.
4676      *              TODO: confirm that the expected results are correct.
4677      *              For now, test just confirms that C++ and Java give identical results.
4678      */
TestThai(void)4679 void TransliteratorTest::TestThai(void) {
4680 #if !UCONFIG_NO_BREAK_ITERATION
4681     UParseError parseError;
4682     UErrorCode status = U_ZERO_ERROR;
4683     Transliterator* tr = Transliterator::createInstance("Any-Latin", UTRANS_FORWARD, parseError, status);
4684     if (tr == 0) {
4685         dataerrln("FAIL: createInstance failed - %s", u_errorName(status));
4686         return;
4687     }
4688     if (U_FAILURE(status)) {
4689         errln("FAIL: createInstance failed with %s", u_errorName(status));
4690         return;
4691     }
4692     const char *thaiText =
4693         "\\u0e42\\u0e14\\u0e22\\u0e1e\\u0e37\\u0e49\\u0e19\\u0e10\\u0e32\\u0e19\\u0e41\\u0e25\\u0e49\\u0e27, \\u0e04\\u0e2d"
4694         "\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d\\u0e23\\u0e4c\\u0e08\\u0e30\\u0e40\\u0e01\\u0e35\\u0e48\\u0e22"
4695         "\\u0e27\\u0e02\\u0e49\\u0e2d\\u0e07\\u0e01\\u0e31\\u0e1a\\u0e40\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e02\\u0e2d"
4696         "\\u0e07\\u0e15\\u0e31\\u0e27\\u0e40\\u0e25\\u0e02. \\u0e04\\u0e2d\\u0e21\\u0e1e\\u0e34\\u0e27\\u0e40\\u0e15\\u0e2d"
4697         "\\u0e23\\u0e4c\\u0e08\\u0e31\\u0e14\\u0e40\\u0e01\\u0e47\\u0e1a\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29"
4698         "\\u0e23\\u0e41\\u0e25\\u0e30\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30\\u0e2d\\u0e37\\u0e48\\u0e19\\u0e46 \\u0e42"
4699         "\\u0e14\\u0e22\\u0e01\\u0e32\\u0e23\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25"
4700         "\\u0e02\\u0e43\\u0e2b\\u0e49\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e41\\u0e15\\u0e48\\u0e25\\u0e30\\u0e15"
4701         "\\u0e31\\u0e27. \\u0e01\\u0e48\\u0e2d\\u0e19\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48\\u0e4a Unicode \\u0e08"
4702         "\\u0e30\\u0e16\\u0e39\\u0e01\\u0e2a\\u0e23\\u0e49\\u0e32\\u0e07\\u0e02\\u0e36\\u0e49\\u0e19, \\u0e44\\u0e14\\u0e49"
4703         "\\u0e21\\u0e35\\u0e23\\u0e30\\u0e1a\\u0e1a encoding \\u0e2d\\u0e22\\u0e39\\u0e48\\u0e2b\\u0e25\\u0e32\\u0e22\\u0e23"
4704         "\\u0e49\\u0e2d\\u0e22\\u0e23\\u0e30\\u0e1a\\u0e1a\\u0e2a\\u0e33\\u0e2b\\u0e23\\u0e31\\u0e1a\\u0e01\\u0e32\\u0e23"
4705         "\\u0e01\\u0e33\\u0e2b\\u0e19\\u0e14\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e40\\u0e25\\u0e02\\u0e40\\u0e2b\\u0e25\\u0e48"
4706         "\\u0e32\\u0e19\\u0e35\\u0e49. \\u0e44\\u0e21\\u0e48\\u0e21\\u0e35 encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48"
4707         "\\u0e21\\u0e35\\u0e08\\u0e33\\u0e19\\u0e27\\u0e19\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e02\\u0e23\\u0e30"
4708         "\\u0e21\\u0e32\\u0e01\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d: \\u0e22\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d"
4709         "\\u0e22\\u0e48\\u0e32\\u0e07\\u0e40\\u0e0a\\u0e48\\u0e19, \\u0e40\\u0e09\\u0e1e\\u0e32\\u0e30\\u0e43\\u0e19\\u0e01"
4710         "\\u0e25\\u0e38\\u0e48\\u0e21\\u0e2a\\u0e2b\\u0e20\\u0e32\\u0e1e\\u0e22\\u0e38\\u0e42\\u0e23\\u0e1b\\u0e40\\u0e1e"
4711         "\\u0e35\\u0e22\\u0e07\\u0e41\\u0e2b\\u0e48\\u0e07\\u0e40\\u0e14\\u0e35\\u0e22\\u0e27 \\u0e01\\u0e47\\u0e15\\u0e49"
4712         "\\u0e2d\\u0e07\\u0e01\\u0e32\\u0e23\\u0e2b\\u0e25\\u0e32\\u0e22 encoding \\u0e43\\u0e19\\u0e01\\u0e32\\u0e23\\u0e04"
4713         "\\u0e23\\u0e2d\\u0e1a\\u0e04\\u0e25\\u0e38\\u0e21\\u0e17\\u0e38\\u0e01\\u0e20\\u0e32\\u0e29\\u0e32\\u0e43\\u0e19"
4714         "\\u0e01\\u0e25\\u0e38\\u0e48\\u0e21. \\u0e2b\\u0e23\\u0e37\\u0e2d\\u0e41\\u0e21\\u0e49\\u0e41\\u0e15\\u0e48\\u0e43"
4715         "\\u0e19\\u0e20\\u0e32\\u0e29\\u0e32\\u0e40\\u0e14\\u0e35\\u0e48\\u0e22\\u0e27 \\u0e40\\u0e0a\\u0e48\\u0e19 \\u0e20"
4716         "\\u0e32\\u0e29\\u0e32\\u0e2d\\u0e31\\u0e07\\u0e01\\u0e24\\u0e29 \\u0e01\\u0e47\\u0e44\\u0e21\\u0e48\\u0e21\\u0e35"
4717         " encoding \\u0e43\\u0e14\\u0e17\\u0e35\\u0e48\\u0e40\\u0e1e\\u0e35\\u0e22\\u0e07\\u0e1e\\u0e2d\\u0e2a\\u0e33\\u0e2b"
4718         "\\u0e23\\u0e31\\u0e1a\\u0e17\\u0e38\\u0e01\\u0e15\\u0e31\\u0e27\\u0e2d\\u0e31\\u0e01\\u0e29\\u0e23, \\u0e40\\u0e04"
4719         "\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07\\u0e2b\\u0e21\\u0e32\\u0e22\\u0e27\\u0e23\\u0e23\\u0e04\\u0e15\\u0e2d\\u0e19"
4720         " \\u0e41\\u0e25\\u0e30\\u0e2a\\u0e31\\u0e0d\\u0e25\\u0e31\\u0e01\\u0e29\\u0e13\\u0e4c\\u0e17\\u0e32\\u0e07\\u0e40"
4721         "\\u0e17\\u0e04\\u0e19\\u0e34\\u0e04\\u0e17\\u0e35\\u0e48\\u0e43\\u0e0a\\u0e49\\u0e01\\u0e31\\u0e19\\u0e2d\\u0e22"
4722         "\\u0e39\\u0e48\\u0e17\\u0e31\\u0e48\\u0e27\\u0e44\\u0e1b.";
4723 
4724     const char *latinText =
4725         "doy ph\\u1ee5\\u0304\\u0302n \\u1e6d\\u0304h\\u0101n l\\u00e6\\u0302w, khxmphiwtexr\\u0312 ca ke\\u012b\\u0300"
4726         "ywk\\u0304\\u0125xng k\\u1ea1b re\\u1ee5\\u0304\\u0300xng k\\u0304hxng t\\u1ea1wlek\\u0304h. khxmphiwtexr"
4727         "\\u0312 c\\u1ea1d k\\u0115b t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r l\\u00e6a x\\u1ea1kk\\u0304h ra x\\u1ee5\\u0304"
4728         "\\u0300n\\u00ab doy k\\u0101r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304\\u0131\\u0302 s\\u0304"
4729         "\\u1ea3h\\u0304r\\u1ea1b t\\u00e6\\u0300la t\\u1ea1w. k\\u0300xn h\\u0304n\\u0302\\u0101 th\\u012b\\u0300\\u0301"
4730         " Unicode ca t\\u0304h\\u016bk s\\u0304r\\u0302\\u0101ng k\\u0304h\\u1ee5\\u0302n, d\\u1ecb\\u0302 m\\u012b "
4731         "rabb encoding xy\\u016b\\u0300 h\\u0304l\\u0101y r\\u0302xy rabb s\\u0304\\u1ea3h\\u0304r\\u1ea1b k\\u0101"
4732         "r k\\u1ea3h\\u0304nd h\\u0304m\\u0101ylek\\u0304h h\\u0304el\\u0300\\u0101 n\\u012b\\u0302. m\\u1ecb\\u0300m"
4733         "\\u012b encoding d\\u0131 th\\u012b\\u0300 m\\u012b c\\u1ea3nwn t\\u1ea1w x\\u1ea1kk\\u0304hra m\\u0101k p"
4734         "he\\u012byng phx: yk t\\u1ea1wx\\u1ef3\\u0101ng ch\\u00e8n, c\\u0304heph\\u0101a n\\u0131 kl\\u00f9m s\\u0304"
4735         "h\\u0304p\\u0323h\\u0101ph yurop phe\\u012byng h\\u0304\\u00e6\\u0300ng de\\u012byw k\\u0306 t\\u0302xngk\\u0101"
4736         "r h\\u0304l\\u0101y encoding n\\u0131 k\\u0101r khrxbkhlum thuk p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 n\\u0131"
4737         " kl\\u00f9m. h\\u0304r\\u1ee5\\u0304x m\\u00e6\\u0302t\\u00e6\\u0300 n\\u0131 p\\u0323h\\u0101s\\u0304\\u02b9"
4738         "\\u0101 de\\u012b\\u0300yw ch\\u00e8n p\\u0323h\\u0101s\\u0304\\u02b9\\u0101 x\\u1ea1ngkvs\\u0304\\u02b9 k\\u0306"
4739         " m\\u1ecb\\u0300m\\u012b encoding d\\u0131 th\\u012b\\u0300 phe\\u012byng phx s\\u0304\\u1ea3h\\u0304r\\u1ea1"
4740         "b thuk t\\u1ea1w x\\u1ea1ks\\u0304\\u02b9r, kher\\u1ee5\\u0304\\u0300xngh\\u0304m\\u0101y wrrkh txn l\\u00e6"
4741         "a s\\u0304\\u1ea1\\u1ef5l\\u1ea1ks\\u0304\\u02b9\\u1e47\\u0312 th\\u0101ng thekhnikh th\\u012b\\u0300 ch\\u0131"
4742         "\\u0302 k\\u1ea1n xy\\u016b\\u0300 th\\u1ea1\\u0300wp\\u1ecb.";
4743 
4744 
4745     UnicodeString  xlitText(thaiText);
4746     xlitText = xlitText.unescape();
4747     tr->transliterate(xlitText);
4748 
4749     UnicodeString expectedText(latinText);
4750     expectedText = expectedText.unescape();
4751     expect(*tr, xlitText, expectedText);
4752 
4753     delete tr;
4754 #endif
4755 }
4756 
4757 
4758 //======================================================================
4759 // Support methods
4760 //======================================================================
expectT(const UnicodeString & id,const UnicodeString & source,const UnicodeString & expectedResult)4761 void TransliteratorTest::expectT(const UnicodeString& id,
4762                                  const UnicodeString& source,
4763                                  const UnicodeString& expectedResult) {
4764     UErrorCode ec = U_ZERO_ERROR;
4765     UParseError pe;
4766     Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, pe, ec);
4767     if (U_FAILURE(ec)) {
4768         errln((UnicodeString)"FAIL: Could not create " + id + " -  " + u_errorName(ec));
4769         delete t;
4770         return;
4771     }
4772     expect(*t, source, expectedResult);
4773     delete t;
4774 }
4775 
reportParseError(const UnicodeString & message,const UParseError & parseError,const UErrorCode & status)4776 void TransliteratorTest::reportParseError(const UnicodeString& message,
4777                                           const UParseError& parseError,
4778                                           const UErrorCode& status) {
4779     dataerrln(message +
4780           /*", parse error " + parseError.code +*/
4781           ", line " + parseError.line +
4782           ", offset " + parseError.offset +
4783           ", pre-context " + prettify(parseError.preContext, TRUE) +
4784           ", post-context " + prettify(parseError.postContext,TRUE) +
4785           ", Error: " + u_errorName(status));
4786 }
4787 
expect(const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4788 void TransliteratorTest::expect(const UnicodeString& rules,
4789                                 const UnicodeString& source,
4790                                 const UnicodeString& expectedResult,
4791                                 UTransPosition *pos) {
4792     expect("<ID>", rules, source, expectedResult, pos);
4793 }
4794 
expect(const UnicodeString & id,const UnicodeString & rules,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4795 void TransliteratorTest::expect(const UnicodeString& id,
4796                                 const UnicodeString& rules,
4797                                 const UnicodeString& source,
4798                                 const UnicodeString& expectedResult,
4799                                 UTransPosition *pos) {
4800     UErrorCode status = U_ZERO_ERROR;
4801     UParseError parseError;
4802     Transliterator* t = Transliterator::createFromRules(id, rules, UTRANS_FORWARD, parseError, status);
4803     if (U_FAILURE(status)) {
4804         reportParseError(UnicodeString("Couldn't create transliterator from ") + rules, parseError, status);
4805     } else {
4806         expect(*t, source, expectedResult, pos);
4807     }
4808     delete t;
4809 }
4810 
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,const Transliterator & reverseTransliterator)4811 void TransliteratorTest::expect(const Transliterator& t,
4812                                 const UnicodeString& source,
4813                                 const UnicodeString& expectedResult,
4814                                 const Transliterator& reverseTransliterator) {
4815     expect(t, source, expectedResult);
4816     expect(reverseTransliterator, expectedResult, source);
4817 }
4818 
expect(const Transliterator & t,const UnicodeString & source,const UnicodeString & expectedResult,UTransPosition * pos)4819 void TransliteratorTest::expect(const Transliterator& t,
4820                                 const UnicodeString& source,
4821                                 const UnicodeString& expectedResult,
4822                                 UTransPosition *pos) {
4823     if (pos == 0) {
4824         UnicodeString result(source);
4825         t.transliterate(result);
4826         expectAux(t.getID() + ":String", source, result, expectedResult);
4827     }
4828     UTransPosition index={0, 0, 0, 0};
4829     if (pos != 0) {
4830         index = *pos;
4831     }
4832 
4833     UnicodeString rsource(source);
4834     if (pos == 0) {
4835         t.transliterate(rsource);
4836     } else {
4837         // Do it all at once -- below we do it incrementally
4838         t.finishTransliteration(rsource, *pos);
4839     }
4840     expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
4841 
4842     // Test keyboard (incremental) transliteration -- this result
4843     // must be the same after we finalize (see below).
4844     UnicodeString log;
4845     rsource.remove();
4846     if (pos != 0) {
4847         rsource = source;
4848         formatInput(log, rsource, index);
4849         log.append(" -> ");
4850         UErrorCode status = U_ZERO_ERROR;
4851         t.transliterate(rsource, index, status);
4852         formatInput(log, rsource, index);
4853     } else {
4854         for (int32_t i=0; i<source.length(); ++i) {
4855             if (i != 0) {
4856                 log.append(" + ");
4857             }
4858             log.append(source.charAt(i)).append(" -> ");
4859             UErrorCode status = U_ZERO_ERROR;
4860             t.transliterate(rsource, index, source.charAt(i), status);
4861             formatInput(log, rsource, index);
4862         }
4863     }
4864 
4865     // As a final step in keyboard transliteration, we must call
4866     // transliterate to finish off any pending partial matches that
4867     // were waiting for more input.
4868     t.finishTransliteration(rsource, index);
4869     log.append(" => ").append(rsource);
4870 
4871     expectAux(t.getID() + ":Keyboard", log,
4872               rsource == expectedResult,
4873               expectedResult);
4874 }
4875 
4876 
4877 /**
4878  * @param appendTo result is appended to this param.
4879  * @param input the string being transliterated
4880  * @param pos the index struct
4881  */
formatInput(UnicodeString & appendTo,const UnicodeString & input,const UTransPosition & pos)4882 UnicodeString& TransliteratorTest::formatInput(UnicodeString &appendTo,
4883                                                const UnicodeString& input,
4884                                                const UTransPosition& pos) {
4885     // Output a string of the form aaa{bbb|ccc|ddd}eee, where
4886     // the {} indicate the context start and limit, and the ||
4887     // indicate the start and limit.
4888     if (0 <= pos.contextStart &&
4889         pos.contextStart <= pos.start &&
4890         pos.start <= pos.limit &&
4891         pos.limit <= pos.contextLimit &&
4892         pos.contextLimit <= input.length()) {
4893 
4894         UnicodeString a, b, c, d, e;
4895         input.extractBetween(0, pos.contextStart, a);
4896         input.extractBetween(pos.contextStart, pos.start, b);
4897         input.extractBetween(pos.start, pos.limit, c);
4898         input.extractBetween(pos.limit, pos.contextLimit, d);
4899         input.extractBetween(pos.contextLimit, input.length(), e);
4900         appendTo.append(a).append((UChar)123/*{*/).append(b).
4901             append((UChar)PIPE).append(c).append((UChar)PIPE).append(d).
4902             append((UChar)125/*}*/).append(e);
4903     } else {
4904         appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
4905                         pos.contextStart + ", s=" + pos.start + ", l=" +
4906                         pos.limit + ", cl=" + pos.contextLimit + "} on " +
4907                         input);
4908     }
4909     return appendTo;
4910 }
4911 
expectAux(const UnicodeString & tag,const UnicodeString & source,const UnicodeString & result,const UnicodeString & expectedResult)4912 void TransliteratorTest::expectAux(const UnicodeString& tag,
4913                                    const UnicodeString& source,
4914                                    const UnicodeString& result,
4915                                    const UnicodeString& expectedResult) {
4916     expectAux(tag, source + " -> " + result,
4917               result == expectedResult,
4918               expectedResult);
4919 }
4920 
expectAux(const UnicodeString & tag,const UnicodeString & summary,UBool pass,const UnicodeString & expectedResult)4921 void TransliteratorTest::expectAux(const UnicodeString& tag,
4922                                    const UnicodeString& summary, UBool pass,
4923                                    const UnicodeString& expectedResult) {
4924     if (pass) {
4925         logln(UnicodeString("(")+tag+") " + prettify(summary));
4926     } else {
4927         dataerrln(UnicodeString("FAIL: (")+tag+") "
4928               + prettify(summary)
4929               + ", expected " + prettify(expectedResult));
4930     }
4931 }
4932 
4933 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
4934