1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 2005-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  */
9 
10 
11 #include "unicode/utypes.h"
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/unistr.h"
15 #include "unicode/putil.h"
16 #include "unicode/uniset.h"
17 
18 #include "intltest.h"
19 #include "csdetest.h"
20 
21 #include "xmlparser.h"
22 
23 #include <memory>
24 #include <stdlib.h>
25 #include <string.h>
26 
27 #ifdef DEBUG_DETECT
28 #include <stdio.h>
29 #endif
30 
31 
32 #define CH_SPACE 0x0020
33 #define CH_SLASH 0x002F
34 
35 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
36     if (!(x)) { \
37         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
38     } \
39 } UPRV_BLOCK_MACRO_END
40 
41 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
42     if (U_FAILURE(errcode)) { \
43         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
44         return; \
45     } \
46 } UPRV_BLOCK_MACRO_END
47 
48 
49 //---------------------------------------------------------------------------
50 //
51 //  Test class boilerplate
52 //
53 //---------------------------------------------------------------------------
CharsetDetectionTest()54 CharsetDetectionTest::CharsetDetectionTest()
55 {
56 }
57 
58 
~CharsetDetectionTest()59 CharsetDetectionTest::~CharsetDetectionTest()
60 {
61 }
62 
63 
64 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)65 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
66 {
67     if (exec) logln("TestSuite CharsetDetectionTest: ");
68     switch (index) {
69        case 0: name = "ConstructionTest";
70             if (exec) ConstructionTest();
71             break;
72 
73        case 1: name = "UTF8Test";
74             if (exec) UTF8Test();
75             break;
76 
77        case 2: name = "UTF16Test";
78             if (exec) UTF16Test();
79             break;
80 
81        case 3: name = "C1BytesTest";
82             if (exec) C1BytesTest();
83             break;
84 
85        case 4: name = "InputFilterTest";
86             if (exec) InputFilterTest();
87             break;
88 
89        case 5: name = "DetectionTest";
90             if (exec) DetectionTest();
91             break;
92 #if !UCONFIG_NO_LEGACY_CONVERSION
93        case 6: name = "IBM424Test";
94             if (exec) IBM424Test();
95             break;
96 
97        case 7: name = "IBM420Test";
98             if (exec) IBM420Test();
99             break;
100 #else
101        case 6:
102        case 7: name = "skip"; break;
103 #endif
104        case 8: name = "Ticket6394Test";
105             if (exec) Ticket6394Test();
106             break;
107 
108        case 9: name = "Ticket6954Test";
109             if (exec) Ticket6954Test();
110             break;
111 
112         default: name = "";
113             break; //needed to end loop
114     }
115 }
116 
split(const UnicodeString & src,UChar ch,int32_t & splits)117 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
118 {
119     int32_t offset = -1;
120 
121     splits = 1;
122     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
123         splits += 1;
124     }
125 
126     UnicodeString *result = new UnicodeString[splits];
127 
128     int32_t start = 0;
129     int32_t split = 0;
130     int32_t end;
131 
132     while((end = src.indexOf(ch, start)) >= 0) {
133         src.extractBetween(start, end, result[split++]);
134         start = end + 1;
135     }
136 
137     src.extractBetween(start, src.length(), result[split]);
138 
139     return result;
140 }
141 
extractBytes(const UnicodeString & source,const char * codepage,int32_t & length)142 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
143 {
144     int32_t sLength = source.length();
145     char *bytes = NULL;
146 
147     length = source.extract(0, sLength, NULL, codepage);
148 
149     if (length > 0) {
150         bytes = new char[length + 1];
151         source.extract(0, sLength, bytes, codepage);
152     }
153 
154     return bytes;
155 }
156 
checkEncoding(const UnicodeString & testString,const UnicodeString & encoding,const UnicodeString & id)157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
158 {
159     int32_t splits = 0;
160     int32_t testLength = testString.length();
161     std::unique_ptr<UnicodeString []> eSplit(split(encoding, CH_SLASH, splits));
162     UErrorCode status = U_ZERO_ERROR;
163     int32_t cpLength = eSplit[0].length();
164     char codepage[64];
165 
166     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
167     codepage[cpLength] = '\0';
168 
169     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
170 
171     int32_t byteLength = 0;
172     std::unique_ptr<char []> bytes(extractBytes(testString, codepage, byteLength));
173 
174     if (! bytes) {
175 #if !UCONFIG_NO_LEGACY_CONVERSION
176         dataerrln("Can't open a " + encoding + " converter for " + id);
177 #endif
178         return;
179     }
180 
181     ucsdet_setText(csd.getAlias(), bytes.get(), byteLength, &status);
182 
183     int32_t matchCount = 0;
184     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
185 
186 
187     UnicodeString name(ucsdet_getName(matches[0], &status));
188     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
189     UChar *decoded = NULL;
190     int32_t dLength = 0;
191 
192     if (matchCount == 0) {
193         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
194         return;
195     }
196 
197     if (name.compare(eSplit[0]) != 0) {
198         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
199 
200 #ifdef DEBUG_DETECT
201         for (int32_t m = 0; m < matchCount; m += 1) {
202             const char *name = ucsdet_getName(matches[m], &status);
203             const char *lang = ucsdet_getLanguage(matches[m], &status);
204             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
205 
206             printf("%s (%s) %d\n", name, lang, confidence);
207         }
208 #endif
209         return;
210     }
211 
212     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
213         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
214         return;
215     }
216 
217     decoded = new UChar[testLength];
218     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
219 
220     if (testString.compare(decoded, dLength) != 0) {
221         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
222 
223 #ifdef DEBUG_DETECT
224         for(int32_t i = 0; i < testLength; i += 1) {
225             if(testString[i] != decoded[i]) {
226                 printf("Strings differ at byte %d\n", i);
227                 break;
228             }
229         }
230 #endif
231 
232     }
233 
234     delete[] decoded;
235 }
236 
getPath(char buffer[2048],const char * filename)237 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
238     UErrorCode status = U_ZERO_ERROR;
239     const char *testDataDirectory = IntlTest::getSourceTestData(status);
240 
241     if (U_FAILURE(status)) {
242         errln("ERROR: getPath() failed - %s", u_errorName(status));
243         return NULL;
244     }
245 
246     strcpy(buffer, testDataDirectory);
247     strcat(buffer, filename);
248     return buffer;
249 }
250 
ConstructionTest()251 void CharsetDetectionTest::ConstructionTest()
252 {
253     IcuTestErrorCode status(*this, "ConstructionTest");
254     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
255     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
256     int32_t count = uenum_count(e.getAlias(), status);
257 
258 #ifdef DEBUG_DETECT
259     printf("There are %d recognizers.\n", count);
260 #endif
261 
262     for(int32_t i = 0; i < count; i += 1) {
263         int32_t length;
264         const char *name = uenum_next(e.getAlias(), &length, status);
265 
266         if(name == NULL || length <= 0) {
267             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
268         }
269 
270 #ifdef DEBUG_DETECT
271         printf("%s\n", name);
272 #endif
273     }
274 
275     const char* defDisabled[] = {
276         "IBM420_rtl", "IBM420_ltr",
277         "IBM424_rtl", "IBM424_ltr",
278         0
279     };
280 
281     LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
282     const char *activeName = NULL;
283 
284     while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
285         // the charset must be included in all list
286         UBool found = FALSE;
287 
288         const char *name = NULL;
289         uenum_reset(e.getAlias(), status);
290         while ((name = uenum_next(e.getAlias(), NULL, status))) {
291             if (strcmp(activeName, name) == 0) {
292                 found = TRUE;
293                 break;
294             }
295         }
296 
297         if (!found) {
298             errln(UnicodeString(activeName) + " is not included in the all charset list.");
299         }
300 
301         // some charsets are disabled by default
302         found = FALSE;
303         for (int32_t i = 0; defDisabled[i] != 0; i++) {
304             if (strcmp(activeName, defDisabled[i]) == 0) {
305                 found = TRUE;
306                 break;
307             }
308         }
309         if (found) {
310             errln(UnicodeString(activeName) + " should not be included in the default charset list.");
311         }
312     }
313 }
314 
UTF8Test()315 void CharsetDetectionTest::UTF8Test()
316 {
317     UErrorCode status = U_ZERO_ERROR;
318     UnicodeString ss = "This is a string with some non-ascii characters that will "
319                        "be converted to UTF-8, then shoved through the detection process.  "
320                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
321                        "Sure would be nice if our source could contain Unicode directly!";
322     UnicodeString s = ss.unescape();
323     int32_t byteLength = 0, sLength = s.length();
324     char *bytes = extractBytes(s, "UTF-8", byteLength);
325     UCharsetDetector *csd = ucsdet_open(&status);
326     const UCharsetMatch *match;
327     UChar *detected = new UChar[sLength];
328 
329     ucsdet_setText(csd, bytes, byteLength, &status);
330     match = ucsdet_detect(csd, &status);
331 
332     if (match == NULL) {
333         errln("Detection failure for UTF-8: got no matches.");
334         goto bail;
335     }
336 
337     ucsdet_getUChars(match, detected, sLength, &status);
338 
339     if (s.compare(detected, sLength) != 0) {
340         errln("Round-trip test failed!");
341     }
342 
343     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
344 
345 bail:
346     delete[] detected;
347     delete[] bytes;
348     ucsdet_close(csd);
349 }
350 
UTF16Test()351 void CharsetDetectionTest::UTF16Test()
352 {
353     UErrorCode status = U_ZERO_ERROR;
354     /* Notice the BOM on the start of this string */
355     UChar chars[] = {
356         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
357         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
358         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
359         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
360         0x064a, 0x062a, 0x0000};
361     UnicodeString s(chars);
362     int32_t beLength = 0, leLength = 0;
363     std::unique_ptr<char []>beBytes(extractBytes(s, "UTF-16BE", beLength));
364     std::unique_ptr<char []>leBytes(extractBytes(s, "UTF-16LE", leLength));
365     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
366     const UCharsetMatch *match;
367     const char *name;
368     int32_t conf;
369 
370     ucsdet_setText(csd.getAlias(), beBytes.get(), beLength, &status);
371     match = ucsdet_detect(csd.getAlias(), &status);
372 
373     if (match == NULL) {
374         errln("Encoding detection failure for UTF-16BE: got no matches.");
375     } else {
376 
377         name  = ucsdet_getName(match, &status);
378         conf  = ucsdet_getConfidence(match, &status);
379 
380         if (strcmp(name, "UTF-16BE") != 0) {
381             errln("Encoding detection failure for UTF-16BE: got %s", name);
382         } else if (conf != 100) {
383             errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
384         }
385     }
386 
387     ucsdet_setText(csd.getAlias(), leBytes.get(), leLength, &status);
388     match = ucsdet_detect(csd.getAlias(), &status);
389 
390     if (match == NULL) {
391         errln("Encoding detection failure for UTF-16LE: got no matches.");
392         return;
393     }
394 
395     name  = ucsdet_getName(match, &status);
396     conf = ucsdet_getConfidence(match, &status);
397 
398     if (strcmp(name, "UTF-16LE") != 0) {
399         errln("Enconding detection failure for UTF-16LE: got %s", name);
400         return;
401     }
402 
403     if (conf != 100) {
404         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
405     }
406 }
407 
InputFilterTest()408 void CharsetDetectionTest::InputFilterTest()
409 {
410     UErrorCode status = U_ZERO_ERROR;
411     UnicodeString s(u"<a> <lot> <of> <English> <inside> <the> <markup> Un très petit peu de Français. <to> <confuse> <the> <detector>");
412     int32_t byteLength = 0;
413     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
414     UCharsetDetector *csd = ucsdet_open(&status);
415     const UCharsetMatch *match;
416     const char *lang, *name;
417 
418     ucsdet_enableInputFilter(csd, TRUE);
419 
420     if (!ucsdet_isInputFilterEnabled(csd)) {
421         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
422     }
423 
424 
425     ucsdet_setText(csd, bytes, byteLength, &status);
426     match = ucsdet_detect(csd, &status);
427 
428     if (match == NULL) {
429         errln("Turning on the input filter resulted in no matches.");
430         goto turn_off;
431     }
432 
433     name = ucsdet_getName(match, &status);
434 
435     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
436         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
437     } else {
438         lang = ucsdet_getLanguage(match, &status);
439 
440         if (lang == NULL || strcmp(lang, "fr") != 0) {
441             errln("Input filter did not strip markup!");
442         }
443     }
444 
445 turn_off:
446     ucsdet_enableInputFilter(csd, FALSE);
447     ucsdet_setText(csd, bytes, byteLength, &status);
448     match = ucsdet_detect(csd, &status);
449 
450     if (match == NULL) {
451         errln("Turning off the input filter resulted in no matches.");
452         goto bail;
453     }
454 
455     name = ucsdet_getName(match, &status);
456 
457     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
458         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
459     } else {
460         lang = ucsdet_getLanguage(match, &status);
461 
462         if (lang == NULL || strcmp(lang, "en") != 0) {
463             errln("Unfiltered input did not detect as English!");
464         }
465     }
466 
467 bail:
468     delete[] bytes;
469     ucsdet_close(csd);
470 }
471 
C1BytesTest()472 void CharsetDetectionTest::C1BytesTest()
473 {
474 #if !UCONFIG_NO_LEGACY_CONVERSION
475     UErrorCode status = U_ZERO_ERROR;
476     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
477     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
478     UnicodeString sWindows  = ssWindows.unescape();
479     int32_t lISO = 0, lWindows = 0;
480     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
481     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
482     UCharsetDetector *csd = ucsdet_open(&status);
483     const UCharsetMatch *match;
484     const char *name;
485 
486     ucsdet_setText(csd, bWindows, lWindows, &status);
487     match = ucsdet_detect(csd, &status);
488 
489     if (match == NULL) {
490         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
491         goto bail;
492     }
493 
494     name  = ucsdet_getName(match, &status);
495 
496     if (strcmp(name, "windows-1252") != 0) {
497         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
498     }
499 
500     ucsdet_setText(csd, bISO, lISO, &status);
501     match = ucsdet_detect(csd, &status);
502 
503     if (match == NULL) {
504         errln("English text without C1 bytes got no matches.");
505         goto bail;
506     }
507 
508     name  = ucsdet_getName(match, &status);
509 
510     if (strcmp(name, "ISO-8859-1") != 0) {
511         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
512     }
513 
514 bail:
515     delete[] bWindows;
516     delete[] bISO;
517 
518     ucsdet_close(csd);
519 #endif
520 }
521 
DetectionTest()522 void CharsetDetectionTest::DetectionTest()
523 {
524 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
525     UErrorCode status = U_ZERO_ERROR;
526     char path[2048];
527     const char *testFilePath = getPath(path, "csdetest.xml");
528 
529     if (testFilePath == NULL) {
530         return; /* Couldn't get path: error message already output. */
531     }
532 
533     UXMLParser  *parser = UXMLParser::createParser(status);
534     if (U_FAILURE(status)) {
535         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
536         return;
537     }
538 
539     UXMLElement *root   = parser->parseFile(testFilePath, status);
540     if (!assertSuccess( "parseFile",status)) return;
541 
542     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
543     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
544     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
545 
546     const UXMLElement *testCase;
547     int32_t tc = 0;
548 
549     while((testCase = root->nextChildElement(tc)) != NULL) {
550         if (testCase->getTagName().compare(test_case) == 0) {
551             const UnicodeString *id = testCase->getAttribute(id_attr);
552             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
553             const UnicodeString  text = testCase->getText(TRUE);
554             int32_t encodingCount;
555             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
556 
557             for(int32_t e = 0; e < encodingCount; e += 1) {
558                 checkEncoding(text, encodingList[e], *id);
559             }
560 
561             delete[] encodingList;
562         }
563     }
564 
565     delete root;
566     delete parser;
567 #endif
568 }
569 
IBM424Test()570 void CharsetDetectionTest::IBM424Test()
571 {
572 #if !UCONFIG_ONLY_HTML_CONVERSION
573     UErrorCode status = U_ZERO_ERROR;
574 
575     static const UChar chars[] = {
576             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
577             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
578             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
579             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
580             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
581             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
582             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
583             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
584             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
585             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
586             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
587             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
588             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
589             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
590             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
591             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
592             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
593     };
594 
595     static const UChar chars_reverse[] = {
596             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
597             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
598             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
599             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
600             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
601             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
602             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
603             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
604             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
605             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
606             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
607             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
608             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
609             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
610             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
611             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
612             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
613             0x0000
614     };
615 
616     int32_t bLength = 0, brLength = 0;
617 
618     UnicodeString s1(chars);
619     UnicodeString s2(chars_reverse);
620 
621     char *bytes = extractBytes(s1, "IBM424", bLength);
622     char *bytes_r = extractBytes(s2, "IBM424", brLength);
623 
624     UCharsetDetector *csd = ucsdet_open(&status);
625 	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
626 	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
627 	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
628 	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
629     if (U_FAILURE(status)) {
630         errln("Error opening charset detector. - %s", u_errorName(status));
631     }
632     const UCharsetMatch *match;
633     const char *name;
634 
635     ucsdet_setText(csd, bytes, bLength, &status);
636     match = ucsdet_detect(csd, &status);
637 
638     if (match == NULL) {
639         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
640         goto bail;
641     }
642 
643     name  = ucsdet_getName(match, &status);
644     if (strcmp(name, "IBM424_rtl") != 0) {
645         errln("Encoding detection failure for IBM424_rtl: got %s", name);
646     }
647 
648     ucsdet_setText(csd, bytes_r, brLength, &status);
649     match = ucsdet_detect(csd, &status);
650 
651     if (match == NULL) {
652         errln("Encoding detection failure for IBM424_ltr: got no matches.");
653         goto bail;
654     }
655 
656     name  = ucsdet_getName(match, &status);
657     if (strcmp(name, "IBM424_ltr") != 0) {
658         errln("Encoding detection failure for IBM424_ltr: got %s", name);
659     }
660 
661 bail:
662     delete[] bytes;
663     delete[] bytes_r;
664     ucsdet_close(csd);
665 #endif
666 }
667 
IBM420Test()668 void CharsetDetectionTest::IBM420Test()
669 {
670 #if !UCONFIG_ONLY_HTML_CONVERSION
671     UErrorCode status = U_ZERO_ERROR;
672 
673     static const UChar chars[] = {
674         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
675         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
676         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
677         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
678         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
679         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
680         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
681         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
682         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
683         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
684         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
685         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
686         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
687         0x0000
688     };
689     static const UChar chars_reverse[] = {
690         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
691         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
692         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
693         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
694         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
695         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
696         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
697         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
698         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
699         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
700         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
701         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
702         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
703         0x0000,
704     };
705 
706     int32_t bLength = 0, brLength = 0;
707 
708     UnicodeString s1(chars);
709     UnicodeString s2(chars_reverse);
710 
711     char *bytes = extractBytes(s1, "IBM420", bLength);
712     char *bytes_r = extractBytes(s2, "IBM420", brLength);
713 
714     UCharsetDetector *csd = ucsdet_open(&status);
715     if (U_FAILURE(status)) {
716         errln("Error opening charset detector. - %s", u_errorName(status));
717     }
718 	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
719 	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
720 	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
721 	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
722     const UCharsetMatch *match;
723     const char *name;
724 
725     ucsdet_setText(csd, bytes, bLength, &status);
726     match = ucsdet_detect(csd, &status);
727 
728     if (match == NULL) {
729         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
730         goto bail;
731     }
732 
733     name  = ucsdet_getName(match, &status);
734     if (strcmp(name, "IBM420_rtl") != 0) {
735         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
736     }
737 
738     ucsdet_setText(csd, bytes_r, brLength, &status);
739     match = ucsdet_detect(csd, &status);
740 
741     if (match == NULL) {
742         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
743         goto bail;
744     }
745 
746     name  = ucsdet_getName(match, &status);
747     if (strcmp(name, "IBM420_ltr") != 0) {
748         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
749     }
750 
751 bail:
752     delete[] bytes;
753     delete[] bytes_r;
754     ucsdet_close(csd);
755 #endif
756 }
757 
758 
Ticket6394Test()759 void CharsetDetectionTest::Ticket6394Test() {
760 #if !UCONFIG_NO_CONVERSION
761     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
762                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
763                              "encodings more than once.  The hop through UnicodeString is for platforms "
764                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
765     char latin1Text[sizeof(charText)];
766     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
767 
768     UErrorCode status = U_ZERO_ERROR;
769     UCharsetDetector *csd = ucsdet_open(&status);
770     ucsdet_setText(csd, latin1Text, -1, &status);
771     if (U_FAILURE(status)) {
772         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
773         return;
774     }
775 
776     int32_t matchCount = 0;
777     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
778     if (U_FAILURE(status)) {
779         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
780         return;
781     }
782 
783     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
784     int32_t i;
785     for (i=0; i<matchCount; i++) {
786         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
787         if (U_FAILURE(status)) {
788             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
789             status = U_ZERO_ERROR;
790         }
791         if (setOfCharsetNames.contains(charSetName)) {
792             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
793             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
794         }
795         setOfCharsetNames.add(charSetName);
796     }
797     ucsdet_close(csd);
798 #endif
799 }
800 
801 
802 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
803 //               similar Windows and non-Windows SBCS encodings. State was kept in the shared
804 //               Charset Recognizer objects, and could be overwritten.
Ticket6954Test()805 void CharsetDetectionTest::Ticket6954Test() {
806 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
807     UErrorCode status = U_ZERO_ERROR;
808     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
809     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
810                             "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
811     UnicodeString sWindows  = ssWindows.unescape();
812     int32_t lISO = 0, lWindows = 0;
813     std::unique_ptr<char[]> bISO(extractBytes(sISO, "ISO-8859-1", lISO));
814     std::unique_ptr<char[]> bWindows(extractBytes(sWindows, "windows-1252", lWindows));
815 
816     // First do a plain vanilla detect of 1252 text
817 
818     LocalUCharsetDetectorPointer csd1(ucsdet_open(&status));
819     ucsdet_setText(csd1.getAlias(), bWindows.get(), lWindows, &status);
820     const UCharsetMatch *match1 = ucsdet_detect(csd1.getAlias(), &status);
821     const char *name1 = ucsdet_getName(match1, &status);
822     TEST_ASSERT_SUCCESS(status);
823     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
824 
825     // Next, using a completely separate detector, detect some 8859-1 text
826 
827     LocalUCharsetDetectorPointer csd2(ucsdet_open(&status));
828     ucsdet_setText(csd2.getAlias(), bISO.get(), lISO, &status);
829     const UCharsetMatch *match2 = ucsdet_detect(csd2.getAlias(), &status);
830     const char *name2 = ucsdet_getName(match2, &status);
831     TEST_ASSERT_SUCCESS(status);
832     TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
833 
834     // Recheck the 1252 results from the first detector, which should not have been
835     //  altered by the use of a different detector.
836 
837     name1 = ucsdet_getName(match1, &status);
838     TEST_ASSERT_SUCCESS(status);
839     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
840 #endif
841 }
842