1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2015, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  */
7 
8 
9 #include "unicode/utypes.h"
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/uniset.h"
15 
16 #include "intltest.h"
17 #include "csdetest.h"
18 
19 #include "xmlparser.h"
20 
21 #include <stdlib.h>
22 #include <string.h>
23 
24 #ifdef DEBUG_DETECT
25 #include <stdio.h>
26 #endif
27 
28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
29 
30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
32 
33 #define CH_SPACE 0x0020
34 #define CH_SLASH 0x002F
35 
36 #define TEST_ASSERT(x) {if (!(x)) { \
37     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
38 
39 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
40     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
41     return;}}
42 
43 
44 //---------------------------------------------------------------------------
45 //
46 //  Test class boilerplate
47 //
48 //---------------------------------------------------------------------------
CharsetDetectionTest()49 CharsetDetectionTest::CharsetDetectionTest()
50 {
51 }
52 
53 
~CharsetDetectionTest()54 CharsetDetectionTest::~CharsetDetectionTest()
55 {
56 }
57 
58 
59 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)60 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
61 {
62     if (exec) logln("TestSuite CharsetDetectionTest: ");
63     switch (index) {
64        case 0: name = "ConstructionTest";
65             if (exec) ConstructionTest();
66             break;
67 
68        case 1: name = "UTF8Test";
69             if (exec) UTF8Test();
70             break;
71 
72        case 2: name = "UTF16Test";
73             if (exec) UTF16Test();
74             break;
75 
76        case 3: name = "C1BytesTest";
77             if (exec) C1BytesTest();
78             break;
79 
80        case 4: name = "InputFilterTest";
81             if (exec) InputFilterTest();
82             break;
83 
84        case 5: name = "DetectionTest";
85             if (exec) DetectionTest();
86             break;
87 #if !UCONFIG_NO_LEGACY_CONVERSION
88        case 6: name = "IBM424Test";
89             if (exec) IBM424Test();
90             break;
91 
92        case 7: name = "IBM420Test";
93             if (exec) IBM420Test();
94             break;
95 #else
96        case 6:
97        case 7: name = "skip"; break;
98 #endif
99        case 8: name = "Ticket6394Test";
100             if (exec) Ticket6394Test();
101             break;
102 
103        case 9: name = "Ticket6954Test";
104             if (exec) Ticket6954Test();
105             break;
106 
107         default: name = "";
108             break; //needed to end loop
109     }
110 }
111 
split(const UnicodeString & src,UChar ch,int32_t & splits)112 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
113 {
114     int32_t offset = -1;
115 
116     splits = 1;
117     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
118         splits += 1;
119     }
120 
121     UnicodeString *result = new UnicodeString[splits];
122 
123     int32_t start = 0;
124     int32_t split = 0;
125     int32_t end;
126 
127     while((end = src.indexOf(ch, start)) >= 0) {
128         src.extractBetween(start, end, result[split++]);
129         start = end + 1;
130     }
131 
132     src.extractBetween(start, src.length(), result[split]);
133 
134     return result;
135 }
136 
extractBytes(const UnicodeString & source,const char * codepage,int32_t & length)137 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
138 {
139     int32_t sLength = source.length();
140     char *bytes = NULL;
141 
142     length = source.extract(0, sLength, NULL, codepage);
143 
144     if (length > 0) {
145         bytes = NEW_ARRAY(char, length + 1);
146         source.extract(0, sLength, bytes, codepage);
147     }
148 
149     return bytes;
150 }
151 
freeBytes(char * bytes)152 static void freeBytes(char *bytes)
153 {
154     DELETE_ARRAY(bytes);
155 }
156 
checkEncoding(const UnicodeString & testString,const UnicodeString & encoding,const UnicodeString & id)157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
158 {
159     int32_t splits = 0;
160     int32_t testLength = testString.length();
161     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
162     UErrorCode status = U_ZERO_ERROR;
163     int32_t cpLength = eSplit[0].length();
164     char codepage[64];
165 
166     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
167     codepage[cpLength] = '\0';
168 
169     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
170 
171     int32_t byteLength = 0;
172     char *bytes = extractBytes(testString, codepage, byteLength);
173 
174     if (bytes == NULL) {
175 #if !UCONFIG_NO_LEGACY_CONVERSION
176         dataerrln("Can't open a " + encoding + " converter for " + id);
177 #endif
178         return;
179     }
180 
181     ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
182 
183     int32_t matchCount = 0;
184     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
185 
186 
187     UnicodeString name(ucsdet_getName(matches[0], &status));
188     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
189     UChar *decoded = NULL;
190     int32_t dLength = 0;
191 
192     if (matchCount == 0) {
193         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
194         goto bail;
195     }
196 
197     if (name.compare(eSplit[0]) != 0) {
198         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
199 
200 #ifdef DEBUG_DETECT
201         for (int32_t m = 0; m < matchCount; m += 1) {
202             const char *name = ucsdet_getName(matches[m], &status);
203             const char *lang = ucsdet_getLanguage(matches[m], &status);
204             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
205 
206             printf("%s (%s) %d\n", name, lang, confidence);
207         }
208 #endif
209         goto bail;
210     }
211 
212     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
213         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
214         goto bail;
215     }
216 
217     decoded = NEW_ARRAY(UChar, testLength);
218     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
219 
220     if (testString.compare(decoded, dLength) != 0) {
221         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
222 
223 #ifdef DEBUG_DETECT
224         for(int32_t i = 0; i < testLength; i += 1) {
225             if(testString[i] != decoded[i]) {
226                 printf("Strings differ at byte %d\n", i);
227                 break;
228             }
229         }
230 #endif
231 
232     }
233 
234     DELETE_ARRAY(decoded);
235 
236 bail:
237     freeBytes(bytes);
238     delete[] eSplit;
239 }
240 
getPath(char buffer[2048],const char * filename)241 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
242     UErrorCode status = U_ZERO_ERROR;
243     const char *testDataDirectory = IntlTest::getSourceTestData(status);
244 
245     if (U_FAILURE(status)) {
246         errln("ERROR: getPath() failed - %s", u_errorName(status));
247         return NULL;
248     }
249 
250     strcpy(buffer, testDataDirectory);
251     strcat(buffer, filename);
252     return buffer;
253 }
254 
ConstructionTest()255 void CharsetDetectionTest::ConstructionTest()
256 {
257     IcuTestErrorCode status(*this, "ConstructionTest");
258     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
259     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
260     int32_t count = uenum_count(e.getAlias(), status);
261 
262 #ifdef DEBUG_DETECT
263     printf("There are %d recognizers.\n", count);
264 #endif
265 
266     for(int32_t i = 0; i < count; i += 1) {
267         int32_t length;
268         const char *name = uenum_next(e.getAlias(), &length, status);
269 
270         if(name == NULL || length <= 0) {
271             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
272         }
273 
274 #ifdef DEBUG_DETECT
275         printf("%s\n", name);
276 #endif
277     }
278 
279     const char* defDisabled[] = {
280         "IBM420_rtl", "IBM420_ltr",
281         "IBM424_rtl", "IBM424_ltr",
282         0
283     };
284 
285     LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
286     const char *activeName = NULL;
287 
288     while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
289         // the charset must be included in all list
290         UBool found = FALSE;
291 
292         const char *name = NULL;
293         uenum_reset(e.getAlias(), status);
294         while ((name = uenum_next(e.getAlias(), NULL, status))) {
295             if (strcmp(activeName, name) == 0) {
296                 found = TRUE;
297                 break;
298             }
299         }
300 
301         if (!found) {
302             errln(UnicodeString(activeName) + " is not included in the all charset list.");
303         }
304 
305         // some charsets are disabled by default
306         found = FALSE;
307         for (int32_t i = 0; defDisabled[i] != 0; i++) {
308             if (strcmp(activeName, defDisabled[i]) == 0) {
309                 found = TRUE;
310                 break;
311             }
312         }
313         if (found) {
314             errln(UnicodeString(activeName) + " should not be included in the default charset list.");
315         }
316     }
317 }
318 
UTF8Test()319 void CharsetDetectionTest::UTF8Test()
320 {
321     UErrorCode status = U_ZERO_ERROR;
322     UnicodeString ss = "This is a string with some non-ascii characters that will "
323                        "be converted to UTF-8, then shoved through the detection process.  "
324                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
325                        "Sure would be nice if our source could contain Unicode directly!";
326     UnicodeString s = ss.unescape();
327     int32_t byteLength = 0, sLength = s.length();
328     char *bytes = extractBytes(s, "UTF-8", byteLength);
329     UCharsetDetector *csd = ucsdet_open(&status);
330     const UCharsetMatch *match;
331     UChar *detected = NEW_ARRAY(UChar, sLength);
332 
333     ucsdet_setText(csd, bytes, byteLength, &status);
334     match = ucsdet_detect(csd, &status);
335 
336     if (match == NULL) {
337         errln("Detection failure for UTF-8: got no matches.");
338         goto bail;
339     }
340 
341     ucsdet_getUChars(match, detected, sLength, &status);
342 
343     if (s.compare(detected, sLength) != 0) {
344         errln("Round-trip test failed!");
345     }
346 
347     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
348 
349 bail:
350     DELETE_ARRAY(detected);
351     freeBytes(bytes);
352     ucsdet_close(csd);
353 }
354 
UTF16Test()355 void CharsetDetectionTest::UTF16Test()
356 {
357     UErrorCode status = U_ZERO_ERROR;
358     /* Notice the BOM on the start of this string */
359     UChar chars[] = {
360         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
361         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
362         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
363         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
364         0x064a, 0x062a, 0x0000};
365     UnicodeString s(chars);
366     int32_t beLength = 0, leLength = 0;
367     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
368     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
369     UCharsetDetector *csd = ucsdet_open(&status);
370     const UCharsetMatch *match;
371     const char *name;
372     int32_t conf;
373 
374     ucsdet_setText(csd, beBytes, beLength, &status);
375     match = ucsdet_detect(csd, &status);
376 
377     if (match == NULL) {
378         errln("Encoding detection failure for UTF-16BE: got no matches.");
379         goto try_le;
380     }
381 
382     name  = ucsdet_getName(match, &status);
383     conf  = ucsdet_getConfidence(match, &status);
384 
385     if (strcmp(name, "UTF-16BE") != 0) {
386         errln("Encoding detection failure for UTF-16BE: got %s", name);
387         goto try_le; // no point in looking at confidence if we got the wrong character set.
388     }
389 
390     if (conf != 100) {
391         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
392     }
393 
394 try_le:
395     ucsdet_setText(csd, leBytes, leLength, &status);
396     match = ucsdet_detect(csd, &status);
397 
398     if (match == NULL) {
399         errln("Encoding detection failure for UTF-16LE: got no matches.");
400         goto bail;
401     }
402 
403     name  = ucsdet_getName(match, &status);
404     conf = ucsdet_getConfidence(match, &status);
405 
406 
407     if (strcmp(name, "UTF-16LE") != 0) {
408         errln("Enconding detection failure for UTF-16LE: got %s", name);
409         goto bail; // no point in looking at confidence if we got the wrong character set.
410     }
411 
412     if (conf != 100) {
413         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
414     }
415 
416 bail:
417     freeBytes(leBytes);
418     freeBytes(beBytes);
419     ucsdet_close(csd);
420 }
421 
InputFilterTest()422 void CharsetDetectionTest::InputFilterTest()
423 {
424     UErrorCode status = U_ZERO_ERROR;
425     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
426     UnicodeString s  = ss.unescape();
427     int32_t byteLength = 0;
428     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
429     UCharsetDetector *csd = ucsdet_open(&status);
430     const UCharsetMatch *match;
431     const char *lang, *name;
432 
433     ucsdet_enableInputFilter(csd, TRUE);
434 
435     if (!ucsdet_isInputFilterEnabled(csd)) {
436         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
437     }
438 
439 
440     ucsdet_setText(csd, bytes, byteLength, &status);
441     match = ucsdet_detect(csd, &status);
442 
443     if (match == NULL) {
444         errln("Turning on the input filter resulted in no matches.");
445         goto turn_off;
446     }
447 
448     name = ucsdet_getName(match, &status);
449 
450     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
451         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
452     } else {
453         lang = ucsdet_getLanguage(match, &status);
454 
455         if (lang == NULL || strcmp(lang, "fr") != 0) {
456             errln("Input filter did not strip markup!");
457         }
458     }
459 
460 turn_off:
461     ucsdet_enableInputFilter(csd, FALSE);
462     ucsdet_setText(csd, bytes, byteLength, &status);
463     match = ucsdet_detect(csd, &status);
464 
465     if (match == NULL) {
466         errln("Turning off the input filter resulted in no matches.");
467         goto bail;
468     }
469 
470     name = ucsdet_getName(match, &status);
471 
472     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
473         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
474     } else {
475         lang = ucsdet_getLanguage(match, &status);
476 
477         if (lang == NULL || strcmp(lang, "en") != 0) {
478             errln("Unfiltered input did not detect as English!");
479         }
480     }
481 
482 bail:
483     freeBytes(bytes);
484     ucsdet_close(csd);
485 }
486 
C1BytesTest()487 void CharsetDetectionTest::C1BytesTest()
488 {
489 #if !UCONFIG_NO_LEGACY_CONVERSION
490     UErrorCode status = U_ZERO_ERROR;
491     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
492     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
493     UnicodeString sWindows  = ssWindows.unescape();
494     int32_t lISO = 0, lWindows = 0;
495     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
496     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
497     UCharsetDetector *csd = ucsdet_open(&status);
498     const UCharsetMatch *match;
499     const char *name;
500 
501     ucsdet_setText(csd, bWindows, lWindows, &status);
502     match = ucsdet_detect(csd, &status);
503 
504     if (match == NULL) {
505         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
506         goto bail;
507     }
508 
509     name  = ucsdet_getName(match, &status);
510 
511     if (strcmp(name, "windows-1252") != 0) {
512         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
513     }
514 
515     ucsdet_setText(csd, bISO, lISO, &status);
516     match = ucsdet_detect(csd, &status);
517 
518     if (match == NULL) {
519         errln("English text without C1 bytes got no matches.");
520         goto bail;
521     }
522 
523     name  = ucsdet_getName(match, &status);
524 
525     if (strcmp(name, "ISO-8859-1") != 0) {
526         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
527     }
528 
529 bail:
530     freeBytes(bWindows);
531     freeBytes(bISO);
532 
533     ucsdet_close(csd);
534 #endif
535 }
536 
DetectionTest()537 void CharsetDetectionTest::DetectionTest()
538 {
539 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
540     UErrorCode status = U_ZERO_ERROR;
541     char path[2048];
542     const char *testFilePath = getPath(path, "csdetest.xml");
543 
544     if (testFilePath == NULL) {
545         return; /* Couldn't get path: error message already output. */
546     }
547 
548     UXMLParser  *parser = UXMLParser::createParser(status);
549     if (U_FAILURE(status)) {
550         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
551         return;
552     }
553 
554     UXMLElement *root   = parser->parseFile(testFilePath, status);
555     if (!assertSuccess( "parseFile",status)) return;
556 
557     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
558     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
559     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
560 
561     const UXMLElement *testCase;
562     int32_t tc = 0;
563 
564     while((testCase = root->nextChildElement(tc)) != NULL) {
565         if (testCase->getTagName().compare(test_case) == 0) {
566             const UnicodeString *id = testCase->getAttribute(id_attr);
567             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
568             const UnicodeString  text = testCase->getText(TRUE);
569             int32_t encodingCount;
570             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
571 
572             for(int32_t e = 0; e < encodingCount; e += 1) {
573                 checkEncoding(text, encodingList[e], *id);
574             }
575 
576             delete[] encodingList;
577         }
578     }
579 
580     delete root;
581     delete parser;
582 #endif
583 }
584 
IBM424Test()585 void CharsetDetectionTest::IBM424Test()
586 {
587 #if !UCONFIG_ONLY_HTML_CONVERSION
588     UErrorCode status = U_ZERO_ERROR;
589 
590     static const UChar chars[] = {
591             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
592             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
593             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
594             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
595             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
596             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
597             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
598             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
599             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
600             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
601             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
602             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
603             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
604             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
605             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
606             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
607             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
608     };
609 
610     static const UChar chars_reverse[] = {
611             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
612             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
613             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
614             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
615             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
616             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
617             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
618             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
619             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
620             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
621             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
622             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
623             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
624             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
625             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
626             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
627             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
628             0x0000
629     };
630 
631     int32_t bLength = 0, brLength = 0;
632 
633     UnicodeString s1(chars);
634     UnicodeString s2(chars_reverse);
635 
636     char *bytes = extractBytes(s1, "IBM424", bLength);
637     char *bytes_r = extractBytes(s2, "IBM424", brLength);
638 
639     UCharsetDetector *csd = ucsdet_open(&status);
640 	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
641 	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
642 	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
643 	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
644     if (U_FAILURE(status)) {
645         errln("Error opening charset detector. - %s", u_errorName(status));
646     }
647     const UCharsetMatch *match;
648     const char *name;
649 
650     ucsdet_setText(csd, bytes, bLength, &status);
651     match = ucsdet_detect(csd, &status);
652 
653     if (match == NULL) {
654         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
655         goto bail;
656     }
657 
658     name  = ucsdet_getName(match, &status);
659     if (strcmp(name, "IBM424_rtl") != 0) {
660         errln("Encoding detection failure for IBM424_rtl: got %s", name);
661     }
662 
663     ucsdet_setText(csd, bytes_r, brLength, &status);
664     match = ucsdet_detect(csd, &status);
665 
666     if (match == NULL) {
667         errln("Encoding detection failure for IBM424_ltr: got no matches.");
668         goto bail;
669     }
670 
671     name  = ucsdet_getName(match, &status);
672     if (strcmp(name, "IBM424_ltr") != 0) {
673         errln("Encoding detection failure for IBM424_ltr: got %s", name);
674     }
675 
676 bail:
677     freeBytes(bytes);
678     freeBytes(bytes_r);
679     ucsdet_close(csd);
680 #endif
681 }
682 
IBM420Test()683 void CharsetDetectionTest::IBM420Test()
684 {
685 #if !UCONFIG_ONLY_HTML_CONVERSION
686     UErrorCode status = U_ZERO_ERROR;
687 
688     static const UChar chars[] = {
689         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
690         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
691         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
692         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
693         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
694         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
695         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
696         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
697         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
698         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
699         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
700         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
701         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
702         0x0000
703     };
704     static const UChar chars_reverse[] = {
705         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
706         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
707         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
708         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
709         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
710         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
711         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
712         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
713         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
714         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
715         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
716         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
717         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
718         0x0000,
719     };
720 
721     int32_t bLength = 0, brLength = 0;
722 
723     UnicodeString s1(chars);
724     UnicodeString s2(chars_reverse);
725 
726     char *bytes = extractBytes(s1, "IBM420", bLength);
727     char *bytes_r = extractBytes(s2, "IBM420", brLength);
728 
729     UCharsetDetector *csd = ucsdet_open(&status);
730     if (U_FAILURE(status)) {
731         errln("Error opening charset detector. - %s", u_errorName(status));
732     }
733 	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
734 	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
735 	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
736 	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
737     const UCharsetMatch *match;
738     const char *name;
739 
740     ucsdet_setText(csd, bytes, bLength, &status);
741     match = ucsdet_detect(csd, &status);
742 
743     if (match == NULL) {
744         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
745         goto bail;
746     }
747 
748     name  = ucsdet_getName(match, &status);
749     if (strcmp(name, "IBM420_rtl") != 0) {
750         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
751     }
752 
753     ucsdet_setText(csd, bytes_r, brLength, &status);
754     match = ucsdet_detect(csd, &status);
755 
756     if (match == NULL) {
757         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
758         goto bail;
759     }
760 
761     name  = ucsdet_getName(match, &status);
762     if (strcmp(name, "IBM420_ltr") != 0) {
763         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
764     }
765 
766 bail:
767     freeBytes(bytes);
768     freeBytes(bytes_r);
769     ucsdet_close(csd);
770 #endif
771 }
772 
773 
Ticket6394Test()774 void CharsetDetectionTest::Ticket6394Test() {
775 #if !UCONFIG_NO_CONVERSION
776     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
777                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
778                              "encodings more than once.  The hop through UnicodeString is for platforms "
779                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
780     char latin1Text[sizeof(charText)];
781     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
782 
783     UErrorCode status = U_ZERO_ERROR;
784     UCharsetDetector *csd = ucsdet_open(&status);
785     ucsdet_setText(csd, latin1Text, -1, &status);
786     if (U_FAILURE(status)) {
787         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
788         return;
789     }
790 
791     int32_t matchCount = 0;
792     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
793     if (U_FAILURE(status)) {
794         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
795         return;
796     }
797 
798     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
799     int32_t i;
800     for (i=0; i<matchCount; i++) {
801         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
802         if (U_FAILURE(status)) {
803             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
804             status = U_ZERO_ERROR;
805         }
806         if (setOfCharsetNames.contains(charSetName)) {
807             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
808             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
809         }
810         setOfCharsetNames.add(charSetName);
811     }
812     ucsdet_close(csd);
813 #endif
814 }
815 
816 
817 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
818 //               similar Windows and non-Windows SBCS encodings. State was kept in the shared
819 //               Charset Recognizer objects, and could be overwritten.
Ticket6954Test()820 void CharsetDetectionTest::Ticket6954Test() {
821 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING
822     UErrorCode status = U_ZERO_ERROR;
823     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
824     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
825                             "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
826     UnicodeString sWindows  = ssWindows.unescape();
827     int32_t lISO = 0, lWindows = 0;
828     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
829     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
830 
831     // First do a plain vanilla detect of 1252 text
832 
833     UCharsetDetector *csd1 = ucsdet_open(&status);
834     ucsdet_setText(csd1, bWindows, lWindows, &status);
835     const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
836     const char *name1 = ucsdet_getName(match1, &status);
837     TEST_ASSERT_SUCCESS(status);
838     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
839 
840     // Next, using a completely separate detector, detect some 8859-1 text
841 
842     UCharsetDetector *csd2 = ucsdet_open(&status);
843     ucsdet_setText(csd2, bISO, lISO, &status);
844     const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
845     const char *name2 = ucsdet_getName(match2, &status);
846     TEST_ASSERT_SUCCESS(status);
847     TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
848 
849     // Recheck the 1252 results from the first detector, which should not have been
850     //  altered by the use of a different detector.
851 
852     name1 = ucsdet_getName(match1, &status);
853     TEST_ASSERT_SUCCESS(status);
854     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
855 
856     ucsdet_close(csd1);
857     ucsdet_close(csd2);
858     freeBytes(bISO);
859     freeBytes(bWindows);
860 #endif
861 }
862