1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 //#define LOG_NDEBUG 0
18 #define LOG_TAG "CharacterEncodingDector"
19 #include <utils/Log.h>
20 
21 #include <CharacterEncodingDetector.h>
22 #include "CharacterEncodingDetectorTables.h"
23 
24 #include "utils/Vector.h"
25 #include "StringArray.h"
26 
27 #include "unicode/ucnv.h"
28 #include "unicode/ucsdet.h"
29 #include "unicode/ustring.h"
30 
31 namespace android {
32 
CharacterEncodingDetector()33 CharacterEncodingDetector::CharacterEncodingDetector() {
34 
35     UErrorCode status = U_ZERO_ERROR;
36     mUtf8Conv = ucnv_open("UTF-8", &status);
37     if (U_FAILURE(status)) {
38         ALOGE("could not create UConverter for UTF-8");
39         mUtf8Conv = NULL;
40     }
41 }
42 
~CharacterEncodingDetector()43 CharacterEncodingDetector::~CharacterEncodingDetector() {
44     ucnv_close(mUtf8Conv);
45 }
46 
addTag(const char * name,const char * value)47 void CharacterEncodingDetector::addTag(const char *name, const char *value) {
48     mNames.push_back(name);
49     mValues.push_back(value);
50 }
51 
size()52 size_t CharacterEncodingDetector::size() {
53     return mNames.size();
54 }
55 
getTag(int index,const char ** name,const char ** value)56 status_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) {
57     if (index >= mNames.size()) {
58         return BAD_VALUE;
59     }
60 
61     *name = mNames.getEntry(index);
62     *value = mValues.getEntry(index);
63     return OK;
64 }
65 
isPrintableAscii(const char * value,size_t len)66 static bool isPrintableAscii(const char *value, size_t len) {
67     for (size_t i = 0; i < len; i++) {
68         if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) {
69             return false;
70         }
71     }
72     return true;
73 }
74 
detectAndConvert()75 void CharacterEncodingDetector::detectAndConvert() {
76 
77     int size = mNames.size();
78     ALOGV("%d tags before conversion", size);
79     for (int i = 0; i < size; i++) {
80         ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
81     }
82 
83     if (size && mUtf8Conv) {
84 
85         UErrorCode status = U_ZERO_ERROR;
86         UCharsetDetector *csd = ucsdet_open(&status);
87         const UCharsetMatch *ucm;
88 
89         // try combined detection of artist/album/title etc.
90         char buf[1024];
91         buf[0] = 0;
92         bool allprintable = true;
93         for (int i = 0; i < size; i++) {
94             const char *name = mNames.getEntry(i);
95             const char *value = mValues.getEntry(i);
96             if (!isPrintableAscii(value, strlen(value)) && (
97                         !strcmp(name, "artist") ||
98                         !strcmp(name, "albumartist") ||
99                         !strcmp(name, "composer") ||
100                         !strcmp(name, "genre") ||
101                         !strcmp(name, "album") ||
102                         !strcmp(name, "title"))) {
103                 strlcat(buf, value, sizeof(buf));
104                 // separate tags by space so ICU's ngram detector can do its job
105                 strlcat(buf, " ", sizeof(buf));
106                 allprintable = false;
107             }
108         }
109 
110         const char *combinedenc = "UTF-8";
111         if (allprintable) {
112             // since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so
113             // no need to even call it
114             ALOGV("all tags are printable, assuming ascii (%zu)", strlen(buf));
115         } else {
116             ucsdet_setText(csd, buf, strlen(buf), &status);
117             int32_t matches;
118             const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
119             bool goodmatch = true;
120             int highest = 0;
121             const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
122                     ucma, matches, &goodmatch, &highest);
123 
124             ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest);
125             if (!goodmatch && (highest < 15 || strlen(buf) < 20)) {
126                 ALOGV("not a good match, trying with more data");
127                 // This string might be too short for ICU to do anything useful with.
128                 // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because
129                 //  the ISO detector reports a confidence of 0, while the GB18030 detector reports
130                 //  a confidence of 10 with no invalid characters)
131                 // Append artist, album and title if they were previously omitted because they
132                 // were printable ascii.
133                 bool added = false;
134                 for (int i = 0; i < size; i++) {
135                     const char *name = mNames.getEntry(i);
136                     const char *value = mValues.getEntry(i);
137                     if (isPrintableAscii(value, strlen(value)) && (
138                                 !strcmp(name, "artist") ||
139                                 !strcmp(name, "album") ||
140                                 !strcmp(name, "title"))) {
141                         strlcat(buf, value, sizeof(buf));
142                         strlcat(buf, " ", sizeof(buf));
143                         added = true;
144                     }
145                 }
146                 if (added) {
147                     ucsdet_setText(csd, buf, strlen(buf), &status);
148                     ucma = ucsdet_detectAll(csd, &matches, &status);
149                     bestCombinedMatch = getPreferred(buf, strlen(buf),
150                             ucma, matches, &goodmatch, &highest);
151                     if (!goodmatch && highest <= 15) {
152                         ALOGV("still not a good match after adding printable tags");
153                         bestCombinedMatch = NULL;
154                     }
155                 } else {
156                     ALOGV("no printable tags to add");
157                 }
158             }
159 
160             if (bestCombinedMatch != NULL) {
161                 combinedenc = ucsdet_getName(bestCombinedMatch, &status);
162             } else {
163                 combinedenc = "ISO-8859-1";
164             }
165         }
166 
167         for (int i = 0; i < size; i++) {
168             const char *name = mNames.getEntry(i);
169             uint8_t* src = (uint8_t *)mValues.getEntry(i);
170             int len = strlen((char *)src);
171 
172             ALOGV("@@@ checking %s", name);
173             const char *s = mValues.getEntry(i);
174             int32_t inputLength = strlen(s);
175             const char *enc;
176 
177             if (!allprintable && (!strcmp(name, "artist") ||
178                     !strcmp(name, "albumartist") ||
179                     !strcmp(name, "composer") ||
180                     !strcmp(name, "genre") ||
181                     !strcmp(name, "album") ||
182                     !strcmp(name, "title"))) {
183                 // use encoding determined from the combination of artist/album/title etc.
184                 enc = combinedenc;
185             } else {
186                 if (isPrintableAscii(s, inputLength)) {
187                     enc = "UTF-8";
188                     ALOGV("@@@@ %s is ascii", mNames.getEntry(i));
189                 } else {
190                     ucsdet_setText(csd, s, inputLength, &status);
191                     ucm = ucsdet_detect(csd, &status);
192                     if (!ucm) {
193                         mValues.setEntry(i, "???");
194                         continue;
195                     }
196                     enc = ucsdet_getName(ucm, &status);
197                     ALOGV("@@@@ recognized charset: %s for %s confidence %d",
198                             enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
199                 }
200             }
201 
202             if (strcmp(enc,"UTF-8") != 0) {
203                 // only convert if the source encoding isn't already UTF-8
204                 ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
205                 status = U_ZERO_ERROR;
206                 UConverter *conv = ucnv_open(enc, &status);
207                 if (U_FAILURE(status)) {
208                     ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1",
209                             enc, status);
210                     status = U_ZERO_ERROR;
211                     conv = ucnv_open("ISO-8859-1", &status);
212                     if (U_FAILURE(status)) {
213                         ALOGW("could not create UConverter for ISO-8859-1 either");
214                         continue;
215                     }
216                 }
217 
218                 // convert from native encoding to UTF-8
219                 const char* source = mValues.getEntry(i);
220                 int targetLength = len * 3 + 1;
221                 char* buffer = new char[targetLength];
222                 // don't normally check for NULL, but in this case targetLength may be large
223                 if (!buffer)
224                     break;
225                 char* target = buffer;
226 
227                 ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
228                         &source, source + strlen(source),
229                         NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
230 
231                 if (U_FAILURE(status)) {
232                     ALOGE("ucnv_convertEx failed: %d", status);
233                     mValues.setEntry(i, "???");
234                 } else {
235                     // zero terminate
236                     *target = 0;
237                     // strip trailing spaces
238                     while (--target > buffer && *target == ' ') {
239                         *target = 0;
240                     }
241                     // skip leading spaces
242                     char *start = buffer;
243                     while (*start == ' ') {
244                         start++;
245                     }
246                     mValues.setEntry(i, start);
247                 }
248 
249                 delete[] buffer;
250 
251                 ucnv_close(conv);
252             }
253         }
254 
255         for (int i = size - 1; i >= 0; --i) {
256             if (strlen(mValues.getEntry(i)) == 0) {
257                 ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
258                 mNames.erase(i);
259                 mValues.erase(i);
260             }
261         }
262 
263         ucsdet_close(csd);
264     }
265 }
266 
267 /*
268  * When ICU detects multiple encoding matches, apply additional heuristics to determine
269  * which one is the best match, since ICU can't always be trusted to make the right choice.
270  *
271  * What this method does is:
272  * - decode the input using each of the matches found
273  * - recalculate the starting confidence level for multibyte encodings using a different
274  *   algorithm and larger frequent character lists than ICU
275  * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
276  * - pick the highest match
277  * - signal to the caller whether this match is considered good: confidence > 15, and confidence
278  *   delta with the next runner up > 15
279  */
getPreferred(const char * input,size_t len,const UCharsetMatch ** ucma,size_t nummatches,bool * goodmatch,int * highestmatch)280 const UCharsetMatch *CharacterEncodingDetector::getPreferred(
281         const char *input, size_t len,
282         const UCharsetMatch** ucma, size_t nummatches,
283         bool *goodmatch, int *highestmatch) {
284 
285     *goodmatch = false;
286     Vector<const UCharsetMatch*> matches;
287     UErrorCode status = U_ZERO_ERROR;
288 
289     ALOGV("%zu matches", nummatches);
290     for (size_t i = 0; i < nummatches; i++) {
291         const char *encname = ucsdet_getName(ucma[i], &status);
292         int confidence = ucsdet_getConfidence(ucma[i], &status);
293         ALOGV("%zu: %s %d", i, encname, confidence);
294         matches.push_back(ucma[i]);
295     }
296 
297     size_t num = matches.size();
298     if (num == 0) {
299         return NULL;
300     }
301     if (num == 1) {
302         int confidence = ucsdet_getConfidence(matches[0], &status);
303         if (confidence > 15) {
304             *goodmatch = true;
305         }
306         return matches[0];
307     }
308 
309     ALOGV("considering %zu matches", num);
310 
311     // keep track of how many "special" characters result when converting the input using each
312     // encoding
313     Vector<int> newconfidence;
314     for (size_t i = 0; i < num; i++) {
315         const uint16_t *freqdata = NULL;
316         float freqcoverage = 0;
317         status = U_ZERO_ERROR;
318         const char *encname = ucsdet_getName(matches[i], &status);
319         int confidence = ucsdet_getConfidence(matches[i], &status);
320         if (!strcmp("GB18030", encname)) {
321             freqdata = frequent_zhCN;
322             freqcoverage = frequent_zhCN_coverage;
323         } else if (!strcmp("Big5", encname)) {
324             freqdata = frequent_zhTW;
325             freqcoverage = frequent_zhTW_coverage;
326         } else if (!strcmp("EUC-KR", encname)) {
327             freqdata = frequent_ko;
328             freqcoverage = frequent_ko_coverage;
329         } else if (!strcmp("EUC-JP", encname)) {
330             freqdata = frequent_ja;
331             freqcoverage = frequent_ja_coverage;
332         } else if (!strcmp("Shift_JIS", encname)) {
333             freqdata = frequent_ja;
334             freqcoverage = frequent_ja_coverage;
335         }
336 
337         ALOGV("%zu: %s %d", i, encname, confidence);
338         status = U_ZERO_ERROR;
339         UConverter *conv = ucnv_open(encname, &status);
340         int demerit = 0;
341         if (U_FAILURE(status)) {
342             ALOGV("failed to open %s: %d", encname, status);
343             confidence = 0;
344             demerit += 1000;
345         }
346         const char *source = input;
347         const char *sourceLimit = input + len;
348         status = U_ZERO_ERROR;
349         int frequentchars = 0;
350         int totalchars = 0;
351         while (true) {
352             // demerit the current encoding for each "special" character found after conversion.
353             // The amount of demerit is somewhat arbitrarily chosen.
354             int inchar;
355             if (source != sourceLimit) {
356                 inchar = (source[0] << 8) + source[1];
357             }
358             UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
359             if (!U_SUCCESS(status)) {
360                 break;
361             }
362             if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) {
363                 ALOGV("control character %x", c);
364                 demerit += 100;
365             } else if ((c == 0xa0)                      // no-break space
366                     || (c >= 0xa2 && c <= 0xbe)         // symbols, superscripts
367                     || (c == 0xd7) || (c == 0xf7)       // multiplication and division signs
368                     || (c >= 0x2000 && c <= 0x209f)) {  // punctuation, superscripts
369                 ALOGV("unlikely character %x", c);
370                 demerit += 10;
371             } else if (c >= 0xe000 && c <= 0xf8ff) {
372                 ALOGV("private use character %x", c);
373                 demerit += 30;
374             } else if (c >= 0x2190 && c <= 0x2bff) {
375                 // this range comprises various symbol ranges that are unlikely to appear in
376                 // music file metadata.
377                 ALOGV("symbol %x", c);
378                 demerit += 10;
379             } else if (c == 0xfffd) {
380                 ALOGV("replacement character");
381                 demerit += 50;
382             } else if (c >= 0xfff0 && c <= 0xfffc) {
383                 ALOGV("unicode special %x", c);
384                 demerit += 50;
385             } else if (freqdata != NULL) {
386                 totalchars++;
387                 if (isFrequent(freqdata, c)) {
388                     frequentchars++;
389                 }
390             }
391         }
392         if (freqdata != NULL && totalchars != 0) {
393             int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
394             ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
395                     totalchars, frequentchars);
396             if (myconfidence > 100) myconfidence = 100;
397             if (myconfidence < 0) myconfidence = 0;
398             confidence = myconfidence;
399         }
400         ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
401         newconfidence.push_back(confidence - demerit);
402         ucnv_close(conv);
403         if (i == 0 && (confidence - demerit) == 100) {
404             // no need to check any further, we'll end up using this match anyway
405             break;
406         }
407     }
408 
409     // find match with highest confidence after adjusting for unlikely characters
410     int highest = newconfidence[0];
411     size_t highestidx = 0;
412     int runnerup = -10000;
413     int runnerupidx = -10000;
414     num = newconfidence.size();
415     for (size_t i = 1; i < num; i++) {
416         if (newconfidence[i] > highest) {
417             runnerup = highest;
418             runnerupidx = highestidx;
419             highest = newconfidence[i];
420             highestidx = i;
421         } else if (newconfidence[i] > runnerup){
422             runnerup = newconfidence[i];
423             runnerupidx = i;
424         }
425     }
426     status = U_ZERO_ERROR;
427     ALOGV("selecting: '%s' w/ %d confidence",
428             ucsdet_getName(matches[highestidx], &status), highest);
429     if (runnerupidx < 0) {
430         ALOGV("no runner up");
431         if (highest > 15) {
432             *goodmatch = true;
433         }
434     } else {
435         ALOGV("runner up: '%s' w/ %d confidence",
436                 ucsdet_getName(matches[runnerupidx], &status), runnerup);
437         if (runnerup < 0) {
438             runnerup = 0;
439         }
440         if ((highest - runnerup) > 15) {
441             *goodmatch = true;
442         }
443     }
444     *highestmatch = highest;
445     return matches[highestidx];
446 }
447 
448 
isFrequent(const uint16_t * values,uint32_t c)449 bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {
450 
451     int start = 0;
452     int end = 511; // All the tables have 512 entries
453     int mid = (start+end)/2;
454 
455     while(start <= end) {
456         if(c == values[mid]) {
457             return true;
458         } else if (c > values[mid]) {
459             start = mid + 1;
460         } else {
461             end = mid - 1;
462         }
463 
464         mid = (start + end) / 2;
465     }
466 
467     return false;
468 }
469 
470 
471 }  // namespace android
472