1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2013, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_CONVERSION
11 
12 #include "csrucode.h"
13 #include "csmatch.h"
14 
15 U_NAMESPACE_BEGIN
16 
~CharsetRecog_Unicode()17 CharsetRecog_Unicode::~CharsetRecog_Unicode()
18 {
19     // nothing to do
20 }
21 
~CharsetRecog_UTF_16_BE()22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
23 {
24     // nothing to do
25 }
26 
getName() const27 const char *CharsetRecog_UTF_16_BE::getName() const
28 {
29     return "UTF-16BE";
30 }
31 
32 // UTF-16 confidence calculation. Very simple minded, but better than nothing.
33 //   Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
34 //     and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
35 //   NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
36 //   NULs should be rare in actual text.
37 
adjustConfidence(UChar codeUnit,int32_t confidence)38 static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) {
39     if (codeUnit == 0) {
40         confidence -= 10;
41     } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
42         confidence += 10;
43     }
44     if (confidence < 0) {
45         confidence = 0;
46     } else if (confidence > 100) {
47         confidence = 100;
48     }
49     return confidence;
50 }
51 
52 
match(InputText * textIn,CharsetMatch * results) const53 UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const
54 {
55     const uint8_t *input = textIn->fRawInput;
56     int32_t confidence = 10;
57     int32_t length = textIn->fRawLength;
58 
59     int32_t bytesToCheck = (length > 30) ? 30 : length;
60     for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
61         UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1];
62         if (charIndex == 0 && codeUnit == 0xFEFF) {
63             confidence = 100;
64             break;
65         }
66         confidence = adjustConfidence(codeUnit, confidence);
67         if (confidence == 0 || confidence == 100) {
68             break;
69         }
70     }
71     if (bytesToCheck < 4 && confidence < 100) {
72         confidence = 0;
73     }
74     results->set(textIn, this, confidence);
75     return (confidence > 0);
76 }
77 
~CharsetRecog_UTF_16_LE()78 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
79 {
80     // nothing to do
81 }
82 
getName() const83 const char *CharsetRecog_UTF_16_LE::getName() const
84 {
85     return "UTF-16LE";
86 }
87 
match(InputText * textIn,CharsetMatch * results) const88 UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const
89 {
90     const uint8_t *input = textIn->fRawInput;
91     int32_t confidence = 10;
92     int32_t length = textIn->fRawLength;
93 
94     int32_t bytesToCheck = (length > 30) ? 30 : length;
95     for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
96         UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8);
97         if (charIndex == 0 && codeUnit == 0xFEFF) {
98             confidence = 100;     // UTF-16 BOM
99             if (length >= 4 && input[2] == 0 && input[3] == 0) {
100                 confidence = 0;   // UTF-32 BOM
101             }
102             break;
103         }
104         confidence = adjustConfidence(codeUnit, confidence);
105         if (confidence == 0 || confidence == 100) {
106             break;
107         }
108     }
109     if (bytesToCheck < 4 && confidence < 100) {
110         confidence = 0;
111     }
112     results->set(textIn, this, confidence);
113     return (confidence > 0);
114 }
115 
~CharsetRecog_UTF_32()116 CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
117 {
118     // nothing to do
119 }
120 
match(InputText * textIn,CharsetMatch * results) const121 UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const
122 {
123     const uint8_t *input = textIn->fRawInput;
124     int32_t limit = (textIn->fRawLength / 4) * 4;
125     int32_t numValid = 0;
126     int32_t numInvalid = 0;
127     bool hasBOM = FALSE;
128     int32_t confidence = 0;
129 
130     if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) {
131         hasBOM = TRUE;
132     }
133 
134     for(int32_t i = 0; i < limit; i += 4) {
135         int32_t ch = getChar(input, i);
136 
137         if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
138             numInvalid += 1;
139         } else {
140             numValid += 1;
141         }
142     }
143 
144 
145     // Cook up some sort of confidence score, based on presense of a BOM
146     //    and the existence of valid and/or invalid multi-byte sequences.
147     if (hasBOM && numInvalid==0) {
148         confidence = 100;
149     } else if (hasBOM && numValid > numInvalid*10) {
150         confidence = 80;
151     } else if (numValid > 3 && numInvalid == 0) {
152         confidence = 100;
153     } else if (numValid > 0 && numInvalid == 0) {
154         confidence = 80;
155     } else if (numValid > numInvalid*10) {
156         // Probably corruput UTF-32BE data.  Valid sequences aren't likely by chance.
157         confidence = 25;
158     }
159 
160     results->set(textIn, this, confidence);
161     return (confidence > 0);
162 }
163 
~CharsetRecog_UTF_32_BE()164 CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
165 {
166     // nothing to do
167 }
168 
getName() const169 const char *CharsetRecog_UTF_32_BE::getName() const
170 {
171     return "UTF-32BE";
172 }
173 
getChar(const uint8_t * input,int32_t index) const174 int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const
175 {
176     return input[index + 0] << 24 | input[index + 1] << 16 |
177            input[index + 2] <<  8 | input[index + 3];
178 }
179 
~CharsetRecog_UTF_32_LE()180 CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
181 {
182     // nothing to do
183 }
184 
getName() const185 const char *CharsetRecog_UTF_32_LE::getName() const
186 {
187     return "UTF-32LE";
188 }
189 
getChar(const uint8_t * input,int32_t index) const190 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const
191 {
192     return input[index + 3] << 24 | input[index + 2] << 16 |
193            input[index + 1] <<  8 | input[index + 0];
194 }
195 
196 U_NAMESPACE_END
197 #endif
198 
199