1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #include "cmemory.h"
11
12 #if !UCONFIG_NO_CONVERSION
13 #include "csrsbcs.h"
14 #include "csmatch.h"
15
16 #define N_GRAM_SIZE 3
17 #define N_GRAM_MASK 0xFFFFFF
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19
20 U_NAMESPACE_BEGIN
21
NGramParser(const int32_t * theNgramList,const uint8_t * theCharMap)22 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
23 : ngram(0), byteIndex(0)
24 {
25 ngramList = theNgramList;
26 charMap = theCharMap;
27
28 ngramCount = hitCount = 0;
29 }
30
~NGramParser()31 NGramParser::~NGramParser()
32 {
33 }
34
35 /*
36 * Binary search for value in table, which must have exactly 64 entries.
37 */
38
search(const int32_t * table,int32_t value)39 int32_t NGramParser::search(const int32_t *table, int32_t value)
40 {
41 int32_t index = 0;
42
43 if (table[index + 32] <= value) {
44 index += 32;
45 }
46
47 if (table[index + 16] <= value) {
48 index += 16;
49 }
50
51 if (table[index + 8] <= value) {
52 index += 8;
53 }
54
55 if (table[index + 4] <= value) {
56 index += 4;
57 }
58
59 if (table[index + 2] <= value) {
60 index += 2;
61 }
62
63 if (table[index + 1] <= value) {
64 index += 1;
65 }
66
67 if (table[index] > value) {
68 index -= 1;
69 }
70
71 if (index < 0 || table[index] != value) {
72 return -1;
73 }
74
75 return index;
76 }
77
lookup(int32_t thisNgram)78 void NGramParser::lookup(int32_t thisNgram)
79 {
80 ngramCount += 1;
81
82 if (search(ngramList, thisNgram) >= 0) {
83 hitCount += 1;
84 }
85
86 }
87
addByte(int32_t b)88 void NGramParser::addByte(int32_t b)
89 {
90 ngram = ((ngram << 8) + b) & N_GRAM_MASK;
91 lookup(ngram);
92 }
93
nextByte(InputText * det)94 int32_t NGramParser::nextByte(InputText *det)
95 {
96 if (byteIndex >= det->fInputLen) {
97 return -1;
98 }
99
100 return det->fInputBytes[byteIndex++];
101 }
102
parseCharacters(InputText * det)103 void NGramParser::parseCharacters(InputText *det)
104 {
105 int32_t b;
106 bool ignoreSpace = FALSE;
107
108 while ((b = nextByte(det)) >= 0) {
109 uint8_t mb = charMap[b];
110
111 // TODO: 0x20 might not be a space in all character sets...
112 if (mb != 0) {
113 if (!(mb == 0x20 && ignoreSpace)) {
114 addByte(mb);
115 }
116
117 ignoreSpace = (mb == 0x20);
118 }
119 }
120 }
121
parse(InputText * det)122 int32_t NGramParser::parse(InputText *det)
123 {
124 parseCharacters(det);
125
126 // TODO: Is this OK? The buffer could have ended in the middle of a word...
127 addByte(0x20);
128
129 double rawPercent = (double) hitCount / (double) ngramCount;
130
131 // if (rawPercent <= 2.0) {
132 // return 0;
133 // }
134
135 // TODO - This is a bit of a hack to take care of a case
136 // were we were getting a confidence of 135...
137 if (rawPercent > 0.33) {
138 return 98;
139 }
140
141 return (int32_t) (rawPercent * 300.0);
142 }
143
144 #if !UCONFIG_ONLY_HTML_CONVERSION
145 static const uint8_t unshapeMap_IBM420[] = {
146 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
147 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
148 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
149 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
150 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
151 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
152 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
153 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
154 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
155 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
156 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
157 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
158 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
159 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
160 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
161 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
162 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
163 };
164
NGramParser_IBM420(const int32_t * theNgramList,const uint8_t * theCharMap)165 NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
166 {
167 alef = 0x00;
168 }
169
170
isLamAlef(int32_t b)171 int32_t NGramParser_IBM420::isLamAlef(int32_t b)
172 {
173 if(b == 0xB2 || b == 0xB3){
174 return 0x47;
175 }else if(b == 0xB4 || b == 0xB5){
176 return 0x49;
177 }else if(b == 0xB8 || b == 0xB9){
178 return 0x56;
179 }else
180 return 0x00;
181 }
182
183 /*
184 * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
185 * because CharsetDetector is dealing with bytes not Unicode code points. We could
186 * convert the bytes to Unicode code points but that would leave us dependent
187 * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
188 * of JDK can produce different results and therefore is also avoided.
189 */
nextByte(InputText * det)190 int32_t NGramParser_IBM420::nextByte(InputText *det)
191 {
192
193 if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
194 return -1;
195 }
196 int next;
197
198 alef = isLamAlef(det->fInputBytes[byteIndex]);
199 if(alef != 0x00)
200 next = 0xB1 & 0xFF;
201 else
202 next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
203
204 byteIndex++;
205
206 return next;
207 }
208
parseCharacters(InputText * det)209 void NGramParser_IBM420::parseCharacters(InputText *det)
210 {
211 int32_t b;
212 bool ignoreSpace = FALSE;
213
214 while ((b = nextByte(det)) >= 0) {
215 uint8_t mb = charMap[b];
216
217 // TODO: 0x20 might not be a space in all character sets...
218 if (mb != 0) {
219 if (!(mb == 0x20 && ignoreSpace)) {
220 addByte(mb);
221 }
222 ignoreSpace = (mb == 0x20);
223 }
224
225 if(alef != 0x00){
226 mb = charMap[alef & 0xFF];
227
228 // TODO: 0x20 might not be a space in all character sets...
229 if (mb != 0) {
230 if (!(mb == 0x20 && ignoreSpace)) {
231 addByte(mb);
232 }
233
234 ignoreSpace = (mb == 0x20);
235 }
236
237 }
238 }
239 }
240 #endif
241
CharsetRecog_sbcs()242 CharsetRecog_sbcs::CharsetRecog_sbcs()
243 {
244 // nothing else to do
245 }
246
~CharsetRecog_sbcs()247 CharsetRecog_sbcs::~CharsetRecog_sbcs()
248 {
249 // nothing to do
250 }
251
match_sbcs(InputText * det,const int32_t ngrams[],const uint8_t byteMap[]) const252 int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
253 {
254 NGramParser parser(ngrams, byteMap);
255 int32_t result;
256
257 result = parser.parse(det);
258
259 return result;
260 }
261
262 static const uint8_t charMap_8859_1[] = {
263 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
264 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
265 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
266 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
267 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
268 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
269 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
270 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
271 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
272 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
273 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
274 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
275 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
276 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
277 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
278 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
279 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
280 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
281 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
282 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
283 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
284 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
285 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
286 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
287 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
288 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
289 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
290 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
291 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
292 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
293 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
294 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
295 };
296
297 static const uint8_t charMap_8859_2[] = {
298 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
299 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
301 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
302 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
303 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
304 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
305 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
306 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
307 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
308 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
309 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
310 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
311 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
312 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
313 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
314 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
315 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
316 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
317 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
318 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
319 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
320 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
321 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
322 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
323 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
324 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
325 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
326 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
327 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
328 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
329 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
330 };
331
332 static const uint8_t charMap_8859_5[] = {
333 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
334 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
335 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
336 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
337 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
338 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
339 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
340 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
341 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
342 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
343 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
344 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
345 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
346 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
347 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
348 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
349 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
350 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
351 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
352 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
353 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
354 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
355 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
356 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
357 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
358 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
359 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
360 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
361 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
362 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
363 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
364 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
365 };
366
367 static const uint8_t charMap_8859_6[] = {
368 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
369 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
370 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
371 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
372 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
373 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
374 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
375 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
376 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
377 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
378 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
379 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
380 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
381 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
382 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
383 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
384 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
385 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
386 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
387 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
388 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
389 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
390 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
391 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
392 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
393 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
394 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
395 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
396 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
397 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
398 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
399 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
400 };
401
402 static const uint8_t charMap_8859_7[] = {
403 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
404 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
405 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
406 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
407 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
408 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
409 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
410 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
411 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
412 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
413 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
414 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
415 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
416 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
417 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
418 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
419 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
420 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
421 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
422 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
423 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
424 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
425 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
426 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
427 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
428 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
429 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
430 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
431 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
432 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
433 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
434 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
435 };
436
437 static const uint8_t charMap_8859_8[] = {
438 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
439 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
440 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
441 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
442 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
443 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
444 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
445 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
446 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
447 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
448 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
449 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
450 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
451 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
452 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
453 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
454 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
455 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
459 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
460 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
461 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
462 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
463 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
464 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
465 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
466 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
467 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
468 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
469 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
470 };
471
472 static const uint8_t charMap_8859_9[] = {
473 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
474 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
475 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
476 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
477 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
478 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
479 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
480 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
481 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
482 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
483 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
484 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
485 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
486 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
487 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
488 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
489 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
490 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
491 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
492 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
493 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
494 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
495 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
496 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
497 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
498 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
499 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
500 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
501 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
502 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
503 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
504 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
505 };
506
507 static const int32_t ngrams_windows_1251[] = {
508 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
509 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
510 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
511 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
512 };
513
514 static const uint8_t charMap_windows_1251[] = {
515 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
516 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
517 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
518 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
519 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
520 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
521 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
522 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
523 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
524 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
525 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
526 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
527 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
528 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
529 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
530 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
531 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
532 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
533 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
534 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
535 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
536 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
537 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
538 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
539 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
540 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
541 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
542 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
543 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
544 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
545 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
546 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
547 };
548
549 static const int32_t ngrams_windows_1256[] = {
550 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
551 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
552 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
553 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
554 };
555
556 static const uint8_t charMap_windows_1256[] = {
557 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
558 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
559 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
560 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
561 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
562 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
563 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
564 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
565 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
566 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
567 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
568 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
569 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
570 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
571 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
572 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
573 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
574 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
575 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
576 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
577 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
578 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
579 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
580 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
581 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
582 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
583 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
584 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
585 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
586 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
587 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
588 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
589 };
590
591 static const int32_t ngrams_KOI8_R[] = {
592 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
593 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
594 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
595 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
596 };
597
598 static const uint8_t charMap_KOI8_R[] = {
599 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
600 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
601 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
602 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
603 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
604 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
605 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
606 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
607 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
608 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
609 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
610 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
611 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
612 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
613 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
614 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
615 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
616 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
617 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
618 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
619 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
620 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
621 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
622 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
623 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
624 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
625 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
626 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
627 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
628 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
629 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
630 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
631 };
632
633 #if !UCONFIG_ONLY_HTML_CONVERSION
634 static const int32_t ngrams_IBM424_he_rtl[] = {
635 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
636 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
637 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
638 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
639 };
640
641 static const int32_t ngrams_IBM424_he_ltr[] = {
642 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
643 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
644 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
645 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
646 };
647
648 static const uint8_t charMap_IBM424_he[] = {
649 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
650 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
651 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
652 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
653 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
654 /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
655 /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
656 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
657 /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
658 /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
659 /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
660 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
661 /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
662 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
663 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
664 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
665 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
666 };
667
668 static const int32_t ngrams_IBM420_ar_rtl[] = {
669 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
670 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
671 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
672 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
673 };
674
675 static const int32_t ngrams_IBM420_ar_ltr[] = {
676 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
677 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
678 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
679 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
680 };
681
682 static const uint8_t charMap_IBM420_ar[]= {
683 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
684 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
685 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
686 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
687 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
688 /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
689 /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
690 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
691 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
692 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
693 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
694 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
695 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
696 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
697 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
698 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
699 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
700 };
701 #endif
702
703 //ISO-8859-1,2,5,6,7,8,9 Ngrams
704
705 struct NGramsPlusLang {
706 const int32_t ngrams[64];
707 const char * lang;
708 };
709
710 static const NGramsPlusLang ngrams_8859_1[] = {
711 {
712 {
713 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
714 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
715 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
716 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
717 },
718 "en"
719 },
720 {
721 {
722 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
723 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
724 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
725 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
726 },
727 "da"
728 },
729 {
730 {
731 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
732 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
733 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
734 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
735 },
736 "de"
737 },
738 {
739 {
740 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
741 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
742 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
743 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
744 },
745 "es"
746 },
747 {
748 {
749 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
750 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
751 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
752 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
753 },
754 "fr"
755 },
756 {
757 {
758 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
759 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
760 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
761 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
762 },
763 "it"
764 },
765 {
766 {
767 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
768 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
769 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
770 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
771 },
772 "nl"
773 },
774 {
775 {
776 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
777 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
778 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
779 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
780 },
781 "no"
782 },
783 {
784 {
785 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
786 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
787 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
788 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
789 },
790 "pt"
791 },
792 {
793 {
794 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
795 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
796 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
797 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
798 },
799 "sv"
800 }
801 };
802
803
804 static const NGramsPlusLang ngrams_8859_2[] = {
805 {
806 {
807 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
808 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
809 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
810 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
811 },
812 "cs"
813 },
814 {
815 {
816 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
817 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
818 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
819 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
820 },
821 "hu"
822 },
823 {
824 {
825 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
826 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
827 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
828 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
829 },
830 "pl"
831 },
832 {
833 {
834 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
835 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
836 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
837 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
838 },
839 "ro"
840 }
841 };
842
843 static const int32_t ngrams_8859_5_ru[] = {
844 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
845 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
846 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
847 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
848 };
849
850 static const int32_t ngrams_8859_6_ar[] = {
851 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
852 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
853 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
854 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
855 };
856
857 static const int32_t ngrams_8859_7_el[] = {
858 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
859 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
860 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
861 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
862 };
863
864 static const int32_t ngrams_8859_8_I_he[] = {
865 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
866 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
867 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
868 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
869 };
870
871 static const int32_t ngrams_8859_8_he[] = {
872 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
873 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
874 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
875 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
876 };
877
878 static const int32_t ngrams_8859_9_tr[] = {
879 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
880 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
881 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
882 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
883 };
884
~CharsetRecog_8859_1()885 CharsetRecog_8859_1::~CharsetRecog_8859_1()
886 {
887 // nothing to do
888 }
889
match(InputText * textIn,CharsetMatch * results) const890 UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const {
891 const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1";
892 uint32_t i;
893 int32_t bestConfidenceSoFar = -1;
894 for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) {
895 const int32_t *ngrams = ngrams_8859_1[i].ngrams;
896 const char *lang = ngrams_8859_1[i].lang;
897 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1);
898 if (confidence > bestConfidenceSoFar) {
899 results->set(textIn, this, confidence, name, lang);
900 bestConfidenceSoFar = confidence;
901 }
902 }
903 return (bestConfidenceSoFar > 0);
904 }
905
getName() const906 const char *CharsetRecog_8859_1::getName() const
907 {
908 return "ISO-8859-1";
909 }
910
911
~CharsetRecog_8859_2()912 CharsetRecog_8859_2::~CharsetRecog_8859_2()
913 {
914 // nothing to do
915 }
916
match(InputText * textIn,CharsetMatch * results) const917 UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const {
918 const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2";
919 uint32_t i;
920 int32_t bestConfidenceSoFar = -1;
921 for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) {
922 const int32_t *ngrams = ngrams_8859_2[i].ngrams;
923 const char *lang = ngrams_8859_2[i].lang;
924 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2);
925 if (confidence > bestConfidenceSoFar) {
926 results->set(textIn, this, confidence, name, lang);
927 bestConfidenceSoFar = confidence;
928 }
929 }
930 return (bestConfidenceSoFar > 0);
931 }
932
getName() const933 const char *CharsetRecog_8859_2::getName() const
934 {
935 return "ISO-8859-2";
936 }
937
938
~CharsetRecog_8859_5()939 CharsetRecog_8859_5::~CharsetRecog_8859_5()
940 {
941 // nothing to do
942 }
943
getName() const944 const char *CharsetRecog_8859_5::getName() const
945 {
946 return "ISO-8859-5";
947 }
948
~CharsetRecog_8859_5_ru()949 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
950 {
951 // nothing to do
952 }
953
getLanguage() const954 const char *CharsetRecog_8859_5_ru::getLanguage() const
955 {
956 return "ru";
957 }
958
match(InputText * textIn,CharsetMatch * results) const959 UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const
960 {
961 int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
962 results->set(textIn, this, confidence);
963 return (confidence > 0);
964 }
965
~CharsetRecog_8859_6()966 CharsetRecog_8859_6::~CharsetRecog_8859_6()
967 {
968 // nothing to do
969 }
970
getName() const971 const char *CharsetRecog_8859_6::getName() const
972 {
973 return "ISO-8859-6";
974 }
975
~CharsetRecog_8859_6_ar()976 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
977 {
978 // nothing to do
979 }
980
getLanguage() const981 const char *CharsetRecog_8859_6_ar::getLanguage() const
982 {
983 return "ar";
984 }
985
match(InputText * textIn,CharsetMatch * results) const986 UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const
987 {
988 int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
989 results->set(textIn, this, confidence);
990 return (confidence > 0);
991 }
992
~CharsetRecog_8859_7()993 CharsetRecog_8859_7::~CharsetRecog_8859_7()
994 {
995 // nothing to do
996 }
997
getName() const998 const char *CharsetRecog_8859_7::getName() const
999 {
1000 return "ISO-8859-7";
1001 }
1002
~CharsetRecog_8859_7_el()1003 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
1004 {
1005 // nothing to do
1006 }
1007
getLanguage() const1008 const char *CharsetRecog_8859_7_el::getLanguage() const
1009 {
1010 return "el";
1011 }
1012
match(InputText * textIn,CharsetMatch * results) const1013 UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const
1014 {
1015 const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7";
1016 int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
1017 results->set(textIn, this, confidence, name, "el");
1018 return (confidence > 0);
1019 }
1020
~CharsetRecog_8859_8()1021 CharsetRecog_8859_8::~CharsetRecog_8859_8()
1022 {
1023 // nothing to do
1024 }
1025
getName() const1026 const char *CharsetRecog_8859_8::getName() const
1027 {
1028 return "ISO-8859-8";
1029 }
1030
~CharsetRecog_8859_8_I_he()1031 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
1032 {
1033 // nothing to do
1034 }
1035
getName() const1036 const char *CharsetRecog_8859_8_I_he::getName() const
1037 {
1038 return "ISO-8859-8-I";
1039 }
1040
getLanguage() const1041 const char *CharsetRecog_8859_8_I_he::getLanguage() const
1042 {
1043 return "he";
1044 }
1045
match(InputText * textIn,CharsetMatch * results) const1046 UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const
1047 {
1048 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I";
1049 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
1050 results->set(textIn, this, confidence, name, "he");
1051 return (confidence > 0);
1052 }
1053
~CharsetRecog_8859_8_he()1054 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
1055 {
1056 // od ot gnihton
1057 }
1058
getLanguage() const1059 const char *CharsetRecog_8859_8_he::getLanguage() const
1060 {
1061 return "he";
1062 }
1063
match(InputText * textIn,CharsetMatch * results) const1064 UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const
1065 {
1066 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8";
1067 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
1068 results->set(textIn, this, confidence, name, "he");
1069 return (confidence > 0);
1070 }
1071
~CharsetRecog_8859_9()1072 CharsetRecog_8859_9::~CharsetRecog_8859_9()
1073 {
1074 // nothing to do
1075 }
1076
getName() const1077 const char *CharsetRecog_8859_9::getName() const
1078 {
1079 return "ISO-8859-9";
1080 }
1081
~CharsetRecog_8859_9_tr()1082 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
1083 {
1084 // nothing to do
1085 }
1086
getLanguage() const1087 const char *CharsetRecog_8859_9_tr::getLanguage() const
1088 {
1089 return "tr";
1090 }
1091
match(InputText * textIn,CharsetMatch * results) const1092 UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const
1093 {
1094 const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9";
1095 int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
1096 results->set(textIn, this, confidence, name, "tr");
1097 return (confidence > 0);
1098 }
1099
~CharsetRecog_windows_1256()1100 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
1101 {
1102 // nothing to do
1103 }
1104
getName() const1105 const char *CharsetRecog_windows_1256::getName() const
1106 {
1107 return "windows-1256";
1108 }
1109
getLanguage() const1110 const char *CharsetRecog_windows_1256::getLanguage() const
1111 {
1112 return "ar";
1113 }
1114
match(InputText * textIn,CharsetMatch * results) const1115 UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const
1116 {
1117 int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
1118 results->set(textIn, this, confidence);
1119 return (confidence > 0);
1120 }
1121
~CharsetRecog_windows_1251()1122 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1123 {
1124 // nothing to do
1125 }
1126
getName() const1127 const char *CharsetRecog_windows_1251::getName() const
1128 {
1129 return "windows-1251";
1130 }
1131
getLanguage() const1132 const char *CharsetRecog_windows_1251::getLanguage() const
1133 {
1134 return "ru";
1135 }
1136
match(InputText * textIn,CharsetMatch * results) const1137 UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const
1138 {
1139 int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
1140 results->set(textIn, this, confidence);
1141 return (confidence > 0);
1142 }
1143
~CharsetRecog_KOI8_R()1144 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1145 {
1146 // nothing to do
1147 }
1148
getName() const1149 const char *CharsetRecog_KOI8_R::getName() const
1150 {
1151 return "KOI8-R";
1152 }
1153
getLanguage() const1154 const char *CharsetRecog_KOI8_R::getLanguage() const
1155 {
1156 return "ru";
1157 }
1158
match(InputText * textIn,CharsetMatch * results) const1159 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
1160 {
1161 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
1162 results->set(textIn, this, confidence);
1163 return (confidence > 0);
1164 }
1165
1166 #if !UCONFIG_ONLY_HTML_CONVERSION
~CharsetRecog_IBM424_he()1167 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1168 {
1169 // nothing to do
1170 }
1171
getLanguage() const1172 const char *CharsetRecog_IBM424_he::getLanguage() const
1173 {
1174 return "he";
1175 }
1176
~CharsetRecog_IBM424_he_rtl()1177 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1178 {
1179 // nothing to do
1180 }
1181
getName() const1182 const char *CharsetRecog_IBM424_he_rtl::getName() const
1183 {
1184 return "IBM424_rtl";
1185 }
1186
match(InputText * textIn,CharsetMatch * results) const1187 UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const
1188 {
1189 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
1190 results->set(textIn, this, confidence);
1191 return (confidence > 0);
1192 }
1193
~CharsetRecog_IBM424_he_ltr()1194 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1195 {
1196 // nothing to do
1197 }
1198
getName() const1199 const char *CharsetRecog_IBM424_he_ltr::getName() const
1200 {
1201 return "IBM424_ltr";
1202 }
1203
match(InputText * textIn,CharsetMatch * results) const1204 UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const
1205 {
1206 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
1207 results->set(textIn, this, confidence);
1208 return (confidence > 0);
1209 }
1210
~CharsetRecog_IBM420_ar()1211 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1212 {
1213 // nothing to do
1214 }
1215
getLanguage() const1216 const char *CharsetRecog_IBM420_ar::getLanguage() const
1217 {
1218 return "ar";
1219 }
1220
1221
match_sbcs(InputText * det,const int32_t ngrams[],const uint8_t byteMap[]) const1222 int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
1223 {
1224 NGramParser_IBM420 parser(ngrams, byteMap);
1225 int32_t result;
1226
1227 result = parser.parse(det);
1228
1229 return result;
1230 }
1231
~CharsetRecog_IBM420_ar_rtl()1232 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1233 {
1234 // nothing to do
1235 }
1236
getName() const1237 const char *CharsetRecog_IBM420_ar_rtl::getName() const
1238 {
1239 return "IBM420_rtl";
1240 }
1241
match(InputText * textIn,CharsetMatch * results) const1242 UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const
1243 {
1244 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
1245 results->set(textIn, this, confidence);
1246 return (confidence > 0);
1247 }
1248
~CharsetRecog_IBM420_ar_ltr()1249 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1250 {
1251 // nothing to do
1252 }
1253
getName() const1254 const char *CharsetRecog_IBM420_ar_ltr::getName() const
1255 {
1256 return "IBM420_ltr";
1257 }
1258
match(InputText * textIn,CharsetMatch * results) const1259 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const
1260 {
1261 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
1262 results->set(textIn, this, confidence);
1263 return (confidence > 0);
1264 }
1265 #endif
1266
1267 U_NAMESPACE_END
1268 #endif
1269
1270