1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2005-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 10 #ifndef __CSRSBCS_H 11 #define __CSRSBCS_H 12 13 #include "unicode/uobject.h" 14 15 #if !UCONFIG_NO_CONVERSION 16 17 #include "csrecog.h" 18 19 U_NAMESPACE_BEGIN 20 21 class NGramParser : public UMemory 22 { 23 private: 24 int32_t ngram; 25 const int32_t *ngramList; 26 27 int32_t ngramCount; 28 int32_t hitCount; 29 30 protected: 31 int32_t byteIndex; 32 const uint8_t *charMap; 33 34 void addByte(int32_t b); 35 36 public: 37 NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); 38 virtual ~NGramParser(); 39 40 private: 41 /* 42 * Binary search for value in table, which must have exactly 64 entries. 43 */ 44 int32_t search(const int32_t *table, int32_t value); 45 46 void lookup(int32_t thisNgram); 47 48 virtual int32_t nextByte(InputText *det); 49 virtual void parseCharacters(InputText *det); 50 51 public: 52 int32_t parse(InputText *det); 53 54 }; 55 56 #if !UCONFIG_ONLY_HTML_CONVERSION 57 class NGramParser_IBM420 : public NGramParser 58 { 59 public: 60 NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap); 61 ~NGramParser_IBM420(); 62 63 private: 64 int32_t alef; 65 int32_t isLamAlef(int32_t b); 66 int32_t nextByte(InputText *det); 67 void parseCharacters(InputText *det); 68 }; 69 #endif 70 71 72 class CharsetRecog_sbcs : public CharsetRecognizer 73 { 74 public: 75 CharsetRecog_sbcs(); 76 virtual ~CharsetRecog_sbcs(); 77 virtual const char *getName() const = 0; 78 virtual UBool match(InputText *det, CharsetMatch *results) const = 0; 79 virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 80 }; 81 82 class CharsetRecog_8859_1 : public CharsetRecog_sbcs 83 { 84 public: 85 virtual ~CharsetRecog_8859_1(); 86 const char *getName() const; 87 virtual UBool match(InputText *det, CharsetMatch *results) const; 88 }; 89 90 class CharsetRecog_8859_2 : public CharsetRecog_sbcs 91 { 92 public: 93 virtual ~CharsetRecog_8859_2(); 94 const char *getName() const; 95 virtual UBool match(InputText *det, CharsetMatch *results) const; 96 }; 97 98 class CharsetRecog_8859_5 : public CharsetRecog_sbcs 99 { 100 public: 101 virtual ~CharsetRecog_8859_5(); 102 const char *getName() const; 103 }; 104 105 class CharsetRecog_8859_6 : public CharsetRecog_sbcs 106 { 107 public: 108 virtual ~CharsetRecog_8859_6(); 109 110 const char *getName() const; 111 }; 112 113 class CharsetRecog_8859_7 : public CharsetRecog_sbcs 114 { 115 public: 116 virtual ~CharsetRecog_8859_7(); 117 118 const char *getName() const; 119 }; 120 121 class CharsetRecog_8859_8 : public CharsetRecog_sbcs 122 { 123 public: 124 virtual ~CharsetRecog_8859_8(); 125 126 virtual const char *getName() const; 127 }; 128 129 class CharsetRecog_8859_9 : public CharsetRecog_sbcs 130 { 131 public: 132 virtual ~CharsetRecog_8859_9(); 133 134 const char *getName() const; 135 }; 136 137 138 139 class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 140 { 141 public: 142 virtual ~CharsetRecog_8859_5_ru(); 143 144 const char *getLanguage() const; 145 146 virtual UBool match(InputText *det, CharsetMatch *results) const; 147 }; 148 149 class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 150 { 151 public: 152 virtual ~CharsetRecog_8859_6_ar(); 153 154 const char *getLanguage() const; 155 156 virtual UBool match(InputText *det, CharsetMatch *results) const; 157 }; 158 159 class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 160 { 161 public: 162 virtual ~CharsetRecog_8859_7_el(); 163 164 const char *getLanguage() const; 165 166 virtual UBool match(InputText *det, CharsetMatch *results) const; 167 }; 168 169 class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 170 { 171 public: 172 virtual ~CharsetRecog_8859_8_I_he(); 173 174 const char *getName() const; 175 176 const char *getLanguage() const; 177 178 virtual UBool match(InputText *det, CharsetMatch *results) const; 179 }; 180 181 class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 182 { 183 public: 184 virtual ~CharsetRecog_8859_8_he (); 185 186 const char *getLanguage() const; 187 188 virtual UBool match(InputText *det, CharsetMatch *results) const; 189 }; 190 191 class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 192 { 193 public: 194 virtual ~CharsetRecog_8859_9_tr (); 195 196 const char *getLanguage() const; 197 198 virtual UBool match(InputText *det, CharsetMatch *results) const; 199 }; 200 201 class CharsetRecog_windows_1256 : public CharsetRecog_sbcs 202 { 203 public: 204 virtual ~CharsetRecog_windows_1256(); 205 206 const char *getName() const; 207 208 const char *getLanguage() const; 209 210 virtual UBool match(InputText *det, CharsetMatch *results) const; 211 }; 212 213 class CharsetRecog_windows_1251 : public CharsetRecog_sbcs 214 { 215 public: 216 virtual ~CharsetRecog_windows_1251(); 217 218 const char *getName() const; 219 220 const char *getLanguage() const; 221 222 virtual UBool match(InputText *det, CharsetMatch *results) const; 223 }; 224 225 226 class CharsetRecog_KOI8_R : public CharsetRecog_sbcs 227 { 228 public: 229 virtual ~CharsetRecog_KOI8_R(); 230 231 const char *getName() const; 232 233 const char *getLanguage() const; 234 235 virtual UBool match(InputText *det, CharsetMatch *results) const; 236 }; 237 238 #if !UCONFIG_ONLY_HTML_CONVERSION 239 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs 240 { 241 public: 242 virtual ~CharsetRecog_IBM424_he(); 243 244 const char *getLanguage() const; 245 }; 246 247 class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { 248 public: 249 virtual ~CharsetRecog_IBM424_he_rtl(); 250 251 const char *getName() const; 252 253 virtual UBool match(InputText *det, CharsetMatch *results) const; 254 }; 255 256 class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { 257 virtual ~CharsetRecog_IBM424_he_ltr(); 258 259 const char *getName() const; 260 261 virtual UBool match(InputText *det, CharsetMatch *results) const; 262 }; 263 264 class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs 265 { 266 public: 267 virtual ~CharsetRecog_IBM420_ar(); 268 269 const char *getLanguage() const; 270 int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 271 272 }; 273 274 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { 275 public: 276 virtual ~CharsetRecog_IBM420_ar_rtl(); 277 278 const char *getName() const; 279 280 virtual UBool match(InputText *det, CharsetMatch *results) const; 281 }; 282 283 class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { 284 virtual ~CharsetRecog_IBM420_ar_ltr(); 285 286 const char *getName() const; 287 288 virtual UBool match(InputText *det, CharsetMatch *results) const; 289 }; 290 #endif 291 292 U_NAMESPACE_END 293 294 #endif /* !UCONFIG_NO_CONVERSION */ 295 #endif /* __CSRSBCS_H */ 296