1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #ifndef __CSRSBCS_H 9 #define __CSRSBCS_H 10 11 #include "unicode/uobject.h" 12 13 #if !UCONFIG_NO_CONVERSION 14 15 #include "csrecog.h" 16 17 U_NAMESPACE_BEGIN 18 19 class NGramParser : public UMemory 20 { 21 private: 22 int32_t ngram; 23 const int32_t *ngramList; 24 25 int32_t ngramCount; 26 int32_t hitCount; 27 28 protected: 29 int32_t byteIndex; 30 const uint8_t *charMap; 31 32 void addByte(int32_t b); 33 34 public: 35 NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); 36 virtual ~NGramParser(); 37 38 private: 39 /* 40 * Binary search for value in table, which must have exactly 64 entries. 41 */ 42 int32_t search(const int32_t *table, int32_t value); 43 44 void lookup(int32_t thisNgram); 45 46 virtual int32_t nextByte(InputText *det); 47 virtual void parseCharacters(InputText *det); 48 49 public: 50 int32_t parse(InputText *det); 51 52 }; 53 54 #if !UCONFIG_ONLY_HTML_CONVERSION 55 class NGramParser_IBM420 : public NGramParser 56 { 57 private: 58 int32_t alef; 59 int32_t isLamAlef(int32_t b); 60 int32_t nextByte(InputText *det); 61 void parseCharacters(InputText *det); 62 63 public: 64 NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap); 65 }; 66 #endif 67 68 69 class CharsetRecog_sbcs : public CharsetRecognizer 70 { 71 public: 72 CharsetRecog_sbcs(); 73 virtual ~CharsetRecog_sbcs(); 74 virtual const char *getName() const = 0; 75 virtual UBool match(InputText *det, CharsetMatch *results) const = 0; 76 virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 77 }; 78 79 class CharsetRecog_8859_1 : public CharsetRecog_sbcs 80 { 81 public: 82 virtual ~CharsetRecog_8859_1(); 83 const char *getName() const; 84 virtual UBool match(InputText *det, CharsetMatch *results) const; 85 }; 86 87 class CharsetRecog_8859_2 : public CharsetRecog_sbcs 88 { 89 public: 90 virtual ~CharsetRecog_8859_2(); 91 const char *getName() const; 92 virtual UBool match(InputText *det, CharsetMatch *results) const; 93 }; 94 95 class CharsetRecog_8859_5 : public CharsetRecog_sbcs 96 { 97 public: 98 virtual ~CharsetRecog_8859_5(); 99 const char *getName() const; 100 }; 101 102 class CharsetRecog_8859_6 : public CharsetRecog_sbcs 103 { 104 public: 105 virtual ~CharsetRecog_8859_6(); 106 107 const char *getName() const; 108 }; 109 110 class CharsetRecog_8859_7 : public CharsetRecog_sbcs 111 { 112 public: 113 virtual ~CharsetRecog_8859_7(); 114 115 const char *getName() const; 116 }; 117 118 class CharsetRecog_8859_8 : public CharsetRecog_sbcs 119 { 120 public: 121 virtual ~CharsetRecog_8859_8(); 122 123 virtual const char *getName() const; 124 }; 125 126 class CharsetRecog_8859_9 : public CharsetRecog_sbcs 127 { 128 public: 129 virtual ~CharsetRecog_8859_9(); 130 131 const char *getName() const; 132 }; 133 134 135 136 class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 137 { 138 public: 139 virtual ~CharsetRecog_8859_5_ru(); 140 141 const char *getLanguage() const; 142 143 virtual UBool match(InputText *det, CharsetMatch *results) const; 144 }; 145 146 class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 147 { 148 public: 149 virtual ~CharsetRecog_8859_6_ar(); 150 151 const char *getLanguage() const; 152 153 virtual UBool match(InputText *det, CharsetMatch *results) const; 154 }; 155 156 class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 157 { 158 public: 159 virtual ~CharsetRecog_8859_7_el(); 160 161 const char *getLanguage() const; 162 163 virtual UBool match(InputText *det, CharsetMatch *results) const; 164 }; 165 166 class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 167 { 168 public: 169 virtual ~CharsetRecog_8859_8_I_he(); 170 171 const char *getName() const; 172 173 const char *getLanguage() const; 174 175 virtual UBool match(InputText *det, CharsetMatch *results) const; 176 }; 177 178 class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 179 { 180 public: 181 virtual ~CharsetRecog_8859_8_he (); 182 183 const char *getLanguage() const; 184 185 virtual UBool match(InputText *det, CharsetMatch *results) const; 186 }; 187 188 class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 189 { 190 public: 191 virtual ~CharsetRecog_8859_9_tr (); 192 193 const char *getLanguage() const; 194 195 virtual UBool match(InputText *det, CharsetMatch *results) const; 196 }; 197 198 class CharsetRecog_windows_1256 : public CharsetRecog_sbcs 199 { 200 public: 201 virtual ~CharsetRecog_windows_1256(); 202 203 const char *getName() const; 204 205 const char *getLanguage() const; 206 207 virtual UBool match(InputText *det, CharsetMatch *results) const; 208 }; 209 210 class CharsetRecog_windows_1251 : public CharsetRecog_sbcs 211 { 212 public: 213 virtual ~CharsetRecog_windows_1251(); 214 215 const char *getName() const; 216 217 const char *getLanguage() const; 218 219 virtual UBool match(InputText *det, CharsetMatch *results) const; 220 }; 221 222 223 class CharsetRecog_KOI8_R : public CharsetRecog_sbcs 224 { 225 public: 226 virtual ~CharsetRecog_KOI8_R(); 227 228 const char *getName() const; 229 230 const char *getLanguage() const; 231 232 virtual UBool match(InputText *det, CharsetMatch *results) const; 233 }; 234 235 #if !UCONFIG_ONLY_HTML_CONVERSION 236 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs 237 { 238 public: 239 virtual ~CharsetRecog_IBM424_he(); 240 241 const char *getLanguage() const; 242 }; 243 244 class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { 245 public: 246 virtual ~CharsetRecog_IBM424_he_rtl(); 247 248 const char *getName() const; 249 250 virtual UBool match(InputText *det, CharsetMatch *results) const; 251 }; 252 253 class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { 254 virtual ~CharsetRecog_IBM424_he_ltr(); 255 256 const char *getName() const; 257 258 virtual UBool match(InputText *det, CharsetMatch *results) const; 259 }; 260 261 class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs 262 { 263 public: 264 virtual ~CharsetRecog_IBM420_ar(); 265 266 const char *getLanguage() const; 267 int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 268 269 }; 270 271 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { 272 public: 273 virtual ~CharsetRecog_IBM420_ar_rtl(); 274 275 const char *getName() const; 276 277 virtual UBool match(InputText *det, CharsetMatch *results) const; 278 }; 279 280 class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { 281 virtual ~CharsetRecog_IBM420_ar_ltr(); 282 283 const char *getName() const; 284 285 virtual UBool match(InputText *det, CharsetMatch *results) const; 286 }; 287 #endif 288 289 U_NAMESPACE_END 290 291 #endif /* !UCONFIG_NO_CONVERSION */ 292 #endif /* __CSRSBCS_H */ 293