1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2014, International Business Machines Corporation * 6 * and others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 10 #ifndef DICTBE_H 11 #define DICTBE_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uniset.h" 15 #include "unicode/utext.h" 16 17 #include "brkeng.h" 18 19 U_NAMESPACE_BEGIN 20 21 class DictionaryMatcher; 22 class Normalizer2; 23 24 /******************************************************************* 25 * DictionaryBreakEngine 26 */ 27 28 /** 29 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 30 * dictionary to determine language-specific breaks.</p> 31 * 32 * <p>After it is constructed a DictionaryBreakEngine may be shared between 33 * threads without synchronization.</p> 34 */ 35 class DictionaryBreakEngine : public LanguageBreakEngine { 36 private: 37 /** 38 * The set of characters handled by this engine 39 * @internal 40 */ 41 42 UnicodeSet fSet; 43 44 /** 45 * The set of break types handled by this engine 46 * @internal 47 */ 48 49 uint32_t fTypes; 50 51 /** 52 * <p>Default constructor.</p> 53 * 54 */ 55 DictionaryBreakEngine(); 56 57 public: 58 59 /** 60 * <p>Constructor setting the break types handled.</p> 61 * 62 * @param breakTypes A bitmap of types handled by the engine. 63 */ 64 DictionaryBreakEngine( uint32_t breakTypes ); 65 66 /** 67 * <p>Virtual destructor.</p> 68 */ 69 virtual ~DictionaryBreakEngine(); 70 71 /** 72 * <p>Indicate whether this engine handles a particular character for 73 * a particular kind of break.</p> 74 * 75 * @param c A character which begins a run that the engine might handle 76 * @param breakType The type of text break which the caller wants to determine 77 * @return TRUE if this engine handles the particular character and break 78 * type. 79 */ 80 virtual UBool handles( UChar32 c, int32_t breakType ) const; 81 82 /** 83 * <p>Find any breaks within a run in the supplied text.</p> 84 * 85 * @param text A UText representing the text. The iterator is left at 86 * the end of the run of characters which the engine is capable of handling 87 * that starts from the first (or last) character in the range. 88 * @param startPos The start of the run within the supplied text. 89 * @param endPos The end of the run within the supplied text. 90 * @param reverse Whether the caller is looking for breaks in a reverse 91 * direction. 92 * @param breakType The type of break desired, or -1. 93 * @param foundBreaks An allocated C array of the breaks found, if any 94 * @return The number of breaks found. 95 */ 96 virtual int32_t findBreaks( UText *text, 97 int32_t startPos, 98 int32_t endPos, 99 UBool reverse, 100 int32_t breakType, 101 UStack &foundBreaks ) const; 102 103 protected: 104 105 /** 106 * <p>Set the character set handled by this engine.</p> 107 * 108 * @param set A UnicodeSet of the set of characters handled by the engine 109 */ 110 virtual void setCharacters( const UnicodeSet &set ); 111 112 /** 113 * <p>Set the break types handled by this engine.</p> 114 * 115 * @param breakTypes A bitmap of types handled by the engine. 116 */ 117 // virtual void setBreakTypes( uint32_t breakTypes ); 118 119 /** 120 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 121 * 122 * @param text A UText representing the text 123 * @param rangeStart The start of the range of dictionary characters 124 * @param rangeEnd The end of the range of dictionary characters 125 * @param foundBreaks Output of C array of int32_t break positions, or 0 126 * @return The number of breaks found 127 */ 128 virtual int32_t divideUpDictionaryRange( UText *text, 129 int32_t rangeStart, 130 int32_t rangeEnd, 131 UStack &foundBreaks ) const = 0; 132 133 }; 134 135 /******************************************************************* 136 * ThaiBreakEngine 137 */ 138 139 /** 140 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 141 * dictionary and heuristics to determine Thai-specific breaks.</p> 142 * 143 * <p>After it is constructed a ThaiBreakEngine may be shared between 144 * threads without synchronization.</p> 145 */ 146 class ThaiBreakEngine : public DictionaryBreakEngine { 147 private: 148 /** 149 * The set of characters handled by this engine 150 * @internal 151 */ 152 153 UnicodeSet fThaiWordSet; 154 UnicodeSet fEndWordSet; 155 UnicodeSet fBeginWordSet; 156 UnicodeSet fSuffixSet; 157 UnicodeSet fMarkSet; 158 DictionaryMatcher *fDictionary; 159 160 public: 161 162 /** 163 * <p>Default constructor.</p> 164 * 165 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 166 * engine is deleted. 167 */ 168 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 169 170 /** 171 * <p>Virtual destructor.</p> 172 */ 173 virtual ~ThaiBreakEngine(); 174 175 protected: 176 /** 177 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 178 * 179 * @param text A UText representing the text 180 * @param rangeStart The start of the range of dictionary characters 181 * @param rangeEnd The end of the range of dictionary characters 182 * @param foundBreaks Output of C array of int32_t break positions, or 0 183 * @return The number of breaks found 184 */ 185 virtual int32_t divideUpDictionaryRange( UText *text, 186 int32_t rangeStart, 187 int32_t rangeEnd, 188 UStack &foundBreaks ) const; 189 190 }; 191 192 /******************************************************************* 193 * LaoBreakEngine 194 */ 195 196 /** 197 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 198 * dictionary and heuristics to determine Lao-specific breaks.</p> 199 * 200 * <p>After it is constructed a LaoBreakEngine may be shared between 201 * threads without synchronization.</p> 202 */ 203 class LaoBreakEngine : public DictionaryBreakEngine { 204 private: 205 /** 206 * The set of characters handled by this engine 207 * @internal 208 */ 209 210 UnicodeSet fLaoWordSet; 211 UnicodeSet fEndWordSet; 212 UnicodeSet fBeginWordSet; 213 UnicodeSet fMarkSet; 214 DictionaryMatcher *fDictionary; 215 216 public: 217 218 /** 219 * <p>Default constructor.</p> 220 * 221 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 222 * engine is deleted. 223 */ 224 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 225 226 /** 227 * <p>Virtual destructor.</p> 228 */ 229 virtual ~LaoBreakEngine(); 230 231 protected: 232 /** 233 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 234 * 235 * @param text A UText representing the text 236 * @param rangeStart The start of the range of dictionary characters 237 * @param rangeEnd The end of the range of dictionary characters 238 * @param foundBreaks Output of C array of int32_t break positions, or 0 239 * @return The number of breaks found 240 */ 241 virtual int32_t divideUpDictionaryRange( UText *text, 242 int32_t rangeStart, 243 int32_t rangeEnd, 244 UStack &foundBreaks ) const; 245 246 }; 247 248 /******************************************************************* 249 * BurmeseBreakEngine 250 */ 251 252 /** 253 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a 254 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> 255 * 256 * <p>After it is constructed a BurmeseBreakEngine may be shared between 257 * threads without synchronization.</p> 258 */ 259 class BurmeseBreakEngine : public DictionaryBreakEngine { 260 private: 261 /** 262 * The set of characters handled by this engine 263 * @internal 264 */ 265 266 UnicodeSet fBurmeseWordSet; 267 UnicodeSet fEndWordSet; 268 UnicodeSet fBeginWordSet; 269 UnicodeSet fMarkSet; 270 DictionaryMatcher *fDictionary; 271 272 public: 273 274 /** 275 * <p>Default constructor.</p> 276 * 277 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 278 * engine is deleted. 279 */ 280 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 281 282 /** 283 * <p>Virtual destructor.</p> 284 */ 285 virtual ~BurmeseBreakEngine(); 286 287 protected: 288 /** 289 * <p>Divide up a range of known dictionary characters.</p> 290 * 291 * @param text A UText representing the text 292 * @param rangeStart The start of the range of dictionary characters 293 * @param rangeEnd The end of the range of dictionary characters 294 * @param foundBreaks Output of C array of int32_t break positions, or 0 295 * @return The number of breaks found 296 */ 297 virtual int32_t divideUpDictionaryRange( UText *text, 298 int32_t rangeStart, 299 int32_t rangeEnd, 300 UStack &foundBreaks ) const; 301 302 }; 303 304 /******************************************************************* 305 * KhmerBreakEngine 306 */ 307 308 /** 309 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 310 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 311 * 312 * <p>After it is constructed a KhmerBreakEngine may be shared between 313 * threads without synchronization.</p> 314 */ 315 class KhmerBreakEngine : public DictionaryBreakEngine { 316 private: 317 /** 318 * The set of characters handled by this engine 319 * @internal 320 */ 321 322 UnicodeSet fKhmerWordSet; 323 UnicodeSet fEndWordSet; 324 UnicodeSet fBeginWordSet; 325 UnicodeSet fMarkSet; 326 DictionaryMatcher *fDictionary; 327 328 public: 329 330 /** 331 * <p>Default constructor.</p> 332 * 333 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 334 * engine is deleted. 335 */ 336 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 337 338 /** 339 * <p>Virtual destructor.</p> 340 */ 341 virtual ~KhmerBreakEngine(); 342 343 protected: 344 /** 345 * <p>Divide up a range of known dictionary characters.</p> 346 * 347 * @param text A UText representing the text 348 * @param rangeStart The start of the range of dictionary characters 349 * @param rangeEnd The end of the range of dictionary characters 350 * @param foundBreaks Output of C array of int32_t break positions, or 0 351 * @return The number of breaks found 352 */ 353 virtual int32_t divideUpDictionaryRange( UText *text, 354 int32_t rangeStart, 355 int32_t rangeEnd, 356 UStack &foundBreaks ) const; 357 358 }; 359 360 #if !UCONFIG_NO_NORMALIZATION 361 362 /******************************************************************* 363 * CjkBreakEngine 364 */ 365 366 //indicates language/script that the CjkBreakEngine will handle 367 enum LanguageType { 368 kKorean, 369 kChineseJapanese 370 }; 371 372 /** 373 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 374 * dictionary with costs associated with each word and 375 * Viterbi decoding to determine CJK-specific breaks.</p> 376 */ 377 class CjkBreakEngine : public DictionaryBreakEngine { 378 protected: 379 /** 380 * The set of characters handled by this engine 381 * @internal 382 */ 383 UnicodeSet fHangulWordSet; 384 UnicodeSet fHanWordSet; 385 UnicodeSet fKatakanaWordSet; 386 UnicodeSet fHiraganaWordSet; 387 388 DictionaryMatcher *fDictionary; 389 const Normalizer2 *nfkcNorm2; 390 391 public: 392 393 /** 394 * <p>Default constructor.</p> 395 * 396 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 397 * engine is deleted. The DictionaryMatcher must contain costs for each word 398 * in order for the dictionary to work properly. 399 */ 400 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 401 402 /** 403 * <p>Virtual destructor.</p> 404 */ 405 virtual ~CjkBreakEngine(); 406 407 protected: 408 /** 409 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 410 * 411 * @param text A UText representing the text 412 * @param rangeStart The start of the range of dictionary characters 413 * @param rangeEnd The end of the range of dictionary characters 414 * @param foundBreaks Output of C array of int32_t break positions, or 0 415 * @return The number of breaks found 416 */ 417 virtual int32_t divideUpDictionaryRange( UText *text, 418 int32_t rangeStart, 419 int32_t rangeEnd, 420 UStack &foundBreaks ) const; 421 422 }; 423 424 #endif 425 426 U_NAMESPACE_END 427 428 /* DICTBE_H */ 429 #endif 430