1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2014, International Business Machines Corporation   *
6  * and others. All Rights Reserved.                                            *
7  *******************************************************************************
8  */
9 
10 #ifndef DICTBE_H
11 #define DICTBE_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utext.h"
16 
17 #include "brkeng.h"
18 #include "uvectr32.h"
19 
20 U_NAMESPACE_BEGIN
21 
22 class DictionaryMatcher;
23 class Normalizer2;
24 
25 /*******************************************************************
26  * DictionaryBreakEngine
27  */
28 
29 /**
30  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
31  * dictionary to determine language-specific breaks.</p>
32  *
33  * <p>After it is constructed a DictionaryBreakEngine may be shared between
34  * threads without synchronization.</p>
35  */
36 class DictionaryBreakEngine : public LanguageBreakEngine {
37  private:
38     /**
39      * The set of characters handled by this engine
40      * @internal
41      */
42 
43   UnicodeSet    fSet;
44 
45  public:
46 
47   /**
48    * <p>Constructor </p>
49    */
50   DictionaryBreakEngine();
51 
52   /**
53    * <p>Virtual destructor.</p>
54    */
55   virtual ~DictionaryBreakEngine();
56 
57   /**
58    * <p>Indicate whether this engine handles a particular character for
59    * a particular kind of break.</p>
60    *
61    * @param c A character which begins a run that the engine might handle
62    * @return TRUE if this engine handles the particular character and break
63    * type.
64    */
65   virtual UBool handles(UChar32 c) const;
66 
67   /**
68    * <p>Find any breaks within a run in the supplied text.</p>
69    *
70    * @param text A UText representing the text. The iterator is left at
71    * the end of the run of characters which the engine is capable of handling
72    * that starts from the first character in the range.
73    * @param startPos The start of the run within the supplied text.
74    * @param endPos The end of the run within the supplied text.
75    * @param foundBreaks vector of int32_t to receive the break positions
76    * @return The number of breaks found.
77    */
78   virtual int32_t findBreaks( UText *text,
79                               int32_t startPos,
80                               int32_t endPos,
81                               UVector32 &foundBreaks ) const;
82 
83  protected:
84 
85  /**
86   * <p>Set the character set handled by this engine.</p>
87   *
88   * @param set A UnicodeSet of the set of characters handled by the engine
89   */
90   virtual void setCharacters( const UnicodeSet &set );
91 
92  /**
93   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
94   *
95   * @param text A UText representing the text
96   * @param rangeStart The start of the range of dictionary characters
97   * @param rangeEnd The end of the range of dictionary characters
98   * @param foundBreaks Output of C array of int32_t break positions, or 0
99   * @return The number of breaks found
100   */
101   virtual int32_t divideUpDictionaryRange( UText *text,
102                                            int32_t rangeStart,
103                                            int32_t rangeEnd,
104                                            UVector32 &foundBreaks ) const = 0;
105 
106 };
107 
108 /*******************************************************************
109  * ThaiBreakEngine
110  */
111 
112 /**
113  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
114  * dictionary and heuristics to determine Thai-specific breaks.</p>
115  *
116  * <p>After it is constructed a ThaiBreakEngine may be shared between
117  * threads without synchronization.</p>
118  */
119 class ThaiBreakEngine : public DictionaryBreakEngine {
120  private:
121     /**
122      * The set of characters handled by this engine
123      * @internal
124      */
125 
126   UnicodeSet                fThaiWordSet;
127   UnicodeSet                fEndWordSet;
128   UnicodeSet                fBeginWordSet;
129   UnicodeSet                fSuffixSet;
130   UnicodeSet                fMarkSet;
131   DictionaryMatcher  *fDictionary;
132 
133  public:
134 
135   /**
136    * <p>Default constructor.</p>
137    *
138    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
139    * engine is deleted.
140    */
141   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
142 
143   /**
144    * <p>Virtual destructor.</p>
145    */
146   virtual ~ThaiBreakEngine();
147 
148  protected:
149  /**
150   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
151   *
152   * @param text A UText representing the text
153   * @param rangeStart The start of the range of dictionary characters
154   * @param rangeEnd The end of the range of dictionary characters
155   * @param foundBreaks Output of C array of int32_t break positions, or 0
156   * @return The number of breaks found
157   */
158   virtual int32_t divideUpDictionaryRange( UText *text,
159                                            int32_t rangeStart,
160                                            int32_t rangeEnd,
161                                            UVector32 &foundBreaks ) const;
162 
163 };
164 
165 /*******************************************************************
166  * LaoBreakEngine
167  */
168 
169 /**
170  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
171  * dictionary and heuristics to determine Lao-specific breaks.</p>
172  *
173  * <p>After it is constructed a LaoBreakEngine may be shared between
174  * threads without synchronization.</p>
175  */
176 class LaoBreakEngine : public DictionaryBreakEngine {
177  private:
178     /**
179      * The set of characters handled by this engine
180      * @internal
181      */
182 
183   UnicodeSet                fLaoWordSet;
184   UnicodeSet                fEndWordSet;
185   UnicodeSet                fBeginWordSet;
186   UnicodeSet                fMarkSet;
187   DictionaryMatcher  *fDictionary;
188 
189  public:
190 
191   /**
192    * <p>Default constructor.</p>
193    *
194    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
195    * engine is deleted.
196    */
197   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
198 
199   /**
200    * <p>Virtual destructor.</p>
201    */
202   virtual ~LaoBreakEngine();
203 
204  protected:
205  /**
206   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
207   *
208   * @param text A UText representing the text
209   * @param rangeStart The start of the range of dictionary characters
210   * @param rangeEnd The end of the range of dictionary characters
211   * @param foundBreaks Output of C array of int32_t break positions, or 0
212   * @return The number of breaks found
213   */
214   virtual int32_t divideUpDictionaryRange( UText *text,
215                                            int32_t rangeStart,
216                                            int32_t rangeEnd,
217                                            UVector32 &foundBreaks ) const;
218 
219 };
220 
221 /*******************************************************************
222  * BurmeseBreakEngine
223  */
224 
225 /**
226  * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
227  * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
228  *
229  * <p>After it is constructed a BurmeseBreakEngine may be shared between
230  * threads without synchronization.</p>
231  */
232 class BurmeseBreakEngine : public DictionaryBreakEngine {
233  private:
234     /**
235      * The set of characters handled by this engine
236      * @internal
237      */
238 
239   UnicodeSet                fBurmeseWordSet;
240   UnicodeSet                fEndWordSet;
241   UnicodeSet                fBeginWordSet;
242   UnicodeSet                fMarkSet;
243   DictionaryMatcher  *fDictionary;
244 
245  public:
246 
247   /**
248    * <p>Default constructor.</p>
249    *
250    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
251    * engine is deleted.
252    */
253   BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
254 
255   /**
256    * <p>Virtual destructor.</p>
257    */
258   virtual ~BurmeseBreakEngine();
259 
260  protected:
261  /**
262   * <p>Divide up a range of known dictionary characters.</p>
263   *
264   * @param text A UText representing the text
265   * @param rangeStart The start of the range of dictionary characters
266   * @param rangeEnd The end of the range of dictionary characters
267   * @param foundBreaks Output of C array of int32_t break positions, or 0
268   * @return The number of breaks found
269   */
270   virtual int32_t divideUpDictionaryRange( UText *text,
271                                            int32_t rangeStart,
272                                            int32_t rangeEnd,
273                                            UVector32 &foundBreaks ) const;
274 
275 };
276 
277 /*******************************************************************
278  * KhmerBreakEngine
279  */
280 
281 /**
282  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
283  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
284  *
285  * <p>After it is constructed a KhmerBreakEngine may be shared between
286  * threads without synchronization.</p>
287  */
288 class KhmerBreakEngine : public DictionaryBreakEngine {
289  private:
290     /**
291      * The set of characters handled by this engine
292      * @internal
293      */
294 
295   UnicodeSet                fKhmerWordSet;
296   UnicodeSet                fEndWordSet;
297   UnicodeSet                fBeginWordSet;
298   UnicodeSet                fMarkSet;
299   DictionaryMatcher  *fDictionary;
300 
301  public:
302 
303   /**
304    * <p>Default constructor.</p>
305    *
306    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
307    * engine is deleted.
308    */
309   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
310 
311   /**
312    * <p>Virtual destructor.</p>
313    */
314   virtual ~KhmerBreakEngine();
315 
316  protected:
317  /**
318   * <p>Divide up a range of known dictionary characters.</p>
319   *
320   * @param text A UText representing the text
321   * @param rangeStart The start of the range of dictionary characters
322   * @param rangeEnd The end of the range of dictionary characters
323   * @param foundBreaks Output of C array of int32_t break positions, or 0
324   * @return The number of breaks found
325   */
326   virtual int32_t divideUpDictionaryRange( UText *text,
327                                            int32_t rangeStart,
328                                            int32_t rangeEnd,
329                                            UVector32 &foundBreaks ) const;
330 
331 };
332 
333 #if !UCONFIG_NO_NORMALIZATION
334 
335 /*******************************************************************
336  * CjkBreakEngine
337  */
338 
339 //indicates language/script that the CjkBreakEngine will handle
340 enum LanguageType {
341     kKorean,
342     kChineseJapanese
343 };
344 
345 /**
346  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
347  * dictionary with costs associated with each word and
348  * Viterbi decoding to determine CJK-specific breaks.</p>
349  */
350 class CjkBreakEngine : public DictionaryBreakEngine {
351  protected:
352     /**
353      * The set of characters handled by this engine
354      * @internal
355      */
356   UnicodeSet                fHangulWordSet;
357   UnicodeSet                fHanWordSet;
358   UnicodeSet                fKatakanaWordSet;
359   UnicodeSet                fHiraganaWordSet;
360 
361   DictionaryMatcher        *fDictionary;
362   const Normalizer2        *nfkcNorm2;
363 
364  public:
365 
366     /**
367      * <p>Default constructor.</p>
368      *
369      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
370      * engine is deleted. The DictionaryMatcher must contain costs for each word
371      * in order for the dictionary to work properly.
372      */
373   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
374 
375     /**
376      * <p>Virtual destructor.</p>
377      */
378   virtual ~CjkBreakEngine();
379 
380  protected:
381     /**
382      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
383      *
384      * @param text A UText representing the text
385      * @param rangeStart The start of the range of dictionary characters
386      * @param rangeEnd The end of the range of dictionary characters
387      * @param foundBreaks Output of C array of int32_t break positions, or 0
388      * @return The number of breaks found
389      */
390   virtual int32_t divideUpDictionaryRange( UText *text,
391           int32_t rangeStart,
392           int32_t rangeEnd,
393           UVector32 &foundBreaks ) const;
394 
395 };
396 
397 #endif
398 
399 U_NAMESPACE_END
400 
401     /* DICTBE_H */
402 #endif
403