1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  ************************************************************************************
5  * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
6  * All Rights Reserved.                                                             *
7  ************************************************************************************
8  */
9 
10 #ifndef BRKENG_H
11 #define BRKENG_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "unicode/utext.h"
16 #include "unicode/uscript.h"
17 
18 U_NAMESPACE_BEGIN
19 
20 class UnicodeSet;
21 class UStack;
22 class DictionaryMatcher;
23 
24 /*******************************************************************
25  * LanguageBreakEngine
26  */
27 
28 /**
29  * <p>LanguageBreakEngines implement language-specific knowledge for
30  * finding text boundaries within a run of characters belonging to a
31  * specific set. The boundaries will be of a specific kind, e.g. word,
32  * line, etc.</p>
33  *
34  * <p>LanguageBreakEngines should normally be implemented so as to
35  * be shared between threads without locking.</p>
36  */
37 class LanguageBreakEngine : public UMemory {
38  public:
39 
40   /**
41    * <p>Default constructor.</p>
42    *
43    */
44   LanguageBreakEngine();
45 
46   /**
47    * <p>Virtual destructor.</p>
48    */
49   virtual ~LanguageBreakEngine();
50 
51  /**
52   * <p>Indicate whether this engine handles a particular character for
53   * a particular kind of break.</p>
54   *
55   * @param c A character which begins a run that the engine might handle
56   * @param breakType The type of text break which the caller wants to determine
57   * @return TRUE if this engine handles the particular character and break
58   * type.
59   */
60   virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
61 
62  /**
63   * <p>Find any breaks within a run in the supplied text.</p>
64   *
65   * @param text A UText representing the text. The
66   * iterator is left at the end of the run of characters which the engine
67   * is capable of handling.
68   * @param startPos The start of the run within the supplied text.
69   * @param endPos The end of the run within the supplied text.
70   * @param reverse Whether the caller is looking for breaks in a reverse
71   * direction.
72   * @param breakType The type of break desired, or -1.
73   * @param foundBreaks An allocated C array of the breaks found, if any
74   * @return The number of breaks found.
75   */
76   virtual int32_t findBreaks( UText *text,
77                               int32_t startPos,
78                               int32_t endPos,
79                               UBool reverse,
80                               int32_t breakType,
81                               UStack &foundBreaks ) const = 0;
82 
83 };
84 
85 /*******************************************************************
86  * LanguageBreakFactory
87  */
88 
89 /**
90  * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
91  * that can determine breaks for characters in a specific set, if
92  * such an object can be found.</p>
93  *
94  * <p>If a LanguageBreakFactory is to be shared between threads,
95  * appropriate synchronization must be used; there is none internal
96  * to the factory.</p>
97  *
98  * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
99  * normally be shared between threads without synchronization, unless
100  * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
101  *
102  * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
103  * it returns when it itself is deleted, unless the specific subclass of
104  * LanguageBreakFactory indicates otherwise. Naturally, the factory should
105  * not be deleted until the LanguageBreakEngines it has returned are no
106  * longer needed.</p>
107  */
108 class LanguageBreakFactory : public UMemory {
109  public:
110 
111   /**
112    * <p>Default constructor.</p>
113    *
114    */
115   LanguageBreakFactory();
116 
117   /**
118    * <p>Virtual destructor.</p>
119    */
120   virtual ~LanguageBreakFactory();
121 
122  /**
123   * <p>Find and return a LanguageBreakEngine that can find the desired
124   * kind of break for the set of characters to which the supplied
125   * character belongs. It is up to the set of available engines to
126   * determine what the sets of characters are.</p>
127   *
128   * @param c A character that begins a run for which a LanguageBreakEngine is
129   * sought.
130   * @param breakType The kind of text break for which a LanguageBreakEngine is
131   * sought.
132   * @return A LanguageBreakEngine with the desired characteristics, or 0.
133   */
134   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
135 
136 };
137 
138 /*******************************************************************
139  * UnhandledEngine
140  */
141 
142 /**
143  * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
144  * handles characters that no other LanguageBreakEngine is available to
145  * handle. It is told the character and the type of break; at its
146  * discretion it may handle more than the specified character (e.g.,
147  * the entire script to which that character belongs.</p>
148  *
149  * <p>UnhandledEngines may not be shared between threads without
150  * external synchronization.</p>
151  */
152 
153 class UnhandledEngine : public LanguageBreakEngine {
154  private:
155 
156     /**
157      * The sets of characters handled, for each break type
158      * @internal
159      */
160 
161   UnicodeSet    *fHandled[4];
162 
163  public:
164 
165   /**
166    * <p>Default constructor.</p>
167    *
168    */
169   UnhandledEngine(UErrorCode &status);
170 
171   /**
172    * <p>Virtual destructor.</p>
173    */
174   virtual ~UnhandledEngine();
175 
176  /**
177   * <p>Indicate whether this engine handles a particular character for
178   * a particular kind of break.</p>
179   *
180   * @param c A character which begins a run that the engine might handle
181   * @param breakType The type of text break which the caller wants to determine
182   * @return TRUE if this engine handles the particular character and break
183   * type.
184   */
185   virtual UBool handles(UChar32 c, int32_t breakType) const;
186 
187  /**
188   * <p>Find any breaks within a run in the supplied text.</p>
189   *
190   * @param text A UText representing the text (TODO: UText). The
191   * iterator is left at the end of the run of characters which the engine
192   * is capable of handling.
193   * @param startPos The start of the run within the supplied text.
194   * @param endPos The end of the run within the supplied text.
195   * @param reverse Whether the caller is looking for breaks in a reverse
196   * direction.
197   * @param breakType The type of break desired, or -1.
198   * @param foundBreaks An allocated C array of the breaks found, if any
199   * @return The number of breaks found.
200   */
201   virtual int32_t findBreaks( UText *text,
202                               int32_t startPos,
203                               int32_t endPos,
204                               UBool reverse,
205                               int32_t breakType,
206                               UStack &foundBreaks ) const;
207 
208  /**
209   * <p>Tell the engine to handle a particular character and break type.</p>
210   *
211   * @param c A character which the engine should handle
212   * @param breakType The type of text break for which the engine should handle c
213   */
214   virtual void handleCharacter(UChar32 c, int32_t breakType);
215 
216 };
217 
218 /*******************************************************************
219  * ICULanguageBreakFactory
220  */
221 
222 /**
223  * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
224  * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
225  * data in the ICU data file.</p>
226  */
227 class ICULanguageBreakFactory : public LanguageBreakFactory {
228  private:
229 
230     /**
231      * The stack of break engines created by this factory
232      * @internal
233      */
234 
235   UStack    *fEngines;
236 
237  public:
238 
239   /**
240    * <p>Standard constructor.</p>
241    *
242    */
243   ICULanguageBreakFactory(UErrorCode &status);
244 
245   /**
246    * <p>Virtual destructor.</p>
247    */
248   virtual ~ICULanguageBreakFactory();
249 
250  /**
251   * <p>Find and return a LanguageBreakEngine that can find the desired
252   * kind of break for the set of characters to which the supplied
253   * character belongs. It is up to the set of available engines to
254   * determine what the sets of characters are.</p>
255   *
256   * @param c A character that begins a run for which a LanguageBreakEngine is
257   * sought.
258   * @param breakType The kind of text break for which a LanguageBreakEngine is
259   * sought.
260   * @return A LanguageBreakEngine with the desired characteristics, or 0.
261   */
262   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
263 
264 protected:
265  /**
266   * <p>Create a LanguageBreakEngine for the set of characters to which
267   * the supplied character belongs, for the specified break type.</p>
268   *
269   * @param c A character that begins a run for which a LanguageBreakEngine is
270   * sought.
271   * @param breakType The kind of text break for which a LanguageBreakEngine is
272   * sought.
273   * @return A LanguageBreakEngine with the desired characteristics, or 0.
274   */
275   virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
276 
277   /**
278    * <p>Create a DictionaryMatcher for the specified script and break type.</p>
279    * @param script An ISO 15924 script code that identifies the dictionary to be
280    * created.
281    * @param breakType The kind of text break for which a dictionary is
282    * sought.
283    * @return A DictionaryMatcher with the desired characteristics, or NULL.
284    */
285   virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
286 };
287 
288 U_NAMESPACE_END
289 
290     /* BRKENG_H */
291 #endif
292