1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  ************************************************************************************
5  * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
6  * All Rights Reserved.                                                             *
7  ************************************************************************************
8  */
9 
10 #ifndef BRKENG_H
11 #define BRKENG_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "unicode/utext.h"
16 #include "unicode/uscript.h"
17 
18 U_NAMESPACE_BEGIN
19 
20 class UnicodeSet;
21 class UStack;
22 class UVector32;
23 class DictionaryMatcher;
24 
25 /*******************************************************************
26  * LanguageBreakEngine
27  */
28 
29 /**
30  * <p>LanguageBreakEngines implement language-specific knowledge for
31  * finding text boundaries within a run of characters belonging to a
32  * specific set. The boundaries will be of a specific kind, e.g. word,
33  * line, etc.</p>
34  *
35  * <p>LanguageBreakEngines should normally be implemented so as to
36  * be shared between threads without locking.</p>
37  */
38 class LanguageBreakEngine : public UMemory {
39  public:
40 
41   /**
42    * <p>Default constructor.</p>
43    *
44    */
45   LanguageBreakEngine();
46 
47   /**
48    * <p>Virtual destructor.</p>
49    */
50   virtual ~LanguageBreakEngine();
51 
52  /**
53   * <p>Indicate whether this engine handles a particular character for
54   * a particular kind of break.</p>
55   *
56   * @param c A character which begins a run that the engine might handle
57   * @return TRUE if this engine handles the particular character and break
58   * type.
59   */
60   virtual UBool handles(UChar32 c) const = 0;
61 
62  /**
63   * <p>Find any breaks within a run in the supplied text.</p>
64   *
65   * @param text A UText representing the text. The
66   * iterator is left at the end of the run of characters which the engine
67   * is capable of handling.
68   * @param startPos The start of the run within the supplied text.
69   * @param endPos The end of the run within the supplied text.
70   * @param foundBreaks A Vector of int32_t to receive the breaks.
71   * @return The number of breaks found.
72   */
73   virtual int32_t findBreaks( UText *text,
74                               int32_t startPos,
75                               int32_t endPos,
76                               UVector32 &foundBreaks ) const = 0;
77 
78 };
79 
80 /*******************************************************************
81  * LanguageBreakFactory
82  */
83 
84 /**
85  * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
86  * that can determine breaks for characters in a specific set, if
87  * such an object can be found.</p>
88  *
89  * <p>If a LanguageBreakFactory is to be shared between threads,
90  * appropriate synchronization must be used; there is none internal
91  * to the factory.</p>
92  *
93  * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
94  * normally be shared between threads without synchronization, unless
95  * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
96  *
97  * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
98  * it returns when it itself is deleted, unless the specific subclass of
99  * LanguageBreakFactory indicates otherwise. Naturally, the factory should
100  * not be deleted until the LanguageBreakEngines it has returned are no
101  * longer needed.</p>
102  */
103 class LanguageBreakFactory : public UMemory {
104  public:
105 
106   /**
107    * <p>Default constructor.</p>
108    *
109    */
110   LanguageBreakFactory();
111 
112   /**
113    * <p>Virtual destructor.</p>
114    */
115   virtual ~LanguageBreakFactory();
116 
117  /**
118   * <p>Find and return a LanguageBreakEngine that can find the desired
119   * kind of break for the set of characters to which the supplied
120   * character belongs. It is up to the set of available engines to
121   * determine what the sets of characters are.</p>
122   *
123   * @param c A character that begins a run for which a LanguageBreakEngine is
124   * sought.
125   * @return A LanguageBreakEngine with the desired characteristics, or 0.
126   */
127   virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
128 
129 };
130 
131 /*******************************************************************
132  * UnhandledEngine
133  */
134 
135 /**
136  * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
137  * handles characters that no other LanguageBreakEngine is available to
138  * handle. It is told the character and the type of break; at its
139  * discretion it may handle more than the specified character (e.g.,
140  * the entire script to which that character belongs.</p>
141  *
142  * <p>UnhandledEngines may not be shared between threads without
143  * external synchronization.</p>
144  */
145 
146 class UnhandledEngine : public LanguageBreakEngine {
147  private:
148 
149     /**
150      * The sets of characters handled.
151      * @internal
152      */
153 
154   UnicodeSet    *fHandled;
155 
156  public:
157 
158   /**
159    * <p>Default constructor.</p>
160    *
161    */
162   UnhandledEngine(UErrorCode &status);
163 
164   /**
165    * <p>Virtual destructor.</p>
166    */
167   virtual ~UnhandledEngine();
168 
169  /**
170   * <p>Indicate whether this engine handles a particular character for
171   * a particular kind of break.</p>
172   *
173   * @param c A character which begins a run that the engine might handle
174   * @return TRUE if this engine handles the particular character and break
175   * type.
176   */
177   virtual UBool handles(UChar32 c) const;
178 
179  /**
180   * <p>Find any breaks within a run in the supplied text.</p>
181   *
182   * @param text A UText representing the text (TODO: UText). The
183   * iterator is left at the end of the run of characters which the engine
184   * is capable of handling.
185   * @param startPos The start of the run within the supplied text.
186   * @param endPos The end of the run within the supplied text.
187   * @param foundBreaks An allocated C array of the breaks found, if any
188   * @return The number of breaks found.
189   */
190   virtual int32_t findBreaks( UText *text,
191                               int32_t startPos,
192                               int32_t endPos,
193                               UVector32 &foundBreaks ) const;
194 
195  /**
196   * <p>Tell the engine to handle a particular character and break type.</p>
197   *
198   * @param c A character which the engine should handle
199   */
200   virtual void handleCharacter(UChar32 c);
201 
202 };
203 
204 /*******************************************************************
205  * ICULanguageBreakFactory
206  */
207 
208 /**
209  * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
210  * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
211  * data in the ICU data file.</p>
212  */
213 class ICULanguageBreakFactory : public LanguageBreakFactory {
214  private:
215 
216     /**
217      * The stack of break engines created by this factory
218      * @internal
219      */
220 
221   UStack    *fEngines;
222 
223  public:
224 
225   /**
226    * <p>Standard constructor.</p>
227    *
228    */
229   ICULanguageBreakFactory(UErrorCode &status);
230 
231   /**
232    * <p>Virtual destructor.</p>
233    */
234   virtual ~ICULanguageBreakFactory();
235 
236  /**
237   * <p>Find and return a LanguageBreakEngine that can find the desired
238   * kind of break for the set of characters to which the supplied
239   * character belongs. It is up to the set of available engines to
240   * determine what the sets of characters are.</p>
241   *
242   * @param c A character that begins a run for which a LanguageBreakEngine is
243   * sought.
244   * @return A LanguageBreakEngine with the desired characteristics, or 0.
245   */
246   virtual const LanguageBreakEngine *getEngineFor(UChar32 c);
247 
248 protected:
249  /**
250   * <p>Create a LanguageBreakEngine for the set of characters to which
251   * the supplied character belongs, for the specified break type.</p>
252   *
253   * @param c A character that begins a run for which a LanguageBreakEngine is
254   * sought.
255   * @return A LanguageBreakEngine with the desired characteristics, or 0.
256   */
257   virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
258 
259   /**
260    * <p>Create a DictionaryMatcher for the specified script and break type.</p>
261    * @param script An ISO 15924 script code that identifies the dictionary to be
262    * created.
263    * @return A DictionaryMatcher with the desired characteristics, or NULL.
264    */
265   virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
266 };
267 
268 U_NAMESPACE_END
269 
270     /* BRKENG_H */
271 #endif
272