1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1996-2015, International Business Machines Corporation and others.
6 * All Rights Reserved.
7 ******************************************************************************
8 */
9 
10 #ifndef UBRK_H
11 #define UBRK_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uloc.h"
15 #include "unicode/utext.h"
16 #include "unicode/localpointer.h"
17 
18 /**
19  * A text-break iterator.
20  *  For usage in C programs.
21  */
22 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
23 #   define UBRK_TYPEDEF_UBREAK_ITERATOR
24     /**
25      *  Opaque type representing an ICU Break iterator object.
26      *  @stable ICU 2.0
27      */
28     typedef struct UBreakIterator UBreakIterator;
29 #endif
30 
31 #if !UCONFIG_NO_BREAK_ITERATION
32 
33 #include "unicode/parseerr.h"
34 
35 /**
36  * \file
37  * \brief C API: BreakIterator
38  *
39  * <h2> BreakIterator C API </h2>
40  *
41  * The BreakIterator C API defines  methods for finding the location
42  * of boundaries in text. Pointer to a UBreakIterator maintain a
43  * current position and scan over text returning the index of characters
44  * where boundaries occur.
45  * <p>
46  * Line boundary analysis determines where a text string can be broken
47  * when line-wrapping. The mechanism correctly handles punctuation and
48  * hyphenated words.
49  * <p>
50  * Note: The locale keyword "lb" can be used to modify line break
51  * behavior according to the CSS level 3 line-break options, see
52  * <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:
53  * "ja@lb=strict", "zh@lb=loose".
54  * <p>
55  * Sentence boundary analysis allows selection with correct
56  * interpretation of periods within numbers and abbreviations, and
57  * trailing punctuation marks such as quotation marks and parentheses.
58  * <p>
59  * Note: The locale keyword "ss" can be used to enable use of
60  * segmentation suppression data (preventing breaks in English after
61  * abbreviations such as "Mr." or "Est.", for example), as follows:
62  * "en@ss=standard".
63  * <p>
64  * Word boundary analysis is used by search and replace functions, as
65  * well as within text editing applications that allow the user to
66  * select words with a double click. Word selection provides correct
67  * interpretation of punctuation marks within and following
68  * words. Characters that are not part of a word, such as symbols or
69  * punctuation marks, have word-breaks on both sides.
70  * <p>
71  * Character boundary analysis identifies the boundaries of
72  * "Extended Grapheme Clusters", which are groupings of codepoints
73  * that should be treated as character-like units for many text operations.
74  * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
75  * http://www.unicode.org/reports/tr29/ for additional information
76  * on grapheme clusters and guidelines on their use.
77  * <p>
78  * Title boundary analysis locates all positions,
79  * typically starts of words, that should be set to Title Case
80  * when title casing the text.
81  * <p>
82  * The text boundary positions are found according to the rules
83  * described in Unicode Standard Annex #29, Text Boundaries, and
84  * Unicode Standard Annex #14, Line Breaking Properties.  These
85  * are available at http://www.unicode.org/reports/tr14/ and
86  * http://www.unicode.org/reports/tr29/.
87  * <p>
88  * In addition to the plain C API defined in this header file, an
89  * object oriented C++ API with equivalent functionality is defined in the
90  * file brkiter.h.
91  * <p>
92  * Code snippets illustrating the use of the Break Iterator APIs
93  * are available in the ICU User Guide,
94  * http://icu-project.org/userguide/boundaryAnalysis.html
95  * and in the sample program icu/source/samples/break/break.cpp
96  */
97 
98 /** The possible types of text boundaries.  @stable ICU 2.0 */
99 typedef enum UBreakIteratorType {
100   /** Character breaks  @stable ICU 2.0 */
101   UBRK_CHARACTER = 0,
102   /** Word breaks @stable ICU 2.0 */
103   UBRK_WORD = 1,
104   /** Line breaks @stable ICU 2.0 */
105   UBRK_LINE = 2,
106   /** Sentence breaks @stable ICU 2.0 */
107   UBRK_SENTENCE = 3,
108 
109 #ifndef U_HIDE_DEPRECATED_API
110   /**
111    * Title Case breaks
112    * The iterator created using this type locates title boundaries as described for
113    * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
114    * please use Word Boundary iterator.
115    *
116    * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
117    */
118   UBRK_TITLE = 4,
119     /**
120      * One more than the highest normal UBreakIteratorType value.
121      * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
122      */
123     UBRK_COUNT = 5
124 #endif  // U_HIDE_DEPRECATED_API
125 } UBreakIteratorType;
126 
127 /** Value indicating all text boundaries have been returned.
128  *  @stable ICU 2.0
129  */
130 #define UBRK_DONE ((int32_t) -1)
131 
132 
133 /**
134  *  Enum constants for the word break tags returned by
135  *  getRuleStatus().  A range of values is defined for each category of
136  *  word, to allow for further subdivisions of a category in future releases.
137  *  Applications should check for tag values falling within the range, rather
138  *  than for single individual values.
139  *
140  * The numeric values of all of these constants are stable (will not change).
141  *
142  * @stable ICU 2.2
143 */
144 typedef enum UWordBreak {
145     /** Tag value for "words" that do not fit into any of other categories.
146      *  Includes spaces and most punctuation. */
147     UBRK_WORD_NONE           = 0,
148     /** Upper bound for tags for uncategorized words. */
149     UBRK_WORD_NONE_LIMIT     = 100,
150     /** Tag value for words that appear to be numbers, lower limit.    */
151     UBRK_WORD_NUMBER         = 100,
152     /** Tag value for words that appear to be numbers, upper limit.    */
153     UBRK_WORD_NUMBER_LIMIT   = 200,
154     /** Tag value for words that contain letters, excluding
155      *  hiragana, katakana or ideographic characters, lower limit.    */
156     UBRK_WORD_LETTER         = 200,
157     /** Tag value for words containing letters, upper limit  */
158     UBRK_WORD_LETTER_LIMIT   = 300,
159     /** Tag value for words containing kana characters, lower limit */
160     UBRK_WORD_KANA           = 300,
161     /** Tag value for words containing kana characters, upper limit */
162     UBRK_WORD_KANA_LIMIT     = 400,
163     /** Tag value for words containing ideographic characters, lower limit */
164     UBRK_WORD_IDEO           = 400,
165     /** Tag value for words containing ideographic characters, upper limit */
166     UBRK_WORD_IDEO_LIMIT     = 500
167 } UWordBreak;
168 
169 /**
170  *  Enum constants for the line break tags returned by getRuleStatus().
171  *  A range of values is defined for each category of
172  *  word, to allow for further subdivisions of a category in future releases.
173  *  Applications should check for tag values falling within the range, rather
174  *  than for single individual values.
175  *
176  * The numeric values of all of these constants are stable (will not change).
177  *
178  * @stable ICU 2.8
179 */
180 typedef enum ULineBreakTag {
181     /** Tag value for soft line breaks, positions at which a line break
182       *  is acceptable but not required                */
183     UBRK_LINE_SOFT            = 0,
184     /** Upper bound for soft line breaks.              */
185     UBRK_LINE_SOFT_LIMIT      = 100,
186     /** Tag value for a hard, or mandatory line break  */
187     UBRK_LINE_HARD            = 100,
188     /** Upper bound for hard line breaks.              */
189     UBRK_LINE_HARD_LIMIT      = 200
190 } ULineBreakTag;
191 
192 
193 
194 /**
195  *  Enum constants for the sentence break tags returned by getRuleStatus().
196  *  A range of values is defined for each category of
197  *  sentence, to allow for further subdivisions of a category in future releases.
198  *  Applications should check for tag values falling within the range, rather
199  *  than for single individual values.
200  *
201  * The numeric values of all of these constants are stable (will not change).
202  *
203  * @stable ICU 2.8
204 */
205 typedef enum USentenceBreakTag {
206     /** Tag value for for sentences  ending with a sentence terminator
207       * ('.', '?', '!', etc.) character, possibly followed by a
208       * hard separator (CR, LF, PS, etc.)
209       */
210     UBRK_SENTENCE_TERM       = 0,
211     /** Upper bound for tags for sentences ended by sentence terminators.    */
212     UBRK_SENTENCE_TERM_LIMIT = 100,
213     /** Tag value for for sentences that do not contain an ending
214       * sentence terminator ('.', '?', '!', etc.) character, but
215       * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
216       */
217     UBRK_SENTENCE_SEP        = 100,
218     /** Upper bound for tags for sentences ended by a separator.              */
219     UBRK_SENTENCE_SEP_LIMIT  = 200
220     /** Tag value for a hard, or mandatory line break  */
221 } USentenceBreakTag;
222 
223 
224 /**
225  * Open a new UBreakIterator for locating text boundaries for a specified locale.
226  * A UBreakIterator may be used for detecting character, line, word,
227  * and sentence breaks in text.
228  * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
229  * UBRK_LINE, UBRK_SENTENCE
230  * @param locale The locale specifying the text-breaking conventions. Note that
231  * locale keys such as "lb" and "ss" may be used to modify text break behavior,
232  * see general discussion of BreakIterator C API.
233  * @param text The text to be iterated over.
234  * @param textLength The number of characters in text, or -1 if null-terminated.
235  * @param status A UErrorCode to receive any errors.
236  * @return A UBreakIterator for the specified locale.
237  * @see ubrk_openRules
238  * @stable ICU 2.0
239  */
240 U_STABLE UBreakIterator* U_EXPORT2
241 ubrk_open(UBreakIteratorType type,
242       const char *locale,
243       const UChar *text,
244       int32_t textLength,
245       UErrorCode *status);
246 
247 /**
248  * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
249  * The rule syntax is ... (TBD)
250  * @param rules A set of rules specifying the text breaking conventions.
251  * @param rulesLength The number of characters in rules, or -1 if null-terminated.
252  * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
253  *        used to specify the text to be iterated.
254  * @param textLength The number of characters in text, or -1 if null-terminated.
255  * @param parseErr   Receives position and context information for any syntax errors
256  *                   detected while parsing the rules.
257  * @param status A UErrorCode to receive any errors.
258  * @return A UBreakIterator for the specified rules.
259  * @see ubrk_open
260  * @stable ICU 2.2
261  */
262 U_STABLE UBreakIterator* U_EXPORT2
263 ubrk_openRules(const UChar     *rules,
264                int32_t         rulesLength,
265                const UChar     *text,
266                int32_t          textLength,
267                UParseError     *parseErr,
268                UErrorCode      *status);
269 
270 /**
271  * Thread safe cloning operation
272  * @param bi iterator to be cloned
273  * @param stackBuffer <em>Deprecated functionality as of ICU 52, use NULL.</em><br>
274  *  user allocated space for the new clone. If NULL new memory will be allocated.
275  *  If buffer is not large enough, new memory will be allocated.
276  *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE.
277  * @param pBufferSize <em>Deprecated functionality as of ICU 52, use NULL or 1.</em><br>
278  *  pointer to size of allocated space.
279  *  If *pBufferSize == 0, a sufficient size for use in cloning will
280  *  be returned ('pre-flighting')
281  *  If *pBufferSize is not enough for a stack-based safe clone,
282  *  new memory will be allocated.
283  * @param status to indicate whether the operation went on smoothly or there were errors
284  *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
285  * @return pointer to the new clone
286  * @stable ICU 2.0
287  */
288 U_STABLE UBreakIterator * U_EXPORT2
289 ubrk_safeClone(
290           const UBreakIterator *bi,
291           void *stackBuffer,
292           int32_t *pBufferSize,
293           UErrorCode *status);
294 
295 #ifndef U_HIDE_DEPRECATED_API
296 
297 /**
298   * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
299   * @deprecated ICU 52. Do not rely on ubrk_safeClone() cloning into any provided buffer.
300   */
301 #define U_BRK_SAFECLONE_BUFFERSIZE 1
302 
303 #endif /* U_HIDE_DEPRECATED_API */
304 
305 /**
306 * Close a UBreakIterator.
307 * Once closed, a UBreakIterator may no longer be used.
308 * @param bi The break iterator to close.
309  * @stable ICU 2.0
310 */
311 U_STABLE void U_EXPORT2
312 ubrk_close(UBreakIterator *bi);
313 
314 #if U_SHOW_CPLUSPLUS_API
315 
316 U_NAMESPACE_BEGIN
317 
318 /**
319  * \class LocalUBreakIteratorPointer
320  * "Smart pointer" class, closes a UBreakIterator via ubrk_close().
321  * For most methods see the LocalPointerBase base class.
322  *
323  * @see LocalPointerBase
324  * @see LocalPointer
325  * @stable ICU 4.4
326  */
327 U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);
328 
329 U_NAMESPACE_END
330 
331 #endif
332 
333 /**
334  * Sets an existing iterator to point to a new piece of text.
335  * The break iterator retains a pointer to the supplied text.
336  * The caller must not modify or delete the text while the BreakIterator
337  * retains the reference.
338  *
339  * @param bi The iterator to use
340  * @param text The text to be set
341  * @param textLength The length of the text
342  * @param status The error code
343  * @stable ICU 2.0
344  */
345 U_STABLE void U_EXPORT2
346 ubrk_setText(UBreakIterator* bi,
347              const UChar*    text,
348              int32_t         textLength,
349              UErrorCode*     status);
350 
351 
352 /**
353  * Sets an existing iterator to point to a new piece of text.
354  *
355  * All index positions returned by break iterator functions are
356  * native indices from the UText. For example, when breaking UTF-8
357  * encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.
358  * will be UTF-8 string indices, not UTF-16 positions.
359  *
360  * @param bi The iterator to use
361  * @param text The text to be set.
362  *             This function makes a shallow clone of the supplied UText.  This means
363  *             that the caller is free to immediately close or otherwise reuse the
364  *             UText that was passed as a parameter, but that the underlying text itself
365  *             must not be altered while being referenced by the break iterator.
366  * @param status The error code
367  * @stable ICU 3.4
368  */
369 U_STABLE void U_EXPORT2
370 ubrk_setUText(UBreakIterator* bi,
371              UText*          text,
372              UErrorCode*     status);
373 
374 
375 
376 /**
377  * Determine the most recently-returned text boundary.
378  *
379  * @param bi The break iterator to use.
380  * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
381  * \ref ubrk_first, or \ref ubrk_last.
382  * @stable ICU 2.0
383  */
384 U_STABLE int32_t U_EXPORT2
385 ubrk_current(const UBreakIterator *bi);
386 
387 /**
388  * Advance the iterator to the boundary following the current boundary.
389  *
390  * @param bi The break iterator to use.
391  * @return The character index of the next text boundary, or UBRK_DONE
392  * if all text boundaries have been returned.
393  * @see ubrk_previous
394  * @stable ICU 2.0
395  */
396 U_STABLE int32_t U_EXPORT2
397 ubrk_next(UBreakIterator *bi);
398 
399 /**
400  * Set the iterator position to the boundary preceding the current boundary.
401  *
402  * @param bi The break iterator to use.
403  * @return The character index of the preceding text boundary, or UBRK_DONE
404  * if all text boundaries have been returned.
405  * @see ubrk_next
406  * @stable ICU 2.0
407  */
408 U_STABLE int32_t U_EXPORT2
409 ubrk_previous(UBreakIterator *bi);
410 
411 /**
412  * Set the iterator position to zero, the start of the text being scanned.
413  * @param bi The break iterator to use.
414  * @return The new iterator position (zero).
415  * @see ubrk_last
416  * @stable ICU 2.0
417  */
418 U_STABLE int32_t U_EXPORT2
419 ubrk_first(UBreakIterator *bi);
420 
421 /**
422  * Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.
423  * This is not the same as the last character.
424  * @param bi The break iterator to use.
425  * @return The character offset immediately <EM>beyond</EM> the last character in the
426  * text being scanned.
427  * @see ubrk_first
428  * @stable ICU 2.0
429  */
430 U_STABLE int32_t U_EXPORT2
431 ubrk_last(UBreakIterator *bi);
432 
433 /**
434  * Set the iterator position to the first boundary preceding the specified offset.
435  * The new position is always smaller than offset, or UBRK_DONE.
436  * @param bi The break iterator to use.
437  * @param offset The offset to begin scanning.
438  * @return The text boundary preceding offset, or UBRK_DONE.
439  * @see ubrk_following
440  * @stable ICU 2.0
441  */
442 U_STABLE int32_t U_EXPORT2
443 ubrk_preceding(UBreakIterator *bi,
444            int32_t offset);
445 
446 /**
447  * Advance the iterator to the first boundary following the specified offset.
448  * The value returned is always greater than offset, or UBRK_DONE.
449  * @param bi The break iterator to use.
450  * @param offset The offset to begin scanning.
451  * @return The text boundary following offset, or UBRK_DONE.
452  * @see ubrk_preceding
453  * @stable ICU 2.0
454  */
455 U_STABLE int32_t U_EXPORT2
456 ubrk_following(UBreakIterator *bi,
457            int32_t offset);
458 
459 /**
460 * Get a locale for which text breaking information is available.
461 * A UBreakIterator in a locale returned by this function will perform the correct
462 * text breaking for the locale.
463 * @param index The index of the desired locale.
464 * @return A locale for which number text breaking information is available, or 0 if none.
465 * @see ubrk_countAvailable
466 * @stable ICU 2.0
467 */
468 U_STABLE const char* U_EXPORT2
469 ubrk_getAvailable(int32_t index);
470 
471 /**
472 * Determine how many locales have text breaking information available.
473 * This function is most useful as determining the loop ending condition for
474 * calls to \ref ubrk_getAvailable.
475 * @return The number of locales for which text breaking information is available.
476 * @see ubrk_getAvailable
477 * @stable ICU 2.0
478 */
479 U_STABLE int32_t U_EXPORT2
480 ubrk_countAvailable(void);
481 
482 
483 /**
484 * Returns true if the specfied position is a boundary position.  As a side
485 * effect, leaves the iterator pointing to the first boundary position at
486 * or after "offset".
487 * @param bi The break iterator to use.
488 * @param offset the offset to check.
489 * @return True if "offset" is a boundary position.
490 * @stable ICU 2.0
491 */
492 U_STABLE  UBool U_EXPORT2
493 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
494 
495 /**
496  * Return the status from the break rule that determined the most recently
497  * returned break position.  The values appear in the rule source
498  * within brackets, {123}, for example.  For rules that do not specify a
499  * status, a default value of 0 is returned.
500  * <p>
501  * For word break iterators, the possible values are defined in enum UWordBreak.
502  * @stable ICU 2.2
503  */
504 U_STABLE  int32_t U_EXPORT2
505 ubrk_getRuleStatus(UBreakIterator *bi);
506 
507 /**
508  * Get the statuses from the break rules that determined the most recently
509  * returned break position.  The values appear in the rule source
510  * within brackets, {123}, for example.  The default status value for rules
511  * that do not explicitly provide one is zero.
512  * <p>
513  * For word break iterators, the possible values are defined in enum UWordBreak.
514  * @param bi        The break iterator to use
515  * @param fillInVec an array to be filled in with the status values.
516  * @param capacity  the length of the supplied vector.  A length of zero causes
517  *                  the function to return the number of status values, in the
518  *                  normal way, without attemtping to store any values.
519  * @param status    receives error codes.
520  * @return          The number of rule status values from rules that determined
521  *                  the most recent boundary returned by the break iterator.
522  * @stable ICU 3.0
523  */
524 U_STABLE  int32_t U_EXPORT2
525 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
526 
527 /**
528  * Return the locale of the break iterator. You can choose between the valid and
529  * the actual locale.
530  * @param bi break iterator
531  * @param type locale type (valid or actual)
532  * @param status error code
533  * @return locale string
534  * @stable ICU 2.8
535  */
536 U_STABLE const char* U_EXPORT2
537 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
538 
539 /**
540   *  Set the subject text string upon which the break iterator is operating
541   *  without changing any other aspect of the state.
542   *  The new and previous text strings must have the same content.
543   *
544   *  This function is intended for use in environments where ICU is operating on
545   *  strings that may move around in memory.  It provides a mechanism for notifying
546   *  ICU that the string has been relocated, and providing a new UText to access the
547   *  string in its new position.
548   *
549   *  Note that the break iterator never copies the underlying text
550   *  of a string being processed, but always operates directly on the original text
551   *  provided by the user. Refreshing simply drops the references to the old text
552   *  and replaces them with references to the new.
553   *
554   *  Caution:  this function is normally used only by very specialized
555   *            system-level code.   One example use case is with garbage collection
556   *            that moves the text in memory.
557   *
558   * @param bi         The break iterator.
559   * @param text       The new (moved) text string.
560   * @param status     Receives errors detected by this function.
561   *
562   * @stable ICU 49
563   */
564 U_STABLE void U_EXPORT2
565 ubrk_refreshUText(UBreakIterator *bi,
566                        UText          *text,
567                        UErrorCode     *status);
568 
569 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
570 
571 #endif
572