1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2015, International Business Machines Corporation and    *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 
8 package com.ibm.icu.text;
9 
10 import java.lang.ref.SoftReference;
11 import java.text.CharacterIterator;
12 import java.text.StringCharacterIterator;
13 import java.util.Locale;
14 import java.util.MissingResourceException;
15 
16 import com.ibm.icu.impl.ICUDebug;
17 import com.ibm.icu.util.ICUCloneNotSupportedException;
18 import com.ibm.icu.util.ULocale;
19 
20 /**
21  * {@icuenhanced java.text.BreakIterator}.{@icu _usage_}
22  *
23  * <p>A class that locates boundaries in text.  This class defines a protocol for
24  * objects that break up a piece of natural-language text according to a set
25  * of criteria.  Instances or subclasses of BreakIterator can be provided, for
26  * example, to break a piece of text into words, sentences, or logical characters
27  * according to the conventions of some language or group of languages.
28  *
29  * We provide five built-in types of BreakIterator:
30  * <ul><li>getTitleInstance() returns a BreakIterator that locates boundaries
31  * between title breaks.
32  * <li>getSentenceInstance() returns a BreakIterator that locates boundaries
33  * between sentences.  This is useful for triple-click selection, for example.
34  * <li>getWordInstance() returns a BreakIterator that locates boundaries between
35  * words.  This is useful for double-click selection or "find whole words" searches.
36  * This type of BreakIterator makes sure there is a boundary position at the
37  * beginning and end of each legal word.  (Numbers count as words, too.)  Whitespace
38  * and punctuation are kept separate from real words.
39  * <li>getLineInstance() returns a BreakIterator that locates positions where it is
40  * legal for a text editor to wrap lines.  This is similar to word breaking, but
41  * not the same: punctuation and whitespace are generally kept with words (you don't
42  * want a line to start with whitespace, for example), and some special characters
43  * can force a position to be considered a line-break position or prevent a position
44  * from being a line-break position.
45  * <li>getCharacterInstance() returns a BreakIterator that locates boundaries between
46  * logical characters.  Because of the structure of the Unicode encoding, a logical
47  * character may be stored internally as more than one Unicode code point.  (A with an
48  * umlaut may be stored as an a followed by a separate combining umlaut character,
49  * for example, but the user still thinks of it as one character.)  This iterator allows
50  * various processes (especially text editors) to treat as characters the units of text
51  * that a user would think of as characters, rather than the units of text that the
52  * computer sees as "characters".</ul>
53  * The text boundary positions are found according to the rules
54  * described in Unicode Standard Annex #29, Text Boundaries, and
55  * Unicode Standard Annex #14, Line Breaking Properties.  These
56  * are available at http://www.unicode.org/reports/tr14/ and
57  * http://www.unicode.org/reports/tr29/.
58  * <p>
59  * BreakIterator's interface follows an "iterator" model (hence the name), meaning it
60  * has a concept of a "current position" and methods like first(), last(), next(),
61  * and previous() that update the current position.  All BreakIterators uphold the
62  * following invariants:
63  * <ul><li>The beginning and end of the text are always treated as boundary positions.
64  * <li>The current position of the iterator is always a boundary position (random-
65  * access methods move the iterator to the nearest boundary position before or
66  * after the specified position, not _to_ the specified position).
67  * <li>DONE is used as a flag to indicate when iteration has stopped.  DONE is only
68  * returned when the current position is the end of the text and the user calls next(),
69  * or when the current position is the beginning of the text and the user calls
70  * previous().
71  * <li>Break positions are numbered by the positions of the characters that follow
72  * them.  Thus, under normal circumstances, the position before the first character
73  * is 0, the position after the first character is 1, and the position after the
74  * last character is 1 plus the length of the string.
75  * <li>The client can change the position of an iterator, or the text it analyzes,
76  * at will, but cannot change the behavior.  If the user wants different behavior, he
77  * must instantiate a new iterator.</ul>
78  *
79  * BreakIterator accesses the text it analyzes through a CharacterIterator, which makes
80  * it possible to use BreakIterator to analyze text in any text-storage vehicle that
81  * provides a CharacterIterator interface.
82  *
83  * <b>Note:</b>  Some types of BreakIterator can take a long time to create, and
84  * instances of BreakIterator are not currently cached by the system.  For
85  * optimal performance, keep instances of BreakIterator around as long as makes
86  * sense.  For example, when word-wrapping a document, don't create and destroy a
87  * new BreakIterator for each line.  Create one break iterator for the whole document
88  * (or whatever stretch of text you're wrapping) and use it to do the whole job of
89  * wrapping the text.
90  *
91   * <P>
92  * <strong>Examples</strong>:<P>
93  * Creating and using text boundaries
94  * <blockquote>
95  * <pre>
96  * public static void main(String args[]) {
97  *      if (args.length == 1) {
98  *          String stringToExamine = args[0];
99  *          //print each word in order
100  *          BreakIterator boundary = BreakIterator.getWordInstance();
101  *          boundary.setText(stringToExamine);
102  *          printEachForward(boundary, stringToExamine);
103  *          //print each sentence in reverse order
104  *          boundary = BreakIterator.getSentenceInstance(Locale.US);
105  *          boundary.setText(stringToExamine);
106  *          printEachBackward(boundary, stringToExamine);
107  *          printFirst(boundary, stringToExamine);
108  *          printLast(boundary, stringToExamine);
109  *      }
110  * }
111  * </pre>
112  * </blockquote>
113  *
114  * Print each element in order
115  * <blockquote>
116  * <pre>
117  * public static void printEachForward(BreakIterator boundary, String source) {
118  *     int start = boundary.first();
119  *     for (int end = boundary.next();
120  *          end != BreakIterator.DONE;
121  *          start = end, end = boundary.next()) {
122  *          System.out.println(source.substring(start,end));
123  *     }
124  * }
125  * </pre>
126  * </blockquote>
127  *
128  * Print each element in reverse order
129  * <blockquote>
130  * <pre>
131  * public static void printEachBackward(BreakIterator boundary, String source) {
132  *     int end = boundary.last();
133  *     for (int start = boundary.previous();
134  *          start != BreakIterator.DONE;
135  *          end = start, start = boundary.previous()) {
136  *         System.out.println(source.substring(start,end));
137  *     }
138  * }
139  * </pre>
140  * </blockquote>
141  *
142  * Print first element
143  * <blockquote>
144  * <pre>
145  * public static void printFirst(BreakIterator boundary, String source) {
146  *     int start = boundary.first();
147  *     int end = boundary.next();
148  *     System.out.println(source.substring(start,end));
149  * }
150  * </pre>
151  * </blockquote>
152  *
153  * Print last element
154  * <blockquote>
155  * <pre>
156  * public static void printLast(BreakIterator boundary, String source) {
157  *     int end = boundary.last();
158  *     int start = boundary.previous();
159  *     System.out.println(source.substring(start,end));
160  * }
161  * </pre>
162  * </blockquote>
163  *
164  * Print the element at a specified position
165  * <blockquote>
166  * <pre>
167  * public static void printAt(BreakIterator boundary, int pos, String source) {
168  *     int end = boundary.following(pos);
169  *     int start = boundary.previous();
170  *     System.out.println(source.substring(start,end));
171  * }
172  * </pre>
173  * </blockquote>
174  *
175  * Find the next word
176  * <blockquote>
177  * <pre>
178  * public static int nextWordStartAfter(int pos, String text) {
179  *     BreakIterator wb = BreakIterator.getWordInstance();
180  *     wb.setText(text);
181  *     int last = wb.following(pos);
182  *     int current = wb.next();
183  *     while (current != BreakIterator.DONE) {
184  *         for (int p = last; p < current; p++) {
185  *             if (Character.isLetter(text.charAt(p)))
186  *                 return last;
187  *         }
188  *         last = current;
189  *         current = wb.next();
190  *     }
191  *     return BreakIterator.DONE;
192  * }
193  * </pre>
194  * (The iterator returned by BreakIterator.getWordInstance() is unique in that
195  * the break positions it returns don't represent both the start and end of the
196  * thing being iterated over.  That is, a sentence-break iterator returns breaks
197  * that each represent the end of one sentence and the beginning of the next.
198  * With the word-break iterator, the characters between two boundaries might be a
199  * word, or they might be the punctuation or whitespace between two words.  The
200  * above code uses a simple heuristic to determine which boundary is the beginning
201  * of a word: If the characters between this boundary and the next boundary
202  * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
203  * a Hangul syllable, a Kana character, etc.), then the text between this boundary
204  * and the next is a word; otherwise, it's the material between words.)
205  * </blockquote>
206  *
207  * @see CharacterIterator
208  * @stable ICU 2.0
209  *
210  */
211 
212 public abstract class BreakIterator implements Cloneable
213 {
214 
215     private static final boolean DEBUG = ICUDebug.enabled("breakiterator");
216 
217     /**
218      * Default constructor.  There is no state that is carried by this abstract
219      * base class.
220      * @stable ICU 2.0
221      */
BreakIterator()222     protected BreakIterator()
223     {
224     }
225 
226     /**
227      * Clone method.  Creates another BreakIterator with the same behavior and
228      * current state as this one.
229      * @return The clone.
230      * @stable ICU 2.0
231      */
clone()232     public Object clone()
233     {
234         try {
235             return super.clone();
236         }
237         catch (CloneNotSupportedException e) {
238             ///CLOVER:OFF
239             throw new ICUCloneNotSupportedException(e);
240             ///CLOVER:ON
241         }
242     }
243 
244     /**
245      * DONE is returned by previous() and next() after all valid
246      * boundaries have been returned.
247      * @stable ICU 2.0
248      */
249     public static final int DONE = -1;
250 
251     /**
252      * Set the iterator to the first boundary position.  This is always the beginning
253      * index of the text this iterator iterates over.  For example, if
254      * the iterator iterates over a whole string, this function will
255      * always return 0.
256      * @return The character offset of the beginning of the stretch of text
257      * being broken.
258      * @stable ICU 2.0
259      */
first()260     public abstract int first();
261 
262     /**
263      * Set the iterator to the last boundary position.  This is always the "past-the-end"
264      * index of the text this iterator iterates over.  For example, if the
265      * iterator iterates over a whole string (call it "text"), this function
266      * will always return text.length().
267      * @return The character offset of the end of the stretch of text
268      * being broken.
269      * @stable ICU 2.0
270      */
last()271     public abstract int last();
272 
273     /**
274      * Move the iterator by the specified number of steps in the text.
275      * A positive number moves the iterator forward; a negative number
276      * moves the iterator backwards. If this causes the iterator
277      * to move off either end of the text, this function returns DONE;
278      * otherwise, this function returns the position of the appropriate
279      * boundary.  Calling this function is equivalent to calling next() or
280      * previous() n times.
281      * @param n The number of boundaries to advance over (if positive, moves
282      * forward; if negative, moves backwards).
283      * @return The position of the boundary n boundaries from the current
284      * iteration position, or DONE if moving n boundaries causes the iterator
285      * to advance off either end of the text.
286      * @stable ICU 2.0
287      */
next(int n)288     public abstract int next(int n);
289 
290     /**
291      * Advances the iterator forward one boundary.  The current iteration
292      * position is updated to point to the next boundary position after the
293      * current position, and this is also the value that is returned.  If
294      * the current position is equal to the value returned by last(), or to
295      * DONE, this function returns DONE and sets the current position to
296      * DONE.
297      * @return The position of the first boundary position following the
298      * iteration position.
299      * @stable ICU 2.0
300      */
next()301     public abstract int next();
302 
303     /**
304      * Move the iterator backward one boundary.  The current iteration
305      * position is updated to point to the last boundary position before
306      * the current position, and this is also the value that is returned.  If
307      * the current position is equal to the value returned by first(), or to
308      * DONE, this function returns DONE and sets the current position to
309      * DONE.
310      * @return The position of the last boundary position preceding the
311      * iteration position.
312      * @stable ICU 2.0
313      */
previous()314     public abstract int previous();
315 
316     /**
317      * Sets the iterator's current iteration position to be the first
318      * boundary position following the specified position.  (Whether the
319      * specified position is itself a boundary position or not doesn't
320      * matter-- this function always moves the iteration position to the
321      * first boundary after the specified position.)  If the specified
322      * position is the past-the-end position, returns DONE.
323      * @param offset The character position to start searching from.
324      * @return The position of the first boundary position following
325      * "offset" (whether or not "offset" itself is a boundary position),
326      * or DONE if "offset" is the past-the-end offset.
327      * @stable ICU 2.0
328      */
following(int offset)329     public abstract int following(int offset);
330 
331     /**
332      * Sets the iterator's current iteration position to be the last
333      * boundary position preceding the specified position.  (Whether the
334      * specified position is itself a boundary position or not doesn't
335      * matter-- this function always moves the iteration position to the
336      * last boundary before the specified position.)  If the specified
337      * position is the starting position, returns DONE.
338      * @param offset The character position to start searching from.
339      * @return The position of the last boundary position preceding
340      * "offset" (whether of not "offset" itself is a boundary position),
341      * or DONE if "offset" is the starting offset of the iterator.
342      * @stable ICU 2.0
343      */
preceding(int offset)344     public int preceding(int offset) {
345         // NOTE:  This implementation is here solely because we can't add new
346         // abstract methods to an existing class.  There is almost ALWAYS a
347         // better, faster way to do this.
348         int pos = following(offset);
349         while (pos >= offset && pos != DONE)
350             pos = previous();
351         return pos;
352     }
353 
354     /**
355      * Return true if the specified position is a boundary position.  If the
356      * function returns true, the current iteration position is set to the
357      * specified position; if the function returns false, the current
358      * iteration position is set as though following() had been called.
359      * @param offset the offset to check.
360      * @return True if "offset" is a boundary position.
361      * @stable ICU 2.0
362      */
isBoundary(int offset)363     public boolean isBoundary(int offset) {
364         // Again, this is the default implementation, which is provided solely because
365         // we couldn't add a new abstract method to an existing class.  The real
366         // implementations will usually need to do a little more work.
367         if (offset == 0) {
368             return true;
369         }
370         else
371             return following(offset - 1) == offset;
372     }
373 
374     /**
375      * Return the iterator's current position.
376      * @return The iterator's current position.
377      * @stable ICU 2.0
378      */
current()379     public abstract int current();
380 
381 
382     /**
383      * Tag value for "words" that do not fit into any of other categories.
384      * Includes spaces and most punctuation.
385      * @stable ICU 53
386      */
387     public static final int WORD_NONE           = 0;
388 
389     /**
390      * Upper bound for tags for uncategorized words.
391      * @stable ICU 53
392      */
393     public static final int WORD_NONE_LIMIT     = 100;
394 
395     /**
396      * Tag value for words that appear to be numbers, lower limit.
397      * @stable ICU 53
398      */
399     public static final int WORD_NUMBER         = 100;
400 
401     /**
402      * Tag value for words that appear to be numbers, upper limit.
403      * @stable ICU 53
404      */
405     public static final int WORD_NUMBER_LIMIT   = 200;
406 
407     /**
408      * Tag value for words that contain letters, excluding
409      * hiragana, katakana or ideographic characters, lower limit.
410      * @stable ICU 53
411      */
412     public static final int WORD_LETTER         = 200;
413 
414     /**
415      * Tag value for words containing letters, upper limit
416      * @stable ICU 53
417      */
418     public static final int WORD_LETTER_LIMIT   = 300;
419 
420     /**
421      * Tag value for words containing kana characters, lower limit
422      * @stable ICU 53
423      */
424     public static final int WORD_KANA           = 300;
425 
426     /**
427      * Tag value for words containing kana characters, upper limit
428      * @stable ICU 53
429      */
430     public static final int WORD_KANA_LIMIT     = 400;
431 
432     /**
433      * Tag value for words containing ideographic characters, lower limit
434      * @stable ICU 53
435      */
436     public static final int WORD_IDEO           = 400;
437 
438     /**
439      * Tag value for words containing ideographic characters, upper limit
440      * @stable ICU 53
441      */
442     public static final int WORD_IDEO_LIMIT     = 500;
443 
444     /**
445      * For RuleBasedBreakIterators, return the status tag from the
446      * break rule that determined the most recently
447      * returned break position.
448      * <p>
449      * For break iterator types that do not support a rule status,
450      * a default value of 0 is returned.
451      * <p>
452      * @return The status from the break rule that determined the most recently
453      *         returned break position.
454      *
455      * @stable ICU 52
456      */
457 
getRuleStatus()458     public int  getRuleStatus() {
459         return 0;
460     }
461 
462     /**
463      * For RuleBasedBreakIterators, get the status (tag) values from the break rule(s)
464      * that determined the most recently returned break position.
465      * <p>
466      * For break iterator types that do not support rule status,
467      * no values are returned.
468      * <p>
469      * If the size  of the output array is insufficient to hold the data,
470      *  the output will be truncated to the available length.  No exception
471      *  will be thrown.
472      *
473      * @param fillInArray an array to be filled in with the status values.
474      * @return          The number of rule status values from rules that determined
475      *                  the most recent boundary returned by the break iterator.
476      *                  In the event that the array is too small, the return value
477      *                  is the total number of status values that were available,
478      *                  not the reduced number that were actually returned.
479      * @stable ICU 52
480      */
getRuleStatusVec(int[] fillInArray)481     public int getRuleStatusVec(int[] fillInArray) {
482         if (fillInArray != null && fillInArray.length > 0) {
483             fillInArray[0] = 0;
484         }
485         return 1;
486     }
487 
488     /**
489      * Returns a CharacterIterator over the text being analyzed.
490      * For at least some subclasses of BreakIterator, this is a reference
491      * to the <b>actual iterator being used</b> by the BreakIterator,
492      * and therefore, this function's return value should be treated as
493      * <tt>const</tt>.  No guarantees are made about the current position
494      * of this iterator when it is returned.  If you need to move that
495      * position to examine the text, clone this function's return value first.
496      * @return A CharacterIterator over the text being analyzed.
497      * @stable ICU 2.0
498      */
getText()499     public abstract CharacterIterator getText();
500 
501     /**
502      * Sets the iterator to analyze a new piece of text.  The new
503      * piece of text is passed in as a String, and the current
504      * iteration position is reset to the beginning of the string.
505      * (The old text is dropped.)
506      * @param newText A String containing the text to analyze with
507      * this BreakIterator.
508      * @stable ICU 2.0
509      */
setText(String newText)510     public void setText(String newText)
511     {
512         setText(new StringCharacterIterator(newText));
513     }
514 
515     /**
516      * Sets the iterator to analyze a new piece of text.  The
517      * BreakIterator is passed a CharacterIterator through which
518      * it will access the text itself.  The current iteration
519      * position is reset to the CharacterIterator's start index.
520      * (The old iterator is dropped.)
521      * @param newText A CharacterIterator referring to the text
522      * to analyze with this BreakIterator (the iterator's current
523      * position is ignored, but its other state is significant).
524      * @stable ICU 2.0
525      */
setText(CharacterIterator newText)526     public abstract void setText(CharacterIterator newText);
527 
528     /**
529      * {@icu}
530      * @stable ICU 2.4
531      */
532     public static final int KIND_CHARACTER = 0;
533     /**
534      * {@icu}
535      * @stable ICU 2.4
536      */
537     public static final int KIND_WORD = 1;
538     /**
539      * {@icu}
540      * @stable ICU 2.4
541      */
542     public static final int KIND_LINE = 2;
543     /**
544      * {@icu}
545      * @stable ICU 2.4
546      */
547     public static final int KIND_SENTENCE = 3;
548     /**
549      * {@icu}
550      * @stable ICU 2.4
551      */
552     public static final int KIND_TITLE = 4;
553 
554     /**
555      * @since ICU 2.8
556      */
557     private static final int KIND_COUNT = 5;
558 
559     private static final SoftReference<?>[] iterCache = new SoftReference<?>[5];
560 
561     /**
562      * Returns a new instance of BreakIterator that locates word boundaries.
563      * This function assumes that the text being analyzed is in the default
564      * locale's language.
565      * @return An instance of BreakIterator that locates word boundaries.
566      * @stable ICU 2.0
567      */
getWordInstance()568     public static BreakIterator getWordInstance()
569     {
570         return getWordInstance(ULocale.getDefault());
571     }
572 
573     /**
574      * Returns a new instance of BreakIterator that locates word boundaries.
575      * @param where A locale specifying the language of the text to be
576      * analyzed.
577      * @return An instance of BreakIterator that locates word boundaries.
578      * @throws NullPointerException if <code>where</code> is null.
579      * @stable ICU 2.0
580      */
getWordInstance(Locale where)581     public static BreakIterator getWordInstance(Locale where)
582     {
583         return getBreakInstance(ULocale.forLocale(where), KIND_WORD);
584     }
585 
586     /**
587      * {@icu} Returns a new instance of BreakIterator that locates word boundaries.
588      * @param where A locale specifying the language of the text to be
589      * analyzed.
590      * @return An instance of BreakIterator that locates word boundaries.
591      * @throws NullPointerException if <code>where</code> is null.
592      * @stable ICU 3.2
593      */
getWordInstance(ULocale where)594     public static BreakIterator getWordInstance(ULocale where)
595     {
596         return getBreakInstance(where, KIND_WORD);
597     }
598 
599     /**
600      * Returns a new instance of BreakIterator that locates legal line-
601      * wrapping positions.  This function assumes the text being broken
602      * is in the default locale's language.
603      * @return A new instance of BreakIterator that locates legal
604      * line-wrapping positions.
605      * @stable ICU 2.0
606      */
getLineInstance()607     public static BreakIterator getLineInstance()
608     {
609         return getLineInstance(ULocale.getDefault());
610     }
611 
612     /**
613      * Returns a new instance of BreakIterator that locates legal line-
614      * wrapping positions.
615      * @param where A Locale specifying the language of the text being broken.
616      * @return A new instance of BreakIterator that locates legal
617      * line-wrapping positions.
618      * @throws NullPointerException if <code>where</code> is null.
619      * @stable ICU 2.0
620      */
getLineInstance(Locale where)621     public static BreakIterator getLineInstance(Locale where)
622     {
623         return getBreakInstance(ULocale.forLocale(where), KIND_LINE);
624     }
625 
626     /**
627      * {@icu} Returns a new instance of BreakIterator that locates legal line-
628      * wrapping positions.
629      * @param where A Locale specifying the language of the text being broken.
630      * @return A new instance of BreakIterator that locates legal
631      * line-wrapping positions.
632      * @throws NullPointerException if <code>where</code> is null.
633      * @stable ICU 3.2
634      */
getLineInstance(ULocale where)635     public static BreakIterator getLineInstance(ULocale where)
636     {
637         return getBreakInstance(where, KIND_LINE);
638     }
639 
640     /**
641      * Returns a new instance of BreakIterator that locates logical-character
642      * boundaries.  This function assumes that the text being analyzed is
643      * in the default locale's language.
644      * @return A new instance of BreakIterator that locates logical-character
645      * boundaries.
646      * @stable ICU 2.0
647      */
getCharacterInstance()648     public static BreakIterator getCharacterInstance()
649     {
650         return getCharacterInstance(ULocale.getDefault());
651     }
652 
653     /**
654      * Returns a new instance of BreakIterator that locates logical-character
655      * boundaries.
656      * @param where A Locale specifying the language of the text being analyzed.
657      * @return A new instance of BreakIterator that locates logical-character
658      * boundaries.
659      * @throws NullPointerException if <code>where</code> is null.
660      * @stable ICU 2.0
661      */
getCharacterInstance(Locale where)662     public static BreakIterator getCharacterInstance(Locale where)
663     {
664         return getBreakInstance(ULocale.forLocale(where), KIND_CHARACTER);
665     }
666 
667     /**
668      * {@icu} Returns a new instance of BreakIterator that locates logical-character
669      * boundaries.
670      * @param where A Locale specifying the language of the text being analyzed.
671      * @return A new instance of BreakIterator that locates logical-character
672      * boundaries.
673      * @throws NullPointerException if <code>where</code> is null.
674      * @stable ICU 3.2
675      */
getCharacterInstance(ULocale where)676     public static BreakIterator getCharacterInstance(ULocale where)
677     {
678         return getBreakInstance(where, KIND_CHARACTER);
679     }
680 
681     /**
682      * Returns a new instance of BreakIterator that locates sentence boundaries.
683      * This function assumes the text being analyzed is in the default locale's
684      * language.
685      * @return A new instance of BreakIterator that locates sentence boundaries.
686      * @stable ICU 2.0
687      */
getSentenceInstance()688     public static BreakIterator getSentenceInstance()
689     {
690         return getSentenceInstance(ULocale.getDefault());
691     }
692 
693     /**
694      * Returns a new instance of BreakIterator that locates sentence boundaries.
695      * @param where A Locale specifying the language of the text being analyzed.
696      * @return A new instance of BreakIterator that locates sentence boundaries.
697      * @throws NullPointerException if <code>where</code> is null.
698      * @stable ICU 2.0
699      */
getSentenceInstance(Locale where)700     public static BreakIterator getSentenceInstance(Locale where)
701     {
702         return getBreakInstance(ULocale.forLocale(where), KIND_SENTENCE);
703     }
704 
705     /**
706      * {@icu} Returns a new instance of BreakIterator that locates sentence boundaries.
707      * @param where A Locale specifying the language of the text being analyzed.
708      * @return A new instance of BreakIterator that locates sentence boundaries.
709      * @throws NullPointerException if <code>where</code> is null.
710      * @stable ICU 3.2
711      */
getSentenceInstance(ULocale where)712     public static BreakIterator getSentenceInstance(ULocale where)
713     {
714         return getBreakInstance(where, KIND_SENTENCE);
715     }
716 
717     /**
718      * {@icu} Returns a new instance of BreakIterator that locates title boundaries.
719      * This function assumes the text being analyzed is in the default locale's
720      * language. The iterator returned locates title boundaries as described for
721      * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
722      * please use a word boundary iterator. {@link #getWordInstance}
723      * @return A new instance of BreakIterator that locates title boundaries.
724      * @stable ICU 2.0
725      */
getTitleInstance()726     public static BreakIterator getTitleInstance()
727     {
728         return getTitleInstance(ULocale.getDefault());
729     }
730 
731     /**
732      * {@icu} Returns a new instance of BreakIterator that locates title boundaries.
733      * The iterator returned locates title boundaries as described for
734      * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
735      * please use Word Boundary iterator.{@link #getWordInstance}
736      * @param where A Locale specifying the language of the text being analyzed.
737      * @return A new instance of BreakIterator that locates title boundaries.
738      * @throws NullPointerException if <code>where</code> is null.
739      * @stable ICU 2.0
740      */
getTitleInstance(Locale where)741     public static BreakIterator getTitleInstance(Locale where)
742     {
743         return getBreakInstance(ULocale.forLocale(where), KIND_TITLE);
744     }
745 
746     /**
747      * {@icu} Returns a new instance of BreakIterator that locates title boundaries.
748      * The iterator returned locates title boundaries as described for
749      * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
750      * please use Word Boundary iterator.{@link #getWordInstance}
751      * @param where A Locale specifying the language of the text being analyzed.
752      * @return A new instance of BreakIterator that locates title boundaries.
753      * @throws NullPointerException if <code>where</code> is null.
754      * @stable ICU 3.2
755 s     */
getTitleInstance(ULocale where)756     public static BreakIterator getTitleInstance(ULocale where)
757     {
758         return getBreakInstance(where, KIND_TITLE);
759     }
760 
761     /**
762      * {@icu} Registers a new break iterator of the indicated kind, to use in the given
763      * locale.  Clones of the iterator will be returned if a request for a break iterator
764      * of the given kind matches or falls back to this locale.
765      *
766      * <p>Because ICU may choose to cache BreakIterator objects internally, this must
767      * be called at application startup, prior to any calls to
768      * BreakIterator.getInstance to avoid undefined behavior.
769      *
770      * @param iter the BreakIterator instance to adopt.
771      * @param locale the Locale for which this instance is to be registered
772      * @param kind the type of iterator for which this instance is to be registered
773      * @return a registry key that can be used to unregister this instance
774      * @stable ICU 2.4
775      */
registerInstance(BreakIterator iter, Locale locale, int kind)776     public static Object registerInstance(BreakIterator iter, Locale locale, int kind) {
777         return registerInstance(iter, ULocale.forLocale(locale), kind);
778     }
779 
780     /**
781      * {@icu} Registers a new break iterator of the indicated kind, to use in the given
782      * locale.  Clones of the iterator will be returned if a request for a break iterator
783      * of the given kind matches or falls back to this locale.
784      *
785      * <p>Because ICU may choose to cache BreakIterator objects internally, this must
786      * be called at application startup, prior to any calls to
787      * BreakIterator.getInstance to avoid undefined behavior.
788      *
789      * @param iter the BreakIterator instance to adopt.
790      * @param locale the Locale for which this instance is to be registered
791      * @param kind the type of iterator for which this instance is to be registered
792      * @return a registry key that can be used to unregister this instance
793      * @stable ICU 3.2
794      */
registerInstance(BreakIterator iter, ULocale locale, int kind)795     public static Object registerInstance(BreakIterator iter, ULocale locale, int kind) {
796         // If the registered object matches the one in the cache, then
797         // flush the cached object.
798         if (iterCache[kind] != null) {
799             BreakIteratorCache cache = (BreakIteratorCache) iterCache[kind].get();
800             if (cache != null) {
801                 if (cache.getLocale().equals(locale)) {
802                     iterCache[kind] = null;
803                 }
804             }
805         }
806         return getShim().registerInstance(iter, locale, kind);
807     }
808 
809     /**
810      * {@icu} Unregisters a previously-registered BreakIterator using the key returned
811      * from the register call.  Key becomes invalid after this call and should not be used
812      * again.
813      * @param key the registry key returned by a previous call to registerInstance
814      * @return true if the iterator for the key was successfully unregistered
815      * @stable ICU 2.4
816      */
unregister(Object key)817     public static boolean unregister(Object key) {
818         if (key == null) {
819             throw new IllegalArgumentException("registry key must not be null");
820         }
821         // TODO: we don't do code coverage for the following lines
822         // because in getBreakInstance we always instantiate the shim,
823         // and test execution is such that we always instantiate a
824         // breakiterator before we get to the break iterator tests.
825         // this is for modularization, and we could remove the
826         // dependencies in getBreakInstance by rewriting part of the
827         // LocaleData code, or perhaps by accepting it into the
828         // module.
829         ///CLOVER:OFF
830         if (shim != null) {
831             // Unfortunately, we don't know what is being unregistered
832             // -- what `kind' and what locale -- so we flush all
833             // caches.  This is safe but inefficient if people are
834             // actively registering and unregistering.
835             for (int kind=0; kind<KIND_COUNT; ++kind) {
836                 iterCache[kind] = null;
837             }
838             return shim.unregister(key);
839         }
840         return false;
841         ///CLOVER:ON
842     }
843 
844     // end of registration
845 
846     /**
847      * Returns a particular kind of BreakIterator for a locale.
848      * Avoids writing a switch statement with getXYZInstance(where) calls.
849      * @internal
850      * @deprecated This API is ICU internal only.
851      */
852     @Deprecated
getBreakInstance(ULocale where, int kind)853     public static BreakIterator getBreakInstance(ULocale where, int kind) {
854         if (where == null) {
855             throw new NullPointerException("Specified locale is null");
856         }
857         if (iterCache[kind] != null) {
858             BreakIteratorCache cache = (BreakIteratorCache)iterCache[kind].get();
859             if (cache != null) {
860                 if (cache.getLocale().equals(where)) {
861                     return cache.createBreakInstance();
862                 }
863             }
864         }
865 
866         // sigh, all to avoid linking in ICULocaleData...
867         BreakIterator result = getShim().createBreakIterator(where, kind);
868 
869         BreakIteratorCache cache = new BreakIteratorCache(where, result);
870         iterCache[kind] = new SoftReference<BreakIteratorCache>(cache);
871         if (result instanceof RuleBasedBreakIterator) {
872             RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator)result;
873             rbbi.setBreakType(kind);
874         }
875 
876         return result;
877     }
878 
879 
880     /**
881      * Returns a list of locales for which BreakIterators can be used.
882      * @return An array of Locales.  All of the locales in the array can
883      * be used when creating a BreakIterator.
884      * @stable ICU 2.6
885      */
getAvailableLocales()886     public static synchronized Locale[] getAvailableLocales()
887     {
888         // to avoid linking ICULocaleData
889         return getShim().getAvailableLocales();
890     }
891 
892     /**
893      * {@icu} Returns a list of locales for which BreakIterators can be used.
894      * @return An array of Locales.  All of the locales in the array can
895      * be used when creating a BreakIterator.
896      * @draft ICU 3.2 (retain)
897      * @provisional This API might change or be removed in a future release.
898      */
getAvailableULocales()899     public static synchronized ULocale[] getAvailableULocales()
900     {
901         // to avoid linking ICULocaleData
902         return getShim().getAvailableULocales();
903     }
904 
905     private static final class BreakIteratorCache {
906 
907         private BreakIterator iter;
908         private ULocale where;
909 
BreakIteratorCache(ULocale where, BreakIterator iter)910         BreakIteratorCache(ULocale where, BreakIterator iter) {
911             this.where = where;
912             this.iter = (BreakIterator) iter.clone();
913         }
914 
getLocale()915         ULocale getLocale() {
916             return where;
917         }
918 
createBreakInstance()919         BreakIterator createBreakInstance() {
920             return (BreakIterator) iter.clone();
921         }
922     }
923 
924     static abstract class BreakIteratorServiceShim {
registerInstance(BreakIterator iter, ULocale l, int k)925         public abstract Object registerInstance(BreakIterator iter, ULocale l, int k);
unregister(Object key)926         public abstract boolean unregister(Object key);
getAvailableLocales()927         public abstract Locale[] getAvailableLocales();
getAvailableULocales()928         public abstract ULocale[] getAvailableULocales();
createBreakIterator(ULocale l, int k)929         public abstract BreakIterator createBreakIterator(ULocale l, int k);
930     }
931 
932     private static BreakIteratorServiceShim shim;
getShim()933     private static BreakIteratorServiceShim getShim() {
934         // Note: this instantiation is safe on loose-memory-model configurations
935         // despite lack of synchronization, since the shim instance has no state--
936         // it's all in the class init.  The worst problem is we might instantiate
937         // two shim instances, but they'll share the same state so that's ok.
938         if (shim == null) {
939             try {
940                 Class<?> cls = Class.forName("com.ibm.icu.text.BreakIteratorFactory");
941                 shim = (BreakIteratorServiceShim)cls.newInstance();
942             }
943             catch (MissingResourceException e)
944             {
945                 throw e;
946             }
947             catch (Exception e) {
948                 ///CLOVER:OFF
949                 if(DEBUG){
950                     e.printStackTrace();
951                 }
952                 throw new RuntimeException(e.getMessage());
953                 ///CLOVER:ON
954             }
955         }
956         return shim;
957     }
958 
959     // -------- BEGIN ULocale boilerplate --------
960 
961     /**
962      * {@icu} Returns the locale that was used to create this object, or null.
963      * This may may differ from the locale requested at the time of
964      * this object's creation.  For example, if an object is created
965      * for locale <tt>en_US_CALIFORNIA</tt>, the actual data may be
966      * drawn from <tt>en</tt> (the <i>actual</i> locale), and
967      * <tt>en_US</tt> may be the most specific locale that exists (the
968      * <i>valid</i> locale).
969      *
970      * <p>Note: The <i>actual</i> locale is returned correctly, but the <i>valid</i>
971      * locale is not, in most cases.
972      * @param type type of information requested, either {@link
973      * com.ibm.icu.util.ULocale#VALID_LOCALE} or {@link
974      * com.ibm.icu.util.ULocale#ACTUAL_LOCALE}.
975      * @return the information specified by <i>type</i>, or null if
976      * this object was not constructed from locale data.
977      * @see com.ibm.icu.util.ULocale
978      * @see com.ibm.icu.util.ULocale#VALID_LOCALE
979      * @see com.ibm.icu.util.ULocale#ACTUAL_LOCALE
980      * @draft ICU 2.8 (retain)
981      * @provisional This API might change or be removed in a future release.
982      */
getLocale(ULocale.Type type)983     public final ULocale getLocale(ULocale.Type type) {
984         return type == ULocale.ACTUAL_LOCALE ?
985             this.actualLocale : this.validLocale;
986     }
987 
988     /**
989      * Set information about the locales that were used to create this
990      * object.  If the object was not constructed from locale data,
991      * both arguments should be set to null.  Otherwise, neither
992      * should be null.  The actual locale must be at the same level or
993      * less specific than the valid locale.  This method is intended
994      * for use by factories or other entities that create objects of
995      * this class.
996      * @param valid the most specific locale containing any resource
997      * data, or null
998      * @param actual the locale containing data used to construct this
999      * object, or null
1000      * @see com.ibm.icu.util.ULocale
1001      * @see com.ibm.icu.util.ULocale#VALID_LOCALE
1002      * @see com.ibm.icu.util.ULocale#ACTUAL_LOCALE
1003      */
setLocale(ULocale valid, ULocale actual)1004     final void setLocale(ULocale valid, ULocale actual) {
1005         // Change the following to an assertion later
1006         if ((valid == null) != (actual == null)) {
1007             ///CLOVER:OFF
1008             throw new IllegalArgumentException();
1009             ///CLOVER:ON
1010         }
1011         // Another check we could do is that the actual locale is at
1012         // the same level or less specific than the valid locale.
1013         this.validLocale = valid;
1014         this.actualLocale = actual;
1015     }
1016 
1017     /**
1018      * The most specific locale containing any resource data, or null.
1019      * @see com.ibm.icu.util.ULocale
1020      */
1021     private ULocale validLocale;
1022 
1023     /**
1024      * The locale containing data used to construct this object, or
1025      * null.
1026      * @see com.ibm.icu.util.ULocale
1027      */
1028     private ULocale actualLocale;
1029 
1030     // -------- END ULocale boilerplate --------
1031 }
1032