1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.  Oracle designates this
9  * particular file as subject to the "Classpath" exception as provided
10  * by Oracle in the LICENSE file that accompanied this code.
11  *
12  * This code is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * version 2 for more details (a copy is included in the LICENSE file that
16  * accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License version
19  * 2 along with this work; if not, write to the Free Software Foundation,
20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23  * or visit www.oracle.com if you need additional information or have any
24  * questions.
25  */
26 
27 /*
28  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29  * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
30  *
31  * The original version of this source code and documentation
32  * is copyrighted and owned by Taligent, Inc., a wholly-owned
33  * subsidiary of IBM. These materials are provided under terms
34  * of a License Agreement between Taligent and Sun. This technology
35  * is protected by multiple US and International patents.
36  *
37  * This notice and attribution to Taligent may not be removed.
38  * Taligent is a registered trademark of Taligent, Inc.
39  *
40  */
41 
42 package java.text;
43 
44 import java.util.Locale;
45 
46 
47 /**
48  * The <code>BreakIterator</code> class implements methods for finding
49  * the location of boundaries in text. Instances of <code>BreakIterator</code>
50  * maintain a current position and scan over text
51  * returning the index of characters where boundaries occur.
52  * Internally, <code>BreakIterator</code> scans text using a
53  * <code>CharacterIterator</code>, and is thus able to scan text held
54  * by any object implementing that protocol. A <code>StringCharacterIterator</code>
55  * is used to scan <code>String</code> objects passed to <code>setText</code>.
56  *
57  * <p>
58  * You use the factory methods provided by this class to create
59  * instances of various types of break iterators. In particular,
60  * use <code>getWordInstance</code>, <code>getLineInstance</code>,
61  * <code>getSentenceInstance</code>, and <code>getCharacterInstance</code>
62  * to create <code>BreakIterator</code>s that perform
63  * word, line, sentence, and character boundary analysis respectively.
64  * A single <code>BreakIterator</code> can work only on one unit
65  * (word, line, sentence, and so on). You must use a different iterator
66  * for each unit boundary analysis you wish to perform.
67  *
68  * <p><a name="line"></a>
69  * Line boundary analysis determines where a text string can be
70  * broken when line-wrapping. The mechanism correctly handles
71  * punctuation and hyphenated words. Actual line breaking needs
72  * to also consider the available line width and is handled by
73  * higher-level software.
74  *
75  * <p><a name="sentence"></a>
76  * Sentence boundary analysis allows selection with correct interpretation
77  * of periods within numbers and abbreviations, and trailing punctuation
78  * marks such as quotation marks and parentheses.
79  *
80  * <p><a name="word"></a>
81  * Word boundary analysis is used by search and replace functions, as
82  * well as within text editing applications that allow the user to
83  * select words with a double click. Word selection provides correct
84  * interpretation of punctuation marks within and following
85  * words. Characters that are not part of a word, such as symbols
86  * or punctuation marks, have word-breaks on both sides.
87  *
88  * <p><a name="character"></a>
89  * Character boundary analysis allows users to interact with characters
90  * as they expect to, for example, when moving the cursor through a text
91  * string. Character boundary analysis provides correct navigation
92  * through character strings, regardless of how the character is stored.
93  * The boundaries returned may be those of supplementary characters,
94  * combining character sequences, or ligature clusters.
95  * For example, an accented character might be stored as a base character
96  * and a diacritical mark. What users consider to be a character can
97  * differ between languages.
98  *
99  * <p>
100  * The <code>BreakIterator</code> instances returned by the factory methods
101  * of this class are intended for use with natural languages only, not for
102  * programming language text. It is however possible to define subclasses
103  * that tokenize a programming language.
104  *
105  * <P>
106  * <strong>Examples</strong>:<P>
107  * Creating and using text boundaries:
108  * <blockquote>
109  * <pre>
110  * public static void main(String args[]) {
111  *      if (args.length == 1) {
112  *          String stringToExamine = args[0];
113  *          //print each word in order
114  *          BreakIterator boundary = BreakIterator.getWordInstance();
115  *          boundary.setText(stringToExamine);
116  *          printEachForward(boundary, stringToExamine);
117  *          //print each sentence in reverse order
118  *          boundary = BreakIterator.getSentenceInstance(Locale.US);
119  *          boundary.setText(stringToExamine);
120  *          printEachBackward(boundary, stringToExamine);
121  *          printFirst(boundary, stringToExamine);
122  *          printLast(boundary, stringToExamine);
123  *      }
124  * }
125  * </pre>
126  * </blockquote>
127  *
128  * Print each element in order:
129  * <blockquote>
130  * <pre>
131  * public static void printEachForward(BreakIterator boundary, String source) {
132  *     int start = boundary.first();
133  *     for (int end = boundary.next();
134  *          end != BreakIterator.DONE;
135  *          start = end, end = boundary.next()) {
136  *          System.out.println(source.substring(start,end));
137  *     }
138  * }
139  * </pre>
140  * </blockquote>
141  *
142  * Print each element in reverse order:
143  * <blockquote>
144  * <pre>
145  * public static void printEachBackward(BreakIterator boundary, String source) {
146  *     int end = boundary.last();
147  *     for (int start = boundary.previous();
148  *          start != BreakIterator.DONE;
149  *          end = start, start = boundary.previous()) {
150  *         System.out.println(source.substring(start,end));
151  *     }
152  * }
153  * </pre>
154  * </blockquote>
155  *
156  * Print first element:
157  * <blockquote>
158  * <pre>
159  * public static void printFirst(BreakIterator boundary, String source) {
160  *     int start = boundary.first();
161  *     int end = boundary.next();
162  *     System.out.println(source.substring(start,end));
163  * }
164  * </pre>
165  * </blockquote>
166  *
167  * Print last element:
168  * <blockquote>
169  * <pre>
170  * public static void printLast(BreakIterator boundary, String source) {
171  *     int end = boundary.last();
172  *     int start = boundary.previous();
173  *     System.out.println(source.substring(start,end));
174  * }
175  * </pre>
176  * </blockquote>
177  *
178  * Print the element at a specified position:
179  * <blockquote>
180  * <pre>
181  * public static void printAt(BreakIterator boundary, int pos, String source) {
182  *     int end = boundary.following(pos);
183  *     int start = boundary.previous();
184  *     System.out.println(source.substring(start,end));
185  * }
186  * </pre>
187  * </blockquote>
188  *
189  * Find the next word:
190  * <blockquote>
191  * <pre>{@code
192  * public static int nextWordStartAfter(int pos, String text) {
193  *     BreakIterator wb = BreakIterator.getWordInstance();
194  *     wb.setText(text);
195  *     int last = wb.following(pos);
196  *     int current = wb.next();
197  *     while (current != BreakIterator.DONE) {
198  *         for (int p = last; p < current; p++) {
199  *             if (Character.isLetter(text.codePointAt(p)))
200  *                 return last;
201  *         }
202  *         last = current;
203  *         current = wb.next();
204  *     }
205  *     return BreakIterator.DONE;
206  * }
207  * }</pre>
208  * (The iterator returned by BreakIterator.getWordInstance() is unique in that
209  * the break positions it returns don't represent both the start and end of the
210  * thing being iterated over.  That is, a sentence-break iterator returns breaks
211  * that each represent the end of one sentence and the beginning of the next.
212  * With the word-break iterator, the characters between two boundaries might be a
213  * word, or they might be the punctuation or whitespace between two words.  The
214  * above code uses a simple heuristic to determine which boundary is the beginning
215  * of a word: If the characters between this boundary and the next boundary
216  * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
217  * a Hangul syllable, a Kana character, etc.), then the text between this boundary
218  * and the next is a word; otherwise, it's the material between words.)
219  * </blockquote>
220  *
221  * @see CharacterIterator
222  *
223  */
224 
225 public abstract class BreakIterator implements Cloneable
226 {
227     /**
228      * Constructor. BreakIterator is stateless and has no default behavior.
229      */
BreakIterator()230     protected BreakIterator()
231     {
232     }
233 
234     /**
235      * Create a copy of this iterator
236      * @return A copy of this
237      */
238     @Override
clone()239     public Object clone()
240     {
241         try {
242             return super.clone();
243         }
244         catch (CloneNotSupportedException e) {
245             throw new InternalError(e);
246         }
247     }
248 
249     /**
250      * DONE is returned by previous(), next(), next(int), preceding(int)
251      * and following(int) when either the first or last text boundary has been
252      * reached.
253      */
254     public static final int DONE = -1;
255 
256     /**
257      * Returns the first boundary. The iterator's current position is set
258      * to the first text boundary.
259      * @return The character index of the first text boundary.
260      */
first()261     public abstract int first();
262 
263     /**
264      * Returns the last boundary. The iterator's current position is set
265      * to the last text boundary.
266      * @return The character index of the last text boundary.
267      */
last()268     public abstract int last();
269 
270     /**
271      * Returns the nth boundary from the current boundary. If either
272      * the first or last text boundary has been reached, it returns
273      * <code>BreakIterator.DONE</code> and the current position is set to either
274      * the first or last text boundary depending on which one is reached. Otherwise,
275      * the iterator's current position is set to the new boundary.
276      * For example, if the iterator's current position is the mth text boundary
277      * and three more boundaries exist from the current boundary to the last text
278      * boundary, the next(2) call will return m + 2. The new text position is set
279      * to the (m + 2)th text boundary. A next(4) call would return
280      * <code>BreakIterator.DONE</code> and the last text boundary would become the
281      * new text position.
282      * @param n which boundary to return.  A value of 0
283      * does nothing.  Negative values move to previous boundaries
284      * and positive values move to later boundaries.
285      * @return The character index of the nth boundary from the current position
286      * or <code>BreakIterator.DONE</code> if either first or last text boundary
287      * has been reached.
288      */
next(int n)289     public abstract int next(int n);
290 
291     /**
292      * Returns the boundary following the current boundary. If the current boundary
293      * is the last text boundary, it returns <code>BreakIterator.DONE</code> and
294      * the iterator's current position is unchanged. Otherwise, the iterator's
295      * current position is set to the boundary following the current boundary.
296      * @return The character index of the next text boundary or
297      * <code>BreakIterator.DONE</code> if the current boundary is the last text
298      * boundary.
299      * Equivalent to next(1).
300      * @see #next(int)
301      */
next()302     public abstract int next();
303 
304     /**
305      * Returns the boundary preceding the current boundary. If the current boundary
306      * is the first text boundary, it returns <code>BreakIterator.DONE</code> and
307      * the iterator's current position is unchanged. Otherwise, the iterator's
308      * current position is set to the boundary preceding the current boundary.
309      * @return The character index of the previous text boundary or
310      * <code>BreakIterator.DONE</code> if the current boundary is the first text
311      * boundary.
312      */
previous()313     public abstract int previous();
314 
315     /**
316      * Returns the first boundary following the specified character offset. If the
317      * specified offset equals to the last text boundary, it returns
318      * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
319      * Otherwise, the iterator's current position is set to the returned boundary.
320      * The value returned is always greater than the offset or the value
321      * <code>BreakIterator.DONE</code>.
322      * @param offset the character offset to begin scanning.
323      * @return The first boundary after the specified offset or
324      * <code>BreakIterator.DONE</code> if the last text boundary is passed in
325      * as the offset.
326      * @exception  IllegalArgumentException if the specified offset is less than
327      * the first text boundary or greater than the last text boundary.
328      */
following(int offset)329     public abstract int following(int offset);
330 
331     /**
332      * Returns the last boundary preceding the specified character offset. If the
333      * specified offset equals to the first text boundary, it returns
334      * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
335      * Otherwise, the iterator's current position is set to the returned boundary.
336      * The value returned is always less than the offset or the value
337      * <code>BreakIterator.DONE</code>.
338      * @param offset the character offset to begin scanning.
339      * @return The last boundary before the specified offset or
340      * <code>BreakIterator.DONE</code> if the first text boundary is passed in
341      * as the offset.
342      * @exception   IllegalArgumentException if the specified offset is less than
343      * the first text boundary or greater than the last text boundary.
344      * @since 1.2
345      */
preceding(int offset)346     public int preceding(int offset) {
347         // NOTE:  This implementation is here solely because we can't add new
348         // abstract methods to an existing class.  There is almost ALWAYS a
349         // better, faster way to do this.
350         int pos = following(offset);
351         while (pos >= offset && pos != DONE) {
352             pos = previous();
353         }
354         return pos;
355     }
356 
357     /**
358      * Returns true if the specified character offset is a text boundary.
359      * @param offset the character offset to check.
360      * @return <code>true</code> if "offset" is a boundary position,
361      * <code>false</code> otherwise.
362      * @exception   IllegalArgumentException if the specified offset is less than
363      * the first text boundary or greater than the last text boundary.
364      * @since 1.2
365      */
isBoundary(int offset)366     public boolean isBoundary(int offset) {
367         // NOTE: This implementation probably is wrong for most situations
368         // because it fails to take into account the possibility that a
369         // CharacterIterator passed to setText() may not have a begin offset
370         // of 0.  But since the abstract BreakIterator doesn't have that
371         // knowledge, it assumes the begin offset is 0.  If you subclass
372         // BreakIterator, copy the SimpleTextBoundary implementation of this
373         // function into your subclass.  [This should have been abstract at
374         // this level, but it's too late to fix that now.]
375         if (offset == 0) {
376             return true;
377         }
378         int boundary = following(offset - 1);
379         if (boundary == DONE) {
380             throw new IllegalArgumentException();
381         }
382         return boundary == offset;
383     }
384 
385     /**
386      * Returns character index of the text boundary that was most
387      * recently returned by next(), next(int), previous(), first(), last(),
388      * following(int) or preceding(int). If any of these methods returns
389      * <code>BreakIterator.DONE</code> because either first or last text boundary
390      * has been reached, it returns the first or last text boundary depending on
391      * which one is reached.
392      * @return The text boundary returned from the above methods, first or last
393      * text boundary.
394      * @see #next()
395      * @see #next(int)
396      * @see #previous()
397      * @see #first()
398      * @see #last()
399      * @see #following(int)
400      * @see #preceding(int)
401      */
current()402     public abstract int current();
403 
404     /**
405      * Get the text being scanned
406      * @return the text being scanned
407      */
getText()408     public abstract CharacterIterator getText();
409 
410     /**
411      * Set a new text string to be scanned.  The current scan
412      * position is reset to first().
413      * @param newText new text to scan.
414      */
setText(String newText)415     public void setText(String newText)
416     {
417         setText(new StringCharacterIterator(newText));
418     }
419 
420     /**
421      * Set a new text for scanning.  The current scan
422      * position is reset to first().
423      * @param newText new text to scan.
424      */
setText(CharacterIterator newText)425     public abstract void setText(CharacterIterator newText);
426 
427     /**
428      * Returns a new <code>BreakIterator</code> instance
429      * for <a href="BreakIterator.html#word">word breaks</a>
430      * for the {@linkplain Locale#getDefault() default locale}.
431      * @return A break iterator for word breaks
432      */
getWordInstance()433     public static BreakIterator getWordInstance()
434     {
435         return getWordInstance(Locale.getDefault());
436     }
437 
438     /**
439      * Returns a new <code>BreakIterator</code> instance
440      * for <a href="BreakIterator.html#word">word breaks</a>
441      * for the given locale.
442      * @param locale the desired locale
443      * @return A break iterator for word breaks
444      * @exception NullPointerException if <code>locale</code> is null
445      */
getWordInstance(Locale locale)446     public static BreakIterator getWordInstance(Locale locale)
447     {
448         // Android-changed: Switched to ICU.
449         return new IcuIteratorWrapper(
450                 android.icu.text.BreakIterator.getWordInstance(locale));
451     }
452 
453     /**
454      * Returns a new <code>BreakIterator</code> instance
455      * for <a href="BreakIterator.html#line">line breaks</a>
456      * for the {@linkplain Locale#getDefault() default locale}.
457      * @return A break iterator for line breaks
458      */
getLineInstance()459     public static BreakIterator getLineInstance()
460     {
461         return getLineInstance(Locale.getDefault());
462     }
463 
464     /**
465      * Returns a new <code>BreakIterator</code> instance
466      * for <a href="BreakIterator.html#line">line breaks</a>
467      * for the given locale.
468      * @param locale the desired locale
469      * @return A break iterator for line breaks
470      * @exception NullPointerException if <code>locale</code> is null
471      */
getLineInstance(Locale locale)472     public static BreakIterator getLineInstance(Locale locale)
473     {
474         // Android-changed: Switched to ICU.
475         return new IcuIteratorWrapper(
476                 android.icu.text.BreakIterator.getLineInstance(locale));
477     }
478 
479     /**
480      * Returns a new <code>BreakIterator</code> instance
481      * for <a href="BreakIterator.html#character">character breaks</a>
482      * for the {@linkplain Locale#getDefault() default locale}.
483      * @return A break iterator for character breaks
484      */
getCharacterInstance()485     public static BreakIterator getCharacterInstance()
486     {
487         return getCharacterInstance(Locale.getDefault());
488     }
489 
490     /**
491      * Returns a new <code>BreakIterator</code> instance
492      * for <a href="BreakIterator.html#character">character breaks</a>
493      * for the given locale.
494      * @param locale the desired locale
495      * @return A break iterator for character breaks
496      * @exception NullPointerException if <code>locale</code> is null
497      */
getCharacterInstance(Locale locale)498     public static BreakIterator getCharacterInstance(Locale locale)
499     {
500         // Android-changed: Switched to ICU.
501         return new IcuIteratorWrapper(
502                 android.icu.text.BreakIterator.getCharacterInstance(locale));
503     }
504 
505     /**
506      * Returns a new <code>BreakIterator</code> instance
507      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
508      * for the {@linkplain Locale#getDefault() default locale}.
509      * @return A break iterator for sentence breaks
510      */
getSentenceInstance()511     public static BreakIterator getSentenceInstance()
512     {
513         return getSentenceInstance(Locale.getDefault());
514     }
515 
516     /**
517      * Returns a new <code>BreakIterator</code> instance
518      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
519      * for the given locale.
520      * @param locale the desired locale
521      * @return A break iterator for sentence breaks
522      * @exception NullPointerException if <code>locale</code> is null
523      */
getSentenceInstance(Locale locale)524     public static BreakIterator getSentenceInstance(Locale locale)
525     {
526         // Android-changed: Switched to ICU.
527         return new IcuIteratorWrapper(
528                 android.icu.text.BreakIterator.getSentenceInstance(locale));
529     }
530 
531     // Android-changed: Removed references to BreakIteratorProvider.
532     /**
533      * Returns an array of all locales for which the
534      * <code>get*Instance</code> methods of this class can return
535      * localized instances.
536      *
537      * @return An array of locales for which localized
538      *         <code>BreakIterator</code> instances are available.
539      */
getAvailableLocales()540     public static synchronized Locale[] getAvailableLocales()
541     {
542         // Android-changed: Switched to ICU.
543         return android.icu.text.BreakIterator.getAvailableLocales();
544     }
545 }
546