1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  * Copyright (c) 1996, 2020, Oracle and/or its affiliates. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.  Oracle designates this
9  * particular file as subject to the "Classpath" exception as provided
10  * by Oracle in the LICENSE file that accompanied this code.
11  *
12  * This code is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * version 2 for more details (a copy is included in the LICENSE file that
16  * accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License version
19  * 2 along with this work; if not, write to the Free Software Foundation,
20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23  * or visit www.oracle.com if you need additional information or have any
24  * questions.
25  */
26 
27 /*
28  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29  * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
30  *
31  * The original version of this source code and documentation
32  * is copyrighted and owned by Taligent, Inc., a wholly-owned
33  * subsidiary of IBM. These materials are provided under terms
34  * of a License Agreement between Taligent and Sun. This technology
35  * is protected by multiple US and International patents.
36  *
37  * This notice and attribution to Taligent may not be removed.
38  * Taligent is a registered trademark of Taligent, Inc.
39  *
40  */
41 
42 package java.text;
43 
44 import java.util.Locale;
45 
46 
47 // Android-changed: Discourage modification on CharacterIterator after setText. http://b/80456574
48 /**
49  * The {@code BreakIterator} class implements methods for finding
50  * the location of boundaries in text. Instances of {@code BreakIterator}
51  * maintain a current position and scan over text
52  * returning the index of characters where boundaries occur.
53  * Internally, {@code BreakIterator} scans text using a
54  * {@code CharacterIterator}, and is thus able to scan text held
55  * by any object implementing that protocol. A {@code StringCharacterIterator}
56  * is used to scan {@code String} objects passed to {@code setText}.
57  * The <code>CharacterIterator</code> object must not be modified after having been
58  * passed to <code>setText</code>. If the text in the <code>CharacterIterator</code> object
59  * is changed, the caller must reset <code>BreakIterator</code> by calling
60  * <code>setText</code>.
61  *
62  * <p>
63  * You use the factory methods provided by this class to create
64  * instances of various types of break iterators. In particular,
65  * use {@code getWordInstance}, {@code getLineInstance},
66  * {@code getSentenceInstance}, and {@code getCharacterInstance}
67  * to create {@code BreakIterator}s that perform
68  * word, line, sentence, and character boundary analysis respectively.
69  * A single {@code BreakIterator} can work only on one unit
70  * (word, line, sentence, and so on). You must use a different iterator
71  * for each unit boundary analysis you wish to perform.
72  *
73  * <p><a id="line"></a>
74  * Line boundary analysis determines where a text string can be
75  * broken when line-wrapping. The mechanism correctly handles
76  * punctuation and hyphenated words. Actual line breaking needs
77  * to also consider the available line width and is handled by
78  * higher-level software.
79  *
80  * <p><a id="sentence"></a>
81  * Sentence boundary analysis allows selection with correct interpretation
82  * of periods within numbers and abbreviations, and trailing punctuation
83  * marks such as quotation marks and parentheses.
84  *
85  * <p><a id="word"></a>
86  * Word boundary analysis is used by search and replace functions, as
87  * well as within text editing applications that allow the user to
88  * select words with a double click. Word selection provides correct
89  * interpretation of punctuation marks within and following
90  * words. Characters that are not part of a word, such as symbols
91  * or punctuation marks, have word-breaks on both sides.
92  *
93  * <p><a id="character"></a>
94  * Character boundary analysis allows users to interact with characters
95  * as they expect to, for example, when moving the cursor through a text
96  * string. Character boundary analysis provides correct navigation
97  * through character strings, regardless of how the character is stored.
98  * The boundaries returned may be those of supplementary characters,
99  * combining character sequences, or ligature clusters.
100  * For example, an accented character might be stored as a base character
101  * and a diacritical mark. What users consider to be a character can
102  * differ between languages.
103  *
104  * <p>
105  * The {@code BreakIterator} instances returned by the factory methods
106  * of this class are intended for use with natural languages only, not for
107  * programming language text. It is however possible to define subclasses
108  * that tokenize a programming language.
109  *
110  * <P>
111  * <strong>Examples</strong>:<P>
112  * Creating and using text boundaries:
113  * <blockquote>
114  * <pre>
115  * public static void main(String args[]) {
116  *      if (args.length == 1) {
117  *          String stringToExamine = args[0];
118  *          //print each word in order
119  *          BreakIterator boundary = BreakIterator.getWordInstance();
120  *          boundary.setText(stringToExamine);
121  *          printEachForward(boundary, stringToExamine);
122  *          //print each sentence in reverse order
123  *          boundary = BreakIterator.getSentenceInstance(Locale.US);
124  *          boundary.setText(stringToExamine);
125  *          printEachBackward(boundary, stringToExamine);
126  *          printFirst(boundary, stringToExamine);
127  *          printLast(boundary, stringToExamine);
128  *      }
129  * }
130  * </pre>
131  * </blockquote>
132  *
133  * Print each element in order:
134  * <blockquote>
135  * <pre>
136  * public static void printEachForward(BreakIterator boundary, String source) {
137  *     int start = boundary.first();
138  *     for (int end = boundary.next();
139  *          end != BreakIterator.DONE;
140  *          start = end, end = boundary.next()) {
141  *          System.out.println(source.substring(start,end));
142  *     }
143  * }
144  * </pre>
145  * </blockquote>
146  *
147  * Print each element in reverse order:
148  * <blockquote>
149  * <pre>
150  * public static void printEachBackward(BreakIterator boundary, String source) {
151  *     int end = boundary.last();
152  *     for (int start = boundary.previous();
153  *          start != BreakIterator.DONE;
154  *          end = start, start = boundary.previous()) {
155  *         System.out.println(source.substring(start,end));
156  *     }
157  * }
158  * </pre>
159  * </blockquote>
160  *
161  * Print first element:
162  * <blockquote>
163  * <pre>
164  * public static void printFirst(BreakIterator boundary, String source) {
165  *     int start = boundary.first();
166  *     int end = boundary.next();
167  *     System.out.println(source.substring(start,end));
168  * }
169  * </pre>
170  * </blockquote>
171  *
172  * Print last element:
173  * <blockquote>
174  * <pre>
175  * public static void printLast(BreakIterator boundary, String source) {
176  *     int end = boundary.last();
177  *     int start = boundary.previous();
178  *     System.out.println(source.substring(start,end));
179  * }
180  * </pre>
181  * </blockquote>
182  *
183  * Print the element at a specified position:
184  * <blockquote>
185  * <pre>
186  * public static void printAt(BreakIterator boundary, int pos, String source) {
187  *     int end = boundary.following(pos);
188  *     int start = boundary.previous();
189  *     System.out.println(source.substring(start,end));
190  * }
191  * </pre>
192  * </blockquote>
193  *
194  * Find the next word:
195  * <blockquote>
196  * <pre>{@code
197  * public static int nextWordStartAfter(int pos, String text) {
198  *     BreakIterator wb = BreakIterator.getWordInstance();
199  *     wb.setText(text);
200  *     int last = wb.following(pos);
201  *     int current = wb.next();
202  *     while (current != BreakIterator.DONE) {
203  *         for (int p = last; p < current; p++) {
204  *             if (Character.isLetter(text.codePointAt(p)))
205  *                 return last;
206  *         }
207  *         last = current;
208  *         current = wb.next();
209  *     }
210  *     return BreakIterator.DONE;
211  * }
212  * }</pre>
213  * (The iterator returned by BreakIterator.getWordInstance() is unique in that
214  * the break positions it returns don't represent both the start and end of the
215  * thing being iterated over.  That is, a sentence-break iterator returns breaks
216  * that each represent the end of one sentence and the beginning of the next.
217  * With the word-break iterator, the characters between two boundaries might be a
218  * word, or they might be the punctuation or whitespace between two words.  The
219  * above code uses a simple heuristic to determine which boundary is the beginning
220  * of a word: If the characters between this boundary and the next boundary
221  * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
222  * a Hangul syllable, a Kana character, etc.), then the text between this boundary
223  * and the next is a word; otherwise, it's the material between words.)
224  * </blockquote>
225  *
226  * @since 1.1
227  * @see CharacterIterator
228  *
229  */
230 
231 public abstract class BreakIterator implements Cloneable
232 {
233     /**
234      * Constructor. BreakIterator is stateless and has no default behavior.
235      */
BreakIterator()236     protected BreakIterator()
237     {
238     }
239 
240     /**
241      * Create a copy of this iterator
242      * @return A copy of this
243      */
244     @Override
clone()245     public Object clone()
246     {
247         try {
248             return super.clone();
249         }
250         catch (CloneNotSupportedException e) {
251             throw new InternalError(e);
252         }
253     }
254 
255     /**
256      * DONE is returned by previous(), next(), next(int), preceding(int)
257      * and following(int) when either the first or last text boundary has been
258      * reached.
259      */
260     public static final int DONE = -1;
261 
262     /**
263      * Returns the first boundary. The iterator's current position is set
264      * to the first text boundary.
265      * @return The character index of the first text boundary.
266      */
first()267     public abstract int first();
268 
269     /**
270      * Returns the last boundary. The iterator's current position is set
271      * to the last text boundary.
272      * @return The character index of the last text boundary.
273      */
last()274     public abstract int last();
275 
276     /**
277      * Returns the nth boundary from the current boundary. If either
278      * the first or last text boundary has been reached, it returns
279      * {@code BreakIterator.DONE} and the current position is set to either
280      * the first or last text boundary depending on which one is reached. Otherwise,
281      * the iterator's current position is set to the new boundary.
282      * For example, if the iterator's current position is the mth text boundary
283      * and three more boundaries exist from the current boundary to the last text
284      * boundary, the next(2) call will return m + 2. The new text position is set
285      * to the (m + 2)th text boundary. A next(4) call would return
286      * {@code BreakIterator.DONE} and the last text boundary would become the
287      * new text position.
288      * @param n which boundary to return.  A value of 0
289      * does nothing.  Negative values move to previous boundaries
290      * and positive values move to later boundaries.
291      * @return The character index of the nth boundary from the current position
292      * or {@code BreakIterator.DONE} if either first or last text boundary
293      * has been reached.
294      */
next(int n)295     public abstract int next(int n);
296 
297     /**
298      * Returns the boundary following the current boundary. If the current boundary
299      * is the last text boundary, it returns {@code BreakIterator.DONE} and
300      * the iterator's current position is unchanged. Otherwise, the iterator's
301      * current position is set to the boundary following the current boundary.
302      * @return The character index of the next text boundary or
303      * {@code BreakIterator.DONE} if the current boundary is the last text
304      * boundary.
305      * Equivalent to next(1).
306      * @see #next(int)
307      */
next()308     public abstract int next();
309 
310     /**
311      * Returns the boundary preceding the current boundary. If the current boundary
312      * is the first text boundary, it returns {@code BreakIterator.DONE} and
313      * the iterator's current position is unchanged. Otherwise, the iterator's
314      * current position is set to the boundary preceding the current boundary.
315      * @return The character index of the previous text boundary or
316      * {@code BreakIterator.DONE} if the current boundary is the first text
317      * boundary.
318      */
previous()319     public abstract int previous();
320 
321     /**
322      * Returns the first boundary following the specified character offset. If the
323      * specified offset is equal to the last text boundary, it returns
324      * {@code BreakIterator.DONE} and the iterator's current position is unchanged.
325      * Otherwise, the iterator's current position is set to the returned boundary.
326      * The value returned is always greater than the offset or the value
327      * {@code BreakIterator.DONE}.
328      * @param offset the character offset to begin scanning.
329      * @return The first boundary after the specified offset or
330      * {@code BreakIterator.DONE} if the last text boundary is passed in
331      * as the offset.
332      * @throws     IllegalArgumentException if the specified offset is less than
333      * the first text boundary or greater than the last text boundary.
334      */
following(int offset)335     public abstract int following(int offset);
336 
337     /**
338      * Returns the last boundary preceding the specified character offset. If the
339      * specified offset is equal to the first text boundary, it returns
340      * {@code BreakIterator.DONE} and the iterator's current position is unchanged.
341      * Otherwise, the iterator's current position is set to the returned boundary.
342      * The value returned is always less than the offset or the value
343      * {@code BreakIterator.DONE}.
344      * @param offset the character offset to begin scanning.
345      * @return The last boundary before the specified offset or
346      * {@code BreakIterator.DONE} if the first text boundary is passed in
347      * as the offset.
348      * @throws      IllegalArgumentException if the specified offset is less than
349      * the first text boundary or greater than the last text boundary.
350      * @since 1.2
351      */
preceding(int offset)352     public int preceding(int offset) {
353         // NOTE:  This implementation is here solely because we can't add new
354         // abstract methods to an existing class.  There is almost ALWAYS a
355         // better, faster way to do this.
356         int pos = following(offset);
357         while (pos >= offset && pos != DONE) {
358             pos = previous();
359         }
360         return pos;
361     }
362 
363     /**
364      * Returns true if the specified character offset is a text boundary.
365      * @param offset the character offset to check.
366      * @return {@code true} if "offset" is a boundary position,
367      * {@code false} otherwise.
368      * @throws      IllegalArgumentException if the specified offset is less than
369      * the first text boundary or greater than the last text boundary.
370      * @since 1.2
371      */
isBoundary(int offset)372     public boolean isBoundary(int offset) {
373         // NOTE: This implementation probably is wrong for most situations
374         // because it fails to take into account the possibility that a
375         // CharacterIterator passed to setText() may not have a begin offset
376         // of 0.  But since the abstract BreakIterator doesn't have that
377         // knowledge, it assumes the begin offset is 0.  If you subclass
378         // BreakIterator, copy the SimpleTextBoundary implementation of this
379         // function into your subclass.  [This should have been abstract at
380         // this level, but it's too late to fix that now.]
381         if (offset == 0) {
382             return true;
383         }
384         int boundary = following(offset - 1);
385         if (boundary == DONE) {
386             throw new IllegalArgumentException();
387         }
388         return boundary == offset;
389     }
390 
391     /**
392      * Returns character index of the text boundary that was most
393      * recently returned by next(), next(int), previous(), first(), last(),
394      * following(int) or preceding(int). If any of these methods returns
395      * {@code BreakIterator.DONE} because either first or last text boundary
396      * has been reached, it returns the first or last text boundary depending on
397      * which one is reached.
398      * @return The text boundary returned from the above methods, first or last
399      * text boundary.
400      * @see #next()
401      * @see #next(int)
402      * @see #previous()
403      * @see #first()
404      * @see #last()
405      * @see #following(int)
406      * @see #preceding(int)
407      */
current()408     public abstract int current();
409 
410     /**
411      * Get the text being scanned
412      * @return the text being scanned
413      */
getText()414     public abstract CharacterIterator getText();
415 
416     /**
417      * Set a new text string to be scanned.  The current scan
418      * position is reset to first().
419      * @param newText new text to scan.
420      */
setText(String newText)421     public void setText(String newText)
422     {
423         setText(new StringCharacterIterator(newText));
424     }
425 
426     /**
427      * Set a new text for scanning.  The current scan
428      * position is reset to first().
429      * @param newText new text to scan.
430      */
setText(CharacterIterator newText)431     public abstract void setText(CharacterIterator newText);
432 
433     // Android-removed: Removed code related to BreakIteratorProvider support.
434 
435     /**
436      * Returns a new {@code BreakIterator} instance
437      * for <a href="BreakIterator.html#word">word breaks</a>
438      * for the {@linkplain Locale#getDefault() default locale}.
439      * @return A break iterator for word breaks
440      */
getWordInstance()441     public static BreakIterator getWordInstance()
442     {
443         return getWordInstance(Locale.getDefault());
444     }
445 
446     /**
447      * Returns a new {@code BreakIterator} instance
448      * for <a href="BreakIterator.html#word">word breaks</a>
449      * for the given locale.
450      * @param locale the desired locale
451      * @return A break iterator for word breaks
452      * @throws    NullPointerException if {@code locale} is null
453      */
getWordInstance(Locale locale)454     public static BreakIterator getWordInstance(Locale locale)
455     {
456         // Android-changed: Switched to ICU.
457         return new IcuIteratorWrapper(
458                 android.icu.text.BreakIterator.getWordInstance(locale));
459     }
460 
461     /**
462      * Returns a new {@code BreakIterator} instance
463      * for <a href="BreakIterator.html#line">line breaks</a>
464      * for the {@linkplain Locale#getDefault() default locale}.
465      * @return A break iterator for line breaks
466      */
getLineInstance()467     public static BreakIterator getLineInstance()
468     {
469         return getLineInstance(Locale.getDefault());
470     }
471 
472     /**
473      * Returns a new {@code BreakIterator} instance
474      * for <a href="BreakIterator.html#line">line breaks</a>
475      * for the given locale.
476      * @param locale the desired locale
477      * @return A break iterator for line breaks
478      * @throws    NullPointerException if {@code locale} is null
479      */
getLineInstance(Locale locale)480     public static BreakIterator getLineInstance(Locale locale)
481     {
482         // Android-changed: Switched to ICU.
483         return new IcuIteratorWrapper(
484                 android.icu.text.BreakIterator.getLineInstance(locale));
485     }
486 
487     /**
488      * Returns a new {@code BreakIterator} instance
489      * for <a href="BreakIterator.html#character">character breaks</a>
490      * for the {@linkplain Locale#getDefault() default locale}.
491      * @return A break iterator for character breaks
492      */
getCharacterInstance()493     public static BreakIterator getCharacterInstance()
494     {
495         return getCharacterInstance(Locale.getDefault());
496     }
497 
498     /**
499      * Returns a new {@code BreakIterator} instance
500      * for <a href="BreakIterator.html#character">character breaks</a>
501      * for the given locale.
502      * @param locale the desired locale
503      * @return A break iterator for character breaks
504      * @throws    NullPointerException if {@code locale} is null
505      */
getCharacterInstance(Locale locale)506     public static BreakIterator getCharacterInstance(Locale locale)
507     {
508         // Android-changed: Switched to ICU.
509         return new IcuIteratorWrapper(
510                 android.icu.text.BreakIterator.getCharacterInstance(locale));
511     }
512 
513     /**
514      * Returns a new {@code BreakIterator} instance
515      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
516      * for the {@linkplain Locale#getDefault() default locale}.
517      * @return A break iterator for sentence breaks
518      */
getSentenceInstance()519     public static BreakIterator getSentenceInstance()
520     {
521         return getSentenceInstance(Locale.getDefault());
522     }
523 
524     /**
525      * Returns a new {@code BreakIterator} instance
526      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
527      * for the given locale.
528      * @param locale the desired locale
529      * @return A break iterator for sentence breaks
530      * @throws    NullPointerException if {@code locale} is null
531      */
getSentenceInstance(Locale locale)532     public static BreakIterator getSentenceInstance(Locale locale)
533     {
534         // Android-changed: Switched to ICU.
535         return new IcuIteratorWrapper(
536                 android.icu.text.BreakIterator.getSentenceInstance(locale));
537     }
538 
539     // Android-removed: Removed code related to BreakIteratorProvider support.
540     /*
541     private static BreakIterator getBreakInstance(Locale locale, int type) {
542         if (iterCache[type] != null) {
543             BreakIteratorCache cache = iterCache[type].get();
544             if (cache != null) {
545                 if (cache.getLocale().equals(locale)) {
546                     return cache.createBreakInstance();
547                 }
548             }
549         }
550 
551         BreakIterator result = createBreakInstance(locale, type);
552         BreakIteratorCache cache = new BreakIteratorCache(locale, result);
553         iterCache[type] = new SoftReference<>(cache);
554         return result;
555     }
556 
557     private static BreakIterator createBreakInstance(Locale locale,
558                                                      int type) {
559         LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale);
560         BreakIterator iterator = createBreakInstance(adapter, locale, type);
561         if (iterator == null) {
562             iterator = createBreakInstance(LocaleProviderAdapter.forJRE(), locale, type);
563         }
564         return iterator;
565     }
566 
567     private static BreakIterator createBreakInstance(LocaleProviderAdapter adapter, Locale locale, int type) {
568         BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider();
569         return switch (type) {
570             case CHARACTER_INDEX -> breakIteratorProvider.getCharacterInstance(locale);
571             case WORD_INDEX      -> breakIteratorProvider.getWordInstance(locale);
572             case LINE_INDEX      -> breakIteratorProvider.getLineInstance(locale);
573             case SENTENCE_INDEX  -> breakIteratorProvider.getSentenceInstance(locale);
574             default              -> null;
575         };
576     }
577     */
578 
579     // Android-changed: Removed references to BreakIteratorProvider from JavaDoc.
580     /**
581      * Returns an array of all locales for which the
582      * {@code get*Instance} methods of this class can return
583      * localized instances.
584      * It must contain at least a {@code Locale}
585      * instance equal to {@link java.util.Locale#US Locale.US}.
586      *
587      * @return An array of locales for which localized
588      *         {@code BreakIterator} instances are available.
589      */
getAvailableLocales()590     public static synchronized Locale[] getAvailableLocales()
591     {
592         // Android-changed: Switched to ICU.
593         return android.icu.text.BreakIterator.getAvailableLocales();
594     }
595 }
596