1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * Copyright (c) 1996, 2020, Oracle and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 27 /* 28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 29 * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved 30 * 31 * The original version of this source code and documentation 32 * is copyrighted and owned by Taligent, Inc., a wholly-owned 33 * subsidiary of IBM. These materials are provided under terms 34 * of a License Agreement between Taligent and Sun. This technology 35 * is protected by multiple US and International patents. 36 * 37 * This notice and attribution to Taligent may not be removed. 38 * Taligent is a registered trademark of Taligent, Inc. 39 * 40 */ 41 42 package java.text; 43 44 import java.util.Locale; 45 46 47 // Android-changed: Discourage modification on CharacterIterator after setText. http://b/80456574 48 /** 49 * The {@code BreakIterator} class implements methods for finding 50 * the location of boundaries in text. Instances of {@code BreakIterator} 51 * maintain a current position and scan over text 52 * returning the index of characters where boundaries occur. 53 * Internally, {@code BreakIterator} scans text using a 54 * {@code CharacterIterator}, and is thus able to scan text held 55 * by any object implementing that protocol. A {@code StringCharacterIterator} 56 * is used to scan {@code String} objects passed to {@code setText}. 57 * The <code>CharacterIterator</code> object must not be modified after having been 58 * passed to <code>setText</code>. If the text in the <code>CharacterIterator</code> object 59 * is changed, the caller must reset <code>BreakIterator</code> by calling 60 * <code>setText</code>. 61 * 62 * <p> 63 * You use the factory methods provided by this class to create 64 * instances of various types of break iterators. In particular, 65 * use {@code getWordInstance}, {@code getLineInstance}, 66 * {@code getSentenceInstance}, and {@code getCharacterInstance} 67 * to create {@code BreakIterator}s that perform 68 * word, line, sentence, and character boundary analysis respectively. 69 * A single {@code BreakIterator} can work only on one unit 70 * (word, line, sentence, and so on). You must use a different iterator 71 * for each unit boundary analysis you wish to perform. 72 * 73 * <p><a id="line"></a> 74 * Line boundary analysis determines where a text string can be 75 * broken when line-wrapping. The mechanism correctly handles 76 * punctuation and hyphenated words. Actual line breaking needs 77 * to also consider the available line width and is handled by 78 * higher-level software. 79 * 80 * <p><a id="sentence"></a> 81 * Sentence boundary analysis allows selection with correct interpretation 82 * of periods within numbers and abbreviations, and trailing punctuation 83 * marks such as quotation marks and parentheses. 84 * 85 * <p><a id="word"></a> 86 * Word boundary analysis is used by search and replace functions, as 87 * well as within text editing applications that allow the user to 88 * select words with a double click. Word selection provides correct 89 * interpretation of punctuation marks within and following 90 * words. Characters that are not part of a word, such as symbols 91 * or punctuation marks, have word-breaks on both sides. 92 * 93 * <p><a id="character"></a> 94 * Character boundary analysis allows users to interact with characters 95 * as they expect to, for example, when moving the cursor through a text 96 * string. Character boundary analysis provides correct navigation 97 * through character strings, regardless of how the character is stored. 98 * The boundaries returned may be those of supplementary characters, 99 * combining character sequences, or ligature clusters. 100 * For example, an accented character might be stored as a base character 101 * and a diacritical mark. What users consider to be a character can 102 * differ between languages. 103 * 104 * <p> 105 * The {@code BreakIterator} instances returned by the factory methods 106 * of this class are intended for use with natural languages only, not for 107 * programming language text. It is however possible to define subclasses 108 * that tokenize a programming language. 109 * 110 * <P> 111 * <strong>Examples</strong>:<P> 112 * Creating and using text boundaries: 113 * <blockquote> 114 * <pre> 115 * public static void main(String args[]) { 116 * if (args.length == 1) { 117 * String stringToExamine = args[0]; 118 * //print each word in order 119 * BreakIterator boundary = BreakIterator.getWordInstance(); 120 * boundary.setText(stringToExamine); 121 * printEachForward(boundary, stringToExamine); 122 * //print each sentence in reverse order 123 * boundary = BreakIterator.getSentenceInstance(Locale.US); 124 * boundary.setText(stringToExamine); 125 * printEachBackward(boundary, stringToExamine); 126 * printFirst(boundary, stringToExamine); 127 * printLast(boundary, stringToExamine); 128 * } 129 * } 130 * </pre> 131 * </blockquote> 132 * 133 * Print each element in order: 134 * <blockquote> 135 * <pre> 136 * public static void printEachForward(BreakIterator boundary, String source) { 137 * int start = boundary.first(); 138 * for (int end = boundary.next(); 139 * end != BreakIterator.DONE; 140 * start = end, end = boundary.next()) { 141 * System.out.println(source.substring(start,end)); 142 * } 143 * } 144 * </pre> 145 * </blockquote> 146 * 147 * Print each element in reverse order: 148 * <blockquote> 149 * <pre> 150 * public static void printEachBackward(BreakIterator boundary, String source) { 151 * int end = boundary.last(); 152 * for (int start = boundary.previous(); 153 * start != BreakIterator.DONE; 154 * end = start, start = boundary.previous()) { 155 * System.out.println(source.substring(start,end)); 156 * } 157 * } 158 * </pre> 159 * </blockquote> 160 * 161 * Print first element: 162 * <blockquote> 163 * <pre> 164 * public static void printFirst(BreakIterator boundary, String source) { 165 * int start = boundary.first(); 166 * int end = boundary.next(); 167 * System.out.println(source.substring(start,end)); 168 * } 169 * </pre> 170 * </blockquote> 171 * 172 * Print last element: 173 * <blockquote> 174 * <pre> 175 * public static void printLast(BreakIterator boundary, String source) { 176 * int end = boundary.last(); 177 * int start = boundary.previous(); 178 * System.out.println(source.substring(start,end)); 179 * } 180 * </pre> 181 * </blockquote> 182 * 183 * Print the element at a specified position: 184 * <blockquote> 185 * <pre> 186 * public static void printAt(BreakIterator boundary, int pos, String source) { 187 * int end = boundary.following(pos); 188 * int start = boundary.previous(); 189 * System.out.println(source.substring(start,end)); 190 * } 191 * </pre> 192 * </blockquote> 193 * 194 * Find the next word: 195 * <blockquote> 196 * <pre>{@code 197 * public static int nextWordStartAfter(int pos, String text) { 198 * BreakIterator wb = BreakIterator.getWordInstance(); 199 * wb.setText(text); 200 * int last = wb.following(pos); 201 * int current = wb.next(); 202 * while (current != BreakIterator.DONE) { 203 * for (int p = last; p < current; p++) { 204 * if (Character.isLetter(text.codePointAt(p))) 205 * return last; 206 * } 207 * last = current; 208 * current = wb.next(); 209 * } 210 * return BreakIterator.DONE; 211 * } 212 * }</pre> 213 * (The iterator returned by BreakIterator.getWordInstance() is unique in that 214 * the break positions it returns don't represent both the start and end of the 215 * thing being iterated over. That is, a sentence-break iterator returns breaks 216 * that each represent the end of one sentence and the beginning of the next. 217 * With the word-break iterator, the characters between two boundaries might be a 218 * word, or they might be the punctuation or whitespace between two words. The 219 * above code uses a simple heuristic to determine which boundary is the beginning 220 * of a word: If the characters between this boundary and the next boundary 221 * include at least one letter (this can be an alphabetical letter, a CJK ideograph, 222 * a Hangul syllable, a Kana character, etc.), then the text between this boundary 223 * and the next is a word; otherwise, it's the material between words.) 224 * </blockquote> 225 * 226 * @since 1.1 227 * @see CharacterIterator 228 * 229 */ 230 231 public abstract class BreakIterator implements Cloneable 232 { 233 /** 234 * Constructor. BreakIterator is stateless and has no default behavior. 235 */ BreakIterator()236 protected BreakIterator() 237 { 238 } 239 240 /** 241 * Create a copy of this iterator 242 * @return A copy of this 243 */ 244 @Override clone()245 public Object clone() 246 { 247 try { 248 return super.clone(); 249 } 250 catch (CloneNotSupportedException e) { 251 throw new InternalError(e); 252 } 253 } 254 255 /** 256 * DONE is returned by previous(), next(), next(int), preceding(int) 257 * and following(int) when either the first or last text boundary has been 258 * reached. 259 */ 260 public static final int DONE = -1; 261 262 /** 263 * Returns the first boundary. The iterator's current position is set 264 * to the first text boundary. 265 * @return The character index of the first text boundary. 266 */ first()267 public abstract int first(); 268 269 /** 270 * Returns the last boundary. The iterator's current position is set 271 * to the last text boundary. 272 * @return The character index of the last text boundary. 273 */ last()274 public abstract int last(); 275 276 /** 277 * Returns the nth boundary from the current boundary. If either 278 * the first or last text boundary has been reached, it returns 279 * {@code BreakIterator.DONE} and the current position is set to either 280 * the first or last text boundary depending on which one is reached. Otherwise, 281 * the iterator's current position is set to the new boundary. 282 * For example, if the iterator's current position is the mth text boundary 283 * and three more boundaries exist from the current boundary to the last text 284 * boundary, the next(2) call will return m + 2. The new text position is set 285 * to the (m + 2)th text boundary. A next(4) call would return 286 * {@code BreakIterator.DONE} and the last text boundary would become the 287 * new text position. 288 * @param n which boundary to return. A value of 0 289 * does nothing. Negative values move to previous boundaries 290 * and positive values move to later boundaries. 291 * @return The character index of the nth boundary from the current position 292 * or {@code BreakIterator.DONE} if either first or last text boundary 293 * has been reached. 294 */ next(int n)295 public abstract int next(int n); 296 297 /** 298 * Returns the boundary following the current boundary. If the current boundary 299 * is the last text boundary, it returns {@code BreakIterator.DONE} and 300 * the iterator's current position is unchanged. Otherwise, the iterator's 301 * current position is set to the boundary following the current boundary. 302 * @return The character index of the next text boundary or 303 * {@code BreakIterator.DONE} if the current boundary is the last text 304 * boundary. 305 * Equivalent to next(1). 306 * @see #next(int) 307 */ next()308 public abstract int next(); 309 310 /** 311 * Returns the boundary preceding the current boundary. If the current boundary 312 * is the first text boundary, it returns {@code BreakIterator.DONE} and 313 * the iterator's current position is unchanged. Otherwise, the iterator's 314 * current position is set to the boundary preceding the current boundary. 315 * @return The character index of the previous text boundary or 316 * {@code BreakIterator.DONE} if the current boundary is the first text 317 * boundary. 318 */ previous()319 public abstract int previous(); 320 321 /** 322 * Returns the first boundary following the specified character offset. If the 323 * specified offset is equal to the last text boundary, it returns 324 * {@code BreakIterator.DONE} and the iterator's current position is unchanged. 325 * Otherwise, the iterator's current position is set to the returned boundary. 326 * The value returned is always greater than the offset or the value 327 * {@code BreakIterator.DONE}. 328 * @param offset the character offset to begin scanning. 329 * @return The first boundary after the specified offset or 330 * {@code BreakIterator.DONE} if the last text boundary is passed in 331 * as the offset. 332 * @throws IllegalArgumentException if the specified offset is less than 333 * the first text boundary or greater than the last text boundary. 334 */ following(int offset)335 public abstract int following(int offset); 336 337 /** 338 * Returns the last boundary preceding the specified character offset. If the 339 * specified offset is equal to the first text boundary, it returns 340 * {@code BreakIterator.DONE} and the iterator's current position is unchanged. 341 * Otherwise, the iterator's current position is set to the returned boundary. 342 * The value returned is always less than the offset or the value 343 * {@code BreakIterator.DONE}. 344 * @param offset the character offset to begin scanning. 345 * @return The last boundary before the specified offset or 346 * {@code BreakIterator.DONE} if the first text boundary is passed in 347 * as the offset. 348 * @throws IllegalArgumentException if the specified offset is less than 349 * the first text boundary or greater than the last text boundary. 350 * @since 1.2 351 */ preceding(int offset)352 public int preceding(int offset) { 353 // NOTE: This implementation is here solely because we can't add new 354 // abstract methods to an existing class. There is almost ALWAYS a 355 // better, faster way to do this. 356 int pos = following(offset); 357 while (pos >= offset && pos != DONE) { 358 pos = previous(); 359 } 360 return pos; 361 } 362 363 /** 364 * Returns true if the specified character offset is a text boundary. 365 * @param offset the character offset to check. 366 * @return {@code true} if "offset" is a boundary position, 367 * {@code false} otherwise. 368 * @throws IllegalArgumentException if the specified offset is less than 369 * the first text boundary or greater than the last text boundary. 370 * @since 1.2 371 */ isBoundary(int offset)372 public boolean isBoundary(int offset) { 373 // NOTE: This implementation probably is wrong for most situations 374 // because it fails to take into account the possibility that a 375 // CharacterIterator passed to setText() may not have a begin offset 376 // of 0. But since the abstract BreakIterator doesn't have that 377 // knowledge, it assumes the begin offset is 0. If you subclass 378 // BreakIterator, copy the SimpleTextBoundary implementation of this 379 // function into your subclass. [This should have been abstract at 380 // this level, but it's too late to fix that now.] 381 if (offset == 0) { 382 return true; 383 } 384 int boundary = following(offset - 1); 385 if (boundary == DONE) { 386 throw new IllegalArgumentException(); 387 } 388 return boundary == offset; 389 } 390 391 /** 392 * Returns character index of the text boundary that was most 393 * recently returned by next(), next(int), previous(), first(), last(), 394 * following(int) or preceding(int). If any of these methods returns 395 * {@code BreakIterator.DONE} because either first or last text boundary 396 * has been reached, it returns the first or last text boundary depending on 397 * which one is reached. 398 * @return The text boundary returned from the above methods, first or last 399 * text boundary. 400 * @see #next() 401 * @see #next(int) 402 * @see #previous() 403 * @see #first() 404 * @see #last() 405 * @see #following(int) 406 * @see #preceding(int) 407 */ current()408 public abstract int current(); 409 410 /** 411 * Get the text being scanned 412 * @return the text being scanned 413 */ getText()414 public abstract CharacterIterator getText(); 415 416 /** 417 * Set a new text string to be scanned. The current scan 418 * position is reset to first(). 419 * @param newText new text to scan. 420 */ setText(String newText)421 public void setText(String newText) 422 { 423 setText(new StringCharacterIterator(newText)); 424 } 425 426 /** 427 * Set a new text for scanning. The current scan 428 * position is reset to first(). 429 * @param newText new text to scan. 430 */ setText(CharacterIterator newText)431 public abstract void setText(CharacterIterator newText); 432 433 // Android-removed: Removed code related to BreakIteratorProvider support. 434 435 /** 436 * Returns a new {@code BreakIterator} instance 437 * for <a href="BreakIterator.html#word">word breaks</a> 438 * for the {@linkplain Locale#getDefault() default locale}. 439 * @return A break iterator for word breaks 440 */ getWordInstance()441 public static BreakIterator getWordInstance() 442 { 443 return getWordInstance(Locale.getDefault()); 444 } 445 446 /** 447 * Returns a new {@code BreakIterator} instance 448 * for <a href="BreakIterator.html#word">word breaks</a> 449 * for the given locale. 450 * @param locale the desired locale 451 * @return A break iterator for word breaks 452 * @throws NullPointerException if {@code locale} is null 453 */ getWordInstance(Locale locale)454 public static BreakIterator getWordInstance(Locale locale) 455 { 456 // Android-changed: Switched to ICU. 457 return new IcuIteratorWrapper( 458 android.icu.text.BreakIterator.getWordInstance(locale)); 459 } 460 461 /** 462 * Returns a new {@code BreakIterator} instance 463 * for <a href="BreakIterator.html#line">line breaks</a> 464 * for the {@linkplain Locale#getDefault() default locale}. 465 * @return A break iterator for line breaks 466 */ getLineInstance()467 public static BreakIterator getLineInstance() 468 { 469 return getLineInstance(Locale.getDefault()); 470 } 471 472 /** 473 * Returns a new {@code BreakIterator} instance 474 * for <a href="BreakIterator.html#line">line breaks</a> 475 * for the given locale. 476 * @param locale the desired locale 477 * @return A break iterator for line breaks 478 * @throws NullPointerException if {@code locale} is null 479 */ getLineInstance(Locale locale)480 public static BreakIterator getLineInstance(Locale locale) 481 { 482 // Android-changed: Switched to ICU. 483 return new IcuIteratorWrapper( 484 android.icu.text.BreakIterator.getLineInstance(locale)); 485 } 486 487 /** 488 * Returns a new {@code BreakIterator} instance 489 * for <a href="BreakIterator.html#character">character breaks</a> 490 * for the {@linkplain Locale#getDefault() default locale}. 491 * @return A break iterator for character breaks 492 */ getCharacterInstance()493 public static BreakIterator getCharacterInstance() 494 { 495 return getCharacterInstance(Locale.getDefault()); 496 } 497 498 /** 499 * Returns a new {@code BreakIterator} instance 500 * for <a href="BreakIterator.html#character">character breaks</a> 501 * for the given locale. 502 * @param locale the desired locale 503 * @return A break iterator for character breaks 504 * @throws NullPointerException if {@code locale} is null 505 */ getCharacterInstance(Locale locale)506 public static BreakIterator getCharacterInstance(Locale locale) 507 { 508 // Android-changed: Switched to ICU. 509 return new IcuIteratorWrapper( 510 android.icu.text.BreakIterator.getCharacterInstance(locale)); 511 } 512 513 /** 514 * Returns a new {@code BreakIterator} instance 515 * for <a href="BreakIterator.html#sentence">sentence breaks</a> 516 * for the {@linkplain Locale#getDefault() default locale}. 517 * @return A break iterator for sentence breaks 518 */ getSentenceInstance()519 public static BreakIterator getSentenceInstance() 520 { 521 return getSentenceInstance(Locale.getDefault()); 522 } 523 524 /** 525 * Returns a new {@code BreakIterator} instance 526 * for <a href="BreakIterator.html#sentence">sentence breaks</a> 527 * for the given locale. 528 * @param locale the desired locale 529 * @return A break iterator for sentence breaks 530 * @throws NullPointerException if {@code locale} is null 531 */ getSentenceInstance(Locale locale)532 public static BreakIterator getSentenceInstance(Locale locale) 533 { 534 // Android-changed: Switched to ICU. 535 return new IcuIteratorWrapper( 536 android.icu.text.BreakIterator.getSentenceInstance(locale)); 537 } 538 539 // Android-removed: Removed code related to BreakIteratorProvider support. 540 /* 541 private static BreakIterator getBreakInstance(Locale locale, int type) { 542 if (iterCache[type] != null) { 543 BreakIteratorCache cache = iterCache[type].get(); 544 if (cache != null) { 545 if (cache.getLocale().equals(locale)) { 546 return cache.createBreakInstance(); 547 } 548 } 549 } 550 551 BreakIterator result = createBreakInstance(locale, type); 552 BreakIteratorCache cache = new BreakIteratorCache(locale, result); 553 iterCache[type] = new SoftReference<>(cache); 554 return result; 555 } 556 557 private static BreakIterator createBreakInstance(Locale locale, 558 int type) { 559 LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale); 560 BreakIterator iterator = createBreakInstance(adapter, locale, type); 561 if (iterator == null) { 562 iterator = createBreakInstance(LocaleProviderAdapter.forJRE(), locale, type); 563 } 564 return iterator; 565 } 566 567 private static BreakIterator createBreakInstance(LocaleProviderAdapter adapter, Locale locale, int type) { 568 BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider(); 569 return switch (type) { 570 case CHARACTER_INDEX -> breakIteratorProvider.getCharacterInstance(locale); 571 case WORD_INDEX -> breakIteratorProvider.getWordInstance(locale); 572 case LINE_INDEX -> breakIteratorProvider.getLineInstance(locale); 573 case SENTENCE_INDEX -> breakIteratorProvider.getSentenceInstance(locale); 574 default -> null; 575 }; 576 } 577 */ 578 579 // Android-changed: Removed references to BreakIteratorProvider from JavaDoc. 580 /** 581 * Returns an array of all locales for which the 582 * {@code get*Instance} methods of this class can return 583 * localized instances. 584 * It must contain at least a {@code Locale} 585 * instance equal to {@link java.util.Locale#US Locale.US}. 586 * 587 * @return An array of locales for which localized 588 * {@code BreakIterator} instances are available. 589 */ getAvailableLocales()590 public static synchronized Locale[] getAvailableLocales() 591 { 592 // Android-changed: Switched to ICU. 593 return android.icu.text.BreakIterator.getAvailableLocales(); 594 } 595 } 596