1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2015, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.text; 9 10 import java.lang.ref.SoftReference; 11 import java.text.CharacterIterator; 12 import java.text.StringCharacterIterator; 13 import java.util.Locale; 14 import java.util.MissingResourceException; 15 16 import com.ibm.icu.impl.ICUDebug; 17 import com.ibm.icu.util.ICUCloneNotSupportedException; 18 import com.ibm.icu.util.ULocale; 19 20 /** 21 * {@icuenhanced java.text.BreakIterator}.{@icu _usage_} 22 * 23 * <p>A class that locates boundaries in text. This class defines a protocol for 24 * objects that break up a piece of natural-language text according to a set 25 * of criteria. Instances or subclasses of BreakIterator can be provided, for 26 * example, to break a piece of text into words, sentences, or logical characters 27 * according to the conventions of some language or group of languages. 28 * 29 * We provide five built-in types of BreakIterator: 30 * <ul><li>getTitleInstance() returns a BreakIterator that locates boundaries 31 * between title breaks. 32 * <li>getSentenceInstance() returns a BreakIterator that locates boundaries 33 * between sentences. This is useful for triple-click selection, for example. 34 * <li>getWordInstance() returns a BreakIterator that locates boundaries between 35 * words. This is useful for double-click selection or "find whole words" searches. 36 * This type of BreakIterator makes sure there is a boundary position at the 37 * beginning and end of each legal word. (Numbers count as words, too.) Whitespace 38 * and punctuation are kept separate from real words. 39 * <li>getLineInstance() returns a BreakIterator that locates positions where it is 40 * legal for a text editor to wrap lines. This is similar to word breaking, but 41 * not the same: punctuation and whitespace are generally kept with words (you don't 42 * want a line to start with whitespace, for example), and some special characters 43 * can force a position to be considered a line-break position or prevent a position 44 * from being a line-break position. 45 * <li>getCharacterInstance() returns a BreakIterator that locates boundaries between 46 * logical characters. Because of the structure of the Unicode encoding, a logical 47 * character may be stored internally as more than one Unicode code point. (A with an 48 * umlaut may be stored as an a followed by a separate combining umlaut character, 49 * for example, but the user still thinks of it as one character.) This iterator allows 50 * various processes (especially text editors) to treat as characters the units of text 51 * that a user would think of as characters, rather than the units of text that the 52 * computer sees as "characters".</ul> 53 * The text boundary positions are found according to the rules 54 * described in Unicode Standard Annex #29, Text Boundaries, and 55 * Unicode Standard Annex #14, Line Breaking Properties. These 56 * are available at http://www.unicode.org/reports/tr14/ and 57 * http://www.unicode.org/reports/tr29/. 58 * <p> 59 * BreakIterator's interface follows an "iterator" model (hence the name), meaning it 60 * has a concept of a "current position" and methods like first(), last(), next(), 61 * and previous() that update the current position. All BreakIterators uphold the 62 * following invariants: 63 * <ul><li>The beginning and end of the text are always treated as boundary positions. 64 * <li>The current position of the iterator is always a boundary position (random- 65 * access methods move the iterator to the nearest boundary position before or 66 * after the specified position, not _to_ the specified position). 67 * <li>DONE is used as a flag to indicate when iteration has stopped. DONE is only 68 * returned when the current position is the end of the text and the user calls next(), 69 * or when the current position is the beginning of the text and the user calls 70 * previous(). 71 * <li>Break positions are numbered by the positions of the characters that follow 72 * them. Thus, under normal circumstances, the position before the first character 73 * is 0, the position after the first character is 1, and the position after the 74 * last character is 1 plus the length of the string. 75 * <li>The client can change the position of an iterator, or the text it analyzes, 76 * at will, but cannot change the behavior. If the user wants different behavior, he 77 * must instantiate a new iterator.</ul> 78 * 79 * BreakIterator accesses the text it analyzes through a CharacterIterator, which makes 80 * it possible to use BreakIterator to analyze text in any text-storage vehicle that 81 * provides a CharacterIterator interface. 82 * 83 * <b>Note:</b> Some types of BreakIterator can take a long time to create, and 84 * instances of BreakIterator are not currently cached by the system. For 85 * optimal performance, keep instances of BreakIterator around as long as makes 86 * sense. For example, when word-wrapping a document, don't create and destroy a 87 * new BreakIterator for each line. Create one break iterator for the whole document 88 * (or whatever stretch of text you're wrapping) and use it to do the whole job of 89 * wrapping the text. 90 * 91 * <P> 92 * <strong>Examples</strong>:<P> 93 * Creating and using text boundaries 94 * <blockquote> 95 * <pre> 96 * public static void main(String args[]) { 97 * if (args.length == 1) { 98 * String stringToExamine = args[0]; 99 * //print each word in order 100 * BreakIterator boundary = BreakIterator.getWordInstance(); 101 * boundary.setText(stringToExamine); 102 * printEachForward(boundary, stringToExamine); 103 * //print each sentence in reverse order 104 * boundary = BreakIterator.getSentenceInstance(Locale.US); 105 * boundary.setText(stringToExamine); 106 * printEachBackward(boundary, stringToExamine); 107 * printFirst(boundary, stringToExamine); 108 * printLast(boundary, stringToExamine); 109 * } 110 * } 111 * </pre> 112 * </blockquote> 113 * 114 * Print each element in order 115 * <blockquote> 116 * <pre> 117 * public static void printEachForward(BreakIterator boundary, String source) { 118 * int start = boundary.first(); 119 * for (int end = boundary.next(); 120 * end != BreakIterator.DONE; 121 * start = end, end = boundary.next()) { 122 * System.out.println(source.substring(start,end)); 123 * } 124 * } 125 * </pre> 126 * </blockquote> 127 * 128 * Print each element in reverse order 129 * <blockquote> 130 * <pre> 131 * public static void printEachBackward(BreakIterator boundary, String source) { 132 * int end = boundary.last(); 133 * for (int start = boundary.previous(); 134 * start != BreakIterator.DONE; 135 * end = start, start = boundary.previous()) { 136 * System.out.println(source.substring(start,end)); 137 * } 138 * } 139 * </pre> 140 * </blockquote> 141 * 142 * Print first element 143 * <blockquote> 144 * <pre> 145 * public static void printFirst(BreakIterator boundary, String source) { 146 * int start = boundary.first(); 147 * int end = boundary.next(); 148 * System.out.println(source.substring(start,end)); 149 * } 150 * </pre> 151 * </blockquote> 152 * 153 * Print last element 154 * <blockquote> 155 * <pre> 156 * public static void printLast(BreakIterator boundary, String source) { 157 * int end = boundary.last(); 158 * int start = boundary.previous(); 159 * System.out.println(source.substring(start,end)); 160 * } 161 * </pre> 162 * </blockquote> 163 * 164 * Print the element at a specified position 165 * <blockquote> 166 * <pre> 167 * public static void printAt(BreakIterator boundary, int pos, String source) { 168 * int end = boundary.following(pos); 169 * int start = boundary.previous(); 170 * System.out.println(source.substring(start,end)); 171 * } 172 * </pre> 173 * </blockquote> 174 * 175 * Find the next word 176 * <blockquote> 177 * <pre> 178 * public static int nextWordStartAfter(int pos, String text) { 179 * BreakIterator wb = BreakIterator.getWordInstance(); 180 * wb.setText(text); 181 * int last = wb.following(pos); 182 * int current = wb.next(); 183 * while (current != BreakIterator.DONE) { 184 * for (int p = last; p < current; p++) { 185 * if (Character.isLetter(text.charAt(p))) 186 * return last; 187 * } 188 * last = current; 189 * current = wb.next(); 190 * } 191 * return BreakIterator.DONE; 192 * } 193 * </pre> 194 * (The iterator returned by BreakIterator.getWordInstance() is unique in that 195 * the break positions it returns don't represent both the start and end of the 196 * thing being iterated over. That is, a sentence-break iterator returns breaks 197 * that each represent the end of one sentence and the beginning of the next. 198 * With the word-break iterator, the characters between two boundaries might be a 199 * word, or they might be the punctuation or whitespace between two words. The 200 * above code uses a simple heuristic to determine which boundary is the beginning 201 * of a word: If the characters between this boundary and the next boundary 202 * include at least one letter (this can be an alphabetical letter, a CJK ideograph, 203 * a Hangul syllable, a Kana character, etc.), then the text between this boundary 204 * and the next is a word; otherwise, it's the material between words.) 205 * </blockquote> 206 * 207 * @see CharacterIterator 208 * @stable ICU 2.0 209 * 210 */ 211 212 public abstract class BreakIterator implements Cloneable 213 { 214 215 private static final boolean DEBUG = ICUDebug.enabled("breakiterator"); 216 217 /** 218 * Default constructor. There is no state that is carried by this abstract 219 * base class. 220 * @stable ICU 2.0 221 */ BreakIterator()222 protected BreakIterator() 223 { 224 } 225 226 /** 227 * Clone method. Creates another BreakIterator with the same behavior and 228 * current state as this one. 229 * @return The clone. 230 * @stable ICU 2.0 231 */ clone()232 public Object clone() 233 { 234 try { 235 return super.clone(); 236 } 237 catch (CloneNotSupportedException e) { 238 ///CLOVER:OFF 239 throw new ICUCloneNotSupportedException(e); 240 ///CLOVER:ON 241 } 242 } 243 244 /** 245 * DONE is returned by previous() and next() after all valid 246 * boundaries have been returned. 247 * @stable ICU 2.0 248 */ 249 public static final int DONE = -1; 250 251 /** 252 * Set the iterator to the first boundary position. This is always the beginning 253 * index of the text this iterator iterates over. For example, if 254 * the iterator iterates over a whole string, this function will 255 * always return 0. 256 * @return The character offset of the beginning of the stretch of text 257 * being broken. 258 * @stable ICU 2.0 259 */ first()260 public abstract int first(); 261 262 /** 263 * Set the iterator to the last boundary position. This is always the "past-the-end" 264 * index of the text this iterator iterates over. For example, if the 265 * iterator iterates over a whole string (call it "text"), this function 266 * will always return text.length(). 267 * @return The character offset of the end of the stretch of text 268 * being broken. 269 * @stable ICU 2.0 270 */ last()271 public abstract int last(); 272 273 /** 274 * Move the iterator by the specified number of steps in the text. 275 * A positive number moves the iterator forward; a negative number 276 * moves the iterator backwards. If this causes the iterator 277 * to move off either end of the text, this function returns DONE; 278 * otherwise, this function returns the position of the appropriate 279 * boundary. Calling this function is equivalent to calling next() or 280 * previous() n times. 281 * @param n The number of boundaries to advance over (if positive, moves 282 * forward; if negative, moves backwards). 283 * @return The position of the boundary n boundaries from the current 284 * iteration position, or DONE if moving n boundaries causes the iterator 285 * to advance off either end of the text. 286 * @stable ICU 2.0 287 */ next(int n)288 public abstract int next(int n); 289 290 /** 291 * Advances the iterator forward one boundary. The current iteration 292 * position is updated to point to the next boundary position after the 293 * current position, and this is also the value that is returned. If 294 * the current position is equal to the value returned by last(), or to 295 * DONE, this function returns DONE and sets the current position to 296 * DONE. 297 * @return The position of the first boundary position following the 298 * iteration position. 299 * @stable ICU 2.0 300 */ next()301 public abstract int next(); 302 303 /** 304 * Move the iterator backward one boundary. The current iteration 305 * position is updated to point to the last boundary position before 306 * the current position, and this is also the value that is returned. If 307 * the current position is equal to the value returned by first(), or to 308 * DONE, this function returns DONE and sets the current position to 309 * DONE. 310 * @return The position of the last boundary position preceding the 311 * iteration position. 312 * @stable ICU 2.0 313 */ previous()314 public abstract int previous(); 315 316 /** 317 * Sets the iterator's current iteration position to be the first 318 * boundary position following the specified position. (Whether the 319 * specified position is itself a boundary position or not doesn't 320 * matter-- this function always moves the iteration position to the 321 * first boundary after the specified position.) If the specified 322 * position is the past-the-end position, returns DONE. 323 * @param offset The character position to start searching from. 324 * @return The position of the first boundary position following 325 * "offset" (whether or not "offset" itself is a boundary position), 326 * or DONE if "offset" is the past-the-end offset. 327 * @stable ICU 2.0 328 */ following(int offset)329 public abstract int following(int offset); 330 331 /** 332 * Sets the iterator's current iteration position to be the last 333 * boundary position preceding the specified position. (Whether the 334 * specified position is itself a boundary position or not doesn't 335 * matter-- this function always moves the iteration position to the 336 * last boundary before the specified position.) If the specified 337 * position is the starting position, returns DONE. 338 * @param offset The character position to start searching from. 339 * @return The position of the last boundary position preceding 340 * "offset" (whether of not "offset" itself is a boundary position), 341 * or DONE if "offset" is the starting offset of the iterator. 342 * @stable ICU 2.0 343 */ preceding(int offset)344 public int preceding(int offset) { 345 // NOTE: This implementation is here solely because we can't add new 346 // abstract methods to an existing class. There is almost ALWAYS a 347 // better, faster way to do this. 348 int pos = following(offset); 349 while (pos >= offset && pos != DONE) 350 pos = previous(); 351 return pos; 352 } 353 354 /** 355 * Return true if the specified position is a boundary position. If the 356 * function returns true, the current iteration position is set to the 357 * specified position; if the function returns false, the current 358 * iteration position is set as though following() had been called. 359 * @param offset the offset to check. 360 * @return True if "offset" is a boundary position. 361 * @stable ICU 2.0 362 */ isBoundary(int offset)363 public boolean isBoundary(int offset) { 364 // Again, this is the default implementation, which is provided solely because 365 // we couldn't add a new abstract method to an existing class. The real 366 // implementations will usually need to do a little more work. 367 if (offset == 0) { 368 return true; 369 } 370 else 371 return following(offset - 1) == offset; 372 } 373 374 /** 375 * Return the iterator's current position. 376 * @return The iterator's current position. 377 * @stable ICU 2.0 378 */ current()379 public abstract int current(); 380 381 382 /** 383 * Tag value for "words" that do not fit into any of other categories. 384 * Includes spaces and most punctuation. 385 * @stable ICU 53 386 */ 387 public static final int WORD_NONE = 0; 388 389 /** 390 * Upper bound for tags for uncategorized words. 391 * @stable ICU 53 392 */ 393 public static final int WORD_NONE_LIMIT = 100; 394 395 /** 396 * Tag value for words that appear to be numbers, lower limit. 397 * @stable ICU 53 398 */ 399 public static final int WORD_NUMBER = 100; 400 401 /** 402 * Tag value for words that appear to be numbers, upper limit. 403 * @stable ICU 53 404 */ 405 public static final int WORD_NUMBER_LIMIT = 200; 406 407 /** 408 * Tag value for words that contain letters, excluding 409 * hiragana, katakana or ideographic characters, lower limit. 410 * @stable ICU 53 411 */ 412 public static final int WORD_LETTER = 200; 413 414 /** 415 * Tag value for words containing letters, upper limit 416 * @stable ICU 53 417 */ 418 public static final int WORD_LETTER_LIMIT = 300; 419 420 /** 421 * Tag value for words containing kana characters, lower limit 422 * @stable ICU 53 423 */ 424 public static final int WORD_KANA = 300; 425 426 /** 427 * Tag value for words containing kana characters, upper limit 428 * @stable ICU 53 429 */ 430 public static final int WORD_KANA_LIMIT = 400; 431 432 /** 433 * Tag value for words containing ideographic characters, lower limit 434 * @stable ICU 53 435 */ 436 public static final int WORD_IDEO = 400; 437 438 /** 439 * Tag value for words containing ideographic characters, upper limit 440 * @stable ICU 53 441 */ 442 public static final int WORD_IDEO_LIMIT = 500; 443 444 /** 445 * For RuleBasedBreakIterators, return the status tag from the 446 * break rule that determined the most recently 447 * returned break position. 448 * <p> 449 * For break iterator types that do not support a rule status, 450 * a default value of 0 is returned. 451 * <p> 452 * @return The status from the break rule that determined the most recently 453 * returned break position. 454 * 455 * @stable ICU 52 456 */ 457 getRuleStatus()458 public int getRuleStatus() { 459 return 0; 460 } 461 462 /** 463 * For RuleBasedBreakIterators, get the status (tag) values from the break rule(s) 464 * that determined the most recently returned break position. 465 * <p> 466 * For break iterator types that do not support rule status, 467 * no values are returned. 468 * <p> 469 * If the size of the output array is insufficient to hold the data, 470 * the output will be truncated to the available length. No exception 471 * will be thrown. 472 * 473 * @param fillInArray an array to be filled in with the status values. 474 * @return The number of rule status values from rules that determined 475 * the most recent boundary returned by the break iterator. 476 * In the event that the array is too small, the return value 477 * is the total number of status values that were available, 478 * not the reduced number that were actually returned. 479 * @stable ICU 52 480 */ getRuleStatusVec(int[] fillInArray)481 public int getRuleStatusVec(int[] fillInArray) { 482 if (fillInArray != null && fillInArray.length > 0) { 483 fillInArray[0] = 0; 484 } 485 return 1; 486 } 487 488 /** 489 * Returns a CharacterIterator over the text being analyzed. 490 * For at least some subclasses of BreakIterator, this is a reference 491 * to the <b>actual iterator being used</b> by the BreakIterator, 492 * and therefore, this function's return value should be treated as 493 * <tt>const</tt>. No guarantees are made about the current position 494 * of this iterator when it is returned. If you need to move that 495 * position to examine the text, clone this function's return value first. 496 * @return A CharacterIterator over the text being analyzed. 497 * @stable ICU 2.0 498 */ getText()499 public abstract CharacterIterator getText(); 500 501 /** 502 * Sets the iterator to analyze a new piece of text. The new 503 * piece of text is passed in as a String, and the current 504 * iteration position is reset to the beginning of the string. 505 * (The old text is dropped.) 506 * @param newText A String containing the text to analyze with 507 * this BreakIterator. 508 * @stable ICU 2.0 509 */ setText(String newText)510 public void setText(String newText) 511 { 512 setText(new StringCharacterIterator(newText)); 513 } 514 515 /** 516 * Sets the iterator to analyze a new piece of text. The 517 * BreakIterator is passed a CharacterIterator through which 518 * it will access the text itself. The current iteration 519 * position is reset to the CharacterIterator's start index. 520 * (The old iterator is dropped.) 521 * @param newText A CharacterIterator referring to the text 522 * to analyze with this BreakIterator (the iterator's current 523 * position is ignored, but its other state is significant). 524 * @stable ICU 2.0 525 */ setText(CharacterIterator newText)526 public abstract void setText(CharacterIterator newText); 527 528 /** 529 * {@icu} 530 * @stable ICU 2.4 531 */ 532 public static final int KIND_CHARACTER = 0; 533 /** 534 * {@icu} 535 * @stable ICU 2.4 536 */ 537 public static final int KIND_WORD = 1; 538 /** 539 * {@icu} 540 * @stable ICU 2.4 541 */ 542 public static final int KIND_LINE = 2; 543 /** 544 * {@icu} 545 * @stable ICU 2.4 546 */ 547 public static final int KIND_SENTENCE = 3; 548 /** 549 * {@icu} 550 * @stable ICU 2.4 551 */ 552 public static final int KIND_TITLE = 4; 553 554 /** 555 * @since ICU 2.8 556 */ 557 private static final int KIND_COUNT = 5; 558 559 private static final SoftReference<?>[] iterCache = new SoftReference<?>[5]; 560 561 /** 562 * Returns a new instance of BreakIterator that locates word boundaries. 563 * This function assumes that the text being analyzed is in the default 564 * locale's language. 565 * @return An instance of BreakIterator that locates word boundaries. 566 * @stable ICU 2.0 567 */ getWordInstance()568 public static BreakIterator getWordInstance() 569 { 570 return getWordInstance(ULocale.getDefault()); 571 } 572 573 /** 574 * Returns a new instance of BreakIterator that locates word boundaries. 575 * @param where A locale specifying the language of the text to be 576 * analyzed. 577 * @return An instance of BreakIterator that locates word boundaries. 578 * @throws NullPointerException if <code>where</code> is null. 579 * @stable ICU 2.0 580 */ getWordInstance(Locale where)581 public static BreakIterator getWordInstance(Locale where) 582 { 583 return getBreakInstance(ULocale.forLocale(where), KIND_WORD); 584 } 585 586 /** 587 * {@icu} Returns a new instance of BreakIterator that locates word boundaries. 588 * @param where A locale specifying the language of the text to be 589 * analyzed. 590 * @return An instance of BreakIterator that locates word boundaries. 591 * @throws NullPointerException if <code>where</code> is null. 592 * @stable ICU 3.2 593 */ getWordInstance(ULocale where)594 public static BreakIterator getWordInstance(ULocale where) 595 { 596 return getBreakInstance(where, KIND_WORD); 597 } 598 599 /** 600 * Returns a new instance of BreakIterator that locates legal line- 601 * wrapping positions. This function assumes the text being broken 602 * is in the default locale's language. 603 * @return A new instance of BreakIterator that locates legal 604 * line-wrapping positions. 605 * @stable ICU 2.0 606 */ getLineInstance()607 public static BreakIterator getLineInstance() 608 { 609 return getLineInstance(ULocale.getDefault()); 610 } 611 612 /** 613 * Returns a new instance of BreakIterator that locates legal line- 614 * wrapping positions. 615 * @param where A Locale specifying the language of the text being broken. 616 * @return A new instance of BreakIterator that locates legal 617 * line-wrapping positions. 618 * @throws NullPointerException if <code>where</code> is null. 619 * @stable ICU 2.0 620 */ getLineInstance(Locale where)621 public static BreakIterator getLineInstance(Locale where) 622 { 623 return getBreakInstance(ULocale.forLocale(where), KIND_LINE); 624 } 625 626 /** 627 * {@icu} Returns a new instance of BreakIterator that locates legal line- 628 * wrapping positions. 629 * @param where A Locale specifying the language of the text being broken. 630 * @return A new instance of BreakIterator that locates legal 631 * line-wrapping positions. 632 * @throws NullPointerException if <code>where</code> is null. 633 * @stable ICU 3.2 634 */ getLineInstance(ULocale where)635 public static BreakIterator getLineInstance(ULocale where) 636 { 637 return getBreakInstance(where, KIND_LINE); 638 } 639 640 /** 641 * Returns a new instance of BreakIterator that locates logical-character 642 * boundaries. This function assumes that the text being analyzed is 643 * in the default locale's language. 644 * @return A new instance of BreakIterator that locates logical-character 645 * boundaries. 646 * @stable ICU 2.0 647 */ getCharacterInstance()648 public static BreakIterator getCharacterInstance() 649 { 650 return getCharacterInstance(ULocale.getDefault()); 651 } 652 653 /** 654 * Returns a new instance of BreakIterator that locates logical-character 655 * boundaries. 656 * @param where A Locale specifying the language of the text being analyzed. 657 * @return A new instance of BreakIterator that locates logical-character 658 * boundaries. 659 * @throws NullPointerException if <code>where</code> is null. 660 * @stable ICU 2.0 661 */ getCharacterInstance(Locale where)662 public static BreakIterator getCharacterInstance(Locale where) 663 { 664 return getBreakInstance(ULocale.forLocale(where), KIND_CHARACTER); 665 } 666 667 /** 668 * {@icu} Returns a new instance of BreakIterator that locates logical-character 669 * boundaries. 670 * @param where A Locale specifying the language of the text being analyzed. 671 * @return A new instance of BreakIterator that locates logical-character 672 * boundaries. 673 * @throws NullPointerException if <code>where</code> is null. 674 * @stable ICU 3.2 675 */ getCharacterInstance(ULocale where)676 public static BreakIterator getCharacterInstance(ULocale where) 677 { 678 return getBreakInstance(where, KIND_CHARACTER); 679 } 680 681 /** 682 * Returns a new instance of BreakIterator that locates sentence boundaries. 683 * This function assumes the text being analyzed is in the default locale's 684 * language. 685 * @return A new instance of BreakIterator that locates sentence boundaries. 686 * @stable ICU 2.0 687 */ getSentenceInstance()688 public static BreakIterator getSentenceInstance() 689 { 690 return getSentenceInstance(ULocale.getDefault()); 691 } 692 693 /** 694 * Returns a new instance of BreakIterator that locates sentence boundaries. 695 * @param where A Locale specifying the language of the text being analyzed. 696 * @return A new instance of BreakIterator that locates sentence boundaries. 697 * @throws NullPointerException if <code>where</code> is null. 698 * @stable ICU 2.0 699 */ getSentenceInstance(Locale where)700 public static BreakIterator getSentenceInstance(Locale where) 701 { 702 return getBreakInstance(ULocale.forLocale(where), KIND_SENTENCE); 703 } 704 705 /** 706 * {@icu} Returns a new instance of BreakIterator that locates sentence boundaries. 707 * @param where A Locale specifying the language of the text being analyzed. 708 * @return A new instance of BreakIterator that locates sentence boundaries. 709 * @throws NullPointerException if <code>where</code> is null. 710 * @stable ICU 3.2 711 */ getSentenceInstance(ULocale where)712 public static BreakIterator getSentenceInstance(ULocale where) 713 { 714 return getBreakInstance(where, KIND_SENTENCE); 715 } 716 717 /** 718 * {@icu} Returns a new instance of BreakIterator that locates title boundaries. 719 * This function assumes the text being analyzed is in the default locale's 720 * language. The iterator returned locates title boundaries as described for 721 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, 722 * please use a word boundary iterator. {@link #getWordInstance} 723 * @return A new instance of BreakIterator that locates title boundaries. 724 * @stable ICU 2.0 725 */ getTitleInstance()726 public static BreakIterator getTitleInstance() 727 { 728 return getTitleInstance(ULocale.getDefault()); 729 } 730 731 /** 732 * {@icu} Returns a new instance of BreakIterator that locates title boundaries. 733 * The iterator returned locates title boundaries as described for 734 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, 735 * please use Word Boundary iterator.{@link #getWordInstance} 736 * @param where A Locale specifying the language of the text being analyzed. 737 * @return A new instance of BreakIterator that locates title boundaries. 738 * @throws NullPointerException if <code>where</code> is null. 739 * @stable ICU 2.0 740 */ getTitleInstance(Locale where)741 public static BreakIterator getTitleInstance(Locale where) 742 { 743 return getBreakInstance(ULocale.forLocale(where), KIND_TITLE); 744 } 745 746 /** 747 * {@icu} Returns a new instance of BreakIterator that locates title boundaries. 748 * The iterator returned locates title boundaries as described for 749 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, 750 * please use Word Boundary iterator.{@link #getWordInstance} 751 * @param where A Locale specifying the language of the text being analyzed. 752 * @return A new instance of BreakIterator that locates title boundaries. 753 * @throws NullPointerException if <code>where</code> is null. 754 * @stable ICU 3.2 755 s */ getTitleInstance(ULocale where)756 public static BreakIterator getTitleInstance(ULocale where) 757 { 758 return getBreakInstance(where, KIND_TITLE); 759 } 760 761 /** 762 * {@icu} Registers a new break iterator of the indicated kind, to use in the given 763 * locale. Clones of the iterator will be returned if a request for a break iterator 764 * of the given kind matches or falls back to this locale. 765 * 766 * <p>Because ICU may choose to cache BreakIterator objects internally, this must 767 * be called at application startup, prior to any calls to 768 * BreakIterator.getInstance to avoid undefined behavior. 769 * 770 * @param iter the BreakIterator instance to adopt. 771 * @param locale the Locale for which this instance is to be registered 772 * @param kind the type of iterator for which this instance is to be registered 773 * @return a registry key that can be used to unregister this instance 774 * @stable ICU 2.4 775 */ registerInstance(BreakIterator iter, Locale locale, int kind)776 public static Object registerInstance(BreakIterator iter, Locale locale, int kind) { 777 return registerInstance(iter, ULocale.forLocale(locale), kind); 778 } 779 780 /** 781 * {@icu} Registers a new break iterator of the indicated kind, to use in the given 782 * locale. Clones of the iterator will be returned if a request for a break iterator 783 * of the given kind matches or falls back to this locale. 784 * 785 * <p>Because ICU may choose to cache BreakIterator objects internally, this must 786 * be called at application startup, prior to any calls to 787 * BreakIterator.getInstance to avoid undefined behavior. 788 * 789 * @param iter the BreakIterator instance to adopt. 790 * @param locale the Locale for which this instance is to be registered 791 * @param kind the type of iterator for which this instance is to be registered 792 * @return a registry key that can be used to unregister this instance 793 * @stable ICU 3.2 794 */ registerInstance(BreakIterator iter, ULocale locale, int kind)795 public static Object registerInstance(BreakIterator iter, ULocale locale, int kind) { 796 // If the registered object matches the one in the cache, then 797 // flush the cached object. 798 if (iterCache[kind] != null) { 799 BreakIteratorCache cache = (BreakIteratorCache) iterCache[kind].get(); 800 if (cache != null) { 801 if (cache.getLocale().equals(locale)) { 802 iterCache[kind] = null; 803 } 804 } 805 } 806 return getShim().registerInstance(iter, locale, kind); 807 } 808 809 /** 810 * {@icu} Unregisters a previously-registered BreakIterator using the key returned 811 * from the register call. Key becomes invalid after this call and should not be used 812 * again. 813 * @param key the registry key returned by a previous call to registerInstance 814 * @return true if the iterator for the key was successfully unregistered 815 * @stable ICU 2.4 816 */ unregister(Object key)817 public static boolean unregister(Object key) { 818 if (key == null) { 819 throw new IllegalArgumentException("registry key must not be null"); 820 } 821 // TODO: we don't do code coverage for the following lines 822 // because in getBreakInstance we always instantiate the shim, 823 // and test execution is such that we always instantiate a 824 // breakiterator before we get to the break iterator tests. 825 // this is for modularization, and we could remove the 826 // dependencies in getBreakInstance by rewriting part of the 827 // LocaleData code, or perhaps by accepting it into the 828 // module. 829 ///CLOVER:OFF 830 if (shim != null) { 831 // Unfortunately, we don't know what is being unregistered 832 // -- what `kind' and what locale -- so we flush all 833 // caches. This is safe but inefficient if people are 834 // actively registering and unregistering. 835 for (int kind=0; kind<KIND_COUNT; ++kind) { 836 iterCache[kind] = null; 837 } 838 return shim.unregister(key); 839 } 840 return false; 841 ///CLOVER:ON 842 } 843 844 // end of registration 845 846 /** 847 * Returns a particular kind of BreakIterator for a locale. 848 * Avoids writing a switch statement with getXYZInstance(where) calls. 849 * @internal 850 * @deprecated This API is ICU internal only. 851 */ 852 @Deprecated getBreakInstance(ULocale where, int kind)853 public static BreakIterator getBreakInstance(ULocale where, int kind) { 854 if (where == null) { 855 throw new NullPointerException("Specified locale is null"); 856 } 857 if (iterCache[kind] != null) { 858 BreakIteratorCache cache = (BreakIteratorCache)iterCache[kind].get(); 859 if (cache != null) { 860 if (cache.getLocale().equals(where)) { 861 return cache.createBreakInstance(); 862 } 863 } 864 } 865 866 // sigh, all to avoid linking in ICULocaleData... 867 BreakIterator result = getShim().createBreakIterator(where, kind); 868 869 BreakIteratorCache cache = new BreakIteratorCache(where, result); 870 iterCache[kind] = new SoftReference<BreakIteratorCache>(cache); 871 if (result instanceof RuleBasedBreakIterator) { 872 RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator)result; 873 rbbi.setBreakType(kind); 874 } 875 876 return result; 877 } 878 879 880 /** 881 * Returns a list of locales for which BreakIterators can be used. 882 * @return An array of Locales. All of the locales in the array can 883 * be used when creating a BreakIterator. 884 * @stable ICU 2.6 885 */ getAvailableLocales()886 public static synchronized Locale[] getAvailableLocales() 887 { 888 // to avoid linking ICULocaleData 889 return getShim().getAvailableLocales(); 890 } 891 892 /** 893 * {@icu} Returns a list of locales for which BreakIterators can be used. 894 * @return An array of Locales. All of the locales in the array can 895 * be used when creating a BreakIterator. 896 * @draft ICU 3.2 (retain) 897 * @provisional This API might change or be removed in a future release. 898 */ getAvailableULocales()899 public static synchronized ULocale[] getAvailableULocales() 900 { 901 // to avoid linking ICULocaleData 902 return getShim().getAvailableULocales(); 903 } 904 905 private static final class BreakIteratorCache { 906 907 private BreakIterator iter; 908 private ULocale where; 909 BreakIteratorCache(ULocale where, BreakIterator iter)910 BreakIteratorCache(ULocale where, BreakIterator iter) { 911 this.where = where; 912 this.iter = (BreakIterator) iter.clone(); 913 } 914 getLocale()915 ULocale getLocale() { 916 return where; 917 } 918 createBreakInstance()919 BreakIterator createBreakInstance() { 920 return (BreakIterator) iter.clone(); 921 } 922 } 923 924 static abstract class BreakIteratorServiceShim { registerInstance(BreakIterator iter, ULocale l, int k)925 public abstract Object registerInstance(BreakIterator iter, ULocale l, int k); unregister(Object key)926 public abstract boolean unregister(Object key); getAvailableLocales()927 public abstract Locale[] getAvailableLocales(); getAvailableULocales()928 public abstract ULocale[] getAvailableULocales(); createBreakIterator(ULocale l, int k)929 public abstract BreakIterator createBreakIterator(ULocale l, int k); 930 } 931 932 private static BreakIteratorServiceShim shim; getShim()933 private static BreakIteratorServiceShim getShim() { 934 // Note: this instantiation is safe on loose-memory-model configurations 935 // despite lack of synchronization, since the shim instance has no state-- 936 // it's all in the class init. The worst problem is we might instantiate 937 // two shim instances, but they'll share the same state so that's ok. 938 if (shim == null) { 939 try { 940 Class<?> cls = Class.forName("com.ibm.icu.text.BreakIteratorFactory"); 941 shim = (BreakIteratorServiceShim)cls.newInstance(); 942 } 943 catch (MissingResourceException e) 944 { 945 throw e; 946 } 947 catch (Exception e) { 948 ///CLOVER:OFF 949 if(DEBUG){ 950 e.printStackTrace(); 951 } 952 throw new RuntimeException(e.getMessage()); 953 ///CLOVER:ON 954 } 955 } 956 return shim; 957 } 958 959 // -------- BEGIN ULocale boilerplate -------- 960 961 /** 962 * {@icu} Returns the locale that was used to create this object, or null. 963 * This may may differ from the locale requested at the time of 964 * this object's creation. For example, if an object is created 965 * for locale <tt>en_US_CALIFORNIA</tt>, the actual data may be 966 * drawn from <tt>en</tt> (the <i>actual</i> locale), and 967 * <tt>en_US</tt> may be the most specific locale that exists (the 968 * <i>valid</i> locale). 969 * 970 * <p>Note: The <i>actual</i> locale is returned correctly, but the <i>valid</i> 971 * locale is not, in most cases. 972 * @param type type of information requested, either {@link 973 * com.ibm.icu.util.ULocale#VALID_LOCALE} or {@link 974 * com.ibm.icu.util.ULocale#ACTUAL_LOCALE}. 975 * @return the information specified by <i>type</i>, or null if 976 * this object was not constructed from locale data. 977 * @see com.ibm.icu.util.ULocale 978 * @see com.ibm.icu.util.ULocale#VALID_LOCALE 979 * @see com.ibm.icu.util.ULocale#ACTUAL_LOCALE 980 * @draft ICU 2.8 (retain) 981 * @provisional This API might change or be removed in a future release. 982 */ getLocale(ULocale.Type type)983 public final ULocale getLocale(ULocale.Type type) { 984 return type == ULocale.ACTUAL_LOCALE ? 985 this.actualLocale : this.validLocale; 986 } 987 988 /** 989 * Set information about the locales that were used to create this 990 * object. If the object was not constructed from locale data, 991 * both arguments should be set to null. Otherwise, neither 992 * should be null. The actual locale must be at the same level or 993 * less specific than the valid locale. This method is intended 994 * for use by factories or other entities that create objects of 995 * this class. 996 * @param valid the most specific locale containing any resource 997 * data, or null 998 * @param actual the locale containing data used to construct this 999 * object, or null 1000 * @see com.ibm.icu.util.ULocale 1001 * @see com.ibm.icu.util.ULocale#VALID_LOCALE 1002 * @see com.ibm.icu.util.ULocale#ACTUAL_LOCALE 1003 */ setLocale(ULocale valid, ULocale actual)1004 final void setLocale(ULocale valid, ULocale actual) { 1005 // Change the following to an assertion later 1006 if ((valid == null) != (actual == null)) { 1007 ///CLOVER:OFF 1008 throw new IllegalArgumentException(); 1009 ///CLOVER:ON 1010 } 1011 // Another check we could do is that the actual locale is at 1012 // the same level or less specific than the valid locale. 1013 this.validLocale = valid; 1014 this.actualLocale = actual; 1015 } 1016 1017 /** 1018 * The most specific locale containing any resource data, or null. 1019 * @see com.ibm.icu.util.ULocale 1020 */ 1021 private ULocale validLocale; 1022 1023 /** 1024 * The locale containing data used to construct this object, or 1025 * null. 1026 * @see com.ibm.icu.util.ULocale 1027 */ 1028 private ULocale actualLocale; 1029 1030 // -------- END ULocale boilerplate -------- 1031 } 1032