1 /* 2 ******************************************************************************* 3 * Copyright (C) 2005-2015 International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.text; 9 10 import static com.ibm.icu.impl.CharacterIteration.DONE32; 11 import static com.ibm.icu.impl.CharacterIteration.next32; 12 import static com.ibm.icu.impl.CharacterIteration.nextTrail32; 13 import static com.ibm.icu.impl.CharacterIteration.previous32; 14 15 import java.io.ByteArrayOutputStream; 16 import java.io.IOException; 17 import java.io.InputStream; 18 import java.io.OutputStream; 19 import java.nio.ByteBuffer; 20 import java.text.CharacterIterator; 21 import java.util.concurrent.ConcurrentHashMap; 22 23 import com.ibm.icu.impl.Assert; 24 import com.ibm.icu.impl.CharTrie; 25 import com.ibm.icu.impl.CharacterIteration; 26 import com.ibm.icu.impl.ICUBinary; 27 import com.ibm.icu.impl.ICUDebug; 28 import com.ibm.icu.lang.UCharacter; 29 import com.ibm.icu.lang.UProperty; 30 import com.ibm.icu.lang.UScript; 31 32 /** 33 * Rule Based Break Iterator 34 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. 35 * 36 * @stable ICU 2.0 37 */ 38 public class RuleBasedBreakIterator extends BreakIterator { 39 //======================================================================= 40 // Constructors & Factories 41 //======================================================================= 42 43 /** 44 * private constructor 45 */ RuleBasedBreakIterator()46 private RuleBasedBreakIterator() { 47 fLastStatusIndexValid = true; 48 fDictionaryCharCount = 0; 49 fBreakEngines.put(-1, fUnhandledBreakEngine); 50 } 51 52 /** 53 * Create a break iterator from a precompiled set of break rules. 54 * 55 * Creating a break iterator from the binary rules is much faster than 56 * creating one from source rules. 57 * 58 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 59 * Binary break iterator rules are not guaranteed to be compatible between 60 * different versions of ICU. 61 * 62 * @param is an input stream supplying the compiled binary rules. 63 * @throws IOException if there is an error while reading the rules from the InputStream. 64 * @see #compileRules(String, OutputStream) 65 * @stable ICU 4.8 66 */ getInstanceFromCompiledRules(InputStream is)67 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 68 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 69 This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); 70 return This; 71 } 72 73 /** 74 * Create a break iterator from a precompiled set of break rules. 75 * 76 * Creating a break iterator from the binary rules is much faster than 77 * creating one from source rules. 78 * 79 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 80 * Binary break iterator rules are not guaranteed to be compatible between 81 * different versions of ICU. 82 * 83 * @param bytes a buffer supplying the compiled binary rules. 84 * @throws IOException if there is an error while reading the rules from the buffer. 85 * @see #compileRules(String, OutputStream) 86 * @internal 87 * @deprecated This API is ICU internal only. 88 */ 89 @Deprecated getInstanceFromCompiledRules(ByteBuffer bytes)90 public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { 91 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 92 This.fRData = RBBIDataWrapper.get(bytes); 93 return This; 94 } 95 96 /** 97 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 98 * @param rules The break rules to be used. 99 * @stable ICU 2.2 100 */ RuleBasedBreakIterator(String rules)101 public RuleBasedBreakIterator(String rules) { 102 this(); 103 try { 104 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); 105 compileRules(rules, ruleOS); 106 fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); 107 } catch (IOException e) { 108 ///CLOVER:OFF 109 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, 110 // causing bogus compiled rules to be produced, but with no compile error raised. 111 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " 112 + e.getMessage()); 113 throw rte; 114 ///CLOVER:ON 115 } 116 } 117 118 //======================================================================= 119 // Boilerplate 120 //======================================================================= 121 122 /** 123 * Clones this iterator. 124 * @return A newly-constructed RuleBasedBreakIterator with the same 125 * behavior as this one. 126 * @stable ICU 2.0 127 */ clone()128 public Object clone() 129 { 130 RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone(); 131 if (fText != null) { 132 result.fText = (CharacterIterator)(fText.clone()); 133 } 134 return result; 135 } 136 137 /** 138 * Returns true if both BreakIterators are of the same class, have the same 139 * rules, and iterate over the same text. 140 * @stable ICU 2.0 141 */ equals(Object that)142 public boolean equals(Object that) { 143 if (that == null) { 144 return false; 145 } 146 if (this == that) { 147 return true; 148 } 149 try { 150 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 151 if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 152 return false; 153 } 154 if (fRData != null && other.fRData != null && 155 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 156 return false; 157 } 158 if (fText == null && other.fText == null) { 159 return true; 160 } 161 if (fText == null || other.fText == null) { 162 return false; 163 } 164 return fText.equals(other.fText); 165 } 166 catch(ClassCastException e) { 167 return false; 168 } 169 } 170 171 /** 172 * Returns the description (rules) used to create this iterator. 173 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) 174 * @stable ICU 2.0 175 */ toString()176 public String toString() { 177 String retStr = ""; 178 if (fRData != null) { 179 retStr = fRData.fRuleSource; 180 } 181 return retStr; 182 } 183 184 /** 185 * Compute a hashcode for this BreakIterator 186 * @return A hash code 187 * @stable ICU 2.0 188 */ hashCode()189 public int hashCode() 190 { 191 return fRData.fRuleSource.hashCode(); 192 } 193 194 195 private static final int START_STATE = 1; // The state number of the starting state 196 private static final int STOP_STATE = 0; // The state-transition value indicating "stop" 197 198 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 199 // of user text. A variable with this enum type keeps track of where we 200 // are. The state machine only fetches user text input while in RUN mode. 201 private static final int RBBI_START = 0; 202 private static final int RBBI_RUN = 1; 203 private static final int RBBI_END = 2; 204 205 /* 206 * The character iterator through which this BreakIterator accesses the text. 207 */ 208 private CharacterIterator fText = new java.text.StringCharacterIterator(""); 209 210 /** 211 * The rule data for this BreakIterator instance. Package private. 212 */ 213 RBBIDataWrapper fRData; 214 215 /* 216 * Index of the Rule {tag} values for the most recent match. 217 */ 218 private int fLastRuleStatusIndex; 219 220 /* 221 * Rule tag value valid flag. 222 * Some iterator operations don't intrinsically set the correct tag value. 223 * This flag lets us lazily compute the value if we are ever asked for it. 224 */ 225 private boolean fLastStatusIndexValid; 226 227 /** 228 * Counter for the number of characters encountered with the "dictionary" 229 * flag set. Normal RBBI iterators don't use it, although the code 230 * for updating it is live. Dictionary Based break iterators (a subclass 231 * of us) access this field directly. 232 * @internal 233 */ 234 private int fDictionaryCharCount; 235 236 /* 237 * ICU debug argument name for RBBI 238 */ 239 private static final String RBBI_DEBUG_ARG = "rbbi"; 240 241 /** 242 * Debugging flag. Trace operation of state machine when true. 243 */ 244 private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) 245 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; 246 247 /** 248 * What kind of break iterator this is. Set to KIND_LINE by default, 249 * since this produces sensible output. 250 */ 251 private int fBreakType = KIND_LINE; 252 253 /** 254 * The "default" break engine - just skips over ranges of dictionary words, 255 * producing no breaks. Should only be used if characters need to be handled 256 * by a dictionary but we have no dictionary implementation for them. 257 */ 258 private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine(); 259 260 /** 261 * when a range of characters is divided up using the dictionary, the break 262 * positions that are discovered are stored here, preventing us from having 263 * to use either the dictionary or the state table again until the iterator 264 * leaves this range of text 265 */ 266 private int[] fCachedBreakPositions; 267 268 /** 269 * if fCachedBreakPositions is not null, this indicates which item in the 270 * cache the current iteration position refers to 271 */ 272 private int fPositionInCache; 273 274 275 private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines = 276 new ConcurrentHashMap<Integer, LanguageBreakEngine>(); 277 /** 278 * Dumps caches and performs other actions associated with a complete change 279 * in text or iteration position. 280 */ reset()281 private void reset() { 282 fCachedBreakPositions = null; 283 // fNumCachedBreakPositions = 0; 284 fDictionaryCharCount = 0; 285 fPositionInCache = 0; 286 287 } 288 /** 289 * Dump the contents of the state table and character classes for this break iterator. 290 * For debugging only. 291 * @internal 292 * @deprecated This API is ICU internal only. 293 */ 294 @Deprecated dump()295 public void dump() { 296 this.fRData.dump(); 297 } 298 299 /** 300 * Compile a set of source break rules into the binary state tables used 301 * by the break iterator engine. Creating a break iterator from precompiled 302 * rules is much faster than creating one from source rules. 303 * 304 * Binary break rules are not guaranteed to be compatible between different 305 * versions of ICU. 306 * 307 * 308 * @param rules The source form of the break rules 309 * @param ruleBinary An output stream to receive the compiled rules. 310 * @throws IOException If there is an error writing the output. 311 * @see #getInstanceFromCompiledRules(InputStream) 312 * @stable ICU 4.8 313 */ compileRules(String rules, OutputStream ruleBinary)314 public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 315 RBBIRuleBuilder.compileRules(rules, ruleBinary); 316 } 317 318 //======================================================================= 319 // BreakIterator overrides 320 //======================================================================= 321 322 /** 323 * Sets the current iteration position to the beginning of the text. 324 * (i.e., the CharacterIterator's starting offset). 325 * @return The offset of the beginning of the text. 326 * @stable ICU 2.0 327 */ first()328 public int first() { 329 fCachedBreakPositions = null; 330 fDictionaryCharCount = 0; 331 fPositionInCache = 0; 332 fLastRuleStatusIndex = 0; 333 fLastStatusIndexValid = true; 334 if (fText == null) { 335 return BreakIterator.DONE; 336 } 337 fText.first(); 338 return fText.getIndex(); 339 } 340 341 /** 342 * Sets the current iteration position to the end of the text. 343 * (i.e., the CharacterIterator's ending offset). 344 * @return The text's past-the-end offset. 345 * @stable ICU 2.0 346 */ last()347 public int last() { 348 fCachedBreakPositions = null; 349 fDictionaryCharCount = 0; 350 fPositionInCache = 0; 351 352 if (fText == null) { 353 fLastRuleStatusIndex = 0; 354 fLastStatusIndexValid = true; 355 return BreakIterator.DONE; 356 } 357 358 // t.last() returns the offset of the last character, 359 // rather than the past-the-end offset 360 // so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ... 361 // will work correctly. 362 fLastStatusIndexValid = false; 363 int pos = fText.getEndIndex(); 364 fText.setIndex(pos); 365 return pos; 366 } 367 368 /** 369 * Advances the iterator either forward or backward the specified number of steps. 370 * Negative values move backward, and positive values move forward. This is 371 * equivalent to repeatedly calling next() or previous(). 372 * @param n The number of steps to move. The sign indicates the direction 373 * (negative is backwards, and positive is forwards). 374 * @return The character offset of the boundary position n boundaries away from 375 * the current one. 376 * @stable ICU 2.0 377 */ next(int n)378 public int next(int n) { 379 int result = current(); 380 while (n > 0) { 381 result = next(); 382 --n; 383 } 384 while (n < 0) { 385 result = previous(); 386 ++n; 387 } 388 return result; 389 } 390 391 /** 392 * Advances the iterator to the next boundary position. 393 * @return The position of the first boundary after this one. 394 * @stable ICU 2.0 395 */ next()396 public int next() { 397 // if we have cached break positions and we're still in the range 398 // covered by them, just move one step forward in the cache 399 if (fCachedBreakPositions != null) { 400 if (fPositionInCache < fCachedBreakPositions.length - 1) { 401 ++fPositionInCache; 402 int pos = fCachedBreakPositions[fPositionInCache]; 403 fText.setIndex(pos); 404 return pos; 405 } 406 else { 407 reset(); 408 } 409 } 410 411 int startPos = current(); 412 fDictionaryCharCount = 0; 413 int result = handleNext(fRData.fFTable); 414 if (fDictionaryCharCount > 0) { 415 result = checkDictionary(startPos, result, false); 416 } 417 return result; 418 } 419 420 /** 421 * checkDictionary This function handles all processing of characters in 422 * the "dictionary" set. It will determine the appropriate 423 * course of action, and possibly set up a cache in the 424 * process. 425 */ checkDictionary(int startPos, int endPos, boolean reverse)426 private int checkDictionary(int startPos, int endPos, boolean reverse) { 427 428 // Reset the old break cache first. 429 reset(); 430 431 // note: code segment below assumes that dictionary chars are in the 432 // startPos-endPos range 433 // value returned should be next character in sequence 434 if ((endPos - startPos) <= 1) { 435 return (reverse ? startPos : endPos); 436 } 437 438 // Starting from the starting point, scan towards the proposed result, 439 // looking for the first dictionary character (which may be the one 440 // we're on, if we're starting in the middle of a range). 441 fText.setIndex(reverse ? endPos : startPos); 442 if (reverse) { 443 CharacterIteration.previous32(fText); 444 } 445 446 int rangeStart = startPos; 447 int rangeEnd = endPos; 448 449 int category; 450 int current; 451 DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI(); 452 int foundBreakCount = 0; 453 int c = CharacterIteration.current32(fText); 454 category = (short)fRData.fTrie.getCodePointValue(c); 455 456 // Is the character we're starting on a dictionary character? If so, we 457 // need to back up to include the entire run; otherwise the results of 458 // the break algorithm will differ depending on where we start. Since 459 // the result is cached and there is typically a non-dictionary break 460 // within a small number of words, there should be little performance impact. 461 if ((category & 0x4000) != 0) { 462 if (reverse) { 463 do { 464 CharacterIteration.next32(fText); 465 c = CharacterIteration.current32(fText); 466 category = (short)fRData.fTrie.getCodePointValue(c); 467 } while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0); 468 469 // Back up to the last dictionary character 470 rangeEnd = fText.getIndex(); 471 if (c == CharacterIteration.DONE32) { 472 // c = fText->last32(); 473 // TODO: why was this if needed? 474 c = CharacterIteration.previous32(fText); 475 } 476 else { 477 c = CharacterIteration.previous32(fText); 478 } 479 } 480 else { 481 do { 482 c = CharacterIteration.previous32(fText); 483 category = (short)fRData.fTrie.getCodePointValue(c); 484 } 485 while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0)); 486 // Back up to the last dictionary character 487 if (c == CharacterIteration.DONE32) { 488 // c = fText->first32(); 489 c = CharacterIteration.current32(fText); 490 } 491 else { 492 CharacterIteration.next32(fText); 493 c = CharacterIteration.current32(fText); 494 } 495 rangeStart = fText.getIndex(); 496 } 497 category = (short)fRData.fTrie.getCodePointValue(c); 498 } 499 500 501 // Loop through the text, looking for ranges of dictionary characters. 502 // For each span, find the appropriate break engine, and ask it to find 503 // any breaks within the span. 504 // Note: we always do this in the forward direction, so that the break 505 // cache is built in the right order. 506 if (reverse) { 507 fText.setIndex(rangeStart); 508 c = CharacterIteration.current32(fText); 509 category = (short)fRData.fTrie.getCodePointValue(c); 510 } 511 LanguageBreakEngine lbe = null; 512 while(true) { 513 while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { 514 CharacterIteration.next32(fText); 515 c = CharacterIteration.current32(fText); 516 category = (short)fRData.fTrie.getCodePointValue(c); 517 } 518 if (current >= rangeEnd) { 519 break; 520 } 521 522 // We now have a dictionary character. Get the appropriate language object 523 // to deal with it. 524 lbe = getLanguageBreakEngine(c); 525 526 // Ask the language object if there are any breaks. It will leave the text 527 // pointer on the other side of its range, ready to search for the next one. 528 if (lbe != null) { 529 int startingIdx = fText.getIndex(); 530 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks); 531 assert fText.getIndex() > startingIdx; 532 } 533 534 // Reload the loop variables for the next go-round 535 c = CharacterIteration.current32(fText); 536 category = (short)fRData.fTrie.getCodePointValue(c); 537 } 538 539 // If we found breaks, build a new break cache. The first and last entries must 540 // be the original starting and ending position. 541 if (foundBreakCount > 0) { 542 if (foundBreakCount != breaks.size()) { 543 System.out.println("oops, foundBreakCount != breaks.size(). LBE = " + lbe.getClass()); 544 } 545 assert foundBreakCount == breaks.size(); 546 if (startPos < breaks.peekLast()) { 547 breaks.offer(startPos); 548 } 549 if (endPos > breaks.peek()) { 550 breaks.push(endPos); 551 } 552 553 // TODO: get rid of this array, use results from the deque directly 554 fCachedBreakPositions = new int[breaks.size()]; 555 556 int i = 0; 557 while (breaks.size() > 0) { 558 fCachedBreakPositions[i++] = breaks.pollLast(); 559 } 560 561 // If there are breaks, then by definition, we are replacing the original 562 // proposed break by one of the breaks we found. Use following() and 563 // preceding() to do the work. They should never recurse in this case. 564 if (reverse) { 565 return preceding(endPos); 566 } 567 else { 568 return following(startPos); 569 } 570 } 571 572 // If we get here, there were no language-based breaks. Set the text pointer 573 // to the original proposed break. 574 fText.setIndex(reverse ? startPos : endPos); 575 return (reverse ? startPos : endPos); 576 577 } 578 579 580 /** 581 * Moves the iterator backwards, to the last boundary preceding this one. 582 * @return The position of the last boundary position preceding this one. 583 * @stable ICU 2.0 584 */ previous()585 public int previous() { 586 int result; 587 int startPos; 588 589 CharacterIterator text = getText(); 590 591 fLastStatusIndexValid = false; 592 593 // if we have cached break positions and we're still in the range 594 // covered by them, just move one step backward in the cache 595 if (fCachedBreakPositions != null) { 596 if (fPositionInCache > 0) { 597 --fPositionInCache; 598 // If we're at the beginning of the cache, need to reevaluate the 599 // rule status 600 if (fPositionInCache <= 0) { 601 fLastStatusIndexValid = false; 602 } 603 int pos = fCachedBreakPositions[fPositionInCache]; 604 text.setIndex(pos); 605 return pos; 606 } else { 607 reset(); 608 } 609 } 610 611 // if we're already sitting at the beginning of the text, return DONE 612 startPos = current(); 613 if (fText == null || startPos == fText.getBeginIndex()) { 614 fLastRuleStatusIndex = 0; 615 fLastStatusIndexValid = true; 616 return BreakIterator.DONE; 617 } 618 619 // Rules with an exact reverse table are handled here. 620 if (fRData.fSRTable != null || fRData.fSFTable != null) { 621 result = handlePrevious(fRData.fRTable); 622 if (fDictionaryCharCount > 0) { 623 result = checkDictionary(result, startPos, true); 624 } 625 return result; 626 } 627 628 // old rule syntax 629 // set things up. handlePrevious() will back us up to some valid 630 // break position before the current position (we back our internal 631 // iterator up one step to prevent handlePrevious() from returning 632 // the current position), but not necessarily the last one before 633 // where we started 634 635 int start = current(); 636 637 previous32(fText); 638 int lastResult = handlePrevious(fRData.fRTable); 639 if (lastResult == BreakIterator.DONE) { 640 lastResult = fText.getBeginIndex(); 641 fText.setIndex(lastResult); 642 } 643 result = lastResult; 644 int lastTag = 0; 645 boolean breakTagValid = false; 646 647 // iterate forward from the known break position until we pass our 648 // starting point. The last break position before the starting 649 // point is our return value 650 651 for (;;) { 652 result = next(); 653 if (result == BreakIterator.DONE || result >= start) { 654 break; 655 } 656 lastResult = result; 657 lastTag = fLastRuleStatusIndex; 658 breakTagValid = true; 659 } 660 661 // fLastBreakTag wants to have the value for section of text preceding 662 // the result position that we are to return (in lastResult.) If 663 // the backwards rules overshot and the above loop had to do two or more 664 // handleNext()s to move up to the desired return position, we will have a valid 665 // tag value. But, if handlePrevious() took us to exactly the correct result position, 666 // we wont have a tag value for that position, which is only set by handleNext(). 667 668 // Set the current iteration position to be the last break position 669 // before where we started, and then return that value. 670 fText.setIndex(lastResult); 671 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 672 fLastStatusIndexValid = breakTagValid; 673 return lastResult; 674 } 675 676 /** 677 * Sets the iterator to refer to the first boundary position following 678 * the specified position. 679 * @param offset The position from which to begin searching for a break position. 680 * @return The position of the first break after the current position. 681 * @stable ICU 2.0 682 */ following(int offset)683 public int following(int offset) { 684 CharacterIterator text = getText(); 685 686 // if we have no cached break positions, or if "offset" is outside the 687 // range covered by the cache, then dump the cache and call our 688 // inherited following() method. This will call other methods in this 689 // class that may refresh the cache. 690 if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] || 691 offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) { 692 fCachedBreakPositions = null; 693 return rulesFollowing(offset); 694 } 695 696 // on the other hand, if "offset" is within the range covered by the 697 // cache, then just search the cache for the first break position 698 // after "offset" 699 else { 700 fPositionInCache = 0; 701 while (fPositionInCache < fCachedBreakPositions.length 702 && offset >= fCachedBreakPositions[fPositionInCache]) 703 ++fPositionInCache; 704 text.setIndex(fCachedBreakPositions[fPositionInCache]); 705 return text.getIndex(); 706 } 707 } 708 rulesFollowing(int offset)709 private int rulesFollowing(int offset) { 710 // if the offset passed in is already past the end of the text, 711 // just return DONE; if it's before the beginning, return the 712 // text's starting offset 713 fLastRuleStatusIndex = 0; 714 fLastStatusIndexValid = true; 715 if (fText == null || offset >= fText.getEndIndex()) { 716 last(); 717 return next(); 718 } 719 else if (offset < fText.getBeginIndex()) { 720 return first(); 721 } 722 723 // otherwise, set our internal iteration position (temporarily) 724 // to the position passed in. If this is the _beginning_ position, 725 // then we can just use next() to get our return value 726 727 int result = 0; 728 729 if (fRData.fSRTable != null) { 730 // Safe Point Reverse rules exist. 731 // This allows us to use the optimum algorithm. 732 fText.setIndex(offset); 733 // move forward one codepoint to prepare for moving back to a 734 // safe point. 735 // this handles offset being between a supplementary character 736 next32(fText); 737 // handlePrevious will move most of the time to < 1 boundary away 738 handlePrevious(fRData.fSRTable); 739 result = next(); 740 while (result <= offset) { 741 result = next(); 742 } 743 return result; 744 } 745 if (fRData.fSFTable != null) { 746 // No Safe point reverse table, but there is a safe pt forward table. 747 // 748 fText.setIndex(offset); 749 previous32(fText); 750 // handle next will give result >= offset 751 handleNext(fRData.fSFTable); 752 // previous will give result 0 or 1 boundary away from offset, 753 // most of the time 754 // we have to 755 int oldresult = previous(); 756 while (oldresult > offset) { 757 result = previous(); 758 if (result <= offset) { 759 return oldresult; 760 } 761 oldresult = result; 762 } 763 result = next(); 764 if (result <= offset) { 765 return next(); 766 } 767 return result; 768 } 769 // otherwise, we have to sync up first. Use handlePrevious() to back 770 // us up to a known break position before the specified position (if 771 // we can determine that the specified position is a break position, 772 // we don't back up at all). This may or may not be the last break 773 // position at or before our starting position. Advance forward 774 // from here until we've passed the starting position. The position 775 // we stop on will be the first break position after the specified one. 776 // old rule syntax 777 778 fText.setIndex(offset); 779 if (offset == fText.getBeginIndex()) { 780 return next(); 781 } 782 result = previous(); 783 784 while (result != BreakIterator.DONE && result <= offset) { 785 result = next(); 786 } 787 788 return result; 789 } 790 /** 791 * Sets the iterator to refer to the last boundary position before the 792 * specified position. 793 * @param offset The position to begin searching for a break from. 794 * @return The position of the last boundary before the starting position. 795 * @stable ICU 2.0 796 */ preceding(int offset)797 public int preceding(int offset) { 798 CharacterIterator text = getText(); 799 800 // if we have no cached break positions, or "offset" is outside the 801 // range covered by the cache, we can just call the inherited routine 802 // (which will eventually call other routines in this class that may 803 // refresh the cache) 804 if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] || 805 offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) { 806 fCachedBreakPositions = null; 807 return rulesPreceding(offset); 808 } 809 810 // on the other hand, if "offset" is within the range covered by the cache, 811 // then all we have to do is search the cache for the last break position 812 // before "offset" 813 else { 814 fPositionInCache = 0; 815 while (fPositionInCache < fCachedBreakPositions.length 816 && offset > fCachedBreakPositions[fPositionInCache]) 817 ++fPositionInCache; 818 --fPositionInCache; 819 text.setIndex(fCachedBreakPositions[fPositionInCache]); 820 return text.getIndex(); 821 } 822 } 823 rulesPreceding(int offset)824 private int rulesPreceding(int offset) { 825 // if the offset passed in is already past the end of the text, 826 // just return DONE; if it's before the beginning, return the 827 828 // text's starting offset 829 if (fText == null || offset > fText.getEndIndex()) { 830 // return BreakIterator::DONE; 831 return last(); 832 } 833 else if (offset < fText.getBeginIndex()) { 834 return first(); 835 } 836 837 // if we start by updating the current iteration position to the 838 // position specified by the caller, we can just use previous() 839 // to carry out this operation 840 841 int result; 842 if (fRData.fSFTable != null) { 843 /// todo synwee 844 // new rule syntax 845 fText.setIndex(offset); 846 // move backwards one codepoint to prepare for moving forwards to a 847 // safe point. 848 // this handles offset being between a supplementary character 849 previous32(fText); 850 handleNext(fRData.fSFTable); 851 result = previous(); 852 while (result >= offset) { 853 result = previous(); 854 } 855 return result; 856 } 857 if (fRData.fSRTable != null) { 858 // backup plan if forward safe table is not available 859 fText.setIndex(offset); 860 next32(fText); 861 // handle previous will give result <= offset 862 handlePrevious(fRData.fSRTable); 863 864 // next will give result 0 or 1 boundary away from offset, 865 // most of the time 866 // we have to 867 int oldresult = next(); 868 while (oldresult < offset) { 869 result = next(); 870 if (result >= offset) { 871 return oldresult; 872 } 873 oldresult = result; 874 } 875 result = previous(); 876 if (result >= offset) { 877 return previous(); 878 } 879 return result; 880 } 881 882 // old rule syntax 883 fText.setIndex(offset); 884 return previous(); 885 } 886 887 /** 888 * Throw IllegalArgumentException unless begin <= offset < end. 889 * @stable ICU 2.0 890 */ checkOffset(int offset, CharacterIterator text)891 protected static final void checkOffset(int offset, CharacterIterator text) { 892 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 893 throw new IllegalArgumentException("offset out of bounds"); 894 } 895 } 896 897 898 /** 899 * Returns true if the specified position is a boundary position. As a side 900 * effect, leaves the iterator pointing to the first boundary position at 901 * or after "offset". 902 * @param offset the offset to check. 903 * @return True if "offset" is a boundary position. 904 * @stable ICU 2.0 905 */ isBoundary(int offset)906 public boolean isBoundary(int offset) { 907 checkOffset(offset, fText); 908 909 // the beginning index of the iterator is always a boundary position by definition 910 if (offset == fText.getBeginIndex()) { 911 first(); // For side effects on current position, tag values. 912 return true; 913 } 914 915 if (offset == fText.getEndIndex()) { 916 last(); // For side effects on current position, tag values. 917 return true; 918 } 919 920 // otherwise, we can use following() on the position before the specified 921 // one and return true if the position we get back is the one the user 922 // specified 923 924 // return following(offset - 1) == offset; 925 // TODO: check whether it is safe to revert to the simpler offset-1 code 926 // The safe rules may take care of unpaired surrogates ok. 927 fText.setIndex(offset); 928 previous32(fText); 929 int pos = fText.getIndex(); 930 boolean result = following(pos) == offset; 931 return result; 932 } 933 934 /** 935 * Returns the current iteration position. 936 * @return The current iteration position. 937 * @stable ICU 2.0 938 */ current()939 public int current() { 940 return (fText != null) ? fText.getIndex() : BreakIterator.DONE; 941 } 942 makeRuleStatusValid()943 private void makeRuleStatusValid() { 944 if (fLastStatusIndexValid == false) { 945 // No cached status is available. 946 int curr = current(); 947 if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) { 948 // At start of text, or there is no text. Status is always zero. 949 fLastRuleStatusIndex = 0; 950 fLastStatusIndexValid = true; 951 } else { 952 // Not at start of text. Find status the tedious way. 953 int pa = fText.getIndex(); 954 first(); 955 int pb = current(); 956 while (fText.getIndex() < pa) { 957 pb = next(); 958 } 959 Assert.assrt(pa == pb); 960 } 961 Assert.assrt(fLastStatusIndexValid == true); 962 Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length); 963 } 964 } 965 966 /** 967 * Return the status tag from the break rule that determined the most recently 968 * returned break position. The values appear in the rule source 969 * within brackets, {123}, for example. For rules that do not specify a 970 * status, a default value of 0 is returned. If more than one rule applies, 971 * the numerically largest of the possible status values is returned. 972 * <p> 973 * Of the standard types of ICU break iterators, only the word break 974 * iterator provides status values. The values are defined in 975 * class RuleBasedBreakIterator, and allow distinguishing between words 976 * that contain alphabetic letters, "words" that appear to be numbers, 977 * punctuation and spaces, words containing ideographic characters, and 978 * more. Call <code>getRuleStatus</code> after obtaining a boundary 979 * position from <code>next()<code>, <code>previous()</code>, or 980 * any other break iterator functions that returns a boundary position. 981 * <p> 982 * @return the status from the break rule that determined the most recently 983 * returned break position. 984 * 985 * @draft ICU 3.0 (retain) 986 * @provisional This is a draft API and might change in a future release of ICU. 987 */ 988 getRuleStatus()989 public int getRuleStatus() { 990 makeRuleStatusValid(); 991 // Status records have this form: 992 // Count N <-- fLastRuleStatusIndex points here. 993 // Status val 0 994 // Status val 1 995 // ... 996 // Status val N-1 <-- the value we need to return 997 // The status values are sorted in ascending order. 998 // This function returns the last (largest) of the array of status values. 999 int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex]; 1000 int tagVal = fRData.fStatusTable[idx]; 1001 return tagVal; 1002 } 1003 1004 /** 1005 * Get the status (tag) values from the break rule(s) that determined the most 1006 * recently returned break position. The values appear in the rule source 1007 * within brackets, {123}, for example. The default status value for rules 1008 * that do not explicitly provide one is zero. 1009 * <p> 1010 * The status values used by the standard ICU break rules are defined 1011 * as public constants in class RuleBasedBreakIterator. 1012 * <p> 1013 * If the size of the output array is insufficient to hold the data, 1014 * the output will be truncated to the available length. No exception 1015 * will be thrown. 1016 * 1017 * @param fillInArray an array to be filled in with the status values. 1018 * @return The number of rule status values from rules that determined 1019 * the most recent boundary returned by the break iterator. 1020 * In the event that the array is too small, the return value 1021 * is the total number of status values that were available, 1022 * not the reduced number that were actually returned. 1023 * @draft ICU 3.0 (retain) 1024 * @provisional This is a draft API and might change in a future release of ICU. 1025 */ getRuleStatusVec(int[] fillInArray)1026 public int getRuleStatusVec(int[] fillInArray) { 1027 makeRuleStatusValid(); 1028 int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex]; 1029 if (fillInArray != null) { 1030 int numToCopy = Math.min(numStatusVals, fillInArray.length); 1031 for (int i=0; i<numToCopy; i++) { 1032 fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1]; 1033 } 1034 } 1035 return numStatusVals; 1036 } 1037 1038 /** 1039 * Return a CharacterIterator over the text being analyzed. This version 1040 * of this method returns the actual CharacterIterator we're using internally. 1041 * Changing the state of this iterator can have undefined consequences. If 1042 * you need to change it, clone it first. 1043 * @return An iterator over the text being analyzed. 1044 * @stable ICU 2.0 1045 */ getText()1046 public CharacterIterator getText() { 1047 return fText; 1048 } 1049 1050 /** 1051 * Set the iterator to analyze a new piece of text. This function resets 1052 * the current iteration position to the beginning of the text. 1053 * @param newText An iterator over the text to analyze. 1054 * @stable ICU 2.0 1055 */ setText(CharacterIterator newText)1056 public void setText(CharacterIterator newText) { 1057 fText = newText; 1058 // first() resets the caches 1059 this.first(); 1060 } 1061 1062 /** 1063 * package private 1064 */ setBreakType(int type)1065 void setBreakType(int type) { 1066 fBreakType = type; 1067 } 1068 1069 /** 1070 * package private 1071 */ getBreakType()1072 int getBreakType() { 1073 return fBreakType; 1074 } 1075 1076 /** 1077 * Control debug, trace and dump options. 1078 * @internal 1079 */ 1080 static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? 1081 ICUDebug.value(RBBI_DEBUG_ARG) : null; 1082 1083 getLanguageBreakEngine(int c)1084 private LanguageBreakEngine getLanguageBreakEngine(int c) { 1085 1086 // We have a dictionary character. 1087 // Does an already instantiated break engine handle it? 1088 for (LanguageBreakEngine candidate : fBreakEngines.values()) { 1089 if (candidate.handles(c, fBreakType)) { 1090 return candidate; 1091 } 1092 } 1093 1094 // if we don't have an existing engine, build one. 1095 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 1096 if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { 1097 // Katakana, Hiragana and Han are handled by the same dictionary engine. 1098 // Fold them together for mapping from script -> engine. 1099 script = UScript.HAN; 1100 } 1101 1102 LanguageBreakEngine eng = fBreakEngines.get(script); 1103 /* 1104 if (eng != null && !eng.handles(c, fBreakType)) { 1105 fUnhandledBreakEngine.handleChar(c, getBreakType()); 1106 eng = fUnhandledBreakEngine; 1107 } else */ { 1108 try { 1109 switch (script) { 1110 case UScript.THAI: 1111 eng = new ThaiBreakEngine(); 1112 break; 1113 case UScript.LAO: 1114 eng = new LaoBreakEngine(); 1115 break; 1116 case UScript.MYANMAR: 1117 eng = new BurmeseBreakEngine(); 1118 break; 1119 case UScript.KHMER: 1120 eng = new KhmerBreakEngine(); 1121 break; 1122 case UScript.HAN: 1123 if (getBreakType() == KIND_WORD) { 1124 eng = new CjkBreakEngine(false); 1125 } 1126 else { 1127 fUnhandledBreakEngine.handleChar(c, getBreakType()); 1128 eng = fUnhandledBreakEngine; 1129 } 1130 break; 1131 case UScript.HANGUL: 1132 if (getBreakType() == KIND_WORD) { 1133 eng = new CjkBreakEngine(true); 1134 } else { 1135 fUnhandledBreakEngine.handleChar(c, getBreakType()); 1136 eng = fUnhandledBreakEngine; 1137 } 1138 break; 1139 default: 1140 fUnhandledBreakEngine.handleChar(c, getBreakType()); 1141 eng = fUnhandledBreakEngine; 1142 break; 1143 } 1144 } catch (IOException e) { 1145 eng = null; 1146 } 1147 } 1148 1149 if (eng != null && eng != fUnhandledBreakEngine) { 1150 LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng); 1151 if (existingEngine != null) { 1152 // There was a race & another thread was first to register an engine for this script. 1153 // Use theirs and discard the one we just created. 1154 eng = existingEngine; 1155 } 1156 // assert eng.handles(c, fBreakType); 1157 } 1158 return eng; 1159 } 1160 1161 1162 1163 /** 1164 * The State Machine Engine for moving forward is here. 1165 * This function is the heart of the RBBI run time engine. 1166 * 1167 * @param stateTable 1168 * @return the new iterator position 1169 * 1170 * A note on supplementary characters and the position of underlying 1171 * Java CharacterIterator: Normally, a character iterator is positioned at 1172 * the char most recently returned by next(). Within this function, when 1173 * a supplementary char is being processed, the char iterator is left 1174 * sitting on the trail surrogate, in the middle of the code point. 1175 * This is different from everywhere else, where an iterator always 1176 * points at the lead surrogate of a supplementary. 1177 */ handleNext(short stateTable[])1178 private int handleNext(short stateTable[]) { 1179 if (TRACE) { 1180 System.out.println("Handle Next pos char state category"); 1181 } 1182 1183 // No matter what, handleNext alway correctly sets the break tag value. 1184 fLastStatusIndexValid = true; 1185 fLastRuleStatusIndex = 0; 1186 1187 // caches for quicker access 1188 CharacterIterator text = fText; 1189 CharTrie trie = fRData.fTrie; 1190 1191 // Set up the starting char 1192 int c = text.current(); 1193 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 1194 c = nextTrail32(text, c); 1195 if (c == DONE32) { 1196 return BreakIterator.DONE; 1197 } 1198 } 1199 int initialPosition = text.getIndex(); 1200 int result = initialPosition; 1201 1202 // Set the initial state for the state machine 1203 int state = START_STATE; 1204 int row = fRData.getRowIndex(state); 1205 short category = 3; 1206 int flagsState = fRData.getStateTableFlags(stateTable); 1207 int mode = RBBI_RUN; 1208 if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 1209 category = 2; 1210 mode = RBBI_START; 1211 if (TRACE) { 1212 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 1213 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 1214 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 1215 } 1216 } 1217 int lookaheadStatus = 0; 1218 int lookaheadTagIdx = 0; 1219 int lookaheadResult = 0; 1220 1221 // loop until we reach the end of the text or transition to state 0 1222 while (state != STOP_STATE) { 1223 if (c == DONE32) { 1224 // Reached end of input string. 1225 if (mode == RBBI_END) { 1226 // We have already run the loop one last time with the 1227 // character set to the pseudo {eof} value. Now it is time 1228 // to unconditionally bail out. 1229 1230 if (lookaheadResult > result) { 1231 // We ran off the end of the string with a pending 1232 // look-ahead match. 1233 // Treat this as if the look-ahead condition had been 1234 // met, and return 1235 // the match at the / position from the look-ahead rule. 1236 result = lookaheadResult; 1237 fLastRuleStatusIndex = lookaheadTagIdx; 1238 } 1239 break; 1240 } 1241 // Run the loop one last time with the fake end-of-input character category 1242 mode = RBBI_END; 1243 category = 1; 1244 } 1245 else if (mode == RBBI_RUN) { 1246 // Get the char category. An incoming category of 1 or 2 mens that 1247 // we are preset for doing the beginning or end of input, and 1248 // that we shouldn't get a category from an actual text input character. 1249 // 1250 1251 // look up the current character's character category, which tells us 1252 // which column in the state table to look at. 1253 // 1254 category = (short) trie.getCodePointValue(c); 1255 1256 // Check the dictionary bit in the character's category. 1257 // Counter is only used by dictionary based iterators (subclasses). 1258 // Chars that need to be handled by a dictionary have a flag bit set 1259 // in their category values. 1260 // 1261 if ((category & 0x4000) != 0) { 1262 fDictionaryCharCount++; 1263 // And off the dictionary flag bit. 1264 category &= ~0x4000; 1265 } 1266 1267 if (TRACE) { 1268 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 1269 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 1270 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 1271 } 1272 1273 // Advance to the next character. 1274 // If this is a beginning-of-input loop iteration, don't advance. 1275 // The next iteration will be processing the first real input character. 1276 c = (int)text.next(); 1277 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 1278 c = nextTrail32(text, c); 1279 } 1280 } 1281 else { 1282 mode = RBBI_RUN; 1283 } 1284 1285 // look up a state transition in the state table 1286 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 1287 row = fRData.getRowIndex(state); 1288 1289 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { 1290 // Match found, common case 1291 result = text.getIndex(); 1292 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 1293 // The iterator has been left in the middle of a surrogate pair. 1294 // We want the start of it. 1295 result--; 1296 } 1297 1298 // Remember the break status (tag) values. 1299 fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; 1300 } 1301 1302 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) { 1303 if (lookaheadStatus != 0 1304 && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) { 1305 // Lookahead match is completed. Set the result accordingly, but only 1306 // if no other rule has matched further in the mean time. 1307 result = lookaheadResult; 1308 fLastRuleStatusIndex = lookaheadTagIdx; 1309 lookaheadStatus = 0; 1310 // TODO: make a standalone hard break in a rule work. 1311 if ((flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0) { 1312 text.setIndex(result); 1313 return result; 1314 } 1315 // Look-ahead completed, but other rules may match further. Continue on. 1316 // TODO: junk this feature? I don't think it's used anywhere. 1317 continue; 1318 } 1319 1320 lookaheadResult = text.getIndex(); 1321 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 1322 // The iterator has been left in the middle of a surrogate pair. 1323 // We want the beginning of it. 1324 lookaheadResult--; 1325 } 1326 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 1327 lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX]; 1328 continue; 1329 } 1330 1331 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) { 1332 // Because this is an accepting state, any in-progress look-ahead match 1333 // is no longer relevant. Clear out the pending lookahead status. 1334 lookaheadStatus = 0; 1335 } 1336 } // End of state machine main loop 1337 1338 // The state machine is done. Check whether it found a match... 1339 1340 // If the iterator failed to advance in the match engine force it ahead by one. 1341 // This indicates a defect in the break rules, which should always match 1342 // at least one character. 1343 1344 if (result == initialPosition) { 1345 if (TRACE) { 1346 System.out.println("Iterator did not move. Advancing by 1."); 1347 } 1348 text.setIndex(initialPosition); 1349 next32(text); 1350 result = text.getIndex(); 1351 } 1352 else { 1353 // Leave the iterator at our result position. 1354 // (we may have advanced beyond the last accepting position chasing after 1355 // longer matches that never completed.) 1356 text.setIndex(result); 1357 } 1358 if (TRACE) { 1359 System.out.println("result = " + result); 1360 } 1361 return result; 1362 } 1363 handlePrevious(short stateTable[])1364 private int handlePrevious(short stateTable[]) { 1365 if (fText == null || stateTable == null) { 1366 return 0; 1367 } 1368 1369 int state; 1370 int category = 0; 1371 int mode; 1372 int row; 1373 int c; 1374 int lookaheadStatus = 0; 1375 int result = 0; 1376 int initialPosition = 0; 1377 int lookaheadResult = 0; 1378 boolean lookAheadHardBreak = 1379 (fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1380 1381 // handlePrevious() never gets the rule status. 1382 // Flag the status as invalid; if the user ever asks for status, we will need 1383 // to back up, then re-find the break position using handleNext(), which does 1384 // get the status value. 1385 fLastStatusIndexValid = false; 1386 fLastRuleStatusIndex = 0; 1387 1388 // set up the starting char 1389 initialPosition = fText.getIndex(); 1390 result = initialPosition; 1391 c = previous32(fText); 1392 1393 // Set up the initial state for the state machine 1394 state = START_STATE; 1395 row = fRData.getRowIndex(state); 1396 category = 3; // TODO: obsolete? from the old start/run mode scheme? 1397 mode = RBBI_RUN; 1398 if ((fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 1399 category = 2; 1400 mode = RBBI_START; 1401 } 1402 1403 if (TRACE) { 1404 System.out.println("Handle Prev pos char state category "); 1405 } 1406 1407 // loop until we reach the beginning of the text or transition to state 0 1408 // 1409 mainLoop: for (;;) { 1410 innerBlock: { 1411 if (c == DONE32) { 1412 // Reached end of input string. 1413 if (mode == RBBI_END || fRData.fHeader.fVersion == 1) { 1414 // Either this is the old (ICU 3.2 and earlier) format data which 1415 // does not support explicit support for matching {eof}, or 1416 // we have already done the {eof} iteration. Now is the time 1417 // to unconditionally bail out. 1418 if (lookaheadResult < result) { 1419 // We ran off the end of the string with a pending look-ahead match. 1420 // Treat this as if the look-ahead condition had been met, and return 1421 // the match at the / position from the look-ahead rule. 1422 result = lookaheadResult; 1423 lookaheadStatus = 0; 1424 } else if (result == initialPosition) { 1425 // Ran off start, no match found. 1426 // Move one position (towards the start, since we are doing previous.) 1427 fText.setIndex(initialPosition); 1428 previous32(fText); 1429 } 1430 break mainLoop; 1431 } 1432 mode = RBBI_END; 1433 category = 1; 1434 } 1435 1436 if (mode == RBBI_RUN) { 1437 // look up the current character's category, which tells us 1438 // which column in the state table to look at. 1439 // 1440 category = (short) fRData.fTrie.getCodePointValue(c); 1441 1442 // Check the dictionary bit in the character's category. 1443 // Counter is only used by dictionary based iterators (subclasses). 1444 // Chars that need to be handled by a dictionary have a flag bit set 1445 // in their category values. 1446 // 1447 if ((category & 0x4000) != 0) { 1448 fDictionaryCharCount++; 1449 // And off the dictionary flag bit. 1450 category &= ~0x4000; 1451 } 1452 } 1453 1454 1455 if (TRACE) { 1456 System.out.print(" " + fText.getIndex() + " "); 1457 if (0x20 <= c && c < 0x7f) { 1458 System.out.print(" " + c + " "); 1459 } else { 1460 System.out.print(" " + Integer.toHexString(c) + " "); 1461 } 1462 System.out.println(" " + state + " " + category + " "); 1463 } 1464 1465 // State Transition - move machine to its next state 1466 // 1467 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 1468 row = fRData.getRowIndex(state); 1469 1470 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { 1471 // Match found, common case, could have lookahead so we move 1472 // on to check it 1473 result = fText.getIndex(); 1474 } 1475 1476 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) { 1477 if (lookaheadStatus != 0 1478 && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) { 1479 // Lookahead match is completed. Set the result 1480 // accordingly, but only 1481 // if no other rule has matched further in the mean 1482 // time. 1483 result = lookaheadResult; 1484 lookaheadStatus = 0; 1485 // TODO: make a stand-alone hard break in a rule work. 1486 1487 if (lookAheadHardBreak) { 1488 break mainLoop; 1489 } 1490 // Look-ahead completed, but other rules may match further. 1491 // Continue on. 1492 // TODO: junk this feature? I don't think that it's used anywhere. 1493 break innerBlock; 1494 } 1495 // Hit a possible look-ahead match. We are at the 1496 // position of the '/'. Remember this position. 1497 lookaheadResult = fText.getIndex(); 1498 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 1499 break innerBlock; 1500 } 1501 1502 // not lookahead... 1503 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) { 1504 // This is a plain (non-look-ahead) accepting state. 1505 if (!lookAheadHardBreak) { 1506 // Clear out any pending look-ahead matches, 1507 // but only if not doing the lookAheadHardBreak option 1508 // which needs to force a break no matter what is going 1509 // on with the rest of the match, i.e. we can't abandon 1510 // a partially completed look-ahead match because 1511 // some other rule matched further than the '/' position 1512 // in the look-ahead match. 1513 lookaheadStatus = 0; 1514 } 1515 } 1516 1517 } // end of innerBlock. "break innerBlock" in above code comes out here. 1518 1519 1520 if (state == STOP_STATE) { 1521 // Normal loop exit is here 1522 break mainLoop; 1523 } 1524 1525 // then move iterator position backwards one character 1526 // 1527 if (mode == RBBI_RUN) { 1528 c = previous32(fText); 1529 } else { 1530 if (mode == RBBI_START) { 1531 mode = RBBI_RUN; 1532 } 1533 } 1534 1535 1536 } // End of the main loop. 1537 1538 // The state machine is done. Check whether it found a match... 1539 // 1540 // If the iterator failed to advance in the match engine, force it ahead by one. 1541 // (This really indicates a defect in the break rules. They should always match 1542 // at least one character.) 1543 if (result == initialPosition) { 1544 result = fText.setIndex(initialPosition); 1545 previous32(fText); 1546 result = fText.getIndex(); 1547 } 1548 1549 fText.setIndex(result); 1550 if (TRACE) { 1551 System.out.println("Result = " + result); 1552 } 1553 1554 return result; 1555 } 1556 } 1557 1558