1 /*
2  *******************************************************************************
3  * Copyright (C) 2005-2015 International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 
8 package com.ibm.icu.text;
9 
10 import static com.ibm.icu.impl.CharacterIteration.DONE32;
11 import static com.ibm.icu.impl.CharacterIteration.next32;
12 import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
13 import static com.ibm.icu.impl.CharacterIteration.previous32;
14 
15 import java.io.ByteArrayOutputStream;
16 import java.io.IOException;
17 import java.io.InputStream;
18 import java.io.OutputStream;
19 import java.nio.ByteBuffer;
20 import java.text.CharacterIterator;
21 import java.util.concurrent.ConcurrentHashMap;
22 
23 import com.ibm.icu.impl.Assert;
24 import com.ibm.icu.impl.CharTrie;
25 import com.ibm.icu.impl.CharacterIteration;
26 import com.ibm.icu.impl.ICUBinary;
27 import com.ibm.icu.impl.ICUDebug;
28 import com.ibm.icu.lang.UCharacter;
29 import com.ibm.icu.lang.UProperty;
30 import com.ibm.icu.lang.UScript;
31 
32 /**
33  * Rule Based Break Iterator
34  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
35  *
36  * @stable ICU 2.0
37  */
38 public class RuleBasedBreakIterator extends BreakIterator {
39     //=======================================================================
40     // Constructors & Factories
41     //=======================================================================
42 
43     /**
44      * private constructor
45      */
RuleBasedBreakIterator()46     private RuleBasedBreakIterator() {
47         fLastStatusIndexValid = true;
48         fDictionaryCharCount  = 0;
49         fBreakEngines.put(-1, fUnhandledBreakEngine);
50     }
51 
52     /**
53      * Create a break iterator from a precompiled set of break rules.
54      *
55      * Creating a break iterator from the binary rules is much faster than
56      * creating one from source rules.
57      *
58      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
59      * Binary break iterator rules are not guaranteed to be compatible between
60      * different versions of ICU.
61      *
62      * @param is an input stream supplying the compiled binary rules.
63      * @throws IOException if there is an error while reading the rules from the InputStream.
64      * @see    #compileRules(String, OutputStream)
65      * @stable ICU 4.8
66      */
getInstanceFromCompiledRules(InputStream is)67     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
68         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
69         This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
70         return This;
71     }
72 
73     /**
74      * Create a break iterator from a precompiled set of break rules.
75      *
76      * Creating a break iterator from the binary rules is much faster than
77      * creating one from source rules.
78      *
79      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
80      * Binary break iterator rules are not guaranteed to be compatible between
81      * different versions of ICU.
82      *
83      * @param bytes a buffer supplying the compiled binary rules.
84      * @throws IOException if there is an error while reading the rules from the buffer.
85      * @see    #compileRules(String, OutputStream)
86      * @internal
87      * @deprecated This API is ICU internal only.
88      */
89     @Deprecated
getInstanceFromCompiledRules(ByteBuffer bytes)90     public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
91         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
92         This.fRData = RBBIDataWrapper.get(bytes);
93         return This;
94     }
95 
96     /**
97      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
98      * @param rules The break rules to be used.
99      * @stable ICU 2.2
100      */
RuleBasedBreakIterator(String rules)101     public RuleBasedBreakIterator(String rules)  {
102         this();
103         try {
104             ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
105             compileRules(rules, ruleOS);
106             fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
107         } catch (IOException e) {
108             ///CLOVER:OFF
109             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
110             //  causing bogus compiled rules to be produced, but with no compile error raised.
111             RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
112                     + e.getMessage());
113             throw rte;
114             ///CLOVER:ON
115         }
116     }
117 
118     //=======================================================================
119     // Boilerplate
120     //=======================================================================
121 
122     /**
123      * Clones this iterator.
124      * @return A newly-constructed RuleBasedBreakIterator with the same
125      * behavior as this one.
126      * @stable ICU 2.0
127      */
clone()128     public Object clone()
129     {
130         RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone();
131         if (fText != null) {
132             result.fText = (CharacterIterator)(fText.clone());
133         }
134         return result;
135     }
136 
137     /**
138      * Returns true if both BreakIterators are of the same class, have the same
139      * rules, and iterate over the same text.
140      * @stable ICU 2.0
141      */
equals(Object that)142     public boolean equals(Object that) {
143         if (that == null) {
144             return false;
145         }
146         if (this == that) {
147             return true;
148         }
149         try {
150             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
151             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
152                 return false;
153             }
154             if (fRData != null && other.fRData != null &&
155                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
156                 return false;
157             }
158             if (fText == null && other.fText == null) {
159                 return true;
160             }
161             if (fText == null || other.fText == null) {
162                 return false;
163             }
164             return fText.equals(other.fText);
165         }
166         catch(ClassCastException e) {
167             return false;
168         }
169      }
170 
171     /**
172      * Returns the description (rules) used to create this iterator.
173      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
174      * @stable ICU 2.0
175      */
toString()176     public String toString() {
177         String retStr = "";
178         if (fRData != null) {
179             retStr =  fRData.fRuleSource;
180         }
181         return retStr;
182     }
183 
184     /**
185      * Compute a hashcode for this BreakIterator
186      * @return A hash code
187      * @stable ICU 2.0
188      */
hashCode()189     public int hashCode()
190     {
191         return fRData.fRuleSource.hashCode();
192     }
193 
194 
195     private static final int  START_STATE = 1;     // The state number of the starting state
196     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"
197 
198     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
199     //               of user text.  A variable with this enum type keeps track of where we
200     //               are.  The state machine only fetches user text input while in RUN mode.
201     private static final int  RBBI_START  = 0;
202     private static final int  RBBI_RUN    = 1;
203     private static final int  RBBI_END    = 2;
204 
205     /*
206      * The character iterator through which this BreakIterator accesses the text.
207      */
208     private CharacterIterator   fText = new java.text.StringCharacterIterator("");
209 
210     /**
211      * The rule data for this BreakIterator instance. Package private.
212      */
213     RBBIDataWrapper             fRData;
214 
215     /*
216      * Index of the Rule {tag} values for the most recent match.
217      */
218     private int                 fLastRuleStatusIndex;
219 
220     /*
221      * Rule tag value valid flag.
222      * Some iterator operations don't intrinsically set the correct tag value.
223      * This flag lets us lazily compute the value if we are ever asked for it.
224      */
225     private boolean             fLastStatusIndexValid;
226 
227     /**
228      * Counter for the number of characters encountered with the "dictionary"
229      *   flag set.  Normal RBBI iterators don't use it, although the code
230      *   for updating it is live.  Dictionary Based break iterators (a subclass
231      *   of us) access this field directly.
232      * @internal
233      */
234     private int fDictionaryCharCount;
235 
236     /*
237      * ICU debug argument name for RBBI
238      */
239     private static final String RBBI_DEBUG_ARG = "rbbi";
240 
241     /**
242      * Debugging flag.  Trace operation of state machine when true.
243      */
244     private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
245             && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
246 
247     /**
248      * What kind of break iterator this is. Set to KIND_LINE by default,
249      * since this produces sensible output.
250      */
251     private int fBreakType = KIND_LINE;
252 
253     /**
254      * The "default" break engine - just skips over ranges of dictionary words,
255      * producing no breaks. Should only be used if characters need to be handled
256      * by a dictionary but we have no dictionary implementation for them.
257      */
258     private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
259 
260     /**
261      * when a range of characters is divided up using the dictionary, the break
262      * positions that are discovered are stored here, preventing us from having
263      * to use either the dictionary or the state table again until the iterator
264      * leaves this range of text
265      */
266     private int[] fCachedBreakPositions;
267 
268     /**
269      * if fCachedBreakPositions is not null, this indicates which item in the
270      * cache the current iteration position refers to
271      */
272     private int fPositionInCache;
273 
274 
275     private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines =
276             new ConcurrentHashMap<Integer, LanguageBreakEngine>();
277     /**
278      * Dumps caches and performs other actions associated with a complete change
279      * in text or iteration position.
280      */
reset()281     private void reset() {
282         fCachedBreakPositions = null;
283         // fNumCachedBreakPositions = 0;
284         fDictionaryCharCount = 0;
285         fPositionInCache = 0;
286 
287     }
288     /**
289      * Dump the contents of the state table and character classes for this break iterator.
290      * For debugging only.
291      * @internal
292      * @deprecated This API is ICU internal only.
293      */
294     @Deprecated
dump()295     public void dump() {
296         this.fRData.dump();
297     }
298 
299     /**
300      * Compile a set of source break rules into the binary state tables used
301      * by the break iterator engine.  Creating a break iterator from precompiled
302      * rules is much faster than creating one from source rules.
303      *
304      * Binary break rules are not guaranteed to be compatible between different
305      * versions of ICU.
306      *
307      *
308      * @param rules  The source form of the break rules
309      * @param ruleBinary  An output stream to receive the compiled rules.
310      * @throws IOException If there is an error writing the output.
311      * @see #getInstanceFromCompiledRules(InputStream)
312      * @stable ICU 4.8
313      */
compileRules(String rules, OutputStream ruleBinary)314     public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
315         RBBIRuleBuilder.compileRules(rules, ruleBinary);
316     }
317 
318     //=======================================================================
319     // BreakIterator overrides
320     //=======================================================================
321 
322     /**
323      * Sets the current iteration position to the beginning of the text.
324      * (i.e., the CharacterIterator's starting offset).
325      * @return The offset of the beginning of the text.
326      * @stable ICU 2.0
327      */
first()328     public int first() {
329         fCachedBreakPositions = null;
330         fDictionaryCharCount = 0;
331         fPositionInCache = 0;
332         fLastRuleStatusIndex  = 0;
333         fLastStatusIndexValid = true;
334         if (fText == null) {
335             return BreakIterator.DONE;
336         }
337         fText.first();
338         return fText.getIndex();
339     }
340 
341     /**
342      * Sets the current iteration position to the end of the text.
343      * (i.e., the CharacterIterator's ending offset).
344      * @return The text's past-the-end offset.
345      * @stable ICU 2.0
346      */
last()347     public int last() {
348         fCachedBreakPositions = null;
349         fDictionaryCharCount = 0;
350         fPositionInCache = 0;
351 
352         if (fText == null) {
353             fLastRuleStatusIndex  = 0;
354             fLastStatusIndexValid = true;
355             return BreakIterator.DONE;
356         }
357 
358         // t.last() returns the offset of the last character,
359         // rather than the past-the-end offset
360         // so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
361         // will work correctly.
362         fLastStatusIndexValid = false;
363         int pos = fText.getEndIndex();
364         fText.setIndex(pos);
365         return pos;
366     }
367 
368     /**
369      * Advances the iterator either forward or backward the specified number of steps.
370      * Negative values move backward, and positive values move forward.  This is
371      * equivalent to repeatedly calling next() or previous().
372      * @param n The number of steps to move.  The sign indicates the direction
373      * (negative is backwards, and positive is forwards).
374      * @return The character offset of the boundary position n boundaries away from
375      * the current one.
376      * @stable ICU 2.0
377      */
next(int n)378     public int next(int n) {
379         int result = current();
380         while (n > 0) {
381             result = next();
382             --n;
383         }
384         while (n < 0) {
385             result = previous();
386             ++n;
387         }
388         return result;
389     }
390 
391     /**
392      * Advances the iterator to the next boundary position.
393      * @return The position of the first boundary after this one.
394      * @stable ICU 2.0
395      */
next()396     public int next() {
397         // if we have cached break positions and we're still in the range
398         // covered by them, just move one step forward in the cache
399         if (fCachedBreakPositions != null) {
400             if (fPositionInCache < fCachedBreakPositions.length - 1) {
401                 ++fPositionInCache;
402                 int pos = fCachedBreakPositions[fPositionInCache];
403                 fText.setIndex(pos);
404                 return pos;
405             }
406             else {
407                 reset();
408             }
409         }
410 
411         int startPos = current();
412         fDictionaryCharCount = 0;
413         int result = handleNext(fRData.fFTable);
414         if (fDictionaryCharCount > 0) {
415             result = checkDictionary(startPos, result, false);
416         }
417         return result;
418     }
419 
420     /**
421       *  checkDictionary      This function handles all processing of characters in
422       *                       the "dictionary" set. It will determine the appropriate
423       *                       course of action, and possibly set up a cache in the
424       *                       process.
425       */
checkDictionary(int startPos, int endPos, boolean reverse)426     private int checkDictionary(int startPos, int endPos, boolean reverse) {
427 
428         // Reset the old break cache first.
429         reset();
430 
431         // note: code segment below assumes that dictionary chars are in the
432         // startPos-endPos range
433         // value returned should be next character in sequence
434         if ((endPos - startPos) <= 1) {
435             return (reverse ? startPos : endPos);
436         }
437 
438         // Starting from the starting point, scan towards the proposed result,
439         // looking for the first dictionary character (which may be the one
440         // we're on, if we're starting in the middle of a range).
441         fText.setIndex(reverse ? endPos : startPos);
442         if (reverse) {
443             CharacterIteration.previous32(fText);
444         }
445 
446         int  rangeStart = startPos;
447         int  rangeEnd = endPos;
448 
449         int    category;
450         int    current;
451         DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI();
452         int     foundBreakCount = 0;
453         int     c = CharacterIteration.current32(fText);
454         category = (short)fRData.fTrie.getCodePointValue(c);
455 
456         // Is the character we're starting on a dictionary character? If so, we
457         // need to back up to include the entire run; otherwise the results of
458         // the break algorithm will differ depending on where we start. Since
459         // the result is cached and there is typically a non-dictionary break
460         // within a small number of words, there should be little performance impact.
461         if ((category & 0x4000) != 0) {
462             if (reverse) {
463                 do {
464                     CharacterIteration.next32(fText);
465                     c = CharacterIteration.current32(fText);
466                     category = (short)fRData.fTrie.getCodePointValue(c);
467                 } while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0);
468 
469                 // Back up to the last dictionary character
470                 rangeEnd = fText.getIndex();
471                 if (c == CharacterIteration.DONE32) {
472                     // c = fText->last32();
473                     //   TODO:  why was this if needed?
474                     c = CharacterIteration.previous32(fText);
475                 }
476                 else {
477                     c = CharacterIteration.previous32(fText);
478                 }
479             }
480             else {
481                 do {
482                     c = CharacterIteration.previous32(fText);
483                     category = (short)fRData.fTrie.getCodePointValue(c);
484                 }
485                 while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0));
486                 // Back up to the last dictionary character
487                 if (c == CharacterIteration.DONE32) {
488                     // c = fText->first32();
489                     c = CharacterIteration.current32(fText);
490                 }
491                 else {
492                     CharacterIteration.next32(fText);
493                     c = CharacterIteration.current32(fText);
494                 }
495                 rangeStart = fText.getIndex();
496             }
497             category = (short)fRData.fTrie.getCodePointValue(c);
498         }
499 
500 
501         // Loop through the text, looking for ranges of dictionary characters.
502         // For each span, find the appropriate break engine, and ask it to find
503         // any breaks within the span.
504         // Note: we always do this in the forward direction, so that the break
505         // cache is built in the right order.
506         if (reverse) {
507             fText.setIndex(rangeStart);
508             c = CharacterIteration.current32(fText);
509             category = (short)fRData.fTrie.getCodePointValue(c);
510         }
511         LanguageBreakEngine lbe = null;
512         while(true) {
513             while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
514                 CharacterIteration.next32(fText);
515                 c = CharacterIteration.current32(fText);
516                 category = (short)fRData.fTrie.getCodePointValue(c);
517             }
518             if (current >= rangeEnd) {
519                 break;
520             }
521 
522             // We now have a dictionary character. Get the appropriate language object
523             // to deal with it.
524             lbe = getLanguageBreakEngine(c);
525 
526             // Ask the language object if there are any breaks. It will leave the text
527             // pointer on the other side of its range, ready to search for the next one.
528             if (lbe != null) {
529                 int startingIdx = fText.getIndex();
530                 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks);
531                 assert fText.getIndex() > startingIdx;
532             }
533 
534             // Reload the loop variables for the next go-round
535             c = CharacterIteration.current32(fText);
536             category = (short)fRData.fTrie.getCodePointValue(c);
537         }
538 
539         // If we found breaks, build a new break cache. The first and last entries must
540         // be the original starting and ending position.
541         if (foundBreakCount > 0) {
542             if (foundBreakCount != breaks.size()) {
543                 System.out.println("oops, foundBreakCount != breaks.size().  LBE = " + lbe.getClass());
544             }
545             assert foundBreakCount == breaks.size();
546             if (startPos < breaks.peekLast()) {
547                 breaks.offer(startPos);
548             }
549             if (endPos > breaks.peek()) {
550                 breaks.push(endPos);
551             }
552 
553             // TODO: get rid of this array, use results from the deque directly
554             fCachedBreakPositions = new int[breaks.size()];
555 
556             int i = 0;
557             while (breaks.size() > 0) {
558                 fCachedBreakPositions[i++] = breaks.pollLast();
559             }
560 
561             // If there are breaks, then by definition, we are replacing the original
562             // proposed break by one of the breaks we found. Use following() and
563             // preceding() to do the work. They should never recurse in this case.
564             if (reverse) {
565                 return preceding(endPos);
566             }
567             else {
568                 return following(startPos);
569             }
570         }
571 
572         // If we get here, there were no language-based breaks. Set the text pointer
573         // to the original proposed break.
574         fText.setIndex(reverse ? startPos : endPos);
575         return (reverse ? startPos : endPos);
576 
577         }
578 
579 
580     /**
581      * Moves the iterator backwards, to the last boundary preceding this one.
582      * @return The position of the last boundary position preceding this one.
583      * @stable ICU 2.0
584      */
previous()585     public int previous() {
586         int result;
587         int startPos;
588 
589         CharacterIterator text = getText();
590 
591         fLastStatusIndexValid = false;
592 
593         // if we have cached break positions and we're still in the range
594         // covered by them, just move one step backward in the cache
595         if (fCachedBreakPositions != null) {
596             if (fPositionInCache > 0) {
597                 --fPositionInCache;
598                 // If we're at the beginning of the cache, need to reevaluate the
599                 // rule status
600                 if (fPositionInCache <= 0) {
601                     fLastStatusIndexValid = false;
602                 }
603                 int pos = fCachedBreakPositions[fPositionInCache];
604                 text.setIndex(pos);
605                 return pos;
606             } else {
607                 reset();
608             }
609         }
610 
611         // if we're already sitting at the beginning of the text, return DONE
612         startPos = current();
613         if (fText == null || startPos == fText.getBeginIndex()) {
614             fLastRuleStatusIndex  = 0;
615             fLastStatusIndexValid = true;
616             return BreakIterator.DONE;
617         }
618 
619         // Rules with an exact reverse table are handled here.
620         if (fRData.fSRTable != null || fRData.fSFTable != null) {
621             result =  handlePrevious(fRData.fRTable);
622             if (fDictionaryCharCount > 0) {
623                 result = checkDictionary(result, startPos, true);
624             }
625             return result;
626         }
627 
628         // old rule syntax
629         // set things up.  handlePrevious() will back us up to some valid
630         // break position before the current position (we back our internal
631         // iterator up one step to prevent handlePrevious() from returning
632         // the current position), but not necessarily the last one before
633         // where we started
634 
635         int       start = current();
636 
637         previous32(fText);
638         int       lastResult    = handlePrevious(fRData.fRTable);
639         if (lastResult == BreakIterator.DONE) {
640             lastResult = fText.getBeginIndex();
641             fText.setIndex(lastResult);
642         }
643         result = lastResult;
644         int      lastTag       = 0;
645         boolean  breakTagValid = false;
646 
647         // iterate forward from the known break position until we pass our
648         // starting point.  The last break position before the starting
649         // point is our return value
650 
651         for (;;) {
652             result         = next();
653             if (result == BreakIterator.DONE || result >= start) {
654                 break;
655             }
656             lastResult     = result;
657             lastTag        = fLastRuleStatusIndex;
658             breakTagValid  = true;
659         }
660 
661         // fLastBreakTag wants to have the value for section of text preceding
662         // the result position that we are to return (in lastResult.)  If
663         // the backwards rules overshot and the above loop had to do two or more
664         // handleNext()s to move up to the desired return position, we will have a valid
665         // tag value. But, if handlePrevious() took us to exactly the correct result position,
666         // we wont have a tag value for that position, which is only set by handleNext().
667 
668         // Set the current iteration position to be the last break position
669         // before where we started, and then return that value.
670         fText.setIndex(lastResult);
671         fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
672         fLastStatusIndexValid = breakTagValid;
673         return lastResult;
674     }
675 
676     /**
677      * Sets the iterator to refer to the first boundary position following
678      * the specified position.
679      * @param offset The position from which to begin searching for a break position.
680      * @return The position of the first break after the current position.
681      * @stable ICU 2.0
682      */
following(int offset)683     public int following(int offset) {
684         CharacterIterator text = getText();
685 
686         // if we have no cached break positions, or if "offset" is outside the
687         // range covered by the cache, then dump the cache and call our
688         // inherited following() method.  This will call other methods in this
689         // class that may refresh the cache.
690         if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] ||
691                 offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
692             fCachedBreakPositions = null;
693             return rulesFollowing(offset);
694         }
695 
696         // on the other hand, if "offset" is within the range covered by the
697         // cache, then just search the cache for the first break position
698         // after "offset"
699         else {
700             fPositionInCache = 0;
701             while (fPositionInCache < fCachedBreakPositions.length
702                    && offset >= fCachedBreakPositions[fPositionInCache])
703                 ++fPositionInCache;
704             text.setIndex(fCachedBreakPositions[fPositionInCache]);
705             return text.getIndex();
706         }
707     }
708 
rulesFollowing(int offset)709     private int rulesFollowing(int offset) {
710         // if the offset passed in is already past the end of the text,
711         // just return DONE; if it's before the beginning, return the
712         // text's starting offset
713         fLastRuleStatusIndex  = 0;
714         fLastStatusIndexValid = true;
715         if (fText == null || offset >= fText.getEndIndex()) {
716             last();
717             return next();
718         }
719         else if (offset < fText.getBeginIndex()) {
720             return first();
721         }
722 
723         // otherwise, set our internal iteration position (temporarily)
724         // to the position passed in.  If this is the _beginning_ position,
725         // then we can just use next() to get our return value
726 
727         int result = 0;
728 
729         if (fRData.fSRTable != null) {
730             // Safe Point Reverse rules exist.
731             //   This allows us to use the optimum algorithm.
732             fText.setIndex(offset);
733             // move forward one codepoint to prepare for moving back to a
734             // safe point.
735             // this handles offset being between a supplementary character
736             next32(fText);
737             // handlePrevious will move most of the time to < 1 boundary away
738             handlePrevious(fRData.fSRTable);
739             result = next();
740             while (result <= offset) {
741                 result = next();
742             }
743             return result;
744         }
745         if (fRData.fSFTable != null) {
746             // No Safe point reverse table, but there is a safe pt forward table.
747             //
748             fText.setIndex(offset);
749             previous32(fText);
750             // handle next will give result >= offset
751             handleNext(fRData.fSFTable);
752             // previous will give result 0 or 1 boundary away from offset,
753             // most of the time
754             // we have to
755             int oldresult = previous();
756             while (oldresult > offset) {
757                 result = previous();
758                 if (result <= offset) {
759                     return oldresult;
760                 }
761                 oldresult = result;
762             }
763             result = next();
764             if (result <= offset) {
765                 return next();
766             }
767             return result;
768         }
769         // otherwise, we have to sync up first.  Use handlePrevious() to back
770         // us up to a known break position before the specified position (if
771         // we can determine that the specified position is a break position,
772         // we don't back up at all).  This may or may not be the last break
773         // position at or before our starting position.  Advance forward
774         // from here until we've passed the starting position.  The position
775         // we stop on will be the first break position after the specified one.
776         // old rule syntax
777 
778         fText.setIndex(offset);
779         if (offset == fText.getBeginIndex()) {
780             return next();
781         }
782         result = previous();
783 
784         while (result != BreakIterator.DONE && result <= offset) {
785             result = next();
786         }
787 
788         return result;
789     }
790     /**
791      * Sets the iterator to refer to the last boundary position before the
792      * specified position.
793      * @param offset The position to begin searching for a break from.
794      * @return The position of the last boundary before the starting position.
795      * @stable ICU 2.0
796      */
preceding(int offset)797     public int preceding(int offset) {
798         CharacterIterator text = getText();
799 
800         // if we have no cached break positions, or "offset" is outside the
801         // range covered by the cache, we can just call the inherited routine
802         // (which will eventually call other routines in this class that may
803         // refresh the cache)
804         if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] ||
805                 offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
806             fCachedBreakPositions = null;
807             return rulesPreceding(offset);
808         }
809 
810         // on the other hand, if "offset" is within the range covered by the cache,
811         // then all we have to do is search the cache for the last break position
812         // before "offset"
813         else {
814             fPositionInCache = 0;
815             while (fPositionInCache < fCachedBreakPositions.length
816                    && offset > fCachedBreakPositions[fPositionInCache])
817                 ++fPositionInCache;
818             --fPositionInCache;
819             text.setIndex(fCachedBreakPositions[fPositionInCache]);
820             return text.getIndex();
821         }
822     }
823 
rulesPreceding(int offset)824     private int rulesPreceding(int offset) {
825         // if the offset passed in is already past the end of the text,
826         // just return DONE; if it's before the beginning, return the
827 
828         // text's starting offset
829         if (fText == null || offset > fText.getEndIndex()) {
830             // return BreakIterator::DONE;
831             return last();
832         }
833         else if (offset < fText.getBeginIndex()) {
834             return first();
835         }
836 
837         // if we start by updating the current iteration position to the
838         // position specified by the caller, we can just use previous()
839         // to carry out this operation
840 
841         int  result;
842         if (fRData.fSFTable != null) {
843             /// todo synwee
844             // new rule syntax
845             fText.setIndex(offset);
846             // move backwards one codepoint to prepare for moving forwards to a
847             // safe point.
848             // this handles offset being between a supplementary character
849             previous32(fText);
850             handleNext(fRData.fSFTable);
851             result = previous();
852             while (result >= offset) {
853                 result = previous();
854             }
855             return result;
856         }
857         if (fRData.fSRTable != null) {
858             // backup plan if forward safe table is not available
859             fText.setIndex(offset);
860             next32(fText);
861             // handle previous will give result <= offset
862             handlePrevious(fRData.fSRTable);
863 
864             // next will give result 0 or 1 boundary away from offset,
865             // most of the time
866             // we have to
867             int oldresult = next();
868             while (oldresult < offset) {
869                 result = next();
870                 if (result >= offset) {
871                     return oldresult;
872                 }
873                 oldresult = result;
874             }
875             result = previous();
876             if (result >= offset) {
877                 return previous();
878             }
879             return result;
880         }
881 
882         // old rule syntax
883         fText.setIndex(offset);
884         return previous();
885     }
886 
887     /**
888      * Throw IllegalArgumentException unless begin <= offset < end.
889      * @stable ICU 2.0
890      */
checkOffset(int offset, CharacterIterator text)891     protected static final void checkOffset(int offset, CharacterIterator text) {
892         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
893             throw new IllegalArgumentException("offset out of bounds");
894         }
895     }
896 
897 
898     /**
899      * Returns true if the specified position is a boundary position.  As a side
900      * effect, leaves the iterator pointing to the first boundary position at
901      * or after "offset".
902      * @param offset the offset to check.
903      * @return True if "offset" is a boundary position.
904      * @stable ICU 2.0
905      */
isBoundary(int offset)906     public boolean isBoundary(int offset) {
907         checkOffset(offset, fText);
908 
909         // the beginning index of the iterator is always a boundary position by definition
910         if (offset == fText.getBeginIndex()) {
911             first();       // For side effects on current position, tag values.
912             return true;
913         }
914 
915         if (offset == fText.getEndIndex()) {
916             last();       // For side effects on current position, tag values.
917             return true;
918         }
919 
920         // otherwise, we can use following() on the position before the specified
921         // one and return true if the position we get back is the one the user
922         // specified
923 
924         // return following(offset - 1) == offset;
925         // TODO:  check whether it is safe to revert to the simpler offset-1 code
926         //         The safe rules may take care of unpaired surrogates ok.
927         fText.setIndex(offset);
928         previous32(fText);
929         int  pos = fText.getIndex();
930         boolean result = following(pos) == offset;
931         return result;
932     }
933 
934     /**
935      * Returns the current iteration position.
936      * @return The current iteration position.
937      * @stable ICU 2.0
938      */
current()939     public int current() {
940         return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
941     }
942 
makeRuleStatusValid()943     private void makeRuleStatusValid() {
944         if (fLastStatusIndexValid == false) {
945             //  No cached status is available.
946             int curr = current();
947             if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) {
948                 //  At start of text, or there is no text.  Status is always zero.
949                 fLastRuleStatusIndex = 0;
950                 fLastStatusIndexValid = true;
951             } else {
952                 //  Not at start of text.  Find status the tedious way.
953                 int pa = fText.getIndex();
954                 first();
955                 int pb = current();
956                 while (fText.getIndex() < pa) {
957                     pb = next();
958                 }
959                 Assert.assrt(pa == pb);
960             }
961             Assert.assrt(fLastStatusIndexValid == true);
962             Assert.assrt(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fRData.fStatusTable.length);
963         }
964     }
965 
966     /**
967      * Return the status tag from the break rule that determined the most recently
968      * returned break position.  The values appear in the rule source
969      * within brackets, {123}, for example.  For rules that do not specify a
970      * status, a default value of 0 is returned.  If more than one rule applies,
971      * the numerically largest of the possible status values is returned.
972      * <p>
973      * Of the standard types of ICU break iterators, only the word break
974      * iterator provides status values.  The values are defined in
975      * class RuleBasedBreakIterator, and allow distinguishing between words
976      * that contain alphabetic letters, "words" that appear to be numbers,
977      * punctuation and spaces, words containing ideographic characters, and
978      * more.  Call <code>getRuleStatus</code> after obtaining a boundary
979      * position from <code>next()<code>, <code>previous()</code>, or
980      * any other break iterator functions that returns a boundary position.
981      * <p>
982      * @return the status from the break rule that determined the most recently
983      * returned break position.
984      *
985      * @draft ICU 3.0 (retain)
986      * @provisional This is a draft API and might change in a future release of ICU.
987      */
988 
getRuleStatus()989     public int  getRuleStatus() {
990         makeRuleStatusValid();
991         //   Status records have this form:
992         //           Count N         <--  fLastRuleStatusIndex points here.
993         //           Status val 0
994         //           Status val 1
995         //              ...
996         //           Status val N-1  <--  the value we need to return
997         //   The status values are sorted in ascending order.
998         //   This function returns the last (largest) of the array of status values.
999         int  idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
1000         int  tagVal = fRData.fStatusTable[idx];
1001         return tagVal;
1002     }
1003 
1004     /**
1005      * Get the status (tag) values from the break rule(s) that determined the most
1006      * recently returned break position.  The values appear in the rule source
1007      * within brackets, {123}, for example.  The default status value for rules
1008      * that do not explicitly provide one is zero.
1009      * <p>
1010      * The status values used by the standard ICU break rules are defined
1011      * as public constants in class RuleBasedBreakIterator.
1012      * <p>
1013      * If the size  of the output array is insufficient to hold the data,
1014      *  the output will be truncated to the available length.  No exception
1015      *  will be thrown.
1016      *
1017      * @param fillInArray an array to be filled in with the status values.
1018      * @return          The number of rule status values from rules that determined
1019      *                  the most recent boundary returned by the break iterator.
1020      *                  In the event that the array is too small, the return value
1021      *                  is the total number of status values that were available,
1022      *                  not the reduced number that were actually returned.
1023      * @draft ICU 3.0 (retain)
1024      * @provisional This is a draft API and might change in a future release of ICU.
1025      */
getRuleStatusVec(int[] fillInArray)1026     public int getRuleStatusVec(int[] fillInArray) {
1027         makeRuleStatusValid();
1028         int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
1029         if (fillInArray != null) {
1030             int numToCopy = Math.min(numStatusVals, fillInArray.length);
1031             for (int i=0; i<numToCopy; i++) {
1032                 fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
1033             }
1034         }
1035         return numStatusVals;
1036     }
1037 
1038     /**
1039      * Return a CharacterIterator over the text being analyzed.  This version
1040      * of this method returns the actual CharacterIterator we're using internally.
1041      * Changing the state of this iterator can have undefined consequences.  If
1042      * you need to change it, clone it first.
1043      * @return An iterator over the text being analyzed.
1044      * @stable ICU 2.0
1045      */
getText()1046     public CharacterIterator getText() {
1047         return fText;
1048     }
1049 
1050     /**
1051      * Set the iterator to analyze a new piece of text.  This function resets
1052      * the current iteration position to the beginning of the text.
1053      * @param newText An iterator over the text to analyze.
1054      * @stable ICU 2.0
1055      */
setText(CharacterIterator newText)1056     public void setText(CharacterIterator newText) {
1057         fText = newText;
1058         // first() resets the caches
1059         this.first();
1060     }
1061 
1062     /**
1063      * package private
1064      */
setBreakType(int type)1065     void setBreakType(int type) {
1066         fBreakType = type;
1067     }
1068 
1069     /**
1070      * package private
1071      */
getBreakType()1072     int getBreakType() {
1073         return fBreakType;
1074     }
1075 
1076     /**
1077      * Control debug, trace and dump options.
1078      * @internal
1079      */
1080     static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
1081                                         ICUDebug.value(RBBI_DEBUG_ARG) : null;
1082 
1083 
getLanguageBreakEngine(int c)1084     private LanguageBreakEngine getLanguageBreakEngine(int c) {
1085 
1086         // We have a dictionary character.
1087         // Does an already instantiated break engine handle it?
1088         for (LanguageBreakEngine candidate : fBreakEngines.values()) {
1089             if (candidate.handles(c, fBreakType)) {
1090                 return candidate;
1091             }
1092         }
1093 
1094         // if we don't have an existing engine, build one.
1095         int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
1096         if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
1097             // Katakana, Hiragana and Han are handled by the same dictionary engine.
1098             // Fold them together for mapping from script -> engine.
1099             script = UScript.HAN;
1100         }
1101 
1102         LanguageBreakEngine eng = fBreakEngines.get(script);
1103         /*
1104         if (eng != null && !eng.handles(c, fBreakType)) {
1105             fUnhandledBreakEngine.handleChar(c, getBreakType());
1106             eng = fUnhandledBreakEngine;
1107         } else  */  {
1108             try {
1109                 switch (script) {
1110                 case UScript.THAI:
1111                     eng = new ThaiBreakEngine();
1112                     break;
1113                 case UScript.LAO:
1114                     eng = new LaoBreakEngine();
1115                     break;
1116                 case UScript.MYANMAR:
1117                     eng = new BurmeseBreakEngine();
1118                     break;
1119                 case UScript.KHMER:
1120                     eng = new KhmerBreakEngine();
1121                     break;
1122                 case UScript.HAN:
1123                     if (getBreakType() == KIND_WORD) {
1124                         eng = new CjkBreakEngine(false);
1125                     }
1126                     else {
1127                         fUnhandledBreakEngine.handleChar(c, getBreakType());
1128                         eng = fUnhandledBreakEngine;
1129                     }
1130                     break;
1131                 case UScript.HANGUL:
1132                     if (getBreakType() == KIND_WORD) {
1133                         eng = new CjkBreakEngine(true);
1134                     } else {
1135                         fUnhandledBreakEngine.handleChar(c, getBreakType());
1136                         eng = fUnhandledBreakEngine;
1137                     }
1138                     break;
1139                 default:
1140                     fUnhandledBreakEngine.handleChar(c, getBreakType());
1141                     eng = fUnhandledBreakEngine;
1142                     break;
1143                 }
1144             } catch (IOException e) {
1145                 eng = null;
1146             }
1147         }
1148 
1149         if (eng != null && eng != fUnhandledBreakEngine) {
1150             LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng);
1151             if (existingEngine != null) {
1152                 // There was a race & another thread was first to register an engine for this script.
1153                 // Use theirs and discard the one we just created.
1154                 eng = existingEngine;
1155             }
1156             // assert eng.handles(c, fBreakType);
1157         }
1158         return eng;
1159     }
1160 
1161 
1162 
1163     /**
1164      * The State Machine Engine for moving forward is here.
1165      * This function is the heart of the RBBI run time engine.
1166      *
1167      * @param stateTable
1168      * @return the new iterator position
1169      *
1170      * A note on supplementary characters and the position of underlying
1171      * Java CharacterIterator:   Normally, a character iterator is positioned at
1172      * the char most recently returned by next().  Within this function, when
1173      * a supplementary char is being processed, the char iterator is left
1174      * sitting on the trail surrogate, in the middle of the code point.
1175      * This is different from everywhere else, where an iterator always
1176      * points at the lead surrogate of a supplementary.
1177      */
handleNext(short stateTable[])1178     private int handleNext(short stateTable[]) {
1179         if (TRACE) {
1180             System.out.println("Handle Next   pos      char  state category");
1181         }
1182 
1183         // No matter what, handleNext alway correctly sets the break tag value.
1184         fLastStatusIndexValid = true;
1185         fLastRuleStatusIndex  = 0;
1186 
1187         // caches for quicker access
1188         CharacterIterator text = fText;
1189         CharTrie trie = fRData.fTrie;
1190 
1191         // Set up the starting char
1192         int c               = text.current();
1193         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
1194             c = nextTrail32(text, c);
1195             if (c == DONE32) {
1196                 return BreakIterator.DONE;
1197             }
1198         }
1199         int initialPosition = text.getIndex();
1200         int result          = initialPosition;
1201 
1202         // Set the initial state for the state machine
1203         int state           = START_STATE;
1204         int row             = fRData.getRowIndex(state);
1205         short category      = 3;
1206         int flagsState      = fRData.getStateTableFlags(stateTable);
1207         int mode            = RBBI_RUN;
1208         if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
1209             category = 2;
1210             mode     = RBBI_START;
1211             if (TRACE) {
1212                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
1213                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1214                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1215             }
1216         }
1217         int lookaheadStatus = 0;
1218         int lookaheadTagIdx = 0;
1219         int lookaheadResult = 0;
1220 
1221         // loop until we reach the end of the text or transition to state 0
1222         while (state != STOP_STATE) {
1223             if (c == DONE32) {
1224                 // Reached end of input string.
1225                 if (mode == RBBI_END) {
1226                     // We have already run the loop one last time with the
1227                     // character set to the pseudo {eof} value. Now it is time
1228                     // to unconditionally bail out.
1229 
1230                     if (lookaheadResult > result) {
1231                         // We ran off the end of the string with a pending
1232                         // look-ahead match.
1233                         // Treat this as if the look-ahead condition had been
1234                         // met, and return
1235                         // the match at the / position from the look-ahead rule.
1236                         result = lookaheadResult;
1237                         fLastRuleStatusIndex = lookaheadTagIdx;
1238                     }
1239                     break;
1240                 }
1241                 // Run the loop one last time with the fake end-of-input character category
1242                 mode = RBBI_END;
1243                 category = 1;
1244             }
1245             else if (mode == RBBI_RUN) {
1246                 // Get the char category.  An incoming category of 1 or 2 mens that
1247                 //      we are preset for doing the beginning or end of input, and
1248                 //      that we shouldn't get a category from an actual text input character.
1249                 //
1250 
1251                 // look up the current character's character category, which tells us
1252                 // which column in the state table to look at.
1253                 //
1254                 category = (short) trie.getCodePointValue(c);
1255 
1256                 // Check the dictionary bit in the character's category.
1257                 //    Counter is only used by dictionary based iterators (subclasses).
1258                 //    Chars that need to be handled by a dictionary have a flag bit set
1259                 //    in their category values.
1260                 //
1261                 if ((category & 0x4000) != 0)  {
1262                     fDictionaryCharCount++;
1263                     //  And off the dictionary flag bit.
1264                     category &= ~0x4000;
1265                 }
1266 
1267                 if (TRACE) {
1268                     System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
1269                     System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1270                     System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1271                 }
1272 
1273                 // Advance to the next character.
1274                 // If this is a beginning-of-input loop iteration, don't advance.
1275                 //    The next iteration will be processing the first real input character.
1276                 c = (int)text.next();
1277                 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
1278                     c = nextTrail32(text, c);
1279                 }
1280             }
1281             else {
1282                 mode = RBBI_RUN;
1283             }
1284 
1285             // look up a state transition in the state table
1286             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1287             row   = fRData.getRowIndex(state);
1288 
1289             if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1290                 // Match found, common case
1291                 result = text.getIndex();
1292                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
1293                     // The iterator has been left in the middle of a surrogate pair.
1294                     // We want the start of it.
1295                     result--;
1296                 }
1297 
1298                 //  Remember the break status (tag) values.
1299                 fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
1300             }
1301 
1302             if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1303                 if (lookaheadStatus != 0
1304                     && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1305                     // Lookahead match is completed.  Set the result accordingly, but only
1306                     // if no other rule has matched further in the mean time.
1307                     result               = lookaheadResult;
1308                     fLastRuleStatusIndex = lookaheadTagIdx;
1309                     lookaheadStatus      = 0;
1310                     // TODO: make a standalone hard break in a rule work.
1311                     if ((flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0) {
1312                         text.setIndex(result);
1313                         return result;
1314                     }
1315                     // Look-ahead completed, but other rules may match further.  Continue on.
1316                     //   TODO:  junk this feature?  I don't think it's used anywhere.
1317                     continue;
1318                 }
1319 
1320                 lookaheadResult = text.getIndex();
1321                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
1322                     // The iterator has been left in the middle of a surrogate pair.
1323                     // We want the beginning  of it.
1324                     lookaheadResult--;
1325                 }
1326                 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
1327                 lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
1328                 continue;
1329             }
1330 
1331             if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1332                 // Because this is an accepting state, any in-progress look-ahead match
1333                 //   is no longer relevant.  Clear out the pending lookahead status.
1334                 lookaheadStatus = 0;
1335             }
1336         }        // End of state machine main loop
1337 
1338         // The state machine is done.  Check whether it found a match...
1339 
1340         // If the iterator failed to advance in the match engine force it ahead by one.
1341         // This indicates a defect in the break rules, which should always match
1342         // at least one character.
1343 
1344         if (result == initialPosition) {
1345             if (TRACE) {
1346                 System.out.println("Iterator did not move. Advancing by 1.");
1347             }
1348             text.setIndex(initialPosition);
1349             next32(text);
1350             result = text.getIndex();
1351         }
1352         else {
1353             // Leave the iterator at our result position.
1354             //   (we may have advanced beyond the last accepting position chasing after
1355             //    longer matches that never completed.)
1356             text.setIndex(result);
1357         }
1358         if (TRACE) {
1359             System.out.println("result = " + result);
1360         }
1361         return result;
1362     }
1363 
handlePrevious(short stateTable[])1364     private int handlePrevious(short stateTable[]) {
1365         if (fText == null || stateTable == null) {
1366             return 0;
1367         }
1368 
1369         int            state;
1370         int            category           = 0;
1371         int            mode;
1372         int            row;
1373         int            c;
1374         int            lookaheadStatus    = 0;
1375         int            result             = 0;
1376         int            initialPosition    = 0;
1377         int            lookaheadResult    = 0;
1378         boolean        lookAheadHardBreak =
1379             (fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1380 
1381         // handlePrevious() never gets the rule status.
1382         // Flag the status as invalid; if the user ever asks for status, we will need
1383         // to back up, then re-find the break position using handleNext(), which does
1384         // get the status value.
1385         fLastStatusIndexValid = false;
1386         fLastRuleStatusIndex  = 0;
1387 
1388         // set up the starting char
1389         initialPosition = fText.getIndex();
1390         result          = initialPosition;
1391         c               = previous32(fText);
1392 
1393         // Set up the initial state for the state machine
1394         state = START_STATE;
1395         row = fRData.getRowIndex(state);
1396         category = 3;   // TODO:  obsolete?  from the old start/run mode scheme?
1397         mode     = RBBI_RUN;
1398         if ((fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
1399             category = 2;
1400             mode     = RBBI_START;
1401         }
1402 
1403         if (TRACE) {
1404             System.out.println("Handle Prev   pos   char  state category ");
1405         }
1406 
1407         // loop until we reach the beginning of the text or transition to state 0
1408         //
1409         mainLoop: for (;;) {
1410             innerBlock: {
1411                 if (c == DONE32) {
1412                     // Reached end of input string.
1413                     if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
1414                         // Either this is the old (ICU 3.2 and earlier) format data which
1415                         // does not support explicit support for matching {eof}, or
1416                         // we have already done the {eof} iteration.  Now is the time
1417                         // to unconditionally bail out.
1418                         if (lookaheadResult < result) {
1419                             // We ran off the end of the string with a pending look-ahead match.
1420                             // Treat this as if the look-ahead condition had been met, and return
1421                             //  the match at the / position from the look-ahead rule.
1422                             result = lookaheadResult;
1423                             lookaheadStatus = 0;
1424                         } else if (result == initialPosition) {
1425                             // Ran off start, no match found.
1426                             // Move one position (towards the start, since we are doing previous.)
1427                             fText.setIndex(initialPosition);
1428                             previous32(fText);
1429                         }
1430                         break mainLoop;
1431                     }
1432                     mode = RBBI_END;
1433                     category = 1;
1434                 }
1435 
1436                 if (mode == RBBI_RUN) {
1437                     // look up the current character's category, which tells us
1438                     // which column in the state table to look at.
1439                     //
1440                     category = (short) fRData.fTrie.getCodePointValue(c);
1441 
1442                     // Check the dictionary bit in the character's category.
1443                     //    Counter is only used by dictionary based iterators (subclasses).
1444                     //    Chars that need to be handled by a dictionary have a flag bit set
1445                     //    in their category values.
1446                     //
1447                     if ((category & 0x4000) != 0)  {
1448                         fDictionaryCharCount++;
1449                         //  And off the dictionary flag bit.
1450                         category &= ~0x4000;
1451                     }
1452                 }
1453 
1454 
1455                 if (TRACE) {
1456                     System.out.print("             " + fText.getIndex() + "   ");
1457                     if (0x20 <= c && c < 0x7f) {
1458                         System.out.print("  " + c + "  ");
1459                     } else {
1460                         System.out.print(" " + Integer.toHexString(c) + " ");
1461                     }
1462                     System.out.println(" " + state + "  " + category + " ");
1463                 }
1464 
1465                 // State Transition - move machine to its next state
1466                 //
1467                 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1468                 row = fRData.getRowIndex(state);
1469 
1470                 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1471                     // Match found, common case, could have lookahead so we move
1472                     // on to check it
1473                     result = fText.getIndex();
1474                 }
1475 
1476                 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1477                     if (lookaheadStatus != 0
1478                             && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1479                         // Lookahead match is completed. Set the result
1480                         // accordingly, but only
1481                         // if no other rule has matched further in the mean
1482                         // time.
1483                         result = lookaheadResult;
1484                         lookaheadStatus = 0;
1485                         // TODO: make a stand-alone hard break in a rule work.
1486 
1487                         if (lookAheadHardBreak) {
1488                             break mainLoop;
1489                         }
1490                         // Look-ahead completed, but other rules may match further.
1491                         // Continue on.
1492                         // TODO: junk this feature?  I don't think that it's used anywhere.
1493                         break innerBlock;
1494                     }
1495                     // Hit a possible look-ahead match. We are at the
1496                     // position of the '/'. Remember this position.
1497                     lookaheadResult = fText.getIndex();
1498                     lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
1499                     break innerBlock;
1500                 }
1501 
1502                 // not lookahead...
1503                 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1504                     // This is a plain (non-look-ahead) accepting state.
1505                     if (!lookAheadHardBreak) {
1506                         // Clear out any pending look-ahead matches,
1507                         // but only if not doing the lookAheadHardBreak option
1508                         // which needs to force a break no matter what is going
1509                         // on with the rest of the match, i.e. we can't abandon
1510                         // a partially completed look-ahead match because
1511                         // some other rule matched further than the '/' position
1512                         // in the look-ahead match.
1513                         lookaheadStatus = 0;
1514                     }
1515                 }
1516 
1517             } // end of innerBlock.  "break innerBlock" in above code comes out here.
1518 
1519 
1520             if (state == STOP_STATE) {
1521                 // Normal loop exit is here
1522                 break mainLoop;
1523             }
1524 
1525             // then move iterator position backwards one character
1526             //
1527             if (mode == RBBI_RUN) {
1528                 c = previous32(fText);
1529             } else {
1530                 if (mode == RBBI_START) {
1531                     mode = RBBI_RUN;
1532                 }
1533             }
1534 
1535 
1536         }   // End of the main loop.
1537 
1538         // The state machine is done.  Check whether it found a match...
1539         //
1540         // If the iterator failed to advance in the match engine, force it ahead by one.
1541         //   (This really indicates a defect in the break rules.  They should always match
1542         //    at least one character.)
1543         if (result == initialPosition) {
1544             result = fText.setIndex(initialPosition);
1545             previous32(fText);
1546             result = fText.getIndex();
1547         }
1548 
1549         fText.setIndex(result);
1550         if (TRACE) {
1551             System.out.println("Result = " + result);
1552         }
1553 
1554         return result;
1555     }
1556 }
1557 
1558