1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2014, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 package com.ibm.icu.text;
8 
9 import java.text.CharacterIterator;
10 import java.text.StringCharacterIterator;
11 import java.util.Locale;
12 
13 import com.ibm.icu.util.ICUException;
14 import com.ibm.icu.util.ULocale;
15 
16 // Java porting note:
17 //
18 //        The ICU4C implementation contains dead code in many places.
19 //      While porting the ICU4C linear search implementation, this dead code
20 //      was not fully ported. The code blocks tagged by "// *** Boyer-Moore ***"
21 //      are those dead code blocks, still available in ICU4C.
22 
23 //        The ICU4C implementation does not seem to handle UCharacterIterator pointing
24 //      to a fragment of text properly. ICU4J uses CharacterIterator to navigate through
25 //      the input text. We need to carefully review the code ported from ICU4C
26 //      assuming the start index is 0.
27 
28 //        ICU4C implementation initializes pattern.CE and pattern.PCE. It looks like
29 //      CE is no longer used, except in a few places checking CELength. It looks like this
30 //      is a leftover from already-disabled Boyer-Moore search code. This Java implementation
31 //      preserves the code, but we should clean this up later.
32 
33 /**
34  *
35  * <tt>StringSearch</tt> is a {@link SearchIterator} that provides
36  * language-sensitive text searching based on the comparison rules defined
37  * in a {@link RuleBasedCollator} object.
38  * StringSearch ensures that language eccentricity can be
39  * handled, e.g. for the German collator, characters &szlig; and SS will be matched
40  * if case is chosen to be ignored.
41  * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm">
42  * "ICU Collation Design Document"</a> for more information.
43  * <p>
44  * There are 2 match options for selection:<br>
45  * Let S' be the sub-string of a text string S between the offsets start and
46  * end [start, end].
47  * <br>
48  * A pattern string P matches a text string S at the offsets [start, end]
49  * if
50  * <pre>
51  * option 1. Some canonical equivalent of P matches some canonical equivalent
52  *           of S'
53  * option 2. P matches S' and if P starts or ends with a combining mark,
54  *           there exists no non-ignorable combining mark before or after S?
55  *           in S respectively.
56  * </pre>
57  * Option 2. is the default.
58  * <p>
59  * This search has APIs similar to that of other text iteration mechanisms
60  * such as the break iterators in {@link BreakIterator}. Using these
61  * APIs, it is easy to scan through text looking for all occurrences of
62  * a given pattern. This search iterator allows changing of direction by
63  * calling a {@link #reset} followed by a {@link #next} or {@link #previous}.
64  * Though a direction change can occur without calling {@link #reset} first,
65  * this operation comes with some speed penalty.
66  * Match results in the forward direction will match the result matches in
67  * the backwards direction in the reverse order
68  * <p>
69  * {@link SearchIterator} provides APIs to specify the starting position
70  * within the text string to be searched, e.g. {@link SearchIterator#setIndex setIndex},
71  * {@link SearchIterator#preceding preceding} and {@link SearchIterator#following following}.
72  * Since the starting position will be set as it is specified, please take note that
73  * there are some danger points at which the search may render incorrect
74  * results:
75  * <ul>
76  * <li> In the midst of a substring that requires normalization.
77  * <li> If the following match is to be found, the position should not be the
78  *      second character which requires swapping with the preceding
79  *      character. Vice versa, if the preceding match is to be found, the
80  *      position to search from should not be the first character which
81  *      requires swapping with the next character. E.g certain Thai and
82  *      Lao characters require swapping.
83  * <li> If a following pattern match is to be found, any position within a
84  *      contracting sequence except the first will fail. Vice versa if a
85  *      preceding pattern match is to be found, an invalid starting point
86  *      would be any character within a contracting sequence except the last.
87  * </ul>
88  * <p>
89  * A {@link BreakIterator} can be used if only matches at logical breaks are desired.
90  * Using a {@link BreakIterator} will only give you results that exactly matches the
91  * boundaries given by the {@link BreakIterator}. For instance the pattern "e" will
92  * not be found in the string "\u00e9" if a character break iterator is used.
93  * <p>
94  * Options are provided to handle overlapping matches.
95  * E.g. In English, overlapping matches produces the result 0 and 2
96  * for the pattern "abab" in the text "ababab", where mutually
97  * exclusive matches only produces the result of 0.
98  * <p>
99  * Options are also provided to implement "asymmetric search" as described in
100  * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search">
101  * UTS #10 Unicode Collation Algorithm</a>, specifically the ElementComparisonType
102  * values.
103  * <p>
104  * Though collator attributes will be taken into consideration while
105  * performing matches, there are no APIs here for setting and getting the
106  * attributes. These attributes can be set by getting the collator
107  * from {@link #getCollator} and using the APIs in {@link RuleBasedCollator}.
108  * Lastly to update <tt>StringSearch</tt> to the new collator attributes,
109  * {@link #reset} has to be called.
110  * <p>
111  * Restriction: <br>
112  * Currently there are no composite characters that consists of a
113  * character with combining class > 0 before a character with combining
114  * class == 0. However, if such a character exists in the future,
115  * <tt>StringSearch</tt> does not guarantee the results for option 1.
116  * <p>
117  * Consult the {@link SearchIterator} documentation for information on
118  * and examples of how to use instances of this class to implement text
119  * searching.
120  * <p>
121  * Note, <tt>StringSearch</tt> is not to be subclassed.
122  * </p>
123  * @see SearchIterator
124  * @see RuleBasedCollator
125  * @author Laura Werner, synwee
126  * @stable ICU 2.0
127  */
128 // internal notes: all methods do not guarantee the correct status of the
129 // characteriterator. the caller has to maintain the original index position
130 // if necessary. methods could change the index position as it deems fit
131 public final class StringSearch extends SearchIterator {
132 
133     private Pattern pattern_;
134     private RuleBasedCollator collator_;
135 
136     // positions within the collation element iterator is used to determine
137     // if we are at the start of the text.
138     private CollationElementIterator textIter_;
139     private CollationPCE textProcessedIter_;
140 
141     // utility collation element, used throughout program for temporary
142     // iteration.
143     private CollationElementIterator utilIter_;
144 
145     private int strength_;
146     int ceMask_;
147     int variableTop_;
148 
149     private boolean toShift_;
150 
151     // *** Boyer-Moore ***
152     // private char[] canonicalPrefixAccents_;
153     // private char[] canonicalSuffixAccents_;
154 
155     /**
156      * Initializes the iterator to use the language-specific rules defined in
157      * the argument collator to search for argument pattern in the argument
158      * target text. The argument <code>breakiter</code> is used to define logical matches.
159      * See super class documentation for more details on the use of the target
160      * text and {@link BreakIterator}.
161      * @param pattern text to look for.
162      * @param target target text to search for pattern.
163      * @param collator {@link RuleBasedCollator} that defines the language rules
164      * @param breakiter A {@link BreakIterator} that is used to determine the
165      *                boundaries of a logical match. This argument can be null.
166      * @throws IllegalArgumentException thrown when argument target is null,
167      *            or of length 0
168      * @see BreakIterator
169      * @see RuleBasedCollator
170      * @stable ICU 2.0
171      */
StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator, BreakIterator breakiter)172     public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator,
173             BreakIterator breakiter) {
174 
175         // This implementation is ported from ICU4C usearch_open()
176 
177         super(target, breakiter);
178 
179         // string search does not really work when numeric collation is turned on
180         if (collator.getNumericCollation()) {
181             throw new UnsupportedOperationException("Numeric collation is not supported by StringSearch");
182         }
183 
184         collator_ = collator;
185         strength_ = collator.getStrength();
186         ceMask_ = getMask(strength_);
187         toShift_ = collator.isAlternateHandlingShifted();
188         variableTop_ = collator.getVariableTop();
189 
190         pattern_ = new Pattern(pattern);
191 
192         search_.setMatchedLength(0);
193         search_.matchedIndex_ = DONE;
194 
195         utilIter_ = null;
196         textIter_ = new CollationElementIterator(target, collator);
197 
198         textProcessedIter_ = null;
199 
200         // This is done by super class constructor
201         /*
202         search_.isOverlap_ = false;
203         search_.isCanonicalMatch_ = false;
204         search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
205         search_.isForwardSearching_ = true;
206         search_.reset_ = true;
207          */
208         ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE);
209         search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale);
210         search_.internalBreakIter_.setText((CharacterIterator)target.clone());  // We need to create a clone
211 
212         initialize();
213     }
214 
215     /**
216      * Initializes the iterator to use the language-specific rules defined in
217      * the argument collator to search for argument pattern in the argument
218      * target text. No {@link BreakIterator}s are set to test for logical matches.
219      * @param pattern text to look for.
220      * @param target target text to search for pattern.
221      * @param collator {@link RuleBasedCollator} that defines the language rules
222      * @throws IllegalArgumentException thrown when argument target is null,
223      *            or of length 0
224      * @see RuleBasedCollator
225      * @stable ICU 2.0
226      */
StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator)227     public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator) {
228         this(pattern, target, collator, null);
229     }
230 
231     /**
232      * Initializes the iterator to use the language-specific rules and
233      * break iterator rules defined in the argument locale to search for
234      * argument pattern in the argument target text.
235      * @param pattern text to look for.
236      * @param target target text to search for pattern.
237      * @param locale locale to use for language and break iterator rules
238      * @throws IllegalArgumentException thrown when argument target is null,
239      *            or of length 0. ClassCastException thrown if the collator for
240      *            the specified locale is not a RuleBasedCollator.
241      * @stable ICU 2.0
242      */
StringSearch(String pattern, CharacterIterator target, Locale locale)243     public StringSearch(String pattern, CharacterIterator target, Locale locale) {
244         this(pattern, target, ULocale.forLocale(locale));
245     }
246 
247     /**
248      * Initializes the iterator to use the language-specific rules and
249      * break iterator rules defined in the argument locale to search for
250      * argument pattern in the argument target text.
251      * See super class documentation for more details on the use of the target
252      * text and {@link BreakIterator}.
253      * @param pattern text to look for.
254      * @param target target text to search for pattern.
255      * @param locale locale to use for language and break iterator rules
256      * @throws IllegalArgumentException thrown when argument target is null,
257      *            or of length 0. ClassCastException thrown if the collator for
258      *            the specified locale is not a RuleBasedCollator.
259      * @see BreakIterator
260      * @see RuleBasedCollator
261      * @see SearchIterator
262      * @stable ICU 3.2
263      */
StringSearch(String pattern, CharacterIterator target, ULocale locale)264     public StringSearch(String pattern, CharacterIterator target, ULocale locale) {
265         this(pattern, target, (RuleBasedCollator) Collator.getInstance(locale), null);
266     }
267 
268     /**
269      * Initializes the iterator to use the language-specific rules and
270      * break iterator rules defined in the default locale to search for
271      * argument pattern in the argument target text.
272      * @param pattern text to look for.
273      * @param target target text to search for pattern.
274      * @throws IllegalArgumentException thrown when argument target is null,
275      *            or of length 0. ClassCastException thrown if the collator for
276      *            the default locale is not a RuleBasedCollator.
277      * @stable ICU 2.0
278      */
StringSearch(String pattern, String target)279     public StringSearch(String pattern, String target) {
280         this(pattern, new StringCharacterIterator(target),
281                 (RuleBasedCollator) Collator.getInstance(), null);
282     }
283 
284     /**
285      * Gets the {@link RuleBasedCollator} used for the language rules.
286      * <p>
287      * Since <tt>StringSearch</tt> depends on the returned {@link RuleBasedCollator}, any
288      * changes to the {@link RuleBasedCollator} result should follow with a call to
289      * either {@link #reset()} or {@link #setCollator(RuleBasedCollator)} to ensure the correct
290      * search behavior.
291      * </p>
292      * @return {@link RuleBasedCollator} used by this <tt>StringSearch</tt>
293      * @see RuleBasedCollator
294      * @see #setCollator
295      * @stable ICU 2.0
296      */
getCollator()297     public RuleBasedCollator getCollator() {
298         return collator_;
299     }
300 
301     /**
302      * Sets the {@link RuleBasedCollator} to be used for language-specific searching.
303      * <p>
304      * The iterator's position will not be changed by this method.
305      * @param collator to use for this <tt>StringSearch</tt>
306      * @throws IllegalArgumentException thrown when collator is null
307      * @see #getCollator
308      * @stable ICU 2.0
309      */
setCollator(RuleBasedCollator collator)310     public void setCollator(RuleBasedCollator collator) {
311         if (collator == null) {
312             throw new IllegalArgumentException("Collator can not be null");
313         }
314         collator_ = collator;
315         ceMask_ = getMask(collator_.getStrength());
316 
317         ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE);
318         search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale);
319         search_.internalBreakIter_.setText((CharacterIterator)search_.text().clone());  // We need to create a clone
320 
321         toShift_ = collator.isAlternateHandlingShifted();
322         variableTop_ = collator.getVariableTop();
323         textIter_ = new CollationElementIterator(pattern_.text_, collator);
324         utilIter_ = new CollationElementIterator(pattern_.text_, collator);
325 
326         // initialize() _after_ setting the iterators for the new collator.
327         initialize();
328     }
329 
330     /**
331      * Returns the pattern for which <tt>StringSearch</tt> is searching for.
332      * @return the pattern searched for
333      * @stable ICU 2.0
334      */
getPattern()335     public String getPattern() {
336         return pattern_.text_;
337     }
338 
339     /**
340      * Set the pattern to search for.
341      * The iterator's position will not be changed by this method.
342      * @param pattern for searching
343      * @see #getPattern
344      * @exception IllegalArgumentException thrown if pattern is null or of
345      *               length 0
346      * @stable ICU 2.0
347      */
setPattern(String pattern)348     public void setPattern(String pattern) {
349         if (pattern == null || pattern.length() <= 0) {
350             throw new IllegalArgumentException(
351                     "Pattern to search for can not be null or of length 0");
352         }
353         pattern_.text_ = pattern;
354         initialize();
355     }
356 
357     /**
358      * Determines whether canonical matches (option 1, as described in the
359      * class documentation) is set.
360      * See setCanonical(boolean) for more information.
361      * @see #setCanonical
362      * @return true if canonical matches is set, false otherwise
363      * @stable ICU 2.8
364      */
365     //TODO: hoist this to SearchIterator
isCanonical()366     public boolean isCanonical() {
367         return search_.isCanonicalMatch_;
368     }
369 
370     /**
371      * Set the canonical match mode. See class documentation for details.
372      * The default setting for this property is false.
373      * @param allowCanonical flag indicator if canonical matches are allowed
374      * @see #isCanonical
375      * @stable ICU 2.8
376      */
377     //TODO: hoist this to SearchIterator
setCanonical(boolean allowCanonical)378     public void setCanonical(boolean allowCanonical) {
379         search_.isCanonicalMatch_ = allowCanonical;
380     }
381 
382     /**
383      * {@inheritDoc}
384      * @stable ICU 2.8
385      */
386     @Override
setTarget(CharacterIterator text)387     public void setTarget(CharacterIterator text) {
388         super.setTarget(text);
389         textIter_.setText(text);
390     }
391 
392     /**
393      * {@inheritDoc}
394      * @stable ICU 2.8
395      */
396     @Override
getIndex()397     public int getIndex() {
398         int result = textIter_.getOffset();
399         if (isOutOfBounds(search_.beginIndex(), search_.endIndex(), result)) {
400             return DONE;
401         }
402         return result;
403     }
404 
405     /**
406      * {@inheritDoc}
407      * @stable ICU 2.8
408      */
409     @Override
setIndex(int position)410     public void setIndex(int position) {
411         // Java porting note: This method is equivalent to setOffset() in ICU4C.
412         // ICU4C SearchIterator::setOffset() is a pure virtual method, while
413         // ICU4J SearchIterator.setIndex() is not abstract method.
414 
415         super.setIndex(position);
416         textIter_.setOffset(position);
417     }
418 
419     /**
420      * {@inheritDoc}
421      * @stable ICU 2.8
422      */
423     @Override
reset()424     public void reset() {
425         // reset is setting the attributes that are already in
426         // string search, hence all attributes in the collator should
427         // be retrieved without any problems
428 
429         boolean sameCollAttribute = true;
430         int ceMask;
431         boolean shift;
432         int varTop;
433 
434         // **** hack to deal w/ how processed CEs encode quaternary ****
435         int newStrength = collator_.getStrength();
436         if ((strength_ < Collator.QUATERNARY && newStrength >= Collator.QUATERNARY)
437                 || (strength_ >= Collator.QUATERNARY && newStrength < Collator.QUATERNARY)) {
438             sameCollAttribute = false;
439         }
440 
441         strength_ = collator_.getStrength();
442         ceMask = getMask(strength_);
443         if (ceMask_ != ceMask) {
444             ceMask_ = ceMask;
445             sameCollAttribute = false;
446         }
447 
448         shift = collator_.isAlternateHandlingShifted();
449         if (toShift_ != shift) {
450             toShift_ = shift;
451             sameCollAttribute = false;
452         }
453 
454         varTop = collator_.getVariableTop();
455         if (variableTop_ != varTop) {
456             variableTop_ = varTop;
457             sameCollAttribute = false;
458         }
459 
460         if (!sameCollAttribute) {
461             initialize();
462         }
463 
464         textIter_.setText(search_.text());
465 
466         search_.setMatchedLength(0);
467         search_.matchedIndex_ = DONE;
468         search_.isOverlap_ = false;
469         search_.isCanonicalMatch_ = false;
470         search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
471         search_.isForwardSearching_ = true;
472         search_.reset_ = true;
473     }
474 
475     /**
476      * {@inheritDoc}
477      * @stable ICU 2.8
478      */
479     @Override
handleNext(int position)480     protected int handleNext(int position) {
481         if (pattern_.CELength_ == 0) {
482             search_.matchedIndex_ = search_.matchedIndex_ == DONE ?
483                                     getIndex() : search_.matchedIndex_ + 1;
484             search_.setMatchedLength(0);
485             textIter_.setOffset(search_.matchedIndex_);
486             if (search_.matchedIndex_ == search_.endIndex()) {
487                 search_.matchedIndex_ = DONE;
488             }
489         } else {
490             if (search_.matchedLength() <= 0) {
491                 // the flipping direction issue has already been handled
492                 // in next()
493                 // for boundary check purposes. this will ensure that the
494                 // next match will not preceed the current offset
495                 // note search_.matchedIndex_ will always be set to something
496                 // in the code
497                 search_.matchedIndex_ = position - 1;
498             }
499 
500             textIter_.setOffset(position);
501 
502             // ICU4C comment:
503             // if strsrch_->breakIter is always the same as m_breakiterator_
504             // then we don't need to check the match boundaries here because
505             // usearch_handleNextXXX will already have done it.
506             if (search_.isCanonicalMatch_) {
507                 // *could* actually use exact here 'cause no extra accents allowed...
508                 handleNextCanonical();
509             } else {
510                 handleNextExact();
511             }
512 
513             if (search_.matchedIndex_ == DONE) {
514                 textIter_.setOffset(search_.endIndex());
515             } else {
516                 textIter_.setOffset(search_.matchedIndex_);
517             }
518 
519             return search_.matchedIndex_;
520         }
521 
522         return DONE;
523     }
524 
525     /**
526      * {@inheritDoc}
527      * @stable ICU 2.8
528      */
529     @Override
handlePrevious(int position)530     protected int handlePrevious(int position) {
531         if (pattern_.CELength_ == 0) {
532             search_.matchedIndex_ =
533                     search_.matchedIndex_ == DONE ? getIndex() : search_.matchedIndex_;
534             if (search_.matchedIndex_ == search_.beginIndex()) {
535                 setMatchNotFound();
536             } else {
537                 search_.matchedIndex_--;
538                 textIter_.setOffset(search_.matchedIndex_);
539                 search_.setMatchedLength(0);
540             }
541         } else {
542             textIter_.setOffset(position);
543 
544             if (search_.isCanonicalMatch_) {
545                 // *could* use exact match here since extra accents *not* allowed!
546                 handlePreviousCanonical();
547             } else {
548                 handlePreviousExact();
549             }
550         }
551 
552         return search_.matchedIndex_;
553     }
554 
555     // ------------------ Internal implementation code ---------------------------
556 
557     private static final int INITIAL_ARRAY_SIZE_ = 256;
558 
559     // *** Boyer-Moore ***
560     // private static final Normalizer2Impl nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
561     // private static final int LAST_BYTE_MASK_ = 0xff;
562     // private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
563 
564     private static final int PRIMARYORDERMASK = 0xffff0000;
565     private static final int SECONDARYORDERMASK = 0x0000ff00;
566     private static final int TERTIARYORDERMASK = 0x000000ff;
567 
568     /**
569      * Getting the mask for collation strength
570      * @param strength collation strength
571      * @return collation element mask
572      */
getMask(int strength)573     private static int getMask(int strength) {
574         switch (strength) {
575         case Collator.PRIMARY:
576             return PRIMARYORDERMASK;
577         case Collator.SECONDARY:
578             return SECONDARYORDERMASK | PRIMARYORDERMASK;
579         default:
580             return TERTIARYORDERMASK | SECONDARYORDERMASK | PRIMARYORDERMASK;
581         }
582     }
583 
584 
585     // *** Boyer-Moore ***
586     /*
587     private final char getFCD(String str, int offset) {
588         char ch = str.charAt(offset);
589         if (ch < 0x180) {
590             return (char) nfcImpl_.getFCD16FromBelow180(ch);
591         } else if (nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
592             if (!Character.isHighSurrogate(ch)) {
593                 return (char) nfcImpl_.getFCD16FromNormData(ch);
594             } else {
595                 char c2;
596                 if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) {
597                     return (char) nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2));
598                 }
599             }
600         }
601         return 0;
602     }
603 
604     private final char getFCD(int c) {
605         return (char)nfcImpl_.getFCD16(c);
606     }
607     */
608 
609     /**
610      * Getting the modified collation elements taking into account the collation
611      * attributes.
612      *
613      * @param sourcece
614      * @return the modified collation element
615      */
getCE(int sourcece)616     private int getCE(int sourcece) {
617         // note for tertiary we can't use the collator->tertiaryMask, that
618         // is a preprocessed mask that takes into account case options. since
619         // we are only concerned with exact matches, we don't need that.
620         sourcece &= ceMask_;
621 
622         if (toShift_) {
623             // alternate handling here, since only the 16 most significant digits
624             // is only used, we can safely do a compare without masking
625             // if the ce is a variable, we mask and get only the primary values
626             // no shifting to quartenary is required since all primary values
627             // less than variabletop will need to be masked off anyway.
628             if (variableTop_ > sourcece) {
629                 if (strength_ >= Collator.QUATERNARY) {
630                     sourcece &= PRIMARYORDERMASK;
631                 } else {
632                     sourcece = CollationElementIterator.IGNORABLE;
633                 }
634             }
635         } else if (strength_ >= Collator.QUATERNARY && sourcece == CollationElementIterator.IGNORABLE) {
636             sourcece = 0xFFFF;
637         }
638 
639         return sourcece;
640     }
641 
642     /**
643      * Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp.
644      * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
645      * implement this in Pattern class.
646      *
647      * @param destination target array
648      * @param offset destination offset to add value
649      * @param destinationlength target array size
650      * @param value to be added
651      * @param increments incremental size expected
652      * @return new destination array, destination if there was no new allocation
653      */
addToIntArray(int[] destination, int offset, int destinationlength, int value, int increments)654     private static int[] addToIntArray(int[] destination, int offset, int destinationlength,
655             int value, int increments) {
656         int newlength = destinationlength;
657         if (offset + 1 == newlength) {
658             newlength += increments;
659             int temp[] = new int[newlength];
660             System.arraycopy(destination, 0, temp, 0, offset);
661             destination = temp;
662         }
663         destination[offset] = value;
664         return destination;
665     }
666 
667     /**
668      * Direct port of ICU4C static int64_t * addTouint64_tArray(...) in usearch.cpp.
669      * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
670      * implement this in Pattern class.
671      *
672      * @param destination target array
673      * @param offset destination offset to add value
674      * @param destinationlength target array size
675      * @param value to be added
676      * @param increments incremental size expected
677      * @return new destination array, destination if there was no new allocation
678      */
addToLongArray(long[] destination, int offset, int destinationlength, long value, int increments)679     private static long[] addToLongArray(long[] destination, int offset, int destinationlength,
680             long value, int increments) {
681         int newlength = destinationlength;
682         if (offset + 1 == newlength) {
683             newlength += increments;
684             long temp[] = new long[newlength];
685             System.arraycopy(destination, 0, temp, 0, offset);
686             destination = temp;
687         }
688         destination[offset] = value;
689         return destination;
690     }
691 
692     /**
693      * Initializing the ce table for a pattern.
694      * Stores non-ignorable collation keys.
695      * Table size will be estimated by the size of the pattern text. Table
696      * expansion will be perform as we go along. Adding 1 to ensure that the table
697      * size definitely increases.
698      * @return total number of expansions
699      */
700     // TODO: We probably do not need Pattern CE table.
initializePatternCETable()701     private int initializePatternCETable() {
702         int[] cetable = new int[INITIAL_ARRAY_SIZE_];
703         int cetablesize = cetable.length;
704         int patternlength = pattern_.text_.length();
705         CollationElementIterator coleiter = utilIter_;
706 
707         if (coleiter == null) {
708             coleiter = new CollationElementIterator(pattern_.text_, collator_);
709             utilIter_ = coleiter;
710         } else {
711             coleiter.setText(pattern_.text_);
712         }
713 
714         int offset = 0;
715         int result = 0;
716         int ce;
717 
718         while ((ce = coleiter.next()) != CollationElementIterator.NULLORDER) {
719             int newce = getCE(ce);
720             if (newce != CollationElementIterator.IGNORABLE /* 0 */) {
721                 int[] temp = addToIntArray(cetable, offset, cetablesize, newce,
722                         patternlength - coleiter.getOffset() + 1);
723                 offset++;
724                 cetable = temp;
725             }
726             result += (coleiter.getMaxExpansion(ce) - 1);
727         }
728 
729         cetable[offset] = 0;
730         pattern_.CE_ = cetable;
731         pattern_.CELength_ = offset;
732 
733         return result;
734     }
735 
736     /**
737      * Initializing the pce table for a pattern.
738      * Stores non-ignorable collation keys.
739      * Table size will be estimated by the size of the pattern text. Table
740      * expansion will be perform as we go along. Adding 1 to ensure that the table
741      * size definitely increases.
742      * @return total number of expansions
743      */
initializePatternPCETable()744     private int initializePatternPCETable() {
745         long[] pcetable = new long[INITIAL_ARRAY_SIZE_];
746         int pcetablesize = pcetable.length;
747         int patternlength = pattern_.text_.length();
748         CollationElementIterator coleiter = utilIter_;
749 
750         if (coleiter == null) {
751             coleiter = new CollationElementIterator(pattern_.text_, collator_);
752             utilIter_ = coleiter;
753         } else {
754             coleiter.setText(pattern_.text_);
755         }
756 
757         int offset = 0;
758         int result = 0;
759         long pce;
760 
761         CollationPCE iter = new CollationPCE(coleiter);
762 
763         // ** Should processed CEs be signed or unsigned?
764         // ** (the rest of the code in this file seems to play fast-and-loose with
765         // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
766         while ((pce = iter.nextProcessed(null)) != CollationPCE.PROCESSED_NULLORDER) {
767             long[] temp = addToLongArray(pcetable, offset, pcetablesize, pce, patternlength - coleiter.getOffset() + 1);
768             offset++;
769             pcetable = temp;
770         }
771 
772         pcetable[offset] = 0;
773         pattern_.PCE_ = pcetable;
774         pattern_.PCELength_ = offset;
775 
776         return result;
777     }
778 
779     // TODO: This method only triggers initializePatternCETable(), which is probably no
780     //      longer needed.
initializePattern()781     private int initializePattern() {
782         // Since the strength is primary, accents are ignored in the pattern.
783 
784         // *** Boyer-Moore ***
785         /*
786         if (strength_ == Collator.PRIMARY) {
787             pattern_.hasPrefixAccents_ = false;
788             pattern_.hasSuffixAccents_ = false;
789         } else {
790             pattern_.hasPrefixAccents_ = (getFCD(pattern_.text_, 0) >>> SECOND_LAST_BYTE_SHIFT_) != 0;
791             pattern_.hasSuffixAccents_ = (getFCD(pattern_.text_.codePointBefore(pattern_.text_.length())) & LAST_BYTE_MASK_) != 0;
792         }
793         */
794 
795         pattern_.PCE_ = null;
796 
797         // since intializePattern is an internal method status is a success.
798         return initializePatternCETable();
799     }
800 
801     // *** Boyer-Moore ***
802     /*
803      private final void setShiftTable(char shift[],
804                                          char backshift[],
805                                          int cetable[], int cesize,
806                                          int expansionsize,
807                                          int defaultforward,
808                                          int defaultbackward) {
809          // No implementation
810      }
811      */
812 
813     // TODO: This method only triggers initializePattern(), which is probably no
814     //      longer needed.
initialize()815     private void initialize() {
816         /* int expandlength = */ initializePattern();
817 
818         // *** Boyer-Moore ***
819         /*
820         if (pattern_.CELength_ > 0) {
821             int cesize = pattern_.CELength_;
822             int minlength = cesize > expandlength ? cesize - expandlength : 1;
823             pattern_.defaultShiftSize_ = minlength;
824             setShiftTable(pattern_.shift_, pattern_.backShift_, pattern_.CE_, cesize,
825                     expandlength, minlength, minlength);
826             return;
827         }
828         return pattern_.defaultShiftSize_;
829         */
830     }
831 
832     /**
833      * @internal
834      * @deprecated This API is ICU internal only.
835      */
836     @Deprecated
setMatchNotFound()837     protected void setMatchNotFound() {
838         super.setMatchNotFound();
839         // SearchIterator#setMatchNotFound() does following:
840         //      search_.matchedIndex_ = DONE;
841         //      search_.setMatchedLength(0);
842         if (search_.isForwardSearching_) {
843             textIter_.setOffset(search_.text().getEndIndex());
844         } else {
845             textIter_.setOffset(0);
846         }
847     }
848 
849     /**
850      * Checks if the offset runs out of the text string range
851      * @param textstart offset of the first character in the range
852      * @param textlimit limit offset of the text string range
853      * @param offset to test
854      * @return true if offset is out of bounds, false otherwise
855      */
isOutOfBounds(int textstart, int textlimit, int offset)856     private static final boolean isOutOfBounds(int textstart, int textlimit, int offset) {
857         return offset < textstart || offset > textlimit;
858     }
859 
860     /**
861      * Checks for identical match
862      * @param start offset of possible match
863      * @param end offset of possible match
864      * @return TRUE if identical match is found
865      */
checkIdentical(int start, int end)866     private boolean checkIdentical(int start, int end) {
867         if (strength_ != Collator.IDENTICAL) {
868             return true;
869         }
870         // Note: We could use Normalizer::compare() or similar, but for short strings
871         // which may not be in FCD it might be faster to just NFD them.
872         String textstr = getString(targetText, start, end - start);
873         if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) {
874             textstr = Normalizer.decompose(textstr, false);
875         }
876         String patternstr = pattern_.text_;
877         if (Normalizer.quickCheck(patternstr, Normalizer.NFD, 0) == Normalizer.NO) {
878             patternstr = Normalizer.decompose(patternstr, false);
879         }
880         return textstr.equals(patternstr);
881     }
882 
initTextProcessedIter()883     private boolean initTextProcessedIter() {
884         if (textProcessedIter_ == null) {
885             textProcessedIter_ = new CollationPCE(textIter_);
886         } else {
887             textProcessedIter_.init(textIter_);
888         }
889         return true;
890     }
891 
892     /*
893      * Find the next break boundary after startIndex. If the UStringSearch object
894      * has an external break iterator, use that. Otherwise use the internal character
895      * break iterator.
896      */
nextBoundaryAfter(int startIndex)897     private int nextBoundaryAfter(int startIndex) {
898         BreakIterator breakiterator = search_.breakIter();
899 
900         if (breakiterator == null) {
901             breakiterator = search_.internalBreakIter_;
902         }
903 
904         if (breakiterator != null) {
905             return breakiterator.following(startIndex);
906         }
907 
908         return startIndex;
909     }
910 
911     /*
912      * Returns TRUE if index is on a break boundary. If the UStringSearch
913      * has an external break iterator, test using that, otherwise test
914      * using the internal character break iterator.
915      */
isBreakBoundary(int index)916     private boolean isBreakBoundary(int index) {
917         BreakIterator breakiterator = search_.breakIter();
918 
919         if (breakiterator == null) {
920             breakiterator = search_.internalBreakIter_;
921         }
922 
923         return (breakiterator != null && breakiterator.isBoundary(index));
924     }
925 
926 
927     // Java porting note: Followings are corresponding to UCompareCEsResult enum
928     private static final int CE_MATCH = -1;
929     private static final int CE_NO_MATCH = 0;
930     private static final int CE_SKIP_TARG = 1;
931     private static final int CE_SKIP_PATN = 2;
932 
933     private static int CE_LEVEL2_BASE = 0x00000005;
934     private static int CE_LEVEL3_BASE = 0x00050000;
935 
compareCE64s(long targCE, long patCE, ElementComparisonType compareType)936     private static int compareCE64s(long targCE, long patCE, ElementComparisonType compareType) {
937         if (targCE == patCE) {
938             return CE_MATCH;
939         }
940         if (compareType == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
941             return CE_NO_MATCH;
942         }
943 
944         long targCEshifted = targCE >>> 32;
945         long patCEshifted = patCE >>> 32;
946         long mask;
947 
948         mask = 0xFFFF0000L;
949         int targLev1 = (int)(targCEshifted & mask);
950         int patLev1 = (int)(patCEshifted & mask);
951         if (targLev1 != patLev1) {
952             if (targLev1 == 0) {
953                 return CE_SKIP_TARG;
954             }
955             if (patLev1 == 0
956                     && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) {
957                 return CE_SKIP_PATN;
958             }
959             return CE_NO_MATCH;
960         }
961 
962         mask = 0x0000FFFFL;
963         int targLev2 = (int)(targCEshifted & mask);
964         int patLev2 = (int)(patCEshifted & mask);
965         if (targLev2 != patLev2) {
966             if (targLev2 == 0) {
967                 return CE_SKIP_TARG;
968             }
969             if (patLev2 == 0
970                     && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) {
971                 return CE_SKIP_PATN;
972             }
973             return (patLev2 == CE_LEVEL2_BASE ||
974                     (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD &&
975                         targLev2 == CE_LEVEL2_BASE)) ? CE_MATCH : CE_NO_MATCH;
976         }
977 
978         mask = 0xFFFF0000L;
979         int targLev3 = (int)(targCE & mask);
980         int patLev3 = (int)(patCE & mask);
981         if (targLev3 != patLev3) {
982             return (patLev3 == CE_LEVEL3_BASE ||
983                     (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD &&
984                         targLev3 == CE_LEVEL3_BASE) )? CE_MATCH: CE_NO_MATCH;
985         }
986 
987         return CE_MATCH;
988     }
989 
990     /**
991      * An object used for receiving matched index in search() and
992      * searchBackwards().
993      */
994     private static class Match {
995         int start_ = -1;
996         int limit_ = -1;
997     }
998 
search(int startIdx, Match m)999     private boolean search(int startIdx, Match m) {
1000         // Input parameter sanity check.
1001         if (pattern_.CELength_ == 0
1002                 || startIdx < search_.beginIndex()
1003                 || startIdx > search_.endIndex()) {
1004             throw new IllegalArgumentException("search(" + startIdx + ", m) - expected position to be between " +
1005                     search_.beginIndex() + " and " + search_.endIndex());
1006         }
1007 
1008         if (pattern_.PCE_ == null) {
1009             initializePatternPCETable();
1010         }
1011 
1012         textIter_.setOffset(startIdx);
1013         CEBuffer ceb = new CEBuffer(this);
1014 
1015         int targetIx = 0;
1016         CEI targetCEI = null;
1017         int patIx;
1018         boolean found;
1019 
1020         int mStart = -1;
1021         int mLimit = -1;
1022         int minLimit;
1023         int maxLimit;
1024 
1025         // Outer loop moves over match starting positions in the
1026         //      target CE space.
1027         // Here we see the target as a sequence of collation elements, resulting from the following:
1028         // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
1029         //    (for example, digraphs such as IJ may be broken into two characters).
1030         // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
1031         //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
1032         //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
1033         //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
1034         //    then the CE is deleted, so the following code sees only CEs that are relevant.
1035         // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
1036         // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
1037         // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
1038         for (targetIx = 0; ; targetIx++) {
1039             found = true;
1040             // Inner loop checks for a match beginning at each
1041             // position from the outer loop.
1042             int targetIxOffset = 0;
1043             long patCE = 0;
1044             // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
1045             // (compared to the last CE fetched for the previous targetIx value) as we need to go
1046             // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
1047             CEI firstCEI = ceb.get(targetIx);
1048             if (firstCEI == null) {
1049                 throw new ICUException("CEBuffer.get(" + targetIx + ") returned null.");
1050             }
1051 
1052             for (patIx = 0; patIx < pattern_.PCELength_; patIx++) {
1053                 patCE = pattern_.PCE_[patIx];
1054                 targetCEI = ceb.get(targetIx + patIx + targetIxOffset);
1055                 // Compare CE from target string with CE from the pattern.
1056                 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
1057                 // which will fail the compare, below.
1058                 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_);
1059                 if (ceMatch == CE_NO_MATCH) {
1060                     found = false;
1061                     break;
1062                 } else if (ceMatch > CE_NO_MATCH) {
1063                     if (ceMatch == CE_SKIP_TARG) {
1064                         // redo with same patCE, next targCE
1065                         patIx--;
1066                         targetIxOffset++;
1067                     } else { // ceMatch == CE_SKIP_PATN
1068                         // redo with same targCE, next patCE
1069                         targetIxOffset--;
1070                     }
1071                 }
1072             }
1073             targetIxOffset += pattern_.PCELength_; // this is now the offset in target CE space to end of the match so far
1074 
1075             if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) {
1076                 // No match at this targetIx.  Try again at the next.
1077                 continue;
1078             }
1079 
1080             if (!found) {
1081                 // No match at all, we have run off the end of the target text.
1082                 break;
1083             }
1084 
1085             // We have found a match in CE space.
1086             // Now determine the bounds in string index space.
1087             // There still is a chance of match failure if the CE range not correspond to
1088             // an acceptable character range.
1089             //
1090             CEI lastCEI = ceb.get(targetIx + targetIxOffset -1);
1091 
1092             mStart = firstCEI.lowIndex_;
1093             minLimit = lastCEI.lowIndex_;
1094 
1095             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
1096             // extended to the end of input, and the match is good.
1097 
1098             // Look at the high and low indices of the CE following the match. If
1099             // they are the same it means one of two things:
1100             //    1. The match extended to the last CE from the target text, which is OK, or
1101             //    2. The last CE that was part of the match is in an expansion that extends
1102             //       to the first CE after the match. In this case, we reject the match.
1103             CEI nextCEI = null;
1104             if (search_.elementComparisonType_ == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
1105                 nextCEI = ceb.get(targetIx + targetIxOffset);
1106                 maxLimit = nextCEI.lowIndex_;
1107                 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) {
1108                     found = false;
1109                 }
1110             } else {
1111                 for (;; ++targetIxOffset) {
1112                     nextCEI = ceb.get(targetIx + targetIxOffset);
1113                     maxLimit = nextCEI.lowIndex_;
1114                     // If we are at the end of the target too, match succeeds
1115                     if (nextCEI.ce_ == CollationPCE.PROCESSED_NULLORDER) {
1116                         break;
1117                     }
1118                     // As long as the next CE has primary weight of 0,
1119                     // it is part of the last target element matched by the pattern;
1120                     // make sure it can be part of a match with the last patCE
1121                     if ((((nextCEI.ce_) >>> 32) & 0xFFFF0000L) == 0) {
1122                         int ceMatch = compareCE64s(nextCEI.ce_, patCE, search_.elementComparisonType_);
1123                         if (ceMatch == CE_NO_MATCH || ceMatch == CE_SKIP_PATN ) {
1124                             found = false;
1125                             break;
1126                         }
1127                     // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
1128                     // target element, but it has non-zero primary weight => match fails
1129                     } else if ( nextCEI.lowIndex_ == nextCEI.highIndex_ ) {
1130                         found = false;
1131                         break;
1132                     // Else the target CE is not part of an expansion of the last matched element, match succeeds
1133                     } else {
1134                         break;
1135                     }
1136                 }
1137             }
1138 
1139             // Check for the start of the match being within a combining sequence.
1140             // This can happen if the pattern itself begins with a combining char, and
1141             // the match found combining marks in the target text that were attached
1142             // to something else.
1143             // This type of match should be rejected for not completely consuming a
1144             // combining sequence.
1145             if (!isBreakBoundary(mStart)) {
1146                 found = false;
1147             }
1148 
1149             // Check for the start of the match being within an Collation Element Expansion,
1150             // meaning that the first char of the match is only partially matched.
1151             // With expansions, the first CE will report the index of the source
1152             // character, and all subsequent (expansions) CEs will report the source index of the
1153             // _following_ character.
1154             int secondIx = firstCEI.highIndex_;
1155             if (mStart == secondIx) {
1156                 found = false;
1157             }
1158 
1159             // Advance the match end position to the first acceptable match boundary.
1160             // This advances the index over any combining characters.
1161             mLimit = maxLimit;
1162             if (minLimit < maxLimit) {
1163                 // When the last CE's low index is same with its high index, the CE is likely
1164                 // a part of expansion. In this case, the index is located just after the
1165                 // character corresponding to the CEs compared above. If the index is right
1166                 // at the break boundary, move the position to the next boundary will result
1167                 // incorrect match length when there are ignorable characters exist between
1168                 // the position and the next character produces CE(s). See ticket#8482.
1169                 if (minLimit == lastCEI.highIndex_ && isBreakBoundary(minLimit)) {
1170                     mLimit = minLimit;
1171                 } else {
1172                     int nba = nextBoundaryAfter(minLimit);
1173                     if (nba >= lastCEI.highIndex_) {
1174                         mLimit = nba;
1175                     }
1176                 }
1177             }
1178 
1179             // If advancing to the end of a combining sequence in character indexing space
1180             // advanced us beyond the end of the match in CE space, reject this match.
1181             if (mLimit > maxLimit) {
1182                 found = false;
1183             }
1184 
1185             if (!isBreakBoundary(mLimit)) {
1186                 found = false;
1187             }
1188 
1189             if (!checkIdentical(mStart, mLimit)) {
1190                 found = false;
1191             }
1192 
1193             if (found) {
1194                 break;
1195             }
1196         }
1197 
1198         // All Done.  Store back the match bounds to the caller.
1199         //
1200         if (found == false) {
1201             mLimit = -1;
1202             mStart = -1;
1203         }
1204 
1205         if (m != null) {
1206             m.start_ = mStart;
1207             m.limit_ = mLimit;
1208         }
1209 
1210         return found;
1211     }
1212 
searchBackwards(int startIdx, Match m)1213     private boolean searchBackwards(int startIdx, Match m) {
1214         //ICU4C_TODO comment:  reject search patterns beginning with a combining char.
1215 
1216         // Input parameter sanity check.
1217         if (pattern_.CELength_ == 0
1218                 || startIdx < search_.beginIndex()
1219                 || startIdx > search_.endIndex()) {
1220             throw new IllegalArgumentException("searchBackwards(" + startIdx + ", m) - expected position to be between " +
1221                     search_.beginIndex() + " and " + search_.endIndex());
1222         }
1223 
1224         if (pattern_.PCE_ == null) {
1225             initializePatternPCETable();
1226         }
1227 
1228         CEBuffer ceb = new CEBuffer(this);
1229         int targetIx = 0;
1230 
1231         /*
1232          * Pre-load the buffer with the CE's for the grapheme
1233          * after our starting position so that we're sure that
1234          * we can look at the CE following the match when we
1235          * check the match boundaries.
1236          *
1237          * This will also pre-fetch the first CE that we'll
1238          * consider for the match.
1239          */
1240         if (startIdx < search_.endIndex()) {
1241             BreakIterator bi = search_.internalBreakIter_;
1242             int next = bi.following(startIdx);
1243 
1244             textIter_.setOffset(next);
1245 
1246             for (targetIx = 0; ; targetIx++) {
1247                 if (ceb.getPrevious(targetIx).lowIndex_ < startIdx) {
1248                     break;
1249                 }
1250             }
1251         } else {
1252             textIter_.setOffset(startIdx);
1253         }
1254 
1255         CEI targetCEI = null;
1256         int patIx;
1257         boolean found;
1258 
1259         int limitIx = targetIx;
1260         int mStart = -1;
1261         int mLimit = -1;
1262         int minLimit;
1263         int maxLimit;
1264 
1265         // Outer loop moves over match starting positions in the
1266         //      target CE space.
1267         // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
1268         // But  patIx is 0 at the beginning of the pattern and increases toward the end.
1269         // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
1270         // and the beginning of the base text.
1271         for (targetIx = limitIx; ; targetIx++) {
1272             found = true;
1273             // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
1274             // (compared to the last CE fetched for the previous targetIx value) as we need to go
1275             // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
1276             CEI lastCEI = ceb.getPrevious(targetIx);
1277             if (lastCEI == null) {
1278                 throw new ICUException("CEBuffer.getPrevious(" + targetIx + ") returned null.");
1279             }
1280             // Inner loop checks for a match beginning at each
1281             // position from the outer loop.
1282             int targetIxOffset = 0;
1283             for (patIx = pattern_.PCELength_ - 1; patIx >= 0; patIx--) {
1284                 long patCE = pattern_.PCE_[patIx];
1285 
1286                 targetCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 - patIx + targetIxOffset);
1287                 // Compare CE from target string with CE from the pattern.
1288                 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
1289                 // which will fail the compare, below.
1290                 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_);
1291                 if (ceMatch == CE_NO_MATCH) {
1292                     found = false;
1293                     break;
1294                 } else if (ceMatch > CE_NO_MATCH) {
1295                     if (ceMatch == CE_SKIP_TARG) {
1296                         // redo with same patCE, next targCE
1297                         patIx++;
1298                         targetIxOffset++;
1299                     } else { // ceMatch == CE_SKIP_PATN
1300                         // redo with same targCE, next patCE
1301                         targetIxOffset--;
1302                     }
1303                 }
1304             }
1305 
1306             if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) {
1307                 // No match at this targetIx.  Try again at the next.
1308                 continue;
1309             }
1310 
1311             if (!found) {
1312                 // No match at all, we have run off the end of the target text.
1313                 break;
1314             }
1315 
1316             // We have found a match in CE space.
1317             // Now determine the bounds in string index space.
1318             // There still is a chance of match failure if the CE range not correspond to
1319             // an acceptable character range.
1320             //
1321             CEI firstCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 + targetIxOffset);
1322             mStart = firstCEI.lowIndex_;
1323 
1324             // Check for the start of the match being within a combining sequence.
1325             // This can happen if the pattern itself begins with a combining char, and
1326             // the match found combining marks in the target text that were attached
1327             // to something else.
1328             // This type of match should be rejected for not completely consuming a
1329             // combining sequence.
1330             if (!isBreakBoundary(mStart)) {
1331                 found = false;
1332             }
1333 
1334             // Look at the high index of the first CE in the match. If it's the same as the
1335             // low index, the first CE in the match is in the middle of an expansion.
1336             if (mStart == firstCEI.highIndex_) {
1337                 found = false;
1338             }
1339 
1340             minLimit = lastCEI.lowIndex_;
1341 
1342             if (targetIx > 0) {
1343                 // Look at the CE following the match.  If it is UCOL_NULLORDER the match
1344                 // extended to the end of input, and the match is good.
1345 
1346                 // Look at the high and low indices of the CE following the match. If
1347                 // they are the same it means one of two things:
1348                 //    1. The match extended to the last CE from the target text, which is OK, or
1349                 //    2. The last CE that was part of the match is in an expansion that extends
1350                 //       to the first CE after the match. In this case, we reject the match.
1351                 CEI nextCEI  = ceb.getPrevious(targetIx - 1);
1352 
1353                 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) {
1354                     found = false;
1355                 }
1356 
1357                 mLimit = maxLimit = nextCEI.lowIndex_;
1358 
1359                 // Advance the match end position to the first acceptable match boundary.
1360                 // This advances the index over any combining charcters.
1361                 if (minLimit < maxLimit) {
1362                     int nba = nextBoundaryAfter(minLimit);
1363 
1364                     if (nba >= lastCEI.highIndex_) {
1365                         mLimit = nba;
1366                     }
1367                 }
1368 
1369                 // If advancing to the end of a combining sequence in character indexing space
1370                 // advanced us beyond the end of the match in CE space, reject this match.
1371                 if (mLimit > maxLimit) {
1372                     found = false;
1373                 }
1374 
1375                 // Make sure the end of the match is on a break boundary
1376                 if (!isBreakBoundary(mLimit)) {
1377                     found = false;
1378                 }
1379 
1380             } else {
1381                 // No non-ignorable CEs after this point.
1382                 // The maximum position is detected by boundary after
1383                 // the last non-ignorable CE. Combining sequence
1384                 // across the start index will be truncated.
1385                 int nba = nextBoundaryAfter(minLimit);
1386                 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
1387             }
1388 
1389             if (!checkIdentical(mStart, mLimit)) {
1390                 found = false;
1391             }
1392 
1393             if (found) {
1394                 break;
1395             }
1396         }
1397 
1398         // All Done.  Store back the match bounds to the caller.
1399         //
1400         if (found == false) {
1401             mLimit = -1;
1402             mStart = -1;
1403         }
1404 
1405         if (m != null) {
1406             m.start_ = mStart;
1407             m.limit_ = mLimit;
1408         }
1409 
1410         return found;
1411     }
1412 
1413     // Java porting note:
1414     //
1415     // ICU4C usearch_handleNextExact() is identical to usearch_handleNextCanonical()
1416     // for the linear search implementation. The differences are addressed in search().
1417     //
handleNextExact()1418     private boolean handleNextExact() {
1419         return handleNextCommonImpl();
1420     }
1421 
handleNextCanonical()1422     private boolean handleNextCanonical() {
1423         return handleNextCommonImpl();
1424     }
1425 
handleNextCommonImpl()1426     private boolean handleNextCommonImpl() {
1427         int textOffset = textIter_.getOffset();
1428         Match match = new Match();
1429 
1430         if (search(textOffset, match)) {
1431             search_.matchedIndex_ = match.start_;
1432             search_.setMatchedLength(match.limit_ - match.start_);
1433             return true;
1434         } else {
1435             setMatchNotFound();
1436             return false;
1437         }
1438     }
1439 
1440     // Java porting note:
1441     //
1442     // ICU4C usearch_handlePreviousExact() is identical to usearch_handlePreviousCanonical()
1443     // for the linear search implementation. The differences are addressed in searchBackwards().
1444     //
handlePreviousExact()1445     private boolean handlePreviousExact() {
1446         return handlePreviousCommonImpl();
1447     }
1448 
handlePreviousCanonical()1449     private boolean handlePreviousCanonical() {
1450         return handlePreviousCommonImpl();
1451     }
1452 
handlePreviousCommonImpl()1453     private boolean handlePreviousCommonImpl() {
1454         int textOffset;
1455 
1456         if (search_.isOverlap_) {
1457             if (search_.matchedIndex_ != DONE) {
1458                 textOffset = search_.matchedIndex_ + search_.matchedLength() - 1;
1459             } else {
1460                 // move the start position at the end of possible match
1461                 initializePatternPCETable();
1462                 if (!initTextProcessedIter()) {
1463                     setMatchNotFound();
1464                     return false;
1465                 }
1466                 for (int nPCEs = 0; nPCEs < pattern_.PCELength_ - 1; nPCEs++) {
1467                     long pce = textProcessedIter_.nextProcessed(null);
1468                     if (pce == CollationPCE.PROCESSED_NULLORDER) {
1469                         // at the end of the text
1470                         break;
1471                     }
1472                 }
1473                 textOffset = textIter_.getOffset();
1474             }
1475         } else {
1476             textOffset = textIter_.getOffset();
1477         }
1478 
1479         Match match = new Match();
1480         if (searchBackwards(textOffset, match)) {
1481             search_.matchedIndex_ = match.start_;
1482             search_.setMatchedLength(match.limit_ - match.start_);
1483             return true;
1484         } else {
1485             setMatchNotFound();
1486             return false;
1487         }
1488     }
1489 
1490     /**
1491      * Gets a substring out of a CharacterIterator
1492      *
1493      * Java porting note: Not available in ICU4C
1494      *
1495      * @param text CharacterIterator
1496      * @param start start offset
1497      * @param length of substring
1498      * @return substring from text starting at start and length length
1499      */
getString(CharacterIterator text, int start, int length)1500     private static final String getString(CharacterIterator text, int start, int length) {
1501         StringBuilder result = new StringBuilder(length);
1502         int offset = text.getIndex();
1503         text.setIndex(start);
1504         for (int i = 0; i < length; i++) {
1505             result.append(text.current());
1506             text.next();
1507         }
1508         text.setIndex(offset);
1509         return result.toString();
1510     }
1511 
1512     /**
1513      * Java port of ICU4C struct UPattern (usrchimp.h)
1514      */
1515     private static final class Pattern {
1516         /** Pattern string */
1517         String text_;
1518 
1519         long[] PCE_;
1520         int PCELength_ = 0;
1521 
1522         // TODO: We probably do not need CE_ / CELength_
1523         @SuppressWarnings("unused")
1524         int[] CE_;
1525         int CELength_ = 0;
1526 
1527         // *** Boyer-Moore ***
1528         // boolean hasPrefixAccents_ = false;
1529         // boolean hasSuffixAccents_ = false;
1530         // int defaultShiftSize_;
1531         // char[] shift_;
1532         // char[] backShift_;
1533 
Pattern(String pattern)1534         protected Pattern(String pattern) {
1535             text_ = pattern;
1536         }
1537     }
1538 
1539     /**
1540      * Java port of ICU4C UCollationPCE (usrchimp.h)
1541      */
1542     private static class CollationPCE {
1543         public static final long PROCESSED_NULLORDER = -1;
1544 
1545         private static final int DEFAULT_BUFFER_SIZE = 16;
1546         private static final int BUFFER_GROW = 8;
1547 
1548         // Note: PRIMARYORDERMASK is also duplicated in StringSearch class
1549         private static final int PRIMARYORDERMASK = 0xffff0000;
1550         private static final int CONTINUATION_MARKER = 0xc0;
1551 
1552         private PCEBuffer pceBuffer_ = new PCEBuffer();
1553         private CollationElementIterator cei_;
1554         private int strength_;
1555         private boolean toShift_;
1556         private boolean isShifted_;
1557         private int variableTop_;
1558 
CollationPCE(CollationElementIterator iter)1559         public CollationPCE(CollationElementIterator iter) {
1560             init(iter);
1561         }
1562 
init(CollationElementIterator iter)1563         public void init(CollationElementIterator iter) {
1564             cei_ = iter;
1565             init(iter.getRuleBasedCollator());
1566         }
1567 
init(RuleBasedCollator coll)1568         private void init(RuleBasedCollator coll) {
1569             strength_ = coll.getStrength();
1570             toShift_ = coll.isAlternateHandlingShifted();
1571             isShifted_ = false;
1572             variableTop_ = coll.getVariableTop();
1573         }
1574 
1575         @SuppressWarnings("fallthrough")
processCE(int ce)1576         private long processCE(int ce) {
1577             long primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
1578 
1579             // This is clean, but somewhat slow...
1580             // We could apply the mask to ce and then
1581             // just get all three orders...
1582             switch (strength_) {
1583             default:
1584                 tertiary = CollationElementIterator.tertiaryOrder(ce);
1585                 /* note fall-through */
1586 
1587             case Collator.SECONDARY:
1588                 secondary = CollationElementIterator.secondaryOrder(ce);
1589                 /* note fall-through */
1590 
1591             case Collator.PRIMARY:
1592                 primary = CollationElementIterator.primaryOrder(ce);
1593             }
1594 
1595             // **** This should probably handle continuations too. ****
1596             // **** That means that we need 24 bits for the primary ****
1597             // **** instead of the 16 that we're currently using. ****
1598             // **** So we can lay out the 64 bits as: 24.12.12.16. ****
1599             // **** Another complication with continuations is that ****
1600             // **** the *second* CE is marked as a continuation, so ****
1601             // **** we always have to peek ahead to know how long ****
1602             // **** the primary is... ****
1603             if ((toShift_ && variableTop_ > ce && primary != 0) || (isShifted_ && primary == 0)) {
1604 
1605                 if (primary == 0) {
1606                     return CollationElementIterator.IGNORABLE;
1607                 }
1608 
1609                 if (strength_ >= Collator.QUATERNARY) {
1610                     quaternary = primary;
1611                 }
1612 
1613                 primary = secondary = tertiary = 0;
1614                 isShifted_ = true;
1615             } else {
1616                 if (strength_ >= Collator.QUATERNARY) {
1617                     quaternary = 0xFFFF;
1618                 }
1619 
1620                 isShifted_ = false;
1621             }
1622 
1623             return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
1624         }
1625 
1626         /**
1627          * Get the processed ordering priority of the next collation element in the text.
1628          * A single character may contain more than one collation element.
1629          *
1630          * Note: This is equivalent to
1631          * UCollationPCE::nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
1632          *
1633          * @param range receiving the iterator index before/after fetching the CE.
1634          * @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER
1635          *         if an error has occurred or if the end of string has been reached
1636          */
nextProcessed(Range range)1637         public long nextProcessed(Range range) {
1638             long result = CollationElementIterator.IGNORABLE;
1639             int low = 0, high = 0;
1640 
1641             pceBuffer_.reset();
1642 
1643             do {
1644                 low = cei_.getOffset();
1645                 int ce = cei_.next();
1646                 high = cei_.getOffset();
1647 
1648                 if (ce == CollationElementIterator.NULLORDER) {
1649                      result = PROCESSED_NULLORDER;
1650                      break;
1651                 }
1652 
1653                 result = processCE(ce);
1654             } while (result == CollationElementIterator.IGNORABLE);
1655 
1656             if (range != null) {
1657                 range.ixLow_ = low;
1658                 range.ixHigh_ = high;
1659             }
1660 
1661             return result;
1662         }
1663 
1664         /**
1665          * Get the processed ordering priority of the previous collation element in the text.
1666          * A single character may contain more than one collation element.
1667          *
1668          * Note: This is equivalent to
1669          * UCollationPCE::previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
1670          *
1671          * @param range receiving the iterator index before/after fetching the CE.
1672          * @return The previous collation elements ordering, otherwise returns
1673          *         PROCESSED_NULLORDER if an error has occurred or if the start of
1674          *         string has been reached.
1675          */
previousProcessed(Range range)1676         public long previousProcessed(Range range) {
1677             long result = CollationElementIterator.IGNORABLE;
1678             int low = 0, high = 0;
1679 
1680             // pceBuffer_.reset();
1681 
1682             while (pceBuffer_.empty()) {
1683                 // buffer raw CEs up to non-ignorable primary
1684                 RCEBuffer rceb = new RCEBuffer();
1685                 int ce;
1686 
1687                 boolean finish = false;
1688 
1689                 // **** do we need to reset rceb, or will it always be empty at this point ****
1690                 do {
1691                     high = cei_.getOffset();
1692                     ce = cei_.previous();
1693                     low = cei_.getOffset();
1694 
1695                     if (ce == CollationElementIterator.NULLORDER) {
1696                         if (!rceb.empty()) {
1697                             break;
1698                         }
1699 
1700                         finish = true;
1701                         break;
1702                     }
1703 
1704                     rceb.put(ce, low, high);
1705                 } while ((ce & PRIMARYORDERMASK) == 0 || isContinuation(ce));
1706 
1707                 if (finish) {
1708                     break;
1709                 }
1710 
1711                 // process the raw CEs
1712                 while (!rceb.empty()) {
1713                     RCEI rcei = rceb.get();
1714 
1715                     result = processCE(rcei.ce_);
1716 
1717                     if (result != CollationElementIterator.IGNORABLE) {
1718                         pceBuffer_.put(result, rcei.low_, rcei.high_);
1719                     }
1720                 }
1721             }
1722 
1723             if (pceBuffer_.empty()) {
1724                 // **** Is -1 the right value for ixLow, ixHigh? ****
1725                 if (range != null) {
1726                     range.ixLow_ = -1;
1727                     range.ixHigh_ = -1;
1728                 }
1729                 return CollationElementIterator.NULLORDER;
1730             }
1731 
1732             PCEI pcei = pceBuffer_.get();
1733 
1734             if (range != null) {
1735                 range.ixLow_ = pcei.low_;
1736                 range.ixHigh_ = pcei.high_;
1737             }
1738 
1739             return pcei.ce_;
1740         }
1741 
isContinuation(int ce)1742         private static boolean isContinuation(int ce) {
1743             return ((ce & CONTINUATION_MARKER) == CONTINUATION_MARKER);
1744         }
1745 
1746         public static final class Range {
1747             int ixLow_;
1748             int ixHigh_;
1749         }
1750 
1751         /** Processed collation element buffer stuff ported from ICU4C ucoleitr.cpp */
1752         private static final class PCEI {
1753             long ce_;
1754             int low_;
1755             int high_;
1756         }
1757 
1758         private static final class PCEBuffer {
1759             private PCEI[] buffer_ = new PCEI[DEFAULT_BUFFER_SIZE];
1760             private int bufferIndex_ = 0;
1761 
reset()1762             void reset() {
1763                 bufferIndex_ = 0;
1764             }
1765 
empty()1766             boolean empty() {
1767                 return bufferIndex_ <= 0;
1768             }
1769 
put(long ce, int ixLow, int ixHigh)1770             void put(long ce, int ixLow, int ixHigh)
1771             {
1772                 if (bufferIndex_ >= buffer_.length) {
1773                     PCEI[] newBuffer = new PCEI[buffer_.length + BUFFER_GROW];
1774                     System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length);
1775                     buffer_ = newBuffer;
1776                 }
1777                 buffer_[bufferIndex_] = new PCEI();
1778                 buffer_[bufferIndex_].ce_ = ce;
1779                 buffer_[bufferIndex_].low_ = ixLow;
1780                 buffer_[bufferIndex_].high_ = ixHigh;
1781 
1782                 bufferIndex_ += 1;
1783             }
1784 
get()1785             PCEI get() {
1786                 if (bufferIndex_ > 0) {
1787                     return buffer_[--bufferIndex_];
1788                 }
1789                 return null;
1790             }
1791         }
1792 
1793         /** Raw collation element buffer stuff ported from ICU4C ucoleitr.cpp */
1794         private static final class RCEI {
1795             int ce_;
1796             int low_;
1797             int high_;
1798         }
1799 
1800         private static final class RCEBuffer {
1801             private RCEI[] buffer_ = new RCEI[DEFAULT_BUFFER_SIZE];
1802             private int bufferIndex_ = 0;
1803 
empty()1804             boolean empty() {
1805                 return bufferIndex_ <= 0;
1806             }
1807 
put(int ce, int ixLow, int ixHigh)1808             void put(int ce, int ixLow, int ixHigh) {
1809                 if (bufferIndex_ >= buffer_.length) {
1810                     RCEI[] newBuffer = new RCEI[buffer_.length + BUFFER_GROW];
1811                     System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length);
1812                     buffer_ = newBuffer;
1813                 }
1814                 buffer_[bufferIndex_] = new RCEI();
1815                 buffer_[bufferIndex_].ce_ = ce;
1816                 buffer_[bufferIndex_].low_ = ixLow;
1817                 buffer_[bufferIndex_].high_ = ixHigh;
1818 
1819                 bufferIndex_ += 1;
1820             }
1821 
get()1822             RCEI get() {
1823                 if (bufferIndex_ > 0) {
1824                     return buffer_[--bufferIndex_];
1825                 }
1826                 return null;
1827             }
1828         }
1829     }
1830 
1831     /**
1832      * Java port of ICU4C CEI (usearch.cpp)
1833      *
1834      * CEI  Collation Element + source text index.
1835      *      These structs are kept in the circular buffer.
1836      */
1837     private static class CEI {
1838         long ce_;
1839         int lowIndex_;
1840         int highIndex_;
1841     }
1842 
1843     /**
1844      * CEBuffer A circular buffer of CEs from the text being searched
1845      */
1846     private static class CEBuffer {
1847         // Java porting note: ICU4C uses the size for stack buffer
1848         // static final int DEFAULT_CEBUFFER_SIZE = 96;
1849 
1850         static final int CEBUFFER_EXTRA = 32;
1851         static final int MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L = 8;
1852         static final int MAX_TARGET_IGNORABLES_PER_PAT_OTHER = 3;
1853 
1854         CEI[] buf_;
1855         int bufSize_;
1856         int firstIx_;
1857         int limitIx_;
1858 
1859         // Java porting note: No references in ICU4C implementation
1860         // CollationElementIterator ceIter_;
1861 
1862         StringSearch strSearch_;
1863 
CEBuffer(StringSearch ss)1864         CEBuffer(StringSearch ss) {
1865             strSearch_ = ss;
1866             bufSize_ = ss.pattern_.PCELength_ + CEBUFFER_EXTRA;
1867             if (ss.search_.elementComparisonType_ != ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
1868                 String patText = ss.pattern_.text_;
1869                 if (patText != null) {
1870                     for (int i = 0; i < patText.length(); i++) {
1871                         char c = patText.charAt(i);
1872                         if (MIGHT_BE_JAMO_L(c)) {
1873                             bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
1874                         } else {
1875                             // No check for surrogates, we might allocate slightly more buffer than necessary.
1876                             bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
1877                         }
1878                     }
1879                 }
1880             }
1881 
1882             // Not used - see above
1883             // ceIter_ = ss.textIter_;
1884 
1885             firstIx_ = 0;
1886             limitIx_ = 0;
1887 
1888             if (!ss.initTextProcessedIter()) {
1889                 return;
1890             }
1891 
1892             buf_ = new CEI[bufSize_];
1893         }
1894 
1895         // Get the CE with the specified index.
1896         //   Index must be in the range
1897         //             n-history_size < index < n+1
1898         //   where n is the largest index to have been fetched by some previous call to this function.
1899         //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
1900         //
get(int index)1901         CEI get(int index) {
1902             int i = index % bufSize_;
1903 
1904             if (index >= firstIx_ && index < limitIx_) {
1905                 // The request was for an entry already in our buffer.
1906                 // Just return it.
1907                 return buf_[i];
1908             }
1909 
1910             // Caller is requesting a new, never accessed before, CE.
1911             // Verify that it is the next one in sequence, which is all
1912             // that is allowed.
1913             if (index != limitIx_) {
1914                 assert(false);
1915                 return null;
1916             }
1917 
1918             // Manage the circular CE buffer indexing
1919             limitIx_++;
1920 
1921             if (limitIx_ - firstIx_ >= bufSize_) {
1922                 // The buffer is full, knock out the lowest-indexed entry.
1923                 firstIx_++;
1924             }
1925 
1926             CollationPCE.Range range = new CollationPCE.Range();
1927             if (buf_[i] == null) {
1928                 buf_[i] = new CEI();
1929             }
1930             buf_[i].ce_ = strSearch_.textProcessedIter_.nextProcessed(range);
1931             buf_[i].lowIndex_ = range.ixLow_;
1932             buf_[i].highIndex_ = range.ixHigh_;
1933 
1934             return buf_[i];
1935         }
1936 
1937         // Get the CE with the specified index.
1938         //   Index must be in the range
1939         //             n-history_size < index < n+1
1940         //   where n is the largest index to have been fetched by some previous call to this function.
1941         //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
1942         //
getPrevious(int index)1943         CEI getPrevious(int index) {
1944             int i = index % bufSize_;
1945 
1946             if (index >= firstIx_ && index < limitIx_) {
1947                 // The request was for an entry already in our buffer.
1948                 // Just return it.
1949                 return buf_[i];
1950             }
1951 
1952             // Caller is requesting a new, never accessed before, CE.
1953             // Verify that it is the next one in sequence, which is all
1954             // that is allowed.
1955             if (index != limitIx_) {
1956                 assert(false);
1957                 return null;
1958             }
1959 
1960             // Manage the circular CE buffer indexing
1961             limitIx_++;
1962 
1963             if (limitIx_ - firstIx_ >= bufSize_) {
1964                 // The buffer is full, knock out the lowest-indexed entry.
1965                 firstIx_++;
1966             }
1967 
1968             CollationPCE.Range range = new CollationPCE.Range();
1969             if (buf_[i] == null) {
1970                 buf_[i] = new CEI();
1971             }
1972             buf_[i].ce_ = strSearch_.textProcessedIter_.previousProcessed(range);
1973             buf_[i].lowIndex_ = range.ixLow_;
1974             buf_[i].highIndex_ = range.ixHigh_;
1975 
1976             return buf_[i];
1977         }
1978 
MIGHT_BE_JAMO_L(char c)1979         static boolean MIGHT_BE_JAMO_L(char c) {
1980             return (c >= 0x1100 && c <= 0x115E)
1981                     || (c >= 0x3131 && c <= 0x314E)
1982                     || (c >= 0x3165 && c <= 0x3186);
1983         }
1984     }
1985 }
1986