1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.text; 8 9 import java.text.CharacterIterator; 10 import java.text.StringCharacterIterator; 11 import java.util.Locale; 12 13 import com.ibm.icu.util.ICUException; 14 import com.ibm.icu.util.ULocale; 15 16 // Java porting note: 17 // 18 // The ICU4C implementation contains dead code in many places. 19 // While porting the ICU4C linear search implementation, this dead code 20 // was not fully ported. The code blocks tagged by "// *** Boyer-Moore ***" 21 // are those dead code blocks, still available in ICU4C. 22 23 // The ICU4C implementation does not seem to handle UCharacterIterator pointing 24 // to a fragment of text properly. ICU4J uses CharacterIterator to navigate through 25 // the input text. We need to carefully review the code ported from ICU4C 26 // assuming the start index is 0. 27 28 // ICU4C implementation initializes pattern.CE and pattern.PCE. It looks like 29 // CE is no longer used, except in a few places checking CELength. It looks like this 30 // is a leftover from already-disabled Boyer-Moore search code. This Java implementation 31 // preserves the code, but we should clean this up later. 32 33 /** 34 * 35 * <tt>StringSearch</tt> is a {@link SearchIterator} that provides 36 * language-sensitive text searching based on the comparison rules defined 37 * in a {@link RuleBasedCollator} object. 38 * StringSearch ensures that language eccentricity can be 39 * handled, e.g. for the German collator, characters ß and SS will be matched 40 * if case is chosen to be ignored. 41 * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm"> 42 * "ICU Collation Design Document"</a> for more information. 43 * <p> 44 * There are 2 match options for selection:<br> 45 * Let S' be the sub-string of a text string S between the offsets start and 46 * end [start, end]. 47 * <br> 48 * A pattern string P matches a text string S at the offsets [start, end] 49 * if 50 * <pre> 51 * option 1. Some canonical equivalent of P matches some canonical equivalent 52 * of S' 53 * option 2. P matches S' and if P starts or ends with a combining mark, 54 * there exists no non-ignorable combining mark before or after S? 55 * in S respectively. 56 * </pre> 57 * Option 2. is the default. 58 * <p> 59 * This search has APIs similar to that of other text iteration mechanisms 60 * such as the break iterators in {@link BreakIterator}. Using these 61 * APIs, it is easy to scan through text looking for all occurrences of 62 * a given pattern. This search iterator allows changing of direction by 63 * calling a {@link #reset} followed by a {@link #next} or {@link #previous}. 64 * Though a direction change can occur without calling {@link #reset} first, 65 * this operation comes with some speed penalty. 66 * Match results in the forward direction will match the result matches in 67 * the backwards direction in the reverse order 68 * <p> 69 * {@link SearchIterator} provides APIs to specify the starting position 70 * within the text string to be searched, e.g. {@link SearchIterator#setIndex setIndex}, 71 * {@link SearchIterator#preceding preceding} and {@link SearchIterator#following following}. 72 * Since the starting position will be set as it is specified, please take note that 73 * there are some danger points at which the search may render incorrect 74 * results: 75 * <ul> 76 * <li> In the midst of a substring that requires normalization. 77 * <li> If the following match is to be found, the position should not be the 78 * second character which requires swapping with the preceding 79 * character. Vice versa, if the preceding match is to be found, the 80 * position to search from should not be the first character which 81 * requires swapping with the next character. E.g certain Thai and 82 * Lao characters require swapping. 83 * <li> If a following pattern match is to be found, any position within a 84 * contracting sequence except the first will fail. Vice versa if a 85 * preceding pattern match is to be found, an invalid starting point 86 * would be any character within a contracting sequence except the last. 87 * </ul> 88 * <p> 89 * A {@link BreakIterator} can be used if only matches at logical breaks are desired. 90 * Using a {@link BreakIterator} will only give you results that exactly matches the 91 * boundaries given by the {@link BreakIterator}. For instance the pattern "e" will 92 * not be found in the string "\u00e9" if a character break iterator is used. 93 * <p> 94 * Options are provided to handle overlapping matches. 95 * E.g. In English, overlapping matches produces the result 0 and 2 96 * for the pattern "abab" in the text "ababab", where mutually 97 * exclusive matches only produces the result of 0. 98 * <p> 99 * Options are also provided to implement "asymmetric search" as described in 100 * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search"> 101 * UTS #10 Unicode Collation Algorithm</a>, specifically the ElementComparisonType 102 * values. 103 * <p> 104 * Though collator attributes will be taken into consideration while 105 * performing matches, there are no APIs here for setting and getting the 106 * attributes. These attributes can be set by getting the collator 107 * from {@link #getCollator} and using the APIs in {@link RuleBasedCollator}. 108 * Lastly to update <tt>StringSearch</tt> to the new collator attributes, 109 * {@link #reset} has to be called. 110 * <p> 111 * Restriction: <br> 112 * Currently there are no composite characters that consists of a 113 * character with combining class > 0 before a character with combining 114 * class == 0. However, if such a character exists in the future, 115 * <tt>StringSearch</tt> does not guarantee the results for option 1. 116 * <p> 117 * Consult the {@link SearchIterator} documentation for information on 118 * and examples of how to use instances of this class to implement text 119 * searching. 120 * <p> 121 * Note, <tt>StringSearch</tt> is not to be subclassed. 122 * </p> 123 * @see SearchIterator 124 * @see RuleBasedCollator 125 * @author Laura Werner, synwee 126 * @stable ICU 2.0 127 */ 128 // internal notes: all methods do not guarantee the correct status of the 129 // characteriterator. the caller has to maintain the original index position 130 // if necessary. methods could change the index position as it deems fit 131 public final class StringSearch extends SearchIterator { 132 133 private Pattern pattern_; 134 private RuleBasedCollator collator_; 135 136 // positions within the collation element iterator is used to determine 137 // if we are at the start of the text. 138 private CollationElementIterator textIter_; 139 private CollationPCE textProcessedIter_; 140 141 // utility collation element, used throughout program for temporary 142 // iteration. 143 private CollationElementIterator utilIter_; 144 145 private int strength_; 146 int ceMask_; 147 int variableTop_; 148 149 private boolean toShift_; 150 151 // *** Boyer-Moore *** 152 // private char[] canonicalPrefixAccents_; 153 // private char[] canonicalSuffixAccents_; 154 155 /** 156 * Initializes the iterator to use the language-specific rules defined in 157 * the argument collator to search for argument pattern in the argument 158 * target text. The argument <code>breakiter</code> is used to define logical matches. 159 * See super class documentation for more details on the use of the target 160 * text and {@link BreakIterator}. 161 * @param pattern text to look for. 162 * @param target target text to search for pattern. 163 * @param collator {@link RuleBasedCollator} that defines the language rules 164 * @param breakiter A {@link BreakIterator} that is used to determine the 165 * boundaries of a logical match. This argument can be null. 166 * @throws IllegalArgumentException thrown when argument target is null, 167 * or of length 0 168 * @see BreakIterator 169 * @see RuleBasedCollator 170 * @stable ICU 2.0 171 */ StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator, BreakIterator breakiter)172 public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator, 173 BreakIterator breakiter) { 174 175 // This implementation is ported from ICU4C usearch_open() 176 177 super(target, breakiter); 178 179 // string search does not really work when numeric collation is turned on 180 if (collator.getNumericCollation()) { 181 throw new UnsupportedOperationException("Numeric collation is not supported by StringSearch"); 182 } 183 184 collator_ = collator; 185 strength_ = collator.getStrength(); 186 ceMask_ = getMask(strength_); 187 toShift_ = collator.isAlternateHandlingShifted(); 188 variableTop_ = collator.getVariableTop(); 189 190 pattern_ = new Pattern(pattern); 191 192 search_.setMatchedLength(0); 193 search_.matchedIndex_ = DONE; 194 195 utilIter_ = null; 196 textIter_ = new CollationElementIterator(target, collator); 197 198 textProcessedIter_ = null; 199 200 // This is done by super class constructor 201 /* 202 search_.isOverlap_ = false; 203 search_.isCanonicalMatch_ = false; 204 search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON; 205 search_.isForwardSearching_ = true; 206 search_.reset_ = true; 207 */ 208 ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE); 209 search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale); 210 search_.internalBreakIter_.setText((CharacterIterator)target.clone()); // We need to create a clone 211 212 initialize(); 213 } 214 215 /** 216 * Initializes the iterator to use the language-specific rules defined in 217 * the argument collator to search for argument pattern in the argument 218 * target text. No {@link BreakIterator}s are set to test for logical matches. 219 * @param pattern text to look for. 220 * @param target target text to search for pattern. 221 * @param collator {@link RuleBasedCollator} that defines the language rules 222 * @throws IllegalArgumentException thrown when argument target is null, 223 * or of length 0 224 * @see RuleBasedCollator 225 * @stable ICU 2.0 226 */ StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator)227 public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator) { 228 this(pattern, target, collator, null); 229 } 230 231 /** 232 * Initializes the iterator to use the language-specific rules and 233 * break iterator rules defined in the argument locale to search for 234 * argument pattern in the argument target text. 235 * @param pattern text to look for. 236 * @param target target text to search for pattern. 237 * @param locale locale to use for language and break iterator rules 238 * @throws IllegalArgumentException thrown when argument target is null, 239 * or of length 0. ClassCastException thrown if the collator for 240 * the specified locale is not a RuleBasedCollator. 241 * @stable ICU 2.0 242 */ StringSearch(String pattern, CharacterIterator target, Locale locale)243 public StringSearch(String pattern, CharacterIterator target, Locale locale) { 244 this(pattern, target, ULocale.forLocale(locale)); 245 } 246 247 /** 248 * Initializes the iterator to use the language-specific rules and 249 * break iterator rules defined in the argument locale to search for 250 * argument pattern in the argument target text. 251 * See super class documentation for more details on the use of the target 252 * text and {@link BreakIterator}. 253 * @param pattern text to look for. 254 * @param target target text to search for pattern. 255 * @param locale locale to use for language and break iterator rules 256 * @throws IllegalArgumentException thrown when argument target is null, 257 * or of length 0. ClassCastException thrown if the collator for 258 * the specified locale is not a RuleBasedCollator. 259 * @see BreakIterator 260 * @see RuleBasedCollator 261 * @see SearchIterator 262 * @stable ICU 3.2 263 */ StringSearch(String pattern, CharacterIterator target, ULocale locale)264 public StringSearch(String pattern, CharacterIterator target, ULocale locale) { 265 this(pattern, target, (RuleBasedCollator) Collator.getInstance(locale), null); 266 } 267 268 /** 269 * Initializes the iterator to use the language-specific rules and 270 * break iterator rules defined in the default locale to search for 271 * argument pattern in the argument target text. 272 * @param pattern text to look for. 273 * @param target target text to search for pattern. 274 * @throws IllegalArgumentException thrown when argument target is null, 275 * or of length 0. ClassCastException thrown if the collator for 276 * the default locale is not a RuleBasedCollator. 277 * @stable ICU 2.0 278 */ StringSearch(String pattern, String target)279 public StringSearch(String pattern, String target) { 280 this(pattern, new StringCharacterIterator(target), 281 (RuleBasedCollator) Collator.getInstance(), null); 282 } 283 284 /** 285 * Gets the {@link RuleBasedCollator} used for the language rules. 286 * <p> 287 * Since <tt>StringSearch</tt> depends on the returned {@link RuleBasedCollator}, any 288 * changes to the {@link RuleBasedCollator} result should follow with a call to 289 * either {@link #reset()} or {@link #setCollator(RuleBasedCollator)} to ensure the correct 290 * search behavior. 291 * </p> 292 * @return {@link RuleBasedCollator} used by this <tt>StringSearch</tt> 293 * @see RuleBasedCollator 294 * @see #setCollator 295 * @stable ICU 2.0 296 */ getCollator()297 public RuleBasedCollator getCollator() { 298 return collator_; 299 } 300 301 /** 302 * Sets the {@link RuleBasedCollator} to be used for language-specific searching. 303 * <p> 304 * The iterator's position will not be changed by this method. 305 * @param collator to use for this <tt>StringSearch</tt> 306 * @throws IllegalArgumentException thrown when collator is null 307 * @see #getCollator 308 * @stable ICU 2.0 309 */ setCollator(RuleBasedCollator collator)310 public void setCollator(RuleBasedCollator collator) { 311 if (collator == null) { 312 throw new IllegalArgumentException("Collator can not be null"); 313 } 314 collator_ = collator; 315 ceMask_ = getMask(collator_.getStrength()); 316 317 ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE); 318 search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale); 319 search_.internalBreakIter_.setText((CharacterIterator)search_.text().clone()); // We need to create a clone 320 321 toShift_ = collator.isAlternateHandlingShifted(); 322 variableTop_ = collator.getVariableTop(); 323 textIter_ = new CollationElementIterator(pattern_.text_, collator); 324 utilIter_ = new CollationElementIterator(pattern_.text_, collator); 325 326 // initialize() _after_ setting the iterators for the new collator. 327 initialize(); 328 } 329 330 /** 331 * Returns the pattern for which <tt>StringSearch</tt> is searching for. 332 * @return the pattern searched for 333 * @stable ICU 2.0 334 */ getPattern()335 public String getPattern() { 336 return pattern_.text_; 337 } 338 339 /** 340 * Set the pattern to search for. 341 * The iterator's position will not be changed by this method. 342 * @param pattern for searching 343 * @see #getPattern 344 * @exception IllegalArgumentException thrown if pattern is null or of 345 * length 0 346 * @stable ICU 2.0 347 */ setPattern(String pattern)348 public void setPattern(String pattern) { 349 if (pattern == null || pattern.length() <= 0) { 350 throw new IllegalArgumentException( 351 "Pattern to search for can not be null or of length 0"); 352 } 353 pattern_.text_ = pattern; 354 initialize(); 355 } 356 357 /** 358 * Determines whether canonical matches (option 1, as described in the 359 * class documentation) is set. 360 * See setCanonical(boolean) for more information. 361 * @see #setCanonical 362 * @return true if canonical matches is set, false otherwise 363 * @stable ICU 2.8 364 */ 365 //TODO: hoist this to SearchIterator isCanonical()366 public boolean isCanonical() { 367 return search_.isCanonicalMatch_; 368 } 369 370 /** 371 * Set the canonical match mode. See class documentation for details. 372 * The default setting for this property is false. 373 * @param allowCanonical flag indicator if canonical matches are allowed 374 * @see #isCanonical 375 * @stable ICU 2.8 376 */ 377 //TODO: hoist this to SearchIterator setCanonical(boolean allowCanonical)378 public void setCanonical(boolean allowCanonical) { 379 search_.isCanonicalMatch_ = allowCanonical; 380 } 381 382 /** 383 * {@inheritDoc} 384 * @stable ICU 2.8 385 */ 386 @Override setTarget(CharacterIterator text)387 public void setTarget(CharacterIterator text) { 388 super.setTarget(text); 389 textIter_.setText(text); 390 } 391 392 /** 393 * {@inheritDoc} 394 * @stable ICU 2.8 395 */ 396 @Override getIndex()397 public int getIndex() { 398 int result = textIter_.getOffset(); 399 if (isOutOfBounds(search_.beginIndex(), search_.endIndex(), result)) { 400 return DONE; 401 } 402 return result; 403 } 404 405 /** 406 * {@inheritDoc} 407 * @stable ICU 2.8 408 */ 409 @Override setIndex(int position)410 public void setIndex(int position) { 411 // Java porting note: This method is equivalent to setOffset() in ICU4C. 412 // ICU4C SearchIterator::setOffset() is a pure virtual method, while 413 // ICU4J SearchIterator.setIndex() is not abstract method. 414 415 super.setIndex(position); 416 textIter_.setOffset(position); 417 } 418 419 /** 420 * {@inheritDoc} 421 * @stable ICU 2.8 422 */ 423 @Override reset()424 public void reset() { 425 // reset is setting the attributes that are already in 426 // string search, hence all attributes in the collator should 427 // be retrieved without any problems 428 429 boolean sameCollAttribute = true; 430 int ceMask; 431 boolean shift; 432 int varTop; 433 434 // **** hack to deal w/ how processed CEs encode quaternary **** 435 int newStrength = collator_.getStrength(); 436 if ((strength_ < Collator.QUATERNARY && newStrength >= Collator.QUATERNARY) 437 || (strength_ >= Collator.QUATERNARY && newStrength < Collator.QUATERNARY)) { 438 sameCollAttribute = false; 439 } 440 441 strength_ = collator_.getStrength(); 442 ceMask = getMask(strength_); 443 if (ceMask_ != ceMask) { 444 ceMask_ = ceMask; 445 sameCollAttribute = false; 446 } 447 448 shift = collator_.isAlternateHandlingShifted(); 449 if (toShift_ != shift) { 450 toShift_ = shift; 451 sameCollAttribute = false; 452 } 453 454 varTop = collator_.getVariableTop(); 455 if (variableTop_ != varTop) { 456 variableTop_ = varTop; 457 sameCollAttribute = false; 458 } 459 460 if (!sameCollAttribute) { 461 initialize(); 462 } 463 464 textIter_.setText(search_.text()); 465 466 search_.setMatchedLength(0); 467 search_.matchedIndex_ = DONE; 468 search_.isOverlap_ = false; 469 search_.isCanonicalMatch_ = false; 470 search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON; 471 search_.isForwardSearching_ = true; 472 search_.reset_ = true; 473 } 474 475 /** 476 * {@inheritDoc} 477 * @stable ICU 2.8 478 */ 479 @Override handleNext(int position)480 protected int handleNext(int position) { 481 if (pattern_.CELength_ == 0) { 482 search_.matchedIndex_ = search_.matchedIndex_ == DONE ? 483 getIndex() : search_.matchedIndex_ + 1; 484 search_.setMatchedLength(0); 485 textIter_.setOffset(search_.matchedIndex_); 486 if (search_.matchedIndex_ == search_.endIndex()) { 487 search_.matchedIndex_ = DONE; 488 } 489 } else { 490 if (search_.matchedLength() <= 0) { 491 // the flipping direction issue has already been handled 492 // in next() 493 // for boundary check purposes. this will ensure that the 494 // next match will not preceed the current offset 495 // note search_.matchedIndex_ will always be set to something 496 // in the code 497 search_.matchedIndex_ = position - 1; 498 } 499 500 textIter_.setOffset(position); 501 502 // ICU4C comment: 503 // if strsrch_->breakIter is always the same as m_breakiterator_ 504 // then we don't need to check the match boundaries here because 505 // usearch_handleNextXXX will already have done it. 506 if (search_.isCanonicalMatch_) { 507 // *could* actually use exact here 'cause no extra accents allowed... 508 handleNextCanonical(); 509 } else { 510 handleNextExact(); 511 } 512 513 if (search_.matchedIndex_ == DONE) { 514 textIter_.setOffset(search_.endIndex()); 515 } else { 516 textIter_.setOffset(search_.matchedIndex_); 517 } 518 519 return search_.matchedIndex_; 520 } 521 522 return DONE; 523 } 524 525 /** 526 * {@inheritDoc} 527 * @stable ICU 2.8 528 */ 529 @Override handlePrevious(int position)530 protected int handlePrevious(int position) { 531 if (pattern_.CELength_ == 0) { 532 search_.matchedIndex_ = 533 search_.matchedIndex_ == DONE ? getIndex() : search_.matchedIndex_; 534 if (search_.matchedIndex_ == search_.beginIndex()) { 535 setMatchNotFound(); 536 } else { 537 search_.matchedIndex_--; 538 textIter_.setOffset(search_.matchedIndex_); 539 search_.setMatchedLength(0); 540 } 541 } else { 542 textIter_.setOffset(position); 543 544 if (search_.isCanonicalMatch_) { 545 // *could* use exact match here since extra accents *not* allowed! 546 handlePreviousCanonical(); 547 } else { 548 handlePreviousExact(); 549 } 550 } 551 552 return search_.matchedIndex_; 553 } 554 555 // ------------------ Internal implementation code --------------------------- 556 557 private static final int INITIAL_ARRAY_SIZE_ = 256; 558 559 // *** Boyer-Moore *** 560 // private static final Normalizer2Impl nfcImpl_ = Norm2AllModes.getNFCInstance().impl; 561 // private static final int LAST_BYTE_MASK_ = 0xff; 562 // private static final int SECOND_LAST_BYTE_SHIFT_ = 8; 563 564 private static final int PRIMARYORDERMASK = 0xffff0000; 565 private static final int SECONDARYORDERMASK = 0x0000ff00; 566 private static final int TERTIARYORDERMASK = 0x000000ff; 567 568 /** 569 * Getting the mask for collation strength 570 * @param strength collation strength 571 * @return collation element mask 572 */ getMask(int strength)573 private static int getMask(int strength) { 574 switch (strength) { 575 case Collator.PRIMARY: 576 return PRIMARYORDERMASK; 577 case Collator.SECONDARY: 578 return SECONDARYORDERMASK | PRIMARYORDERMASK; 579 default: 580 return TERTIARYORDERMASK | SECONDARYORDERMASK | PRIMARYORDERMASK; 581 } 582 } 583 584 585 // *** Boyer-Moore *** 586 /* 587 private final char getFCD(String str, int offset) { 588 char ch = str.charAt(offset); 589 if (ch < 0x180) { 590 return (char) nfcImpl_.getFCD16FromBelow180(ch); 591 } else if (nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) { 592 if (!Character.isHighSurrogate(ch)) { 593 return (char) nfcImpl_.getFCD16FromNormData(ch); 594 } else { 595 char c2; 596 if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) { 597 return (char) nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2)); 598 } 599 } 600 } 601 return 0; 602 } 603 604 private final char getFCD(int c) { 605 return (char)nfcImpl_.getFCD16(c); 606 } 607 */ 608 609 /** 610 * Getting the modified collation elements taking into account the collation 611 * attributes. 612 * 613 * @param sourcece 614 * @return the modified collation element 615 */ getCE(int sourcece)616 private int getCE(int sourcece) { 617 // note for tertiary we can't use the collator->tertiaryMask, that 618 // is a preprocessed mask that takes into account case options. since 619 // we are only concerned with exact matches, we don't need that. 620 sourcece &= ceMask_; 621 622 if (toShift_) { 623 // alternate handling here, since only the 16 most significant digits 624 // is only used, we can safely do a compare without masking 625 // if the ce is a variable, we mask and get only the primary values 626 // no shifting to quartenary is required since all primary values 627 // less than variabletop will need to be masked off anyway. 628 if (variableTop_ > sourcece) { 629 if (strength_ >= Collator.QUATERNARY) { 630 sourcece &= PRIMARYORDERMASK; 631 } else { 632 sourcece = CollationElementIterator.IGNORABLE; 633 } 634 } 635 } else if (strength_ >= Collator.QUATERNARY && sourcece == CollationElementIterator.IGNORABLE) { 636 sourcece = 0xFFFF; 637 } 638 639 return sourcece; 640 } 641 642 /** 643 * Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp. 644 * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should 645 * implement this in Pattern class. 646 * 647 * @param destination target array 648 * @param offset destination offset to add value 649 * @param destinationlength target array size 650 * @param value to be added 651 * @param increments incremental size expected 652 * @return new destination array, destination if there was no new allocation 653 */ addToIntArray(int[] destination, int offset, int destinationlength, int value, int increments)654 private static int[] addToIntArray(int[] destination, int offset, int destinationlength, 655 int value, int increments) { 656 int newlength = destinationlength; 657 if (offset + 1 == newlength) { 658 newlength += increments; 659 int temp[] = new int[newlength]; 660 System.arraycopy(destination, 0, temp, 0, offset); 661 destination = temp; 662 } 663 destination[offset] = value; 664 return destination; 665 } 666 667 /** 668 * Direct port of ICU4C static int64_t * addTouint64_tArray(...) in usearch.cpp. 669 * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should 670 * implement this in Pattern class. 671 * 672 * @param destination target array 673 * @param offset destination offset to add value 674 * @param destinationlength target array size 675 * @param value to be added 676 * @param increments incremental size expected 677 * @return new destination array, destination if there was no new allocation 678 */ addToLongArray(long[] destination, int offset, int destinationlength, long value, int increments)679 private static long[] addToLongArray(long[] destination, int offset, int destinationlength, 680 long value, int increments) { 681 int newlength = destinationlength; 682 if (offset + 1 == newlength) { 683 newlength += increments; 684 long temp[] = new long[newlength]; 685 System.arraycopy(destination, 0, temp, 0, offset); 686 destination = temp; 687 } 688 destination[offset] = value; 689 return destination; 690 } 691 692 /** 693 * Initializing the ce table for a pattern. 694 * Stores non-ignorable collation keys. 695 * Table size will be estimated by the size of the pattern text. Table 696 * expansion will be perform as we go along. Adding 1 to ensure that the table 697 * size definitely increases. 698 * @return total number of expansions 699 */ 700 // TODO: We probably do not need Pattern CE table. initializePatternCETable()701 private int initializePatternCETable() { 702 int[] cetable = new int[INITIAL_ARRAY_SIZE_]; 703 int cetablesize = cetable.length; 704 int patternlength = pattern_.text_.length(); 705 CollationElementIterator coleiter = utilIter_; 706 707 if (coleiter == null) { 708 coleiter = new CollationElementIterator(pattern_.text_, collator_); 709 utilIter_ = coleiter; 710 } else { 711 coleiter.setText(pattern_.text_); 712 } 713 714 int offset = 0; 715 int result = 0; 716 int ce; 717 718 while ((ce = coleiter.next()) != CollationElementIterator.NULLORDER) { 719 int newce = getCE(ce); 720 if (newce != CollationElementIterator.IGNORABLE /* 0 */) { 721 int[] temp = addToIntArray(cetable, offset, cetablesize, newce, 722 patternlength - coleiter.getOffset() + 1); 723 offset++; 724 cetable = temp; 725 } 726 result += (coleiter.getMaxExpansion(ce) - 1); 727 } 728 729 cetable[offset] = 0; 730 pattern_.CE_ = cetable; 731 pattern_.CELength_ = offset; 732 733 return result; 734 } 735 736 /** 737 * Initializing the pce table for a pattern. 738 * Stores non-ignorable collation keys. 739 * Table size will be estimated by the size of the pattern text. Table 740 * expansion will be perform as we go along. Adding 1 to ensure that the table 741 * size definitely increases. 742 * @return total number of expansions 743 */ initializePatternPCETable()744 private int initializePatternPCETable() { 745 long[] pcetable = new long[INITIAL_ARRAY_SIZE_]; 746 int pcetablesize = pcetable.length; 747 int patternlength = pattern_.text_.length(); 748 CollationElementIterator coleiter = utilIter_; 749 750 if (coleiter == null) { 751 coleiter = new CollationElementIterator(pattern_.text_, collator_); 752 utilIter_ = coleiter; 753 } else { 754 coleiter.setText(pattern_.text_); 755 } 756 757 int offset = 0; 758 int result = 0; 759 long pce; 760 761 CollationPCE iter = new CollationPCE(coleiter); 762 763 // ** Should processed CEs be signed or unsigned? 764 // ** (the rest of the code in this file seems to play fast-and-loose with 765 // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 766 while ((pce = iter.nextProcessed(null)) != CollationPCE.PROCESSED_NULLORDER) { 767 long[] temp = addToLongArray(pcetable, offset, pcetablesize, pce, patternlength - coleiter.getOffset() + 1); 768 offset++; 769 pcetable = temp; 770 } 771 772 pcetable[offset] = 0; 773 pattern_.PCE_ = pcetable; 774 pattern_.PCELength_ = offset; 775 776 return result; 777 } 778 779 // TODO: This method only triggers initializePatternCETable(), which is probably no 780 // longer needed. initializePattern()781 private int initializePattern() { 782 // Since the strength is primary, accents are ignored in the pattern. 783 784 // *** Boyer-Moore *** 785 /* 786 if (strength_ == Collator.PRIMARY) { 787 pattern_.hasPrefixAccents_ = false; 788 pattern_.hasSuffixAccents_ = false; 789 } else { 790 pattern_.hasPrefixAccents_ = (getFCD(pattern_.text_, 0) >>> SECOND_LAST_BYTE_SHIFT_) != 0; 791 pattern_.hasSuffixAccents_ = (getFCD(pattern_.text_.codePointBefore(pattern_.text_.length())) & LAST_BYTE_MASK_) != 0; 792 } 793 */ 794 795 pattern_.PCE_ = null; 796 797 // since intializePattern is an internal method status is a success. 798 return initializePatternCETable(); 799 } 800 801 // *** Boyer-Moore *** 802 /* 803 private final void setShiftTable(char shift[], 804 char backshift[], 805 int cetable[], int cesize, 806 int expansionsize, 807 int defaultforward, 808 int defaultbackward) { 809 // No implementation 810 } 811 */ 812 813 // TODO: This method only triggers initializePattern(), which is probably no 814 // longer needed. initialize()815 private void initialize() { 816 /* int expandlength = */ initializePattern(); 817 818 // *** Boyer-Moore *** 819 /* 820 if (pattern_.CELength_ > 0) { 821 int cesize = pattern_.CELength_; 822 int minlength = cesize > expandlength ? cesize - expandlength : 1; 823 pattern_.defaultShiftSize_ = minlength; 824 setShiftTable(pattern_.shift_, pattern_.backShift_, pattern_.CE_, cesize, 825 expandlength, minlength, minlength); 826 return; 827 } 828 return pattern_.defaultShiftSize_; 829 */ 830 } 831 832 /** 833 * @internal 834 * @deprecated This API is ICU internal only. 835 */ 836 @Deprecated setMatchNotFound()837 protected void setMatchNotFound() { 838 super.setMatchNotFound(); 839 // SearchIterator#setMatchNotFound() does following: 840 // search_.matchedIndex_ = DONE; 841 // search_.setMatchedLength(0); 842 if (search_.isForwardSearching_) { 843 textIter_.setOffset(search_.text().getEndIndex()); 844 } else { 845 textIter_.setOffset(0); 846 } 847 } 848 849 /** 850 * Checks if the offset runs out of the text string range 851 * @param textstart offset of the first character in the range 852 * @param textlimit limit offset of the text string range 853 * @param offset to test 854 * @return true if offset is out of bounds, false otherwise 855 */ isOutOfBounds(int textstart, int textlimit, int offset)856 private static final boolean isOutOfBounds(int textstart, int textlimit, int offset) { 857 return offset < textstart || offset > textlimit; 858 } 859 860 /** 861 * Checks for identical match 862 * @param start offset of possible match 863 * @param end offset of possible match 864 * @return TRUE if identical match is found 865 */ checkIdentical(int start, int end)866 private boolean checkIdentical(int start, int end) { 867 if (strength_ != Collator.IDENTICAL) { 868 return true; 869 } 870 // Note: We could use Normalizer::compare() or similar, but for short strings 871 // which may not be in FCD it might be faster to just NFD them. 872 String textstr = getString(targetText, start, end - start); 873 if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) { 874 textstr = Normalizer.decompose(textstr, false); 875 } 876 String patternstr = pattern_.text_; 877 if (Normalizer.quickCheck(patternstr, Normalizer.NFD, 0) == Normalizer.NO) { 878 patternstr = Normalizer.decompose(patternstr, false); 879 } 880 return textstr.equals(patternstr); 881 } 882 initTextProcessedIter()883 private boolean initTextProcessedIter() { 884 if (textProcessedIter_ == null) { 885 textProcessedIter_ = new CollationPCE(textIter_); 886 } else { 887 textProcessedIter_.init(textIter_); 888 } 889 return true; 890 } 891 892 /* 893 * Find the next break boundary after startIndex. If the UStringSearch object 894 * has an external break iterator, use that. Otherwise use the internal character 895 * break iterator. 896 */ nextBoundaryAfter(int startIndex)897 private int nextBoundaryAfter(int startIndex) { 898 BreakIterator breakiterator = search_.breakIter(); 899 900 if (breakiterator == null) { 901 breakiterator = search_.internalBreakIter_; 902 } 903 904 if (breakiterator != null) { 905 return breakiterator.following(startIndex); 906 } 907 908 return startIndex; 909 } 910 911 /* 912 * Returns TRUE if index is on a break boundary. If the UStringSearch 913 * has an external break iterator, test using that, otherwise test 914 * using the internal character break iterator. 915 */ isBreakBoundary(int index)916 private boolean isBreakBoundary(int index) { 917 BreakIterator breakiterator = search_.breakIter(); 918 919 if (breakiterator == null) { 920 breakiterator = search_.internalBreakIter_; 921 } 922 923 return (breakiterator != null && breakiterator.isBoundary(index)); 924 } 925 926 927 // Java porting note: Followings are corresponding to UCompareCEsResult enum 928 private static final int CE_MATCH = -1; 929 private static final int CE_NO_MATCH = 0; 930 private static final int CE_SKIP_TARG = 1; 931 private static final int CE_SKIP_PATN = 2; 932 933 private static int CE_LEVEL2_BASE = 0x00000005; 934 private static int CE_LEVEL3_BASE = 0x00050000; 935 compareCE64s(long targCE, long patCE, ElementComparisonType compareType)936 private static int compareCE64s(long targCE, long patCE, ElementComparisonType compareType) { 937 if (targCE == patCE) { 938 return CE_MATCH; 939 } 940 if (compareType == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 941 return CE_NO_MATCH; 942 } 943 944 long targCEshifted = targCE >>> 32; 945 long patCEshifted = patCE >>> 32; 946 long mask; 947 948 mask = 0xFFFF0000L; 949 int targLev1 = (int)(targCEshifted & mask); 950 int patLev1 = (int)(patCEshifted & mask); 951 if (targLev1 != patLev1) { 952 if (targLev1 == 0) { 953 return CE_SKIP_TARG; 954 } 955 if (patLev1 == 0 956 && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) { 957 return CE_SKIP_PATN; 958 } 959 return CE_NO_MATCH; 960 } 961 962 mask = 0x0000FFFFL; 963 int targLev2 = (int)(targCEshifted & mask); 964 int patLev2 = (int)(patCEshifted & mask); 965 if (targLev2 != patLev2) { 966 if (targLev2 == 0) { 967 return CE_SKIP_TARG; 968 } 969 if (patLev2 == 0 970 && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) { 971 return CE_SKIP_PATN; 972 } 973 return (patLev2 == CE_LEVEL2_BASE || 974 (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD && 975 targLev2 == CE_LEVEL2_BASE)) ? CE_MATCH : CE_NO_MATCH; 976 } 977 978 mask = 0xFFFF0000L; 979 int targLev3 = (int)(targCE & mask); 980 int patLev3 = (int)(patCE & mask); 981 if (targLev3 != patLev3) { 982 return (patLev3 == CE_LEVEL3_BASE || 983 (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD && 984 targLev3 == CE_LEVEL3_BASE) )? CE_MATCH: CE_NO_MATCH; 985 } 986 987 return CE_MATCH; 988 } 989 990 /** 991 * An object used for receiving matched index in search() and 992 * searchBackwards(). 993 */ 994 private static class Match { 995 int start_ = -1; 996 int limit_ = -1; 997 } 998 search(int startIdx, Match m)999 private boolean search(int startIdx, Match m) { 1000 // Input parameter sanity check. 1001 if (pattern_.CELength_ == 0 1002 || startIdx < search_.beginIndex() 1003 || startIdx > search_.endIndex()) { 1004 throw new IllegalArgumentException("search(" + startIdx + ", m) - expected position to be between " + 1005 search_.beginIndex() + " and " + search_.endIndex()); 1006 } 1007 1008 if (pattern_.PCE_ == null) { 1009 initializePatternPCETable(); 1010 } 1011 1012 textIter_.setOffset(startIdx); 1013 CEBuffer ceb = new CEBuffer(this); 1014 1015 int targetIx = 0; 1016 CEI targetCEI = null; 1017 int patIx; 1018 boolean found; 1019 1020 int mStart = -1; 1021 int mLimit = -1; 1022 int minLimit; 1023 int maxLimit; 1024 1025 // Outer loop moves over match starting positions in the 1026 // target CE space. 1027 // Here we see the target as a sequence of collation elements, resulting from the following: 1028 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 1029 // (for example, digraphs such as IJ may be broken into two characters). 1030 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 1031 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 1032 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 1033 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), 1034 // then the CE is deleted, so the following code sees only CEs that are relevant. 1035 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 1036 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 1037 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 1038 for (targetIx = 0; ; targetIx++) { 1039 found = true; 1040 // Inner loop checks for a match beginning at each 1041 // position from the outer loop. 1042 int targetIxOffset = 0; 1043 long patCE = 0; 1044 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 1045 // (compared to the last CE fetched for the previous targetIx value) as we need to go 1046 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. 1047 CEI firstCEI = ceb.get(targetIx); 1048 if (firstCEI == null) { 1049 throw new ICUException("CEBuffer.get(" + targetIx + ") returned null."); 1050 } 1051 1052 for (patIx = 0; patIx < pattern_.PCELength_; patIx++) { 1053 patCE = pattern_.PCE_[patIx]; 1054 targetCEI = ceb.get(targetIx + patIx + targetIxOffset); 1055 // Compare CE from target string with CE from the pattern. 1056 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 1057 // which will fail the compare, below. 1058 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_); 1059 if (ceMatch == CE_NO_MATCH) { 1060 found = false; 1061 break; 1062 } else if (ceMatch > CE_NO_MATCH) { 1063 if (ceMatch == CE_SKIP_TARG) { 1064 // redo with same patCE, next targCE 1065 patIx--; 1066 targetIxOffset++; 1067 } else { // ceMatch == CE_SKIP_PATN 1068 // redo with same targCE, next patCE 1069 targetIxOffset--; 1070 } 1071 } 1072 } 1073 targetIxOffset += pattern_.PCELength_; // this is now the offset in target CE space to end of the match so far 1074 1075 if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) { 1076 // No match at this targetIx. Try again at the next. 1077 continue; 1078 } 1079 1080 if (!found) { 1081 // No match at all, we have run off the end of the target text. 1082 break; 1083 } 1084 1085 // We have found a match in CE space. 1086 // Now determine the bounds in string index space. 1087 // There still is a chance of match failure if the CE range not correspond to 1088 // an acceptable character range. 1089 // 1090 CEI lastCEI = ceb.get(targetIx + targetIxOffset -1); 1091 1092 mStart = firstCEI.lowIndex_; 1093 minLimit = lastCEI.lowIndex_; 1094 1095 // Look at the CE following the match. If it is UCOL_NULLORDER the match 1096 // extended to the end of input, and the match is good. 1097 1098 // Look at the high and low indices of the CE following the match. If 1099 // they are the same it means one of two things: 1100 // 1. The match extended to the last CE from the target text, which is OK, or 1101 // 2. The last CE that was part of the match is in an expansion that extends 1102 // to the first CE after the match. In this case, we reject the match. 1103 CEI nextCEI = null; 1104 if (search_.elementComparisonType_ == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 1105 nextCEI = ceb.get(targetIx + targetIxOffset); 1106 maxLimit = nextCEI.lowIndex_; 1107 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) { 1108 found = false; 1109 } 1110 } else { 1111 for (;; ++targetIxOffset) { 1112 nextCEI = ceb.get(targetIx + targetIxOffset); 1113 maxLimit = nextCEI.lowIndex_; 1114 // If we are at the end of the target too, match succeeds 1115 if (nextCEI.ce_ == CollationPCE.PROCESSED_NULLORDER) { 1116 break; 1117 } 1118 // As long as the next CE has primary weight of 0, 1119 // it is part of the last target element matched by the pattern; 1120 // make sure it can be part of a match with the last patCE 1121 if ((((nextCEI.ce_) >>> 32) & 0xFFFF0000L) == 0) { 1122 int ceMatch = compareCE64s(nextCEI.ce_, patCE, search_.elementComparisonType_); 1123 if (ceMatch == CE_NO_MATCH || ceMatch == CE_SKIP_PATN ) { 1124 found = false; 1125 break; 1126 } 1127 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 1128 // target element, but it has non-zero primary weight => match fails 1129 } else if ( nextCEI.lowIndex_ == nextCEI.highIndex_ ) { 1130 found = false; 1131 break; 1132 // Else the target CE is not part of an expansion of the last matched element, match succeeds 1133 } else { 1134 break; 1135 } 1136 } 1137 } 1138 1139 // Check for the start of the match being within a combining sequence. 1140 // This can happen if the pattern itself begins with a combining char, and 1141 // the match found combining marks in the target text that were attached 1142 // to something else. 1143 // This type of match should be rejected for not completely consuming a 1144 // combining sequence. 1145 if (!isBreakBoundary(mStart)) { 1146 found = false; 1147 } 1148 1149 // Check for the start of the match being within an Collation Element Expansion, 1150 // meaning that the first char of the match is only partially matched. 1151 // With expansions, the first CE will report the index of the source 1152 // character, and all subsequent (expansions) CEs will report the source index of the 1153 // _following_ character. 1154 int secondIx = firstCEI.highIndex_; 1155 if (mStart == secondIx) { 1156 found = false; 1157 } 1158 1159 // Advance the match end position to the first acceptable match boundary. 1160 // This advances the index over any combining characters. 1161 mLimit = maxLimit; 1162 if (minLimit < maxLimit) { 1163 // When the last CE's low index is same with its high index, the CE is likely 1164 // a part of expansion. In this case, the index is located just after the 1165 // character corresponding to the CEs compared above. If the index is right 1166 // at the break boundary, move the position to the next boundary will result 1167 // incorrect match length when there are ignorable characters exist between 1168 // the position and the next character produces CE(s). See ticket#8482. 1169 if (minLimit == lastCEI.highIndex_ && isBreakBoundary(minLimit)) { 1170 mLimit = minLimit; 1171 } else { 1172 int nba = nextBoundaryAfter(minLimit); 1173 if (nba >= lastCEI.highIndex_) { 1174 mLimit = nba; 1175 } 1176 } 1177 } 1178 1179 // If advancing to the end of a combining sequence in character indexing space 1180 // advanced us beyond the end of the match in CE space, reject this match. 1181 if (mLimit > maxLimit) { 1182 found = false; 1183 } 1184 1185 if (!isBreakBoundary(mLimit)) { 1186 found = false; 1187 } 1188 1189 if (!checkIdentical(mStart, mLimit)) { 1190 found = false; 1191 } 1192 1193 if (found) { 1194 break; 1195 } 1196 } 1197 1198 // All Done. Store back the match bounds to the caller. 1199 // 1200 if (found == false) { 1201 mLimit = -1; 1202 mStart = -1; 1203 } 1204 1205 if (m != null) { 1206 m.start_ = mStart; 1207 m.limit_ = mLimit; 1208 } 1209 1210 return found; 1211 } 1212 searchBackwards(int startIdx, Match m)1213 private boolean searchBackwards(int startIdx, Match m) { 1214 //ICU4C_TODO comment: reject search patterns beginning with a combining char. 1215 1216 // Input parameter sanity check. 1217 if (pattern_.CELength_ == 0 1218 || startIdx < search_.beginIndex() 1219 || startIdx > search_.endIndex()) { 1220 throw new IllegalArgumentException("searchBackwards(" + startIdx + ", m) - expected position to be between " + 1221 search_.beginIndex() + " and " + search_.endIndex()); 1222 } 1223 1224 if (pattern_.PCE_ == null) { 1225 initializePatternPCETable(); 1226 } 1227 1228 CEBuffer ceb = new CEBuffer(this); 1229 int targetIx = 0; 1230 1231 /* 1232 * Pre-load the buffer with the CE's for the grapheme 1233 * after our starting position so that we're sure that 1234 * we can look at the CE following the match when we 1235 * check the match boundaries. 1236 * 1237 * This will also pre-fetch the first CE that we'll 1238 * consider for the match. 1239 */ 1240 if (startIdx < search_.endIndex()) { 1241 BreakIterator bi = search_.internalBreakIter_; 1242 int next = bi.following(startIdx); 1243 1244 textIter_.setOffset(next); 1245 1246 for (targetIx = 0; ; targetIx++) { 1247 if (ceb.getPrevious(targetIx).lowIndex_ < startIdx) { 1248 break; 1249 } 1250 } 1251 } else { 1252 textIter_.setOffset(startIdx); 1253 } 1254 1255 CEI targetCEI = null; 1256 int patIx; 1257 boolean found; 1258 1259 int limitIx = targetIx; 1260 int mStart = -1; 1261 int mLimit = -1; 1262 int minLimit; 1263 int maxLimit; 1264 1265 // Outer loop moves over match starting positions in the 1266 // target CE space. 1267 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 1268 // But patIx is 0 at the beginning of the pattern and increases toward the end. 1269 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 1270 // and the beginning of the base text. 1271 for (targetIx = limitIx; ; targetIx++) { 1272 found = true; 1273 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 1274 // (compared to the last CE fetched for the previous targetIx value) as we need to go 1275 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. 1276 CEI lastCEI = ceb.getPrevious(targetIx); 1277 if (lastCEI == null) { 1278 throw new ICUException("CEBuffer.getPrevious(" + targetIx + ") returned null."); 1279 } 1280 // Inner loop checks for a match beginning at each 1281 // position from the outer loop. 1282 int targetIxOffset = 0; 1283 for (patIx = pattern_.PCELength_ - 1; patIx >= 0; patIx--) { 1284 long patCE = pattern_.PCE_[patIx]; 1285 1286 targetCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 - patIx + targetIxOffset); 1287 // Compare CE from target string with CE from the pattern. 1288 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 1289 // which will fail the compare, below. 1290 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_); 1291 if (ceMatch == CE_NO_MATCH) { 1292 found = false; 1293 break; 1294 } else if (ceMatch > CE_NO_MATCH) { 1295 if (ceMatch == CE_SKIP_TARG) { 1296 // redo with same patCE, next targCE 1297 patIx++; 1298 targetIxOffset++; 1299 } else { // ceMatch == CE_SKIP_PATN 1300 // redo with same targCE, next patCE 1301 targetIxOffset--; 1302 } 1303 } 1304 } 1305 1306 if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) { 1307 // No match at this targetIx. Try again at the next. 1308 continue; 1309 } 1310 1311 if (!found) { 1312 // No match at all, we have run off the end of the target text. 1313 break; 1314 } 1315 1316 // We have found a match in CE space. 1317 // Now determine the bounds in string index space. 1318 // There still is a chance of match failure if the CE range not correspond to 1319 // an acceptable character range. 1320 // 1321 CEI firstCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 + targetIxOffset); 1322 mStart = firstCEI.lowIndex_; 1323 1324 // Check for the start of the match being within a combining sequence. 1325 // This can happen if the pattern itself begins with a combining char, and 1326 // the match found combining marks in the target text that were attached 1327 // to something else. 1328 // This type of match should be rejected for not completely consuming a 1329 // combining sequence. 1330 if (!isBreakBoundary(mStart)) { 1331 found = false; 1332 } 1333 1334 // Look at the high index of the first CE in the match. If it's the same as the 1335 // low index, the first CE in the match is in the middle of an expansion. 1336 if (mStart == firstCEI.highIndex_) { 1337 found = false; 1338 } 1339 1340 minLimit = lastCEI.lowIndex_; 1341 1342 if (targetIx > 0) { 1343 // Look at the CE following the match. If it is UCOL_NULLORDER the match 1344 // extended to the end of input, and the match is good. 1345 1346 // Look at the high and low indices of the CE following the match. If 1347 // they are the same it means one of two things: 1348 // 1. The match extended to the last CE from the target text, which is OK, or 1349 // 2. The last CE that was part of the match is in an expansion that extends 1350 // to the first CE after the match. In this case, we reject the match. 1351 CEI nextCEI = ceb.getPrevious(targetIx - 1); 1352 1353 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) { 1354 found = false; 1355 } 1356 1357 mLimit = maxLimit = nextCEI.lowIndex_; 1358 1359 // Advance the match end position to the first acceptable match boundary. 1360 // This advances the index over any combining charcters. 1361 if (minLimit < maxLimit) { 1362 int nba = nextBoundaryAfter(minLimit); 1363 1364 if (nba >= lastCEI.highIndex_) { 1365 mLimit = nba; 1366 } 1367 } 1368 1369 // If advancing to the end of a combining sequence in character indexing space 1370 // advanced us beyond the end of the match in CE space, reject this match. 1371 if (mLimit > maxLimit) { 1372 found = false; 1373 } 1374 1375 // Make sure the end of the match is on a break boundary 1376 if (!isBreakBoundary(mLimit)) { 1377 found = false; 1378 } 1379 1380 } else { 1381 // No non-ignorable CEs after this point. 1382 // The maximum position is detected by boundary after 1383 // the last non-ignorable CE. Combining sequence 1384 // across the start index will be truncated. 1385 int nba = nextBoundaryAfter(minLimit); 1386 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 1387 } 1388 1389 if (!checkIdentical(mStart, mLimit)) { 1390 found = false; 1391 } 1392 1393 if (found) { 1394 break; 1395 } 1396 } 1397 1398 // All Done. Store back the match bounds to the caller. 1399 // 1400 if (found == false) { 1401 mLimit = -1; 1402 mStart = -1; 1403 } 1404 1405 if (m != null) { 1406 m.start_ = mStart; 1407 m.limit_ = mLimit; 1408 } 1409 1410 return found; 1411 } 1412 1413 // Java porting note: 1414 // 1415 // ICU4C usearch_handleNextExact() is identical to usearch_handleNextCanonical() 1416 // for the linear search implementation. The differences are addressed in search(). 1417 // handleNextExact()1418 private boolean handleNextExact() { 1419 return handleNextCommonImpl(); 1420 } 1421 handleNextCanonical()1422 private boolean handleNextCanonical() { 1423 return handleNextCommonImpl(); 1424 } 1425 handleNextCommonImpl()1426 private boolean handleNextCommonImpl() { 1427 int textOffset = textIter_.getOffset(); 1428 Match match = new Match(); 1429 1430 if (search(textOffset, match)) { 1431 search_.matchedIndex_ = match.start_; 1432 search_.setMatchedLength(match.limit_ - match.start_); 1433 return true; 1434 } else { 1435 setMatchNotFound(); 1436 return false; 1437 } 1438 } 1439 1440 // Java porting note: 1441 // 1442 // ICU4C usearch_handlePreviousExact() is identical to usearch_handlePreviousCanonical() 1443 // for the linear search implementation. The differences are addressed in searchBackwards(). 1444 // handlePreviousExact()1445 private boolean handlePreviousExact() { 1446 return handlePreviousCommonImpl(); 1447 } 1448 handlePreviousCanonical()1449 private boolean handlePreviousCanonical() { 1450 return handlePreviousCommonImpl(); 1451 } 1452 handlePreviousCommonImpl()1453 private boolean handlePreviousCommonImpl() { 1454 int textOffset; 1455 1456 if (search_.isOverlap_) { 1457 if (search_.matchedIndex_ != DONE) { 1458 textOffset = search_.matchedIndex_ + search_.matchedLength() - 1; 1459 } else { 1460 // move the start position at the end of possible match 1461 initializePatternPCETable(); 1462 if (!initTextProcessedIter()) { 1463 setMatchNotFound(); 1464 return false; 1465 } 1466 for (int nPCEs = 0; nPCEs < pattern_.PCELength_ - 1; nPCEs++) { 1467 long pce = textProcessedIter_.nextProcessed(null); 1468 if (pce == CollationPCE.PROCESSED_NULLORDER) { 1469 // at the end of the text 1470 break; 1471 } 1472 } 1473 textOffset = textIter_.getOffset(); 1474 } 1475 } else { 1476 textOffset = textIter_.getOffset(); 1477 } 1478 1479 Match match = new Match(); 1480 if (searchBackwards(textOffset, match)) { 1481 search_.matchedIndex_ = match.start_; 1482 search_.setMatchedLength(match.limit_ - match.start_); 1483 return true; 1484 } else { 1485 setMatchNotFound(); 1486 return false; 1487 } 1488 } 1489 1490 /** 1491 * Gets a substring out of a CharacterIterator 1492 * 1493 * Java porting note: Not available in ICU4C 1494 * 1495 * @param text CharacterIterator 1496 * @param start start offset 1497 * @param length of substring 1498 * @return substring from text starting at start and length length 1499 */ getString(CharacterIterator text, int start, int length)1500 private static final String getString(CharacterIterator text, int start, int length) { 1501 StringBuilder result = new StringBuilder(length); 1502 int offset = text.getIndex(); 1503 text.setIndex(start); 1504 for (int i = 0; i < length; i++) { 1505 result.append(text.current()); 1506 text.next(); 1507 } 1508 text.setIndex(offset); 1509 return result.toString(); 1510 } 1511 1512 /** 1513 * Java port of ICU4C struct UPattern (usrchimp.h) 1514 */ 1515 private static final class Pattern { 1516 /** Pattern string */ 1517 String text_; 1518 1519 long[] PCE_; 1520 int PCELength_ = 0; 1521 1522 // TODO: We probably do not need CE_ / CELength_ 1523 @SuppressWarnings("unused") 1524 int[] CE_; 1525 int CELength_ = 0; 1526 1527 // *** Boyer-Moore *** 1528 // boolean hasPrefixAccents_ = false; 1529 // boolean hasSuffixAccents_ = false; 1530 // int defaultShiftSize_; 1531 // char[] shift_; 1532 // char[] backShift_; 1533 Pattern(String pattern)1534 protected Pattern(String pattern) { 1535 text_ = pattern; 1536 } 1537 } 1538 1539 /** 1540 * Java port of ICU4C UCollationPCE (usrchimp.h) 1541 */ 1542 private static class CollationPCE { 1543 public static final long PROCESSED_NULLORDER = -1; 1544 1545 private static final int DEFAULT_BUFFER_SIZE = 16; 1546 private static final int BUFFER_GROW = 8; 1547 1548 // Note: PRIMARYORDERMASK is also duplicated in StringSearch class 1549 private static final int PRIMARYORDERMASK = 0xffff0000; 1550 private static final int CONTINUATION_MARKER = 0xc0; 1551 1552 private PCEBuffer pceBuffer_ = new PCEBuffer(); 1553 private CollationElementIterator cei_; 1554 private int strength_; 1555 private boolean toShift_; 1556 private boolean isShifted_; 1557 private int variableTop_; 1558 CollationPCE(CollationElementIterator iter)1559 public CollationPCE(CollationElementIterator iter) { 1560 init(iter); 1561 } 1562 init(CollationElementIterator iter)1563 public void init(CollationElementIterator iter) { 1564 cei_ = iter; 1565 init(iter.getRuleBasedCollator()); 1566 } 1567 init(RuleBasedCollator coll)1568 private void init(RuleBasedCollator coll) { 1569 strength_ = coll.getStrength(); 1570 toShift_ = coll.isAlternateHandlingShifted(); 1571 isShifted_ = false; 1572 variableTop_ = coll.getVariableTop(); 1573 } 1574 1575 @SuppressWarnings("fallthrough") processCE(int ce)1576 private long processCE(int ce) { 1577 long primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 1578 1579 // This is clean, but somewhat slow... 1580 // We could apply the mask to ce and then 1581 // just get all three orders... 1582 switch (strength_) { 1583 default: 1584 tertiary = CollationElementIterator.tertiaryOrder(ce); 1585 /* note fall-through */ 1586 1587 case Collator.SECONDARY: 1588 secondary = CollationElementIterator.secondaryOrder(ce); 1589 /* note fall-through */ 1590 1591 case Collator.PRIMARY: 1592 primary = CollationElementIterator.primaryOrder(ce); 1593 } 1594 1595 // **** This should probably handle continuations too. **** 1596 // **** That means that we need 24 bits for the primary **** 1597 // **** instead of the 16 that we're currently using. **** 1598 // **** So we can lay out the 64 bits as: 24.12.12.16. **** 1599 // **** Another complication with continuations is that **** 1600 // **** the *second* CE is marked as a continuation, so **** 1601 // **** we always have to peek ahead to know how long **** 1602 // **** the primary is... **** 1603 if ((toShift_ && variableTop_ > ce && primary != 0) || (isShifted_ && primary == 0)) { 1604 1605 if (primary == 0) { 1606 return CollationElementIterator.IGNORABLE; 1607 } 1608 1609 if (strength_ >= Collator.QUATERNARY) { 1610 quaternary = primary; 1611 } 1612 1613 primary = secondary = tertiary = 0; 1614 isShifted_ = true; 1615 } else { 1616 if (strength_ >= Collator.QUATERNARY) { 1617 quaternary = 0xFFFF; 1618 } 1619 1620 isShifted_ = false; 1621 } 1622 1623 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; 1624 } 1625 1626 /** 1627 * Get the processed ordering priority of the next collation element in the text. 1628 * A single character may contain more than one collation element. 1629 * 1630 * Note: This is equivalent to 1631 * UCollationPCE::nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 1632 * 1633 * @param range receiving the iterator index before/after fetching the CE. 1634 * @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER 1635 * if an error has occurred or if the end of string has been reached 1636 */ nextProcessed(Range range)1637 public long nextProcessed(Range range) { 1638 long result = CollationElementIterator.IGNORABLE; 1639 int low = 0, high = 0; 1640 1641 pceBuffer_.reset(); 1642 1643 do { 1644 low = cei_.getOffset(); 1645 int ce = cei_.next(); 1646 high = cei_.getOffset(); 1647 1648 if (ce == CollationElementIterator.NULLORDER) { 1649 result = PROCESSED_NULLORDER; 1650 break; 1651 } 1652 1653 result = processCE(ce); 1654 } while (result == CollationElementIterator.IGNORABLE); 1655 1656 if (range != null) { 1657 range.ixLow_ = low; 1658 range.ixHigh_ = high; 1659 } 1660 1661 return result; 1662 } 1663 1664 /** 1665 * Get the processed ordering priority of the previous collation element in the text. 1666 * A single character may contain more than one collation element. 1667 * 1668 * Note: This is equivalent to 1669 * UCollationPCE::previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 1670 * 1671 * @param range receiving the iterator index before/after fetching the CE. 1672 * @return The previous collation elements ordering, otherwise returns 1673 * PROCESSED_NULLORDER if an error has occurred or if the start of 1674 * string has been reached. 1675 */ previousProcessed(Range range)1676 public long previousProcessed(Range range) { 1677 long result = CollationElementIterator.IGNORABLE; 1678 int low = 0, high = 0; 1679 1680 // pceBuffer_.reset(); 1681 1682 while (pceBuffer_.empty()) { 1683 // buffer raw CEs up to non-ignorable primary 1684 RCEBuffer rceb = new RCEBuffer(); 1685 int ce; 1686 1687 boolean finish = false; 1688 1689 // **** do we need to reset rceb, or will it always be empty at this point **** 1690 do { 1691 high = cei_.getOffset(); 1692 ce = cei_.previous(); 1693 low = cei_.getOffset(); 1694 1695 if (ce == CollationElementIterator.NULLORDER) { 1696 if (!rceb.empty()) { 1697 break; 1698 } 1699 1700 finish = true; 1701 break; 1702 } 1703 1704 rceb.put(ce, low, high); 1705 } while ((ce & PRIMARYORDERMASK) == 0 || isContinuation(ce)); 1706 1707 if (finish) { 1708 break; 1709 } 1710 1711 // process the raw CEs 1712 while (!rceb.empty()) { 1713 RCEI rcei = rceb.get(); 1714 1715 result = processCE(rcei.ce_); 1716 1717 if (result != CollationElementIterator.IGNORABLE) { 1718 pceBuffer_.put(result, rcei.low_, rcei.high_); 1719 } 1720 } 1721 } 1722 1723 if (pceBuffer_.empty()) { 1724 // **** Is -1 the right value for ixLow, ixHigh? **** 1725 if (range != null) { 1726 range.ixLow_ = -1; 1727 range.ixHigh_ = -1; 1728 } 1729 return CollationElementIterator.NULLORDER; 1730 } 1731 1732 PCEI pcei = pceBuffer_.get(); 1733 1734 if (range != null) { 1735 range.ixLow_ = pcei.low_; 1736 range.ixHigh_ = pcei.high_; 1737 } 1738 1739 return pcei.ce_; 1740 } 1741 isContinuation(int ce)1742 private static boolean isContinuation(int ce) { 1743 return ((ce & CONTINUATION_MARKER) == CONTINUATION_MARKER); 1744 } 1745 1746 public static final class Range { 1747 int ixLow_; 1748 int ixHigh_; 1749 } 1750 1751 /** Processed collation element buffer stuff ported from ICU4C ucoleitr.cpp */ 1752 private static final class PCEI { 1753 long ce_; 1754 int low_; 1755 int high_; 1756 } 1757 1758 private static final class PCEBuffer { 1759 private PCEI[] buffer_ = new PCEI[DEFAULT_BUFFER_SIZE]; 1760 private int bufferIndex_ = 0; 1761 reset()1762 void reset() { 1763 bufferIndex_ = 0; 1764 } 1765 empty()1766 boolean empty() { 1767 return bufferIndex_ <= 0; 1768 } 1769 put(long ce, int ixLow, int ixHigh)1770 void put(long ce, int ixLow, int ixHigh) 1771 { 1772 if (bufferIndex_ >= buffer_.length) { 1773 PCEI[] newBuffer = new PCEI[buffer_.length + BUFFER_GROW]; 1774 System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length); 1775 buffer_ = newBuffer; 1776 } 1777 buffer_[bufferIndex_] = new PCEI(); 1778 buffer_[bufferIndex_].ce_ = ce; 1779 buffer_[bufferIndex_].low_ = ixLow; 1780 buffer_[bufferIndex_].high_ = ixHigh; 1781 1782 bufferIndex_ += 1; 1783 } 1784 get()1785 PCEI get() { 1786 if (bufferIndex_ > 0) { 1787 return buffer_[--bufferIndex_]; 1788 } 1789 return null; 1790 } 1791 } 1792 1793 /** Raw collation element buffer stuff ported from ICU4C ucoleitr.cpp */ 1794 private static final class RCEI { 1795 int ce_; 1796 int low_; 1797 int high_; 1798 } 1799 1800 private static final class RCEBuffer { 1801 private RCEI[] buffer_ = new RCEI[DEFAULT_BUFFER_SIZE]; 1802 private int bufferIndex_ = 0; 1803 empty()1804 boolean empty() { 1805 return bufferIndex_ <= 0; 1806 } 1807 put(int ce, int ixLow, int ixHigh)1808 void put(int ce, int ixLow, int ixHigh) { 1809 if (bufferIndex_ >= buffer_.length) { 1810 RCEI[] newBuffer = new RCEI[buffer_.length + BUFFER_GROW]; 1811 System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length); 1812 buffer_ = newBuffer; 1813 } 1814 buffer_[bufferIndex_] = new RCEI(); 1815 buffer_[bufferIndex_].ce_ = ce; 1816 buffer_[bufferIndex_].low_ = ixLow; 1817 buffer_[bufferIndex_].high_ = ixHigh; 1818 1819 bufferIndex_ += 1; 1820 } 1821 get()1822 RCEI get() { 1823 if (bufferIndex_ > 0) { 1824 return buffer_[--bufferIndex_]; 1825 } 1826 return null; 1827 } 1828 } 1829 } 1830 1831 /** 1832 * Java port of ICU4C CEI (usearch.cpp) 1833 * 1834 * CEI Collation Element + source text index. 1835 * These structs are kept in the circular buffer. 1836 */ 1837 private static class CEI { 1838 long ce_; 1839 int lowIndex_; 1840 int highIndex_; 1841 } 1842 1843 /** 1844 * CEBuffer A circular buffer of CEs from the text being searched 1845 */ 1846 private static class CEBuffer { 1847 // Java porting note: ICU4C uses the size for stack buffer 1848 // static final int DEFAULT_CEBUFFER_SIZE = 96; 1849 1850 static final int CEBUFFER_EXTRA = 32; 1851 static final int MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L = 8; 1852 static final int MAX_TARGET_IGNORABLES_PER_PAT_OTHER = 3; 1853 1854 CEI[] buf_; 1855 int bufSize_; 1856 int firstIx_; 1857 int limitIx_; 1858 1859 // Java porting note: No references in ICU4C implementation 1860 // CollationElementIterator ceIter_; 1861 1862 StringSearch strSearch_; 1863 CEBuffer(StringSearch ss)1864 CEBuffer(StringSearch ss) { 1865 strSearch_ = ss; 1866 bufSize_ = ss.pattern_.PCELength_ + CEBUFFER_EXTRA; 1867 if (ss.search_.elementComparisonType_ != ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 1868 String patText = ss.pattern_.text_; 1869 if (patText != null) { 1870 for (int i = 0; i < patText.length(); i++) { 1871 char c = patText.charAt(i); 1872 if (MIGHT_BE_JAMO_L(c)) { 1873 bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 1874 } else { 1875 // No check for surrogates, we might allocate slightly more buffer than necessary. 1876 bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 1877 } 1878 } 1879 } 1880 } 1881 1882 // Not used - see above 1883 // ceIter_ = ss.textIter_; 1884 1885 firstIx_ = 0; 1886 limitIx_ = 0; 1887 1888 if (!ss.initTextProcessedIter()) { 1889 return; 1890 } 1891 1892 buf_ = new CEI[bufSize_]; 1893 } 1894 1895 // Get the CE with the specified index. 1896 // Index must be in the range 1897 // n-history_size < index < n+1 1898 // where n is the largest index to have been fetched by some previous call to this function. 1899 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 1900 // get(int index)1901 CEI get(int index) { 1902 int i = index % bufSize_; 1903 1904 if (index >= firstIx_ && index < limitIx_) { 1905 // The request was for an entry already in our buffer. 1906 // Just return it. 1907 return buf_[i]; 1908 } 1909 1910 // Caller is requesting a new, never accessed before, CE. 1911 // Verify that it is the next one in sequence, which is all 1912 // that is allowed. 1913 if (index != limitIx_) { 1914 assert(false); 1915 return null; 1916 } 1917 1918 // Manage the circular CE buffer indexing 1919 limitIx_++; 1920 1921 if (limitIx_ - firstIx_ >= bufSize_) { 1922 // The buffer is full, knock out the lowest-indexed entry. 1923 firstIx_++; 1924 } 1925 1926 CollationPCE.Range range = new CollationPCE.Range(); 1927 if (buf_[i] == null) { 1928 buf_[i] = new CEI(); 1929 } 1930 buf_[i].ce_ = strSearch_.textProcessedIter_.nextProcessed(range); 1931 buf_[i].lowIndex_ = range.ixLow_; 1932 buf_[i].highIndex_ = range.ixHigh_; 1933 1934 return buf_[i]; 1935 } 1936 1937 // Get the CE with the specified index. 1938 // Index must be in the range 1939 // n-history_size < index < n+1 1940 // where n is the largest index to have been fetched by some previous call to this function. 1941 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 1942 // getPrevious(int index)1943 CEI getPrevious(int index) { 1944 int i = index % bufSize_; 1945 1946 if (index >= firstIx_ && index < limitIx_) { 1947 // The request was for an entry already in our buffer. 1948 // Just return it. 1949 return buf_[i]; 1950 } 1951 1952 // Caller is requesting a new, never accessed before, CE. 1953 // Verify that it is the next one in sequence, which is all 1954 // that is allowed. 1955 if (index != limitIx_) { 1956 assert(false); 1957 return null; 1958 } 1959 1960 // Manage the circular CE buffer indexing 1961 limitIx_++; 1962 1963 if (limitIx_ - firstIx_ >= bufSize_) { 1964 // The buffer is full, knock out the lowest-indexed entry. 1965 firstIx_++; 1966 } 1967 1968 CollationPCE.Range range = new CollationPCE.Range(); 1969 if (buf_[i] == null) { 1970 buf_[i] = new CEI(); 1971 } 1972 buf_[i].ce_ = strSearch_.textProcessedIter_.previousProcessed(range); 1973 buf_[i].lowIndex_ = range.ixLow_; 1974 buf_[i].highIndex_ = range.ixHigh_; 1975 1976 return buf_[i]; 1977 } 1978 MIGHT_BE_JAMO_L(char c)1979 static boolean MIGHT_BE_JAMO_L(char c) { 1980 return (c >= 0x1100 && c <= 0x115E) 1981 || (c >= 0x3131 && c <= 0x314E) 1982 || (c >= 0x3165 && c <= 0x3186); 1983 } 1984 } 1985 } 1986