1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2009-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 */ 11 12 package com.ibm.icu.impl; 13 14 import java.util.ArrayList; 15 16 import com.ibm.icu.text.UnicodeSet; 17 import com.ibm.icu.text.UnicodeSet.SpanCondition; 18 import com.ibm.icu.util.OutputInt; 19 20 /* 21 * Implement span() etc. for a set with strings. 22 * Avoid recursion because of its exponential complexity. 23 * Instead, try multiple paths at once and track them with an IndexList. 24 */ 25 public class UnicodeSetStringSpan { 26 27 /* 28 * Which span() variant will be used? The object is either built for one variant and used once, 29 * or built for all and may be used many times. 30 */ 31 public static final int WITH_COUNT = 0x40; // spanAndCount() may be called 32 public static final int FWD = 0x20; 33 public static final int BACK = 0x10; 34 // public static final int UTF16 = 8; 35 public static final int CONTAINED = 2; 36 public static final int NOT_CONTAINED = 1; 37 38 public static final int ALL = 0x7f; 39 40 public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; 41 public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; 42 public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; 43 public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; 44 45 /** 46 * Special spanLength short values. (since Java has not unsigned byte type) 47 * All code points in the string are contained in the parent set. 48 */ 49 static final short ALL_CP_CONTAINED = 0xff; 50 /** The spanLength is >=0xfe. */ 51 static final short LONG_SPAN = ALL_CP_CONTAINED - 1; 52 53 /** Set for span(). Same as parent but without strings. */ 54 private UnicodeSet spanSet; 55 56 /** 57 * Set for span(not contained). 58 * Same as spanSet, plus characters that start or end strings. 59 */ 60 private UnicodeSet spanNotSet; 61 62 /** The strings of the parent set. */ 63 private ArrayList<String> strings; 64 65 /** The lengths of span(), spanBack() etc. for each string. */ 66 private short[] spanLengths; 67 68 /** Maximum lengths of relevant strings. */ 69 private final int maxLength16; 70 71 /** Are there strings that are not fully contained in the code point set? */ 72 private boolean someRelevant; 73 74 /** Set up for all variants of span()? */ 75 private boolean all; 76 77 /** Span helper */ 78 private OffsetList offsets; 79 80 /** 81 * Constructs for all variants of span(), or only for any one variant. 82 * Initializes as little as possible, for single use. 83 */ UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which)84 public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) { 85 spanSet = new UnicodeSet(0, 0x10ffff); 86 // TODO: With Java 6, just take the parent set's strings as is, 87 // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings. 88 // Then iterate via the first() and higher() methods. 89 // (We do not want to create multiple Iterator objects in each span().) 90 // See ICU ticket #7454. 91 strings = setStrings; 92 all = (which == ALL); 93 spanSet.retainAll(set); 94 if (0 != (which & NOT_CONTAINED)) { 95 // Default to the same sets. 96 // addToSpanNotSet() will create a separate set if necessary. 97 spanNotSet = spanSet; 98 } 99 offsets = new OffsetList(); 100 101 // Determine if the strings even need to be taken into account at all for span() etc. 102 // If any string is relevant, then all strings need to be used for 103 // span(longest match) but only the relevant ones for span(while contained). 104 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 105 // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 106 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 107 // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 108 int stringsLength = strings.size(); 109 110 int i, spanLength; 111 int maxLength16 = 0; 112 someRelevant = false; 113 for (i = 0; i < stringsLength; ++i) { 114 String string = strings.get(i); 115 int length16 = string.length(); 116 spanLength = spanSet.span(string, SpanCondition.CONTAINED); 117 if (spanLength < length16) { // Relevant string. 118 someRelevant = true; 119 } 120 if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { 121 maxLength16 = length16; 122 } 123 } 124 this.maxLength16 = maxLength16; 125 if (!someRelevant && (which & WITH_COUNT) == 0) { 126 return; 127 } 128 129 // Freeze after checking for the need to use strings at all because freezing 130 // a set takes some time and memory which are wasted if there are no relevant strings. 131 if (all) { 132 spanSet.freeze(); 133 } 134 135 int spanBackLengthsOffset; 136 137 // Allocate a block of meta data. 138 int allocSize; 139 if (all) { 140 // 2 sets of span lengths 141 allocSize = stringsLength * (2); 142 } else { 143 allocSize = stringsLength; // One set of span lengths. 144 } 145 spanLengths = new short[allocSize]; 146 147 if (all) { 148 // Store span lengths for all span() variants. 149 spanBackLengthsOffset = stringsLength; 150 } else { 151 // Store span lengths for only one span() variant. 152 spanBackLengthsOffset = 0; 153 } 154 155 // Set the meta data and spanNotSet and write the UTF-8 strings. 156 157 for (i = 0; i < stringsLength; ++i) { 158 String string = strings.get(i); 159 int length16 = string.length(); 160 spanLength = spanSet.span(string, SpanCondition.CONTAINED); 161 if (spanLength < length16) { // Relevant string. 162 if (true /* 0 != (which & UTF16) */) { 163 if (0 != (which & CONTAINED)) { 164 if (0 != (which & FWD)) { 165 spanLengths[i] = makeSpanLengthByte(spanLength); 166 } 167 if (0 != (which & BACK)) { 168 spanLength = length16 169 - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); 170 spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); 171 } 172 } else /* not CONTAINED, not all, but NOT_CONTAINED */{ 173 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant 174 // flag. 175 } 176 } 177 if (0 != (which & NOT_CONTAINED)) { 178 // Add string start and end code points to the spanNotSet so that 179 // a span(while not contained) stops before any string. 180 int c; 181 if (0 != (which & FWD)) { 182 c = string.codePointAt(0); 183 addToSpanNotSet(c); 184 } 185 if (0 != (which & BACK)) { 186 c = string.codePointBefore(length16); 187 addToSpanNotSet(c); 188 } 189 } 190 } else { // Irrelevant string. 191 if (all) { 192 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; 193 } else { 194 // All spanXYZLengths pointers contain the same address. 195 spanLengths[i] = ALL_CP_CONTAINED; 196 } 197 } 198 } 199 200 // Finish. 201 if (all) { 202 spanNotSet.freeze(); 203 } 204 } 205 206 /** 207 * Constructs a copy of an existing UnicodeSetStringSpan. 208 * Assumes which==ALL for a frozen set. 209 */ UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList<String> newParentSetStrings)210 public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, 211 final ArrayList<String> newParentSetStrings) { 212 spanSet = otherStringSpan.spanSet; 213 strings = newParentSetStrings; 214 maxLength16 = otherStringSpan.maxLength16; 215 someRelevant = otherStringSpan.someRelevant; 216 all = true; 217 if (Utility.sameObjects(otherStringSpan.spanNotSet, otherStringSpan.spanSet)) { 218 spanNotSet = spanSet; 219 } else { 220 spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone(); 221 } 222 offsets = new OffsetList(); 223 224 spanLengths = otherStringSpan.spanLengths.clone(); 225 } 226 227 /** 228 * Do the strings need to be checked in span() etc.? 229 * 230 * @return true if strings need to be checked (call span() here), 231 * false if not (use a BMPSet for best performance). 232 */ needsStringSpanUTF16()233 public boolean needsStringSpanUTF16() { 234 return someRelevant; 235 } 236 237 /** For fast UnicodeSet::contains(c). */ contains(int c)238 public boolean contains(int c) { 239 return spanSet.contains(c); 240 } 241 242 /** 243 * Adds a starting or ending string character to the spanNotSet 244 * so that a character span ends before any string. 245 */ addToSpanNotSet(int c)246 private void addToSpanNotSet(int c) { 247 if (Utility.sameObjects(spanNotSet, null) || Utility.sameObjects(spanNotSet, spanSet)) { 248 if (spanSet.contains(c)) { 249 return; // Nothing to do. 250 } 251 spanNotSet = spanSet.cloneAsThawed(); 252 } 253 spanNotSet.add(c); 254 } 255 256 /* 257 * Note: In span() when spanLength==0 258 * (after a string match, or at the beginning after an empty code point span) 259 * and in spanNot() and spanNotUTF8(), 260 * string matching could use a binary search because all string matches are done 261 * from the same start index. 262 * 263 * For UTF-8, this would require a comparison function that returns UTF-16 order. 264 * 265 * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets 266 * with strings have very few very short strings. For cases with many strings, it might be better to use a different 267 * API and implementation with a DFA (state machine). 268 */ 269 270 /* 271 * Algorithm for span(SpanCondition.CONTAINED) 272 * 273 * Theoretical algorithm: 274 * - Iterate through the string, and at each code point boundary: 275 * + If the code point there is in the set, then remember to continue after it. 276 * + If a set string matches at the current position, then remember to continue after it. 277 * + Either recursively span for each code point or string match, or recursively span 278 * for all but the shortest one and iteratively continue the span with the shortest local match. 279 * + Remember the longest recursive span (the farthest end point). 280 * + If there is no match at the current position, 281 * neither for the code point there nor for any set string, 282 * then stop and return the longest recursive span length. 283 * 284 * Optimized implementation: 285 * 286 * (We assume that most sets will have very few very short strings. 287 * A span using a string-less set is extremely fast.) 288 * 289 * Create and cache a spanSet which contains all of the single code points of the original set 290 * but none of its strings. 291 * 292 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). 293 * - Loop: 294 * + Try to match each set string at the end of the spanLength. 295 * ~ Set strings that start with set-contained code points 296 * must be matched with a partial overlap 297 * because the recursive algorithm would have tried to match them at every position. 298 * ~ Set strings that entirely consist of set-contained code points 299 * are irrelevant for span(SpanCondition.CONTAINED) 300 * because the recursive algorithm would continue after them anyway and 301 * find the longest recursive match from their end. 302 * ~ Rather than recursing, note each end point of a set string match. 303 * + If no set string matched after spanSet.span(), 304 * then return with where the spanSet.span() ended. 305 * + If at least one set string matched after spanSet.span(), 306 * then pop the shortest string match end point and continue the loop, 307 * trying to match all set strings from there. 308 * + If at least one more set string matched after a previous string match, then test if the 309 * code point after the previous string match is also contained in the set. 310 * Continue the loop with the shortest end point of 311 * either this code point or a matching set string. 312 * + If no more set string matched after a previous string match, 313 * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). 314 * Stop if spanLength==0, otherwise continue the loop. 315 * 316 * By noting each end point of a set string match, the function visits each string position at most once and 317 * finishes in linear time. 318 * 319 * The recursive algorithm may visit the same string position many times 320 * if multiple paths lead to it and finishes in exponential time. 321 */ 322 323 /* 324 * Algorithm for span(SIMPLE) 325 * 326 * Theoretical algorithm: 327 * - Iterate through the string, and at each code point boundary: 328 * + If the code point there is in the set, then remember to continue after it. 329 * + If a set string matches at the current position, then remember to continue after it. 330 * + Continue from the farthest match position and ignore all others. 331 * + If there is no match at the current position, then stop and return the current position. 332 * 333 * Optimized implementation: 334 * 335 * (Same assumption and spanSet as above.) 336 * 337 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). 338 * - Loop: 339 * + Try to match each set string at the end of the spanLength. 340 * ~ Set strings that start with set-contained code points 341 * must be matched with a partial overlap 342 * because the standard algorithm would have tried to match them earlier. 343 * ~ Set strings that entirely consist of set-contained code points 344 * must be matched with a full overlap because the longest-match algorithm 345 * would hide set string matches that end earlier. 346 * Such set strings need not be matched earlier inside the code point span 347 * because the standard algorithm would then have 348 * continued after the set string match anyway. 349 * ~ Remember the longest set string match (farthest end point) 350 * from the earliest starting point. 351 * + If no set string matched after spanSet.span(), 352 * then return with where the spanSet.span() ended. 353 * + If at least one set string matched, 354 * then continue the loop after the longest match from the earliest position. 355 * + If no more set string matched after a previous string match, 356 * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). 357 * Stop if spanLength==0, otherwise continue the loop. 358 */ 359 /** 360 * Spans a string. 361 * 362 * @param s The string to be spanned 363 * @param start The start index that the span begins 364 * @param spanCondition The span condition 365 * @return the limit (exclusive end) of the span 366 */ span(CharSequence s, int start, SpanCondition spanCondition)367 public int span(CharSequence s, int start, SpanCondition spanCondition) { 368 if (spanCondition == SpanCondition.NOT_CONTAINED) { 369 return spanNot(s, start, null); 370 } 371 int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); 372 if (spanLimit == s.length()) { 373 return spanLimit; 374 } 375 return spanWithStrings(s, start, spanLimit, spanCondition); 376 } 377 378 /** 379 * Synchronized method for complicated spans using the offsets. 380 * Avoids synchronization for simple cases. 381 * 382 * @param spanLimit = spanSet.span(s, start, CONTAINED) 383 */ spanWithStrings(CharSequence s, int start, int spanLimit, SpanCondition spanCondition)384 private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, 385 SpanCondition spanCondition) { 386 // Consider strings; they may overlap with the span. 387 int initSize = 0; 388 if (spanCondition == SpanCondition.CONTAINED) { 389 // Use offset list to try all possibilities. 390 initSize = maxLength16; 391 } 392 offsets.setMaxLength(initSize); 393 int length = s.length(); 394 int pos = spanLimit, rest = length - spanLimit; 395 int spanLength = spanLimit - start; 396 int i, stringsLength = strings.size(); 397 for (;;) { 398 if (spanCondition == SpanCondition.CONTAINED) { 399 for (i = 0; i < stringsLength; ++i) { 400 int overlap = spanLengths[i]; 401 if (overlap == ALL_CP_CONTAINED) { 402 continue; // Irrelevant string. 403 } 404 String string = strings.get(i); 405 406 int length16 = string.length(); 407 408 // Try to match this string at pos-overlap..pos. 409 if (overlap >= LONG_SPAN) { 410 overlap = length16; 411 // While contained: No point matching fully inside the code point span. 412 overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code 413 // point. 414 } 415 if (overlap > spanLength) { 416 overlap = spanLength; 417 } 418 int inc = length16 - overlap; // Keep overlap+inc==length16. 419 for (;;) { 420 if (inc > rest) { 421 break; 422 } 423 // Try to match if the increment is not listed already. 424 if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { 425 if (inc == rest) { 426 return length; // Reached the end of the string. 427 } 428 offsets.addOffset(inc); 429 } 430 if (overlap == 0) { 431 break; 432 } 433 --overlap; 434 ++inc; 435 } 436 } 437 } else /* SIMPLE */{ 438 int maxInc = 0, maxOverlap = 0; 439 for (i = 0; i < stringsLength; ++i) { 440 int overlap = spanLengths[i]; 441 // For longest match, we do need to try to match even an all-contained string 442 // to find the match from the earliest start. 443 444 String string = strings.get(i); 445 446 int length16 = string.length(); 447 448 // Try to match this string at pos-overlap..pos. 449 if (overlap >= LONG_SPAN) { 450 overlap = length16; 451 // Longest match: Need to match fully inside the code point span 452 // to find the match from the earliest start. 453 } 454 if (overlap > spanLength) { 455 overlap = spanLength; 456 } 457 int inc = length16 - overlap; // Keep overlap+inc==length16. 458 for (;;) { 459 if (inc > rest || overlap < maxOverlap) { 460 break; 461 } 462 // Try to match if the string is longer or starts earlier. 463 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) 464 && matches16CPB(s, pos - overlap, length, string, length16)) { 465 maxInc = inc; // Longest match from earliest start. 466 maxOverlap = overlap; 467 break; 468 } 469 --overlap; 470 ++inc; 471 } 472 } 473 474 if (maxInc != 0 || maxOverlap != 0) { 475 // Longest-match algorithm, and there was a string match. 476 // Simply continue after it. 477 pos += maxInc; 478 rest -= maxInc; 479 if (rest == 0) { 480 return length; // Reached the end of the string. 481 } 482 spanLength = 0; // Match strings from after a string match. 483 continue; 484 } 485 } 486 // Finished trying to match all strings at pos. 487 488 if (spanLength != 0 || pos == 0) { 489 // The position is after an unlimited code point span (spanLength!=0), 490 // not after a string match. 491 // The only position where spanLength==0 after a span is pos==0. 492 // Otherwise, an unlimited code point span is only tried again when no 493 // strings match, and if such a non-initial span fails we stop. 494 if (offsets.isEmpty()) { 495 return pos; // No strings matched after a span. 496 } 497 // Match strings from after the next string match. 498 } else { 499 // The position is after a string match (or a single code point). 500 if (offsets.isEmpty()) { 501 // No more strings matched after a previous string match. 502 // Try another code point span from after the last string match. 503 spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); 504 spanLength = spanLimit - pos; 505 if (spanLength == rest || // Reached the end of the string, or 506 spanLength == 0 // neither strings nor span progressed. 507 ) { 508 return spanLimit; 509 } 510 pos += spanLength; 511 rest -= spanLength; 512 continue; // spanLength>0: Match strings from after a span. 513 } else { 514 // Try to match only one code point from after a string match if some 515 // string matched beyond it, so that we try all possible positions 516 // and don't overshoot. 517 spanLength = spanOne(spanSet, s, pos, rest); 518 if (spanLength > 0) { 519 if (spanLength == rest) { 520 return length; // Reached the end of the string. 521 } 522 // Match strings after this code point. 523 // There cannot be any increments below it because UnicodeSet strings 524 // contain multiple code points. 525 pos += spanLength; 526 rest -= spanLength; 527 offsets.shift(spanLength); 528 spanLength = 0; 529 continue; // Match strings from after a single code point. 530 } 531 // Match strings from after the next string match. 532 } 533 } 534 int minOffset = offsets.popMinimum(null); 535 pos += minOffset; 536 rest -= minOffset; 537 spanLength = 0; // Match strings from after a string match. 538 } 539 } 540 541 /** 542 * Spans a string and counts the smallest number of set elements on any path across the span. 543 * 544 * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans. 545 * 546 * <p>If the set does not have any fully-contained strings, then we could optimize this 547 * like span(), but such sets are likely rare, and this is at least still linear. 548 * 549 * @param s The string to be spanned 550 * @param start The start index that the span begins 551 * @param spanCondition The span condition 552 * @param outCount The count 553 * @return the limit (exclusive end) of the span 554 */ spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)555 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, 556 OutputInt outCount) { 557 if (spanCondition == SpanCondition.NOT_CONTAINED) { 558 return spanNot(s, start, outCount); 559 } 560 // Consider strings; they may overlap with the span, 561 // and they may result in a smaller count that with just code points. 562 if (spanCondition == SpanCondition.CONTAINED) { 563 return spanContainedAndCount(s, start, outCount); 564 } 565 // SIMPLE (not synchronized, does not use offsets) 566 int stringsLength = strings.size(); 567 int length = s.length(); 568 int pos = start; 569 int rest = length - start; 570 int count = 0; 571 while (rest != 0) { 572 // Try to match the next code point. 573 int cpLength = spanOne(spanSet, s, pos, rest); 574 int maxInc = (cpLength > 0) ? cpLength : 0; 575 // Try to match all of the strings. 576 for (int i = 0; i < stringsLength; ++i) { 577 String string = strings.get(i); 578 int length16 = string.length(); 579 if (maxInc < length16 && length16 <= rest && 580 matches16CPB(s, pos, length, string, length16)) { 581 maxInc = length16; 582 } 583 } 584 // We are done if there is no match beyond pos. 585 if (maxInc == 0) { 586 outCount.value = count; 587 return pos; 588 } 589 // Continue from the longest match. 590 ++count; 591 pos += maxInc; 592 rest -= maxInc; 593 } 594 outCount.value = count; 595 return pos; 596 } 597 spanContainedAndCount(CharSequence s, int start, OutputInt outCount)598 private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { 599 // Use offset list to try all possibilities. 600 offsets.setMaxLength(maxLength16); 601 int stringsLength = strings.size(); 602 int length = s.length(); 603 int pos = start; 604 int rest = length - start; 605 int count = 0; 606 while (rest != 0) { 607 // Try to match the next code point. 608 int cpLength = spanOne(spanSet, s, pos, rest); 609 if (cpLength > 0) { 610 offsets.addOffsetAndCount(cpLength, count + 1); 611 } 612 // Try to match all of the strings. 613 for (int i = 0; i < stringsLength; ++i) { 614 String string = strings.get(i); 615 int length16 = string.length(); 616 // Note: If the strings were sorted by length, then we could also 617 // avoid trying to match if there is already a match of the same length. 618 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && 619 matches16CPB(s, pos, length, string, length16)) { 620 offsets.addOffsetAndCount(length16, count + 1); 621 } 622 } 623 // We are done if there is no match beyond pos. 624 if (offsets.isEmpty()) { 625 outCount.value = count; 626 return pos; 627 } 628 // Continue from the nearest match. 629 int minOffset = offsets.popMinimum(outCount); 630 count = outCount.value; 631 pos += minOffset; 632 rest -= minOffset; 633 } 634 outCount.value = count; 635 return pos; 636 } 637 638 /** 639 * Span a string backwards. 640 * 641 * @param s The string to be spanned 642 * @param spanCondition The span condition 643 * @return The string index which starts the span (i.e. inclusive). 644 */ spanBack(CharSequence s, int length, SpanCondition spanCondition)645 public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { 646 if (spanCondition == SpanCondition.NOT_CONTAINED) { 647 return spanNotBack(s, length); 648 } 649 int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); 650 if (pos == 0) { 651 return 0; 652 } 653 int spanLength = length - pos; 654 655 // Consider strings; they may overlap with the span. 656 int initSize = 0; 657 if (spanCondition == SpanCondition.CONTAINED) { 658 // Use offset list to try all possibilities. 659 initSize = maxLength16; 660 } 661 offsets.setMaxLength(initSize); 662 int i, stringsLength = strings.size(); 663 int spanBackLengthsOffset = 0; 664 if (all) { 665 spanBackLengthsOffset = stringsLength; 666 } 667 for (;;) { 668 if (spanCondition == SpanCondition.CONTAINED) { 669 for (i = 0; i < stringsLength; ++i) { 670 int overlap = spanLengths[spanBackLengthsOffset + i]; 671 if (overlap == ALL_CP_CONTAINED) { 672 continue; // Irrelevant string. 673 } 674 String string = strings.get(i); 675 676 int length16 = string.length(); 677 678 // Try to match this string at pos-(length16-overlap)..pos-length16. 679 if (overlap >= LONG_SPAN) { 680 overlap = length16; 681 // While contained: No point matching fully inside the code point span. 682 int len1 = 0; 683 len1 = string.offsetByCodePoints(0, 1); 684 overlap -= len1; // Length of the string minus the first code point. 685 } 686 if (overlap > spanLength) { 687 overlap = spanLength; 688 } 689 int dec = length16 - overlap; // Keep dec+overlap==length16. 690 for (;;) { 691 if (dec > pos) { 692 break; 693 } 694 // Try to match if the decrement is not listed already. 695 if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { 696 if (dec == pos) { 697 return 0; // Reached the start of the string. 698 } 699 offsets.addOffset(dec); 700 } 701 if (overlap == 0) { 702 break; 703 } 704 --overlap; 705 ++dec; 706 } 707 } 708 } else /* SIMPLE */{ 709 int maxDec = 0, maxOverlap = 0; 710 for (i = 0; i < stringsLength; ++i) { 711 int overlap = spanLengths[spanBackLengthsOffset + i]; 712 // For longest match, we do need to try to match even an all-contained string 713 // to find the match from the latest end. 714 715 String string = strings.get(i); 716 717 int length16 = string.length(); 718 719 // Try to match this string at pos-(length16-overlap)..pos-length16. 720 if (overlap >= LONG_SPAN) { 721 overlap = length16; 722 // Longest match: Need to match fully inside the code point span 723 // to find the match from the latest end. 724 } 725 if (overlap > spanLength) { 726 overlap = spanLength; 727 } 728 int dec = length16 - overlap; // Keep dec+overlap==length16. 729 for (;;) { 730 if (dec > pos || overlap < maxOverlap) { 731 break; 732 } 733 // Try to match if the string is longer or ends later. 734 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) 735 && matches16CPB(s, pos - dec, length, string, length16)) { 736 maxDec = dec; // Longest match from latest end. 737 maxOverlap = overlap; 738 break; 739 } 740 --overlap; 741 ++dec; 742 } 743 } 744 745 if (maxDec != 0 || maxOverlap != 0) { 746 // Longest-match algorithm, and there was a string match. 747 // Simply continue before it. 748 pos -= maxDec; 749 if (pos == 0) { 750 return 0; // Reached the start of the string. 751 } 752 spanLength = 0; // Match strings from before a string match. 753 continue; 754 } 755 } 756 // Finished trying to match all strings at pos. 757 758 if (spanLength != 0 || pos == length) { 759 // The position is before an unlimited code point span (spanLength!=0), 760 // not before a string match. 761 // The only position where spanLength==0 before a span is pos==length. 762 // Otherwise, an unlimited code point span is only tried again when no 763 // strings match, and if such a non-initial span fails we stop. 764 if (offsets.isEmpty()) { 765 return pos; // No strings matched before a span. 766 } 767 // Match strings from before the next string match. 768 } else { 769 // The position is before a string match (or a single code point). 770 if (offsets.isEmpty()) { 771 // No more strings matched before a previous string match. 772 // Try another code point span from before the last string match. 773 int oldPos = pos; 774 pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); 775 spanLength = oldPos - pos; 776 if (pos == 0 || // Reached the start of the string, or 777 spanLength == 0 // neither strings nor span progressed. 778 ) { 779 return pos; 780 } 781 continue; // spanLength>0: Match strings from before a span. 782 } else { 783 // Try to match only one code point from before a string match if some 784 // string matched beyond it, so that we try all possible positions 785 // and don't overshoot. 786 spanLength = spanOneBack(spanSet, s, pos); 787 if (spanLength > 0) { 788 if (spanLength == pos) { 789 return 0; // Reached the start of the string. 790 } 791 // Match strings before this code point. 792 // There cannot be any decrements below it because UnicodeSet strings 793 // contain multiple code points. 794 pos -= spanLength; 795 offsets.shift(spanLength); 796 spanLength = 0; 797 continue; // Match strings from before a single code point. 798 } 799 // Match strings from before the next string match. 800 } 801 } 802 pos -= offsets.popMinimum(null); 803 spanLength = 0; // Match strings from before a string match. 804 } 805 } 806 807 /** 808 * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) 809 * 810 * Theoretical algorithm: 811 * - Iterate through the string, and at each code point boundary: 812 * + If the code point there is in the set, then return with the current position. 813 * + If a set string matches at the current position, then return with the current position. 814 * 815 * Optimized implementation: 816 * 817 * (Same assumption as for span() above.) 818 * 819 * Create and cache a spanNotSet which contains 820 * all of the single code points of the original set but none of its strings. 821 * For each set string add its initial code point to the spanNotSet. 822 * (Also add its final code point for spanNotBack().) 823 * 824 * - Loop: 825 * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). 826 * + If the current code point is in the original set, then return the current position. 827 * + If any set string matches at the current position, then return the current position. 828 * + If there is no match at the current position, neither for the code point 829 * there nor for any set string, then skip this code point and continue the loop. 830 * This happens for set-string-initial code points that were added to spanNotSet 831 * when there is not actually a match for such a set string. 832 * 833 * @param s The string to be spanned 834 * @param start The start index that the span begins 835 * @param outCount If not null: Receives the number of code points across the span. 836 * @return the limit (exclusive end) of the span 837 */ spanNot(CharSequence s, int start, OutputInt outCount)838 private int spanNot(CharSequence s, int start, OutputInt outCount) { 839 int length = s.length(); 840 int pos = start, rest = length - start; 841 int stringsLength = strings.size(); 842 int count = 0; 843 do { 844 // Span until we find a code point from the set, 845 // or a code point that starts or ends some string. 846 int spanLimit; 847 if (outCount == null) { 848 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); 849 } else { 850 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); 851 outCount.value = count = count + outCount.value; 852 } 853 if (spanLimit == length) { 854 return length; // Reached the end of the string. 855 } 856 pos = spanLimit; 857 rest = length - spanLimit; 858 859 // Check whether the current code point is in the original set, 860 // without the string starts and ends. 861 int cpLength = spanOne(spanSet, s, pos, rest); 862 if (cpLength > 0) { 863 return pos; // There is a set element at pos. 864 } 865 866 // Try to match the strings at pos. 867 for (int i = 0; i < stringsLength; ++i) { 868 if (spanLengths[i] == ALL_CP_CONTAINED) { 869 continue; // Irrelevant string. 870 } 871 String string = strings.get(i); 872 873 int length16 = string.length(); 874 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { 875 return pos; // There is a set element at pos. 876 } 877 } 878 879 // The span(while not contained) ended on a string start/end which is 880 // not in the original set. Skip this code point and continue. 881 // cpLength<0 882 pos -= cpLength; 883 rest += cpLength; 884 ++count; 885 } while (rest != 0); 886 if (outCount != null) { 887 outCount.value = count; 888 } 889 return length; // Reached the end of the string. 890 } 891 spanNotBack(CharSequence s, int length)892 private int spanNotBack(CharSequence s, int length) { 893 int pos = length; 894 int i, stringsLength = strings.size(); 895 do { 896 // Span until we find a code point from the set, 897 // or a code point that starts or ends some string. 898 pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); 899 if (pos == 0) { 900 return 0; // Reached the start of the string. 901 } 902 903 // Check whether the current code point is in the original set, 904 // without the string starts and ends. 905 int cpLength = spanOneBack(spanSet, s, pos); 906 if (cpLength > 0) { 907 return pos; // There is a set element at pos. 908 } 909 910 // Try to match the strings at pos. 911 for (i = 0; i < stringsLength; ++i) { 912 // Use spanLengths rather than a spanLengths pointer because 913 // it is easier and we only need to know whether the string is irrelevant 914 // which is the same in either array. 915 if (spanLengths[i] == ALL_CP_CONTAINED) { 916 continue; // Irrelevant string. 917 } 918 String string = strings.get(i); 919 920 int length16 = string.length(); 921 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { 922 return pos; // There is a set element at pos. 923 } 924 } 925 926 // The span(while not contained) ended on a string start/end which is 927 // not in the original set. Skip this code point and continue. 928 // cpLength<0 929 pos += cpLength; 930 } while (pos != 0); 931 return 0; // Reached the start of the string. 932 } 933 makeSpanLengthByte(int spanLength)934 static short makeSpanLengthByte(int spanLength) { 935 // 0xfe==UnicodeSetStringSpan::LONG_SPAN 936 return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; 937 } 938 939 // Compare strings without any argument checks. Requires length>0. matches16(CharSequence s, int start, final String t, int length)940 private static boolean matches16(CharSequence s, int start, final String t, int length) { 941 int end = start + length; 942 while (length-- > 0) { 943 if (s.charAt(--end) != t.charAt(length)) { 944 return false; 945 } 946 } 947 return true; 948 } 949 950 /** 951 * Compare 16-bit Unicode strings (which may be malformed UTF-16) 952 * at code point boundaries. 953 * That is, each edge of a match must not be in the middle of a surrogate pair. 954 * @param s The string to match in. 955 * @param start The start index of s. 956 * @param limit The limit of the subsequence of s being spanned. 957 * @param t The substring to be matched in s. 958 * @param tlength The length of t. 959 */ matches16CPB(CharSequence s, int start, int limit, final String t, int tlength)960 static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { 961 return matches16(s, start, t, tlength) 962 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && 963 Character.isLowSurrogate(s.charAt(start))) 964 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && 965 Character.isLowSurrogate(s.charAt(start + tlength))); 966 } 967 968 /** 969 * Does the set contain the next code point? 970 * If so, return its length; otherwise return its negative length. 971 */ spanOne(final UnicodeSet set, CharSequence s, int start, int length)972 static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { 973 char c = s.charAt(start); 974 if (c >= 0xd800 && c <= 0xdbff && length >= 2) { 975 char c2 = s.charAt(start + 1); 976 if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) { 977 int supplementary = Character.toCodePoint(c, c2); 978 return set.contains(supplementary) ? 2 : -2; 979 } 980 } 981 return set.contains(c) ? 1 : -1; 982 } 983 spanOneBack(final UnicodeSet set, CharSequence s, int length)984 static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { 985 char c = s.charAt(length - 1); 986 if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { 987 char c2 = s.charAt(length - 2); 988 if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) { 989 int supplementary = Character.toCodePoint(c2, c); 990 return set.contains(supplementary) ? 2 : -2; 991 } 992 } 993 return set.contains(c) ? 1 : -1; 994 } 995 996 /** 997 * Helper class for UnicodeSetStringSpan. 998 * 999 * <p>List of offsets from the current position from where to try matching 1000 * a code point or a string. 1001 * Stores offsets rather than indexes to simplify the code and use the same list 1002 * for both increments (in span()) and decrements (in spanBack()). 1003 * 1004 * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time 1005 * are relatively dense, that is, 1006 * there are normally no gaps of hundreds or thousands of offset values. 1007 * 1008 * <p>This class optionally also tracks the minimum non-negative count for each position, 1009 * intended to count the smallest number of elements of any path leading to that position. 1010 * 1011 * <p>The implementation uses a circular buffer of count integers, 1012 * each indicating whether the corresponding offset is in the list, 1013 * and its path element count. 1014 * This avoids inserting into a sorted list of offsets (or absolute indexes) 1015 * and physically moving part of the list. 1016 * 1017 * <p>Note: In principle, the caller should setMaxLength() to 1018 * the maximum of the max string length and U16_LENGTH/U8_LENGTH 1019 * to account for "long" single code points. 1020 * 1021 * <p>Note: An earlier version did not track counts and stored only byte flags. 1022 * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, 1023 * the list could be stored as bit flags in a single integer. 1024 * Rather than handling a circular buffer with a start list index, 1025 * the integer would simply be shifted when lower offsets are removed. 1026 * UnicodeSet does not have a limit on the lengths of strings. 1027 */ 1028 private static final class OffsetList { 1029 private int[] list; 1030 private int length; 1031 private int start; 1032 OffsetList()1033 public OffsetList() { 1034 list = new int[16]; // default size 1035 } 1036 setMaxLength(int maxLength)1037 public void setMaxLength(int maxLength) { 1038 if (maxLength > list.length) { 1039 list = new int[maxLength]; 1040 } 1041 clear(); 1042 } 1043 clear()1044 public void clear() { 1045 for (int i = list.length; i-- > 0;) { 1046 list[i] = 0; 1047 } 1048 start = length = 0; 1049 } 1050 isEmpty()1051 public boolean isEmpty() { 1052 return (length == 0); 1053 } 1054 1055 /** 1056 * Reduces all stored offsets by delta, used when the current position moves by delta. 1057 * There must not be any offsets lower than delta. 1058 * If there is an offset equal to delta, it is removed. 1059 * 1060 * @param delta [1..maxLength] 1061 */ shift(int delta)1062 public void shift(int delta) { 1063 int i = start + delta; 1064 if (i >= list.length) { 1065 i -= list.length; 1066 } 1067 if (list[i] != 0) { 1068 list[i] = 0; 1069 --length; 1070 } 1071 start = i; 1072 } 1073 1074 /** 1075 * Adds an offset. The list must not contain it yet. 1076 * @param offset [1..maxLength] 1077 */ addOffset(int offset)1078 public void addOffset(int offset) { 1079 int i = start + offset; 1080 if (i >= list.length) { 1081 i -= list.length; 1082 } 1083 assert list[i] == 0; 1084 list[i] = 1; 1085 ++length; 1086 } 1087 1088 /** 1089 * Adds an offset and updates its count. 1090 * The list may already contain the offset. 1091 * @param offset [1..maxLength] 1092 */ addOffsetAndCount(int offset, int count)1093 public void addOffsetAndCount(int offset, int count) { 1094 assert count > 0; 1095 int i = start + offset; 1096 if (i >= list.length) { 1097 i -= list.length; 1098 } 1099 if (list[i] == 0) { 1100 list[i] = count; 1101 ++length; 1102 } else if (count < list[i]) { 1103 list[i] = count; 1104 } 1105 } 1106 1107 /** 1108 * @param offset [1..maxLength] 1109 */ containsOffset(int offset)1110 public boolean containsOffset(int offset) { 1111 int i = start + offset; 1112 if (i >= list.length) { 1113 i -= list.length; 1114 } 1115 return list[i] != 0; 1116 } 1117 1118 /** 1119 * @param offset [1..maxLength] 1120 */ hasCountAtOffset(int offset, int count)1121 public boolean hasCountAtOffset(int offset, int count) { 1122 int i = start + offset; 1123 if (i >= list.length) { 1124 i -= list.length; 1125 } 1126 int oldCount = list[i]; 1127 return oldCount != 0 && oldCount <= count; 1128 } 1129 1130 /** 1131 * Finds the lowest stored offset from a non-empty list, removes it, 1132 * and reduces all other offsets by this minimum. 1133 * @return min=[1..maxLength] 1134 */ popMinimum(OutputInt outCount)1135 public int popMinimum(OutputInt outCount) { 1136 // Look for the next offset in list[start+1..list.length-1]. 1137 int i = start, result; 1138 while (++i < list.length) { 1139 int count = list[i]; 1140 if (count != 0) { 1141 list[i] = 0; 1142 --length; 1143 result = i - start; 1144 start = i; 1145 if (outCount != null) { outCount.value = count; } 1146 return result; 1147 } 1148 } 1149 // i==list.length 1150 1151 // Wrap around and look for the next offset in list[0..start]. 1152 // Since the list is not empty, there will be one. 1153 result = list.length - start; 1154 i = 0; 1155 int count; 1156 while ((count = list[i]) == 0) { 1157 ++i; 1158 } 1159 list[i] = 0; 1160 --length; 1161 start = i; 1162 if (outCount != null) { outCount.value = count; } 1163 return result + i; 1164 } 1165 } 1166 } 1167