1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  ******************************************************************************
5  *
6  *   Copyright (C) 2009-2015, International Business Machines
7  *   Corporation and others.  All Rights Reserved.
8  *
9  ******************************************************************************
10  */
11 
12 package com.ibm.icu.impl;
13 
14 import java.util.ArrayList;
15 
16 import com.ibm.icu.text.UnicodeSet;
17 import com.ibm.icu.text.UnicodeSet.SpanCondition;
18 import com.ibm.icu.util.OutputInt;
19 
20 /*
21  * Implement span() etc. for a set with strings.
22  * Avoid recursion because of its exponential complexity.
23  * Instead, try multiple paths at once and track them with an IndexList.
24  */
25 public class UnicodeSetStringSpan {
26 
27     /*
28      * Which span() variant will be used? The object is either built for one variant and used once,
29      * or built for all and may be used many times.
30      */
31     public static final int WITH_COUNT    = 0x40;  // spanAndCount() may be called
32     public static final int FWD           = 0x20;
33     public static final int BACK          = 0x10;
34     // public static final int UTF16      = 8;
35     public static final int CONTAINED     = 2;
36     public static final int NOT_CONTAINED = 1;
37 
38     public static final int ALL = 0x7f;
39 
40     public static final int FWD_UTF16_CONTAINED      = FWD  | /* UTF16 | */    CONTAINED;
41     public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | /* UTF16 | */NOT_CONTAINED;
42     public static final int BACK_UTF16_CONTAINED     = BACK | /* UTF16 | */    CONTAINED;
43     public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
44 
45     /**
46      * Special spanLength short values. (since Java has not unsigned byte type)
47      * All code points in the string are contained in the parent set.
48      */
49     static final short ALL_CP_CONTAINED = 0xff;
50     /** The spanLength is >=0xfe. */
51     static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
52 
53     /** Set for span(). Same as parent but without strings. */
54     private UnicodeSet spanSet;
55 
56     /**
57      * Set for span(not contained).
58      * Same as spanSet, plus characters that start or end strings.
59      */
60     private UnicodeSet spanNotSet;
61 
62     /** The strings of the parent set. */
63     private ArrayList<String> strings;
64 
65     /** The lengths of span(), spanBack() etc. for each string. */
66     private short[] spanLengths;
67 
68     /** Maximum lengths of relevant strings. */
69     private final int maxLength16;
70 
71     /** Are there strings that are not fully contained in the code point set? */
72     private boolean someRelevant;
73 
74     /** Set up for all variants of span()? */
75     private boolean all;
76 
77     /** Span helper */
78     private OffsetList offsets;
79 
80     /**
81      * Constructs for all variants of span(), or only for any one variant.
82      * Initializes as little as possible, for single use.
83      */
UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which)84     public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
85         spanSet = new UnicodeSet(0, 0x10ffff);
86         // TODO: With Java 6, just take the parent set's strings as is,
87         // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
88         // Then iterate via the first() and higher() methods.
89         // (We do not want to create multiple Iterator objects in each span().)
90         // See ICU ticket #7454.
91         strings = setStrings;
92         all = (which == ALL);
93         spanSet.retainAll(set);
94         if (0 != (which & NOT_CONTAINED)) {
95             // Default to the same sets.
96             // addToSpanNotSet() will create a separate set if necessary.
97             spanNotSet = spanSet;
98         }
99         offsets = new OffsetList();
100 
101         // Determine if the strings even need to be taken into account at all for span() etc.
102         // If any string is relevant, then all strings need to be used for
103         // span(longest match) but only the relevant ones for span(while contained).
104         // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
105         // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
106         // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
107         // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
108         int stringsLength = strings.size();
109 
110         int i, spanLength;
111         int maxLength16 = 0;
112         someRelevant = false;
113         for (i = 0; i < stringsLength; ++i) {
114             String string = strings.get(i);
115             int length16 = string.length();
116             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
117             if (spanLength < length16) { // Relevant string.
118                 someRelevant = true;
119             }
120             if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
121                 maxLength16 = length16;
122             }
123         }
124         this.maxLength16 = maxLength16;
125         if (!someRelevant && (which & WITH_COUNT) == 0) {
126             return;
127         }
128 
129         // Freeze after checking for the need to use strings at all because freezing
130         // a set takes some time and memory which are wasted if there are no relevant strings.
131         if (all) {
132             spanSet.freeze();
133         }
134 
135         int spanBackLengthsOffset;
136 
137         // Allocate a block of meta data.
138         int allocSize;
139         if (all) {
140             // 2 sets of span lengths
141             allocSize = stringsLength * (2);
142         } else {
143             allocSize = stringsLength; // One set of span lengths.
144         }
145         spanLengths = new short[allocSize];
146 
147         if (all) {
148             // Store span lengths for all span() variants.
149             spanBackLengthsOffset = stringsLength;
150         } else {
151             // Store span lengths for only one span() variant.
152             spanBackLengthsOffset = 0;
153         }
154 
155         // Set the meta data and spanNotSet and write the UTF-8 strings.
156 
157         for (i = 0; i < stringsLength; ++i) {
158             String string = strings.get(i);
159             int length16 = string.length();
160             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
161             if (spanLength < length16) { // Relevant string.
162                 if (true /* 0 != (which & UTF16) */) {
163                     if (0 != (which & CONTAINED)) {
164                         if (0 != (which & FWD)) {
165                             spanLengths[i] = makeSpanLengthByte(spanLength);
166                         }
167                         if (0 != (which & BACK)) {
168                             spanLength = length16
169                                     - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
170                             spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
171                         }
172                     } else /* not CONTAINED, not all, but NOT_CONTAINED */{
173                         spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
174                                                                                      // flag.
175                     }
176                 }
177                 if (0 != (which & NOT_CONTAINED)) {
178                     // Add string start and end code points to the spanNotSet so that
179                     // a span(while not contained) stops before any string.
180                     int c;
181                     if (0 != (which & FWD)) {
182                         c = string.codePointAt(0);
183                         addToSpanNotSet(c);
184                     }
185                     if (0 != (which & BACK)) {
186                         c = string.codePointBefore(length16);
187                         addToSpanNotSet(c);
188                     }
189                 }
190             } else { // Irrelevant string.
191                 if (all) {
192                     spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
193                 } else {
194                     // All spanXYZLengths pointers contain the same address.
195                     spanLengths[i] = ALL_CP_CONTAINED;
196                 }
197             }
198         }
199 
200         // Finish.
201         if (all) {
202             spanNotSet.freeze();
203         }
204     }
205 
206     /**
207      * Constructs a copy of an existing UnicodeSetStringSpan.
208      * Assumes which==ALL for a frozen set.
209      */
UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList<String> newParentSetStrings)210     public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan,
211             final ArrayList<String> newParentSetStrings) {
212         spanSet = otherStringSpan.spanSet;
213         strings = newParentSetStrings;
214         maxLength16 = otherStringSpan.maxLength16;
215         someRelevant = otherStringSpan.someRelevant;
216         all = true;
217         if (Utility.sameObjects(otherStringSpan.spanNotSet, otherStringSpan.spanSet)) {
218             spanNotSet = spanSet;
219         } else {
220             spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone();
221         }
222         offsets = new OffsetList();
223 
224         spanLengths = otherStringSpan.spanLengths.clone();
225     }
226 
227     /**
228      * Do the strings need to be checked in span() etc.?
229      *
230      * @return true if strings need to be checked (call span() here),
231      *         false if not (use a BMPSet for best performance).
232      */
needsStringSpanUTF16()233     public boolean needsStringSpanUTF16() {
234         return someRelevant;
235     }
236 
237     /** For fast UnicodeSet::contains(c). */
contains(int c)238     public boolean contains(int c) {
239         return spanSet.contains(c);
240     }
241 
242     /**
243      * Adds a starting or ending string character to the spanNotSet
244      * so that a character span ends before any string.
245      */
addToSpanNotSet(int c)246     private void addToSpanNotSet(int c) {
247         if (Utility.sameObjects(spanNotSet, null) || Utility.sameObjects(spanNotSet, spanSet)) {
248             if (spanSet.contains(c)) {
249                 return; // Nothing to do.
250             }
251             spanNotSet = spanSet.cloneAsThawed();
252         }
253         spanNotSet.add(c);
254     }
255 
256     /*
257      * Note: In span() when spanLength==0
258      * (after a string match, or at the beginning after an empty code point span)
259      * and in spanNot() and spanNotUTF8(),
260      * string matching could use a binary search because all string matches are done
261      * from the same start index.
262      *
263      * For UTF-8, this would require a comparison function that returns UTF-16 order.
264      *
265      * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
266      * with strings have very few very short strings. For cases with many strings, it might be better to use a different
267      * API and implementation with a DFA (state machine).
268      */
269 
270     /*
271      * Algorithm for span(SpanCondition.CONTAINED)
272      *
273      * Theoretical algorithm:
274      * - Iterate through the string, and at each code point boundary:
275      *   + If the code point there is in the set, then remember to continue after it.
276      *   + If a set string matches at the current position, then remember to continue after it.
277      *   + Either recursively span for each code point or string match, or recursively span
278      *     for all but the shortest one and iteratively continue the span with the shortest local match.
279      *   + Remember the longest recursive span (the farthest end point).
280      *   + If there is no match at the current position,
281      *     neither for the code point there nor for any set string,
282      *     then stop and return the longest recursive span length.
283      *
284      * Optimized implementation:
285      *
286      * (We assume that most sets will have very few very short strings.
287      * A span using a string-less set is extremely fast.)
288      *
289      * Create and cache a spanSet which contains all of the single code points of the original set
290      * but none of its strings.
291      *
292      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
293      * - Loop:
294      *   + Try to match each set string at the end of the spanLength.
295      *     ~ Set strings that start with set-contained code points
296      *       must be matched with a partial overlap
297      *       because the recursive algorithm would have tried to match them at every position.
298      *     ~ Set strings that entirely consist of set-contained code points
299      *       are irrelevant for span(SpanCondition.CONTAINED)
300      *       because the recursive algorithm would continue after them anyway and
301      *       find the longest recursive match from their end.
302      *     ~ Rather than recursing, note each end point of a set string match.
303      *   + If no set string matched after spanSet.span(),
304      *     then return with where the spanSet.span() ended.
305      *   + If at least one set string matched after spanSet.span(),
306      *     then pop the shortest string match end point and continue the loop,
307      *     trying to match all set strings from there.
308      *   + If at least one more set string matched after a previous string match, then test if the
309      *     code point after the previous string match is also contained in the set.
310      *     Continue the loop with the shortest end point of
311      *     either this code point or a matching set string.
312      *   + If no more set string matched after a previous string match,
313      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
314      *     Stop if spanLength==0, otherwise continue the loop.
315      *
316      * By noting each end point of a set string match, the function visits each string position at most once and
317      * finishes in linear time.
318      *
319      * The recursive algorithm may visit the same string position many times
320      * if multiple paths lead to it and finishes in exponential time.
321      */
322 
323     /*
324      * Algorithm for span(SIMPLE)
325      *
326      * Theoretical algorithm:
327      * - Iterate through the string, and at each code point boundary:
328      *   + If the code point there is in the set, then remember to continue after it.
329      *   + If a set string matches at the current position, then remember to continue after it.
330      *   + Continue from the farthest match position and ignore all others.
331      *   + If there is no match at the current position, then stop and return the current position.
332      *
333      * Optimized implementation:
334      *
335      * (Same assumption and spanSet as above.)
336      *
337      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
338      * - Loop:
339      *   + Try to match each set string at the end of the spanLength.
340      *     ~ Set strings that start with set-contained code points
341      *       must be matched with a partial overlap
342      *       because the standard algorithm would have tried to match them earlier.
343      *     ~ Set strings that entirely consist of set-contained code points
344      *       must be matched with a full overlap because the longest-match algorithm
345      *       would hide set string matches that end earlier.
346      *       Such set strings need not be matched earlier inside the code point span
347      *       because the standard algorithm would then have
348      *       continued after the set string match anyway.
349      *     ~ Remember the longest set string match (farthest end point)
350      *       from the earliest starting point.
351      *   + If no set string matched after spanSet.span(),
352      *     then return with where the spanSet.span() ended.
353      *   + If at least one set string matched,
354      *     then continue the loop after the longest match from the earliest position.
355      *   + If no more set string matched after a previous string match,
356      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
357      *     Stop if spanLength==0, otherwise continue the loop.
358      */
359     /**
360      * Spans a string.
361      *
362      * @param s The string to be spanned
363      * @param start The start index that the span begins
364      * @param spanCondition The span condition
365      * @return the limit (exclusive end) of the span
366      */
span(CharSequence s, int start, SpanCondition spanCondition)367     public int span(CharSequence s, int start, SpanCondition spanCondition) {
368         if (spanCondition == SpanCondition.NOT_CONTAINED) {
369             return spanNot(s, start, null);
370         }
371         int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
372         if (spanLimit == s.length()) {
373             return spanLimit;
374         }
375         return spanWithStrings(s, start, spanLimit, spanCondition);
376     }
377 
378     /**
379      * Synchronized method for complicated spans using the offsets.
380      * Avoids synchronization for simple cases.
381      *
382      * @param spanLimit = spanSet.span(s, start, CONTAINED)
383      */
spanWithStrings(CharSequence s, int start, int spanLimit, SpanCondition spanCondition)384     private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
385             SpanCondition spanCondition) {
386         // Consider strings; they may overlap with the span.
387         int initSize = 0;
388         if (spanCondition == SpanCondition.CONTAINED) {
389             // Use offset list to try all possibilities.
390             initSize = maxLength16;
391         }
392         offsets.setMaxLength(initSize);
393         int length = s.length();
394         int pos = spanLimit, rest = length - spanLimit;
395         int spanLength = spanLimit - start;
396         int i, stringsLength = strings.size();
397         for (;;) {
398             if (spanCondition == SpanCondition.CONTAINED) {
399                 for (i = 0; i < stringsLength; ++i) {
400                     int overlap = spanLengths[i];
401                     if (overlap == ALL_CP_CONTAINED) {
402                         continue; // Irrelevant string.
403                     }
404                     String string = strings.get(i);
405 
406                     int length16 = string.length();
407 
408                     // Try to match this string at pos-overlap..pos.
409                     if (overlap >= LONG_SPAN) {
410                         overlap = length16;
411                         // While contained: No point matching fully inside the code point span.
412                         overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
413                                                                           // point.
414                     }
415                     if (overlap > spanLength) {
416                         overlap = spanLength;
417                     }
418                     int inc = length16 - overlap; // Keep overlap+inc==length16.
419                     for (;;) {
420                         if (inc > rest) {
421                             break;
422                         }
423                         // Try to match if the increment is not listed already.
424                         if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
425                             if (inc == rest) {
426                                 return length; // Reached the end of the string.
427                             }
428                             offsets.addOffset(inc);
429                         }
430                         if (overlap == 0) {
431                             break;
432                         }
433                         --overlap;
434                         ++inc;
435                     }
436                 }
437             } else /* SIMPLE */{
438                 int maxInc = 0, maxOverlap = 0;
439                 for (i = 0; i < stringsLength; ++i) {
440                     int overlap = spanLengths[i];
441                     // For longest match, we do need to try to match even an all-contained string
442                     // to find the match from the earliest start.
443 
444                     String string = strings.get(i);
445 
446                     int length16 = string.length();
447 
448                     // Try to match this string at pos-overlap..pos.
449                     if (overlap >= LONG_SPAN) {
450                         overlap = length16;
451                         // Longest match: Need to match fully inside the code point span
452                         // to find the match from the earliest start.
453                     }
454                     if (overlap > spanLength) {
455                         overlap = spanLength;
456                     }
457                     int inc = length16 - overlap; // Keep overlap+inc==length16.
458                     for (;;) {
459                         if (inc > rest || overlap < maxOverlap) {
460                             break;
461                         }
462                         // Try to match if the string is longer or starts earlier.
463                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
464                                 && matches16CPB(s, pos - overlap, length, string, length16)) {
465                             maxInc = inc; // Longest match from earliest start.
466                             maxOverlap = overlap;
467                             break;
468                         }
469                         --overlap;
470                         ++inc;
471                     }
472                 }
473 
474                 if (maxInc != 0 || maxOverlap != 0) {
475                     // Longest-match algorithm, and there was a string match.
476                     // Simply continue after it.
477                     pos += maxInc;
478                     rest -= maxInc;
479                     if (rest == 0) {
480                         return length; // Reached the end of the string.
481                     }
482                     spanLength = 0; // Match strings from after a string match.
483                     continue;
484                 }
485             }
486             // Finished trying to match all strings at pos.
487 
488             if (spanLength != 0 || pos == 0) {
489                 // The position is after an unlimited code point span (spanLength!=0),
490                 // not after a string match.
491                 // The only position where spanLength==0 after a span is pos==0.
492                 // Otherwise, an unlimited code point span is only tried again when no
493                 // strings match, and if such a non-initial span fails we stop.
494                 if (offsets.isEmpty()) {
495                     return pos; // No strings matched after a span.
496                 }
497                 // Match strings from after the next string match.
498             } else {
499                 // The position is after a string match (or a single code point).
500                 if (offsets.isEmpty()) {
501                     // No more strings matched after a previous string match.
502                     // Try another code point span from after the last string match.
503                     spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
504                     spanLength = spanLimit - pos;
505                     if (spanLength == rest || // Reached the end of the string, or
506                             spanLength == 0 // neither strings nor span progressed.
507                     ) {
508                         return spanLimit;
509                     }
510                     pos += spanLength;
511                     rest -= spanLength;
512                     continue; // spanLength>0: Match strings from after a span.
513                 } else {
514                     // Try to match only one code point from after a string match if some
515                     // string matched beyond it, so that we try all possible positions
516                     // and don't overshoot.
517                     spanLength = spanOne(spanSet, s, pos, rest);
518                     if (spanLength > 0) {
519                         if (spanLength == rest) {
520                             return length; // Reached the end of the string.
521                         }
522                         // Match strings after this code point.
523                         // There cannot be any increments below it because UnicodeSet strings
524                         // contain multiple code points.
525                         pos += spanLength;
526                         rest -= spanLength;
527                         offsets.shift(spanLength);
528                         spanLength = 0;
529                         continue; // Match strings from after a single code point.
530                     }
531                     // Match strings from after the next string match.
532                 }
533             }
534             int minOffset = offsets.popMinimum(null);
535             pos += minOffset;
536             rest -= minOffset;
537             spanLength = 0; // Match strings from after a string match.
538         }
539     }
540 
541     /**
542      * Spans a string and counts the smallest number of set elements on any path across the span.
543      *
544      * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
545      *
546      * <p>If the set does not have any fully-contained strings, then we could optimize this
547      * like span(), but such sets are likely rare, and this is at least still linear.
548      *
549      * @param s The string to be spanned
550      * @param start The start index that the span begins
551      * @param spanCondition The span condition
552      * @param outCount The count
553      * @return the limit (exclusive end) of the span
554      */
spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)555     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
556             OutputInt outCount) {
557         if (spanCondition == SpanCondition.NOT_CONTAINED) {
558             return spanNot(s, start, outCount);
559         }
560         // Consider strings; they may overlap with the span,
561         // and they may result in a smaller count that with just code points.
562         if (spanCondition == SpanCondition.CONTAINED) {
563             return spanContainedAndCount(s, start, outCount);
564         }
565         // SIMPLE (not synchronized, does not use offsets)
566         int stringsLength = strings.size();
567         int length = s.length();
568         int pos = start;
569         int rest = length - start;
570         int count = 0;
571         while (rest != 0) {
572             // Try to match the next code point.
573             int cpLength = spanOne(spanSet, s, pos, rest);
574             int maxInc = (cpLength > 0) ? cpLength : 0;
575             // Try to match all of the strings.
576             for (int i = 0; i < stringsLength; ++i) {
577                 String string = strings.get(i);
578                 int length16 = string.length();
579                 if (maxInc < length16 && length16 <= rest &&
580                         matches16CPB(s, pos, length, string, length16)) {
581                     maxInc = length16;
582                 }
583             }
584             // We are done if there is no match beyond pos.
585             if (maxInc == 0) {
586                 outCount.value = count;
587                 return pos;
588             }
589             // Continue from the longest match.
590             ++count;
591             pos += maxInc;
592             rest -= maxInc;
593         }
594         outCount.value = count;
595         return pos;
596     }
597 
spanContainedAndCount(CharSequence s, int start, OutputInt outCount)598     private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
599         // Use offset list to try all possibilities.
600         offsets.setMaxLength(maxLength16);
601         int stringsLength = strings.size();
602         int length = s.length();
603         int pos = start;
604         int rest = length - start;
605         int count = 0;
606         while (rest != 0) {
607             // Try to match the next code point.
608             int cpLength = spanOne(spanSet, s, pos, rest);
609             if (cpLength > 0) {
610                 offsets.addOffsetAndCount(cpLength, count + 1);
611             }
612             // Try to match all of the strings.
613             for (int i = 0; i < stringsLength; ++i) {
614                 String string = strings.get(i);
615                 int length16 = string.length();
616                 // Note: If the strings were sorted by length, then we could also
617                 // avoid trying to match if there is already a match of the same length.
618                 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
619                         matches16CPB(s, pos, length, string, length16)) {
620                     offsets.addOffsetAndCount(length16, count + 1);
621                 }
622             }
623             // We are done if there is no match beyond pos.
624             if (offsets.isEmpty()) {
625                 outCount.value = count;
626                 return pos;
627             }
628             // Continue from the nearest match.
629             int minOffset = offsets.popMinimum(outCount);
630             count = outCount.value;
631             pos += minOffset;
632             rest -= minOffset;
633         }
634         outCount.value = count;
635         return pos;
636     }
637 
638     /**
639      * Span a string backwards.
640      *
641      * @param s The string to be spanned
642      * @param spanCondition The span condition
643      * @return The string index which starts the span (i.e. inclusive).
644      */
spanBack(CharSequence s, int length, SpanCondition spanCondition)645     public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
646         if (spanCondition == SpanCondition.NOT_CONTAINED) {
647             return spanNotBack(s, length);
648         }
649         int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
650         if (pos == 0) {
651             return 0;
652         }
653         int spanLength = length - pos;
654 
655         // Consider strings; they may overlap with the span.
656         int initSize = 0;
657         if (spanCondition == SpanCondition.CONTAINED) {
658             // Use offset list to try all possibilities.
659             initSize = maxLength16;
660         }
661         offsets.setMaxLength(initSize);
662         int i, stringsLength = strings.size();
663         int spanBackLengthsOffset = 0;
664         if (all) {
665             spanBackLengthsOffset = stringsLength;
666         }
667         for (;;) {
668             if (spanCondition == SpanCondition.CONTAINED) {
669                 for (i = 0; i < stringsLength; ++i) {
670                     int overlap = spanLengths[spanBackLengthsOffset + i];
671                     if (overlap == ALL_CP_CONTAINED) {
672                         continue; // Irrelevant string.
673                     }
674                     String string = strings.get(i);
675 
676                     int length16 = string.length();
677 
678                     // Try to match this string at pos-(length16-overlap)..pos-length16.
679                     if (overlap >= LONG_SPAN) {
680                         overlap = length16;
681                         // While contained: No point matching fully inside the code point span.
682                         int len1 = 0;
683                         len1 = string.offsetByCodePoints(0, 1);
684                         overlap -= len1; // Length of the string minus the first code point.
685                     }
686                     if (overlap > spanLength) {
687                         overlap = spanLength;
688                     }
689                     int dec = length16 - overlap; // Keep dec+overlap==length16.
690                     for (;;) {
691                         if (dec > pos) {
692                             break;
693                         }
694                         // Try to match if the decrement is not listed already.
695                         if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
696                             if (dec == pos) {
697                                 return 0; // Reached the start of the string.
698                             }
699                             offsets.addOffset(dec);
700                         }
701                         if (overlap == 0) {
702                             break;
703                         }
704                         --overlap;
705                         ++dec;
706                     }
707                 }
708             } else /* SIMPLE */{
709                 int maxDec = 0, maxOverlap = 0;
710                 for (i = 0; i < stringsLength; ++i) {
711                     int overlap = spanLengths[spanBackLengthsOffset + i];
712                     // For longest match, we do need to try to match even an all-contained string
713                     // to find the match from the latest end.
714 
715                     String string = strings.get(i);
716 
717                     int length16 = string.length();
718 
719                     // Try to match this string at pos-(length16-overlap)..pos-length16.
720                     if (overlap >= LONG_SPAN) {
721                         overlap = length16;
722                         // Longest match: Need to match fully inside the code point span
723                         // to find the match from the latest end.
724                     }
725                     if (overlap > spanLength) {
726                         overlap = spanLength;
727                     }
728                     int dec = length16 - overlap; // Keep dec+overlap==length16.
729                     for (;;) {
730                         if (dec > pos || overlap < maxOverlap) {
731                             break;
732                         }
733                         // Try to match if the string is longer or ends later.
734                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
735                                 && matches16CPB(s, pos - dec, length, string, length16)) {
736                             maxDec = dec; // Longest match from latest end.
737                             maxOverlap = overlap;
738                             break;
739                         }
740                         --overlap;
741                         ++dec;
742                     }
743                 }
744 
745                 if (maxDec != 0 || maxOverlap != 0) {
746                     // Longest-match algorithm, and there was a string match.
747                     // Simply continue before it.
748                     pos -= maxDec;
749                     if (pos == 0) {
750                         return 0; // Reached the start of the string.
751                     }
752                     spanLength = 0; // Match strings from before a string match.
753                     continue;
754                 }
755             }
756             // Finished trying to match all strings at pos.
757 
758             if (spanLength != 0 || pos == length) {
759                 // The position is before an unlimited code point span (spanLength!=0),
760                 // not before a string match.
761                 // The only position where spanLength==0 before a span is pos==length.
762                 // Otherwise, an unlimited code point span is only tried again when no
763                 // strings match, and if such a non-initial span fails we stop.
764                 if (offsets.isEmpty()) {
765                     return pos; // No strings matched before a span.
766                 }
767                 // Match strings from before the next string match.
768             } else {
769                 // The position is before a string match (or a single code point).
770                 if (offsets.isEmpty()) {
771                     // No more strings matched before a previous string match.
772                     // Try another code point span from before the last string match.
773                     int oldPos = pos;
774                     pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
775                     spanLength = oldPos - pos;
776                     if (pos == 0 || // Reached the start of the string, or
777                             spanLength == 0 // neither strings nor span progressed.
778                     ) {
779                         return pos;
780                     }
781                     continue; // spanLength>0: Match strings from before a span.
782                 } else {
783                     // Try to match only one code point from before a string match if some
784                     // string matched beyond it, so that we try all possible positions
785                     // and don't overshoot.
786                     spanLength = spanOneBack(spanSet, s, pos);
787                     if (spanLength > 0) {
788                         if (spanLength == pos) {
789                             return 0; // Reached the start of the string.
790                         }
791                         // Match strings before this code point.
792                         // There cannot be any decrements below it because UnicodeSet strings
793                         // contain multiple code points.
794                         pos -= spanLength;
795                         offsets.shift(spanLength);
796                         spanLength = 0;
797                         continue; // Match strings from before a single code point.
798                     }
799                     // Match strings from before the next string match.
800                 }
801             }
802             pos -= offsets.popMinimum(null);
803             spanLength = 0; // Match strings from before a string match.
804         }
805     }
806 
807     /**
808      * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
809      *
810      * Theoretical algorithm:
811      * - Iterate through the string, and at each code point boundary:
812      *   + If the code point there is in the set, then return with the current position.
813      *   + If a set string matches at the current position, then return with the current position.
814      *
815      * Optimized implementation:
816      *
817      * (Same assumption as for span() above.)
818      *
819      * Create and cache a spanNotSet which contains
820      * all of the single code points of the original set but none of its strings.
821      * For each set string add its initial code point to the spanNotSet.
822      * (Also add its final code point for spanNotBack().)
823      *
824      * - Loop:
825      *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
826      *   + If the current code point is in the original set, then return the current position.
827      *   + If any set string matches at the current position, then return the current position.
828      *   + If there is no match at the current position, neither for the code point
829      *     there nor for any set string, then skip this code point and continue the loop.
830      *     This happens for set-string-initial code points that were added to spanNotSet
831      *     when there is not actually a match for such a set string.
832      *
833      * @param s The string to be spanned
834      * @param start The start index that the span begins
835      * @param outCount If not null: Receives the number of code points across the span.
836      * @return the limit (exclusive end) of the span
837      */
spanNot(CharSequence s, int start, OutputInt outCount)838     private int spanNot(CharSequence s, int start, OutputInt outCount) {
839         int length = s.length();
840         int pos = start, rest = length - start;
841         int stringsLength = strings.size();
842         int count = 0;
843         do {
844             // Span until we find a code point from the set,
845             // or a code point that starts or ends some string.
846             int spanLimit;
847             if (outCount == null) {
848                 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
849             } else {
850                 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
851                 outCount.value = count = count + outCount.value;
852             }
853             if (spanLimit == length) {
854                 return length; // Reached the end of the string.
855             }
856             pos = spanLimit;
857             rest = length - spanLimit;
858 
859             // Check whether the current code point is in the original set,
860             // without the string starts and ends.
861             int cpLength = spanOne(spanSet, s, pos, rest);
862             if (cpLength > 0) {
863                 return pos; // There is a set element at pos.
864             }
865 
866             // Try to match the strings at pos.
867             for (int i = 0; i < stringsLength; ++i) {
868                 if (spanLengths[i] == ALL_CP_CONTAINED) {
869                     continue; // Irrelevant string.
870                 }
871                 String string = strings.get(i);
872 
873                 int length16 = string.length();
874                 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
875                     return pos; // There is a set element at pos.
876                 }
877             }
878 
879             // The span(while not contained) ended on a string start/end which is
880             // not in the original set. Skip this code point and continue.
881             // cpLength<0
882             pos -= cpLength;
883             rest += cpLength;
884             ++count;
885         } while (rest != 0);
886         if (outCount != null) {
887             outCount.value = count;
888         }
889         return length; // Reached the end of the string.
890     }
891 
spanNotBack(CharSequence s, int length)892     private int spanNotBack(CharSequence s, int length) {
893         int pos = length;
894         int i, stringsLength = strings.size();
895         do {
896             // Span until we find a code point from the set,
897             // or a code point that starts or ends some string.
898             pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
899             if (pos == 0) {
900                 return 0; // Reached the start of the string.
901             }
902 
903             // Check whether the current code point is in the original set,
904             // without the string starts and ends.
905             int cpLength = spanOneBack(spanSet, s, pos);
906             if (cpLength > 0) {
907                 return pos; // There is a set element at pos.
908             }
909 
910             // Try to match the strings at pos.
911             for (i = 0; i < stringsLength; ++i) {
912                 // Use spanLengths rather than a spanLengths pointer because
913                 // it is easier and we only need to know whether the string is irrelevant
914                 // which is the same in either array.
915                 if (spanLengths[i] == ALL_CP_CONTAINED) {
916                     continue; // Irrelevant string.
917                 }
918                 String string = strings.get(i);
919 
920                 int length16 = string.length();
921                 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
922                     return pos; // There is a set element at pos.
923                 }
924             }
925 
926             // The span(while not contained) ended on a string start/end which is
927             // not in the original set. Skip this code point and continue.
928             // cpLength<0
929             pos += cpLength;
930         } while (pos != 0);
931         return 0; // Reached the start of the string.
932     }
933 
makeSpanLengthByte(int spanLength)934     static short makeSpanLengthByte(int spanLength) {
935         // 0xfe==UnicodeSetStringSpan::LONG_SPAN
936         return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
937     }
938 
939     // Compare strings without any argument checks. Requires length>0.
matches16(CharSequence s, int start, final String t, int length)940     private static boolean matches16(CharSequence s, int start, final String t, int length) {
941         int end = start + length;
942         while (length-- > 0) {
943             if (s.charAt(--end) != t.charAt(length)) {
944                 return false;
945             }
946         }
947         return true;
948     }
949 
950     /**
951      * Compare 16-bit Unicode strings (which may be malformed UTF-16)
952      * at code point boundaries.
953      * That is, each edge of a match must not be in the middle of a surrogate pair.
954      * @param s       The string to match in.
955      * @param start   The start index of s.
956      * @param limit   The limit of the subsequence of s being spanned.
957      * @param t       The substring to be matched in s.
958      * @param tlength The length of t.
959      */
matches16CPB(CharSequence s, int start, int limit, final String t, int tlength)960     static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
961         return matches16(s, start, t, tlength)
962                 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
963                         Character.isLowSurrogate(s.charAt(start)))
964                 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
965                         Character.isLowSurrogate(s.charAt(start + tlength)));
966     }
967 
968     /**
969      * Does the set contain the next code point?
970      * If so, return its length; otherwise return its negative length.
971      */
spanOne(final UnicodeSet set, CharSequence s, int start, int length)972     static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
973         char c = s.charAt(start);
974         if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
975             char c2 = s.charAt(start + 1);
976             if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) {
977                 int supplementary = Character.toCodePoint(c, c2);
978                 return set.contains(supplementary) ? 2 : -2;
979             }
980         }
981         return set.contains(c) ? 1 : -1;
982     }
983 
spanOneBack(final UnicodeSet set, CharSequence s, int length)984     static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
985         char c = s.charAt(length - 1);
986         if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
987             char c2 = s.charAt(length - 2);
988             if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) {
989                 int supplementary = Character.toCodePoint(c2, c);
990                 return set.contains(supplementary) ? 2 : -2;
991             }
992         }
993         return set.contains(c) ? 1 : -1;
994     }
995 
996     /**
997      * Helper class for UnicodeSetStringSpan.
998      *
999      * <p>List of offsets from the current position from where to try matching
1000      * a code point or a string.
1001      * Stores offsets rather than indexes to simplify the code and use the same list
1002      * for both increments (in span()) and decrements (in spanBack()).
1003      *
1004      * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
1005      * are relatively dense, that is,
1006      * there are normally no gaps of hundreds or thousands of offset values.
1007      *
1008      * <p>This class optionally also tracks the minimum non-negative count for each position,
1009      * intended to count the smallest number of elements of any path leading to that position.
1010      *
1011      * <p>The implementation uses a circular buffer of count integers,
1012      * each indicating whether the corresponding offset is in the list,
1013      * and its path element count.
1014      * This avoids inserting into a sorted list of offsets (or absolute indexes)
1015      * and physically moving part of the list.
1016      *
1017      * <p>Note: In principle, the caller should setMaxLength() to
1018      * the maximum of the max string length and U16_LENGTH/U8_LENGTH
1019      * to account for "long" single code points.
1020      *
1021      * <p>Note: An earlier version did not track counts and stored only byte flags.
1022      * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
1023      * the list could be stored as bit flags in a single integer.
1024      * Rather than handling a circular buffer with a start list index,
1025      * the integer would simply be shifted when lower offsets are removed.
1026      * UnicodeSet does not have a limit on the lengths of strings.
1027      */
1028     private static final class OffsetList {
1029         private int[] list;
1030         private int length;
1031         private int start;
1032 
OffsetList()1033         public OffsetList() {
1034             list = new int[16];  // default size
1035         }
1036 
setMaxLength(int maxLength)1037         public void setMaxLength(int maxLength) {
1038             if (maxLength > list.length) {
1039                 list = new int[maxLength];
1040             }
1041             clear();
1042         }
1043 
clear()1044         public void clear() {
1045             for (int i = list.length; i-- > 0;) {
1046                 list[i] = 0;
1047             }
1048             start = length = 0;
1049         }
1050 
isEmpty()1051         public boolean isEmpty() {
1052             return (length == 0);
1053         }
1054 
1055         /**
1056          * Reduces all stored offsets by delta, used when the current position moves by delta.
1057          * There must not be any offsets lower than delta.
1058          * If there is an offset equal to delta, it is removed.
1059          *
1060          * @param delta [1..maxLength]
1061          */
shift(int delta)1062         public void shift(int delta) {
1063             int i = start + delta;
1064             if (i >= list.length) {
1065                 i -= list.length;
1066             }
1067             if (list[i] != 0) {
1068                 list[i] = 0;
1069                 --length;
1070             }
1071             start = i;
1072         }
1073 
1074         /**
1075          * Adds an offset. The list must not contain it yet.
1076          * @param offset [1..maxLength]
1077          */
addOffset(int offset)1078         public void addOffset(int offset) {
1079             int i = start + offset;
1080             if (i >= list.length) {
1081                 i -= list.length;
1082             }
1083             assert list[i] == 0;
1084             list[i] = 1;
1085             ++length;
1086         }
1087 
1088         /**
1089          * Adds an offset and updates its count.
1090          * The list may already contain the offset.
1091          * @param offset [1..maxLength]
1092          */
addOffsetAndCount(int offset, int count)1093         public void addOffsetAndCount(int offset, int count) {
1094             assert count > 0;
1095             int i = start + offset;
1096             if (i >= list.length) {
1097                 i -= list.length;
1098             }
1099             if (list[i] == 0) {
1100                 list[i] = count;
1101                 ++length;
1102             } else if (count < list[i]) {
1103                 list[i] = count;
1104             }
1105         }
1106 
1107         /**
1108          * @param offset [1..maxLength]
1109          */
containsOffset(int offset)1110         public boolean containsOffset(int offset) {
1111             int i = start + offset;
1112             if (i >= list.length) {
1113                 i -= list.length;
1114             }
1115             return list[i] != 0;
1116         }
1117 
1118         /**
1119          * @param offset [1..maxLength]
1120          */
hasCountAtOffset(int offset, int count)1121         public boolean hasCountAtOffset(int offset, int count) {
1122             int i = start + offset;
1123             if (i >= list.length) {
1124                 i -= list.length;
1125             }
1126             int oldCount = list[i];
1127             return oldCount != 0 && oldCount <= count;
1128         }
1129 
1130         /**
1131          * Finds the lowest stored offset from a non-empty list, removes it,
1132          * and reduces all other offsets by this minimum.
1133          * @return min=[1..maxLength]
1134          */
popMinimum(OutputInt outCount)1135         public int popMinimum(OutputInt outCount) {
1136             // Look for the next offset in list[start+1..list.length-1].
1137             int i = start, result;
1138             while (++i < list.length) {
1139                 int count = list[i];
1140                 if (count != 0) {
1141                     list[i] = 0;
1142                     --length;
1143                     result = i - start;
1144                     start = i;
1145                     if (outCount != null) { outCount.value = count; }
1146                     return result;
1147                 }
1148             }
1149             // i==list.length
1150 
1151             // Wrap around and look for the next offset in list[0..start].
1152             // Since the list is not empty, there will be one.
1153             result = list.length - start;
1154             i = 0;
1155             int count;
1156             while ((count = list[i]) == 0) {
1157                 ++i;
1158             }
1159             list[i] = 0;
1160             --length;
1161             start = i;
1162             if (outCount != null) { outCount.value = count; }
1163             return result + i;
1164         }
1165     }
1166 }
1167