1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2007-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: unisetspan.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2007mar01
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ustring.h"
20 #include "unicode/utf8.h"
21 #include "unicode/utf16.h"
22 #include "cmemory.h"
23 #include "uvector.h"
24 #include "unisetspan.h"
25
26 U_NAMESPACE_BEGIN
27
28 /*
29 * List of offsets from the current position from where to try matching
30 * a code point or a string.
31 * Store offsets rather than indexes to simplify the code and use the same list
32 * for both increments (in span()) and decrements (in spanBack()).
33 *
34 * Assumption: The maximum offset is limited, and the offsets that are stored
35 * at any one time are relatively dense, that is, there are normally no gaps of
36 * hundreds or thousands of offset values.
37 *
38 * The implementation uses a circular buffer of byte flags,
39 * each indicating whether the corresponding offset is in the list.
40 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
41 * physically moving part of the list.
42 *
43 * Note: In principle, the caller should setMaxLength() to the maximum of the
44 * max string length and U16_LENGTH/U8_LENGTH to account for
45 * "long" single code points.
46 * However, this implementation uses at least a staticList with more than
47 * U8_LENGTH entries anyway.
48 *
49 * Note: If maxLength were guaranteed to be no more than 32 or 64,
50 * the list could be stored as bit flags in a single integer.
51 * Rather than handling a circular buffer with a start list index,
52 * the integer would simply be shifted when lower offsets are removed.
53 * UnicodeSet does not have a limit on the lengths of strings.
54 */
55 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
56 public:
OffsetList()57 OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
58
~OffsetList()59 ~OffsetList() {
60 if(list!=staticList) {
61 uprv_free(list);
62 }
63 }
64
65 // Call exactly once if the list is to be used.
setMaxLength(int32_t maxLength)66 void setMaxLength(int32_t maxLength) {
67 if(maxLength<=(int32_t)sizeof(staticList)) {
68 capacity=(int32_t)sizeof(staticList);
69 } else {
70 UBool *l=(UBool *)uprv_malloc(maxLength);
71 if(l!=NULL) {
72 list=l;
73 capacity=maxLength;
74 }
75 }
76 uprv_memset(list, 0, capacity);
77 }
78
clear()79 void clear() {
80 uprv_memset(list, 0, capacity);
81 start=length=0;
82 }
83
isEmpty() const84 UBool isEmpty() const {
85 return (UBool)(length==0);
86 }
87
88 // Reduce all stored offsets by delta, used when the current position
89 // moves by delta.
90 // There must not be any offsets lower than delta.
91 // If there is an offset equal to delta, it is removed.
92 // delta=[1..maxLength]
shift(int32_t delta)93 void shift(int32_t delta) {
94 int32_t i=start+delta;
95 if(i>=capacity) {
96 i-=capacity;
97 }
98 if(list[i]) {
99 list[i]=FALSE;
100 --length;
101 }
102 start=i;
103 }
104
105 // Add an offset. The list must not contain it yet.
106 // offset=[1..maxLength]
addOffset(int32_t offset)107 void addOffset(int32_t offset) {
108 int32_t i=start+offset;
109 if(i>=capacity) {
110 i-=capacity;
111 }
112 list[i]=TRUE;
113 ++length;
114 }
115
116 // offset=[1..maxLength]
containsOffset(int32_t offset) const117 UBool containsOffset(int32_t offset) const {
118 int32_t i=start+offset;
119 if(i>=capacity) {
120 i-=capacity;
121 }
122 return list[i];
123 }
124
125 // Find the lowest stored offset from a non-empty list, remove it,
126 // and reduce all other offsets by this minimum.
127 // Returns [1..maxLength].
popMinimum()128 int32_t popMinimum() {
129 // Look for the next offset in list[start+1..capacity-1].
130 int32_t i=start, result;
131 while(++i<capacity) {
132 if(list[i]) {
133 list[i]=FALSE;
134 --length;
135 result=i-start;
136 start=i;
137 return result;
138 }
139 }
140 // i==capacity
141
142 // Wrap around and look for the next offset in list[0..start].
143 // Since the list is not empty, there will be one.
144 result=capacity-start;
145 i=0;
146 while(!list[i]) {
147 ++i;
148 }
149 list[i]=FALSE;
150 --length;
151 start=i;
152 return result+=i;
153 }
154
155 private:
156 UBool *list;
157 int32_t capacity;
158 int32_t length;
159 int32_t start;
160
161 UBool staticList[16];
162 };
163
164 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
165 static int32_t
getUTF8Length(const UChar * s,int32_t length)166 getUTF8Length(const UChar *s, int32_t length) {
167 UErrorCode errorCode=U_ZERO_ERROR;
168 int32_t length8=0;
169 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
170 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
171 return length8;
172 } else {
173 // The string contains an unpaired surrogate.
174 // Ignore this string.
175 return 0;
176 }
177 }
178
179 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
180 static int32_t
appendUTF8(const UChar * s,int32_t length,uint8_t * t,int32_t capacity)181 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
182 UErrorCode errorCode=U_ZERO_ERROR;
183 int32_t length8=0;
184 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
185 if(U_SUCCESS(errorCode)) {
186 return length8;
187 } else {
188 // The string contains an unpaired surrogate.
189 // Ignore this string.
190 return 0;
191 }
192 }
193
194 static inline uint8_t
makeSpanLengthByte(int32_t spanLength)195 makeSpanLengthByte(int32_t spanLength) {
196 // 0xfe==UnicodeSetStringSpan::LONG_SPAN
197 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
198 }
199
200 // Construct for all variants of span(), or only for any one variant.
201 // Initialize as little as possible, for single use.
UnicodeSetStringSpan(const UnicodeSet & set,const UVector & setStrings,uint32_t which)202 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
203 const UVector &setStrings,
204 uint32_t which)
205 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
206 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
207 utf8Length(0),
208 maxLength16(0), maxLength8(0),
209 all((UBool)(which==ALL)) {
210 spanSet.retainAll(set);
211 if(which&NOT_CONTAINED) {
212 // Default to the same sets.
213 // addToSpanNotSet() will create a separate set if necessary.
214 pSpanNotSet=&spanSet;
215 }
216
217 // Determine if the strings even need to be taken into account at all for span() etc.
218 // If any string is relevant, then all strings need to be used for
219 // span(longest match) but only the relevant ones for span(while contained).
220 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
221 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
222 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
223 // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
224 int32_t stringsLength=strings.size();
225
226 int32_t i, spanLength;
227 UBool someRelevant=FALSE;
228 for(i=0; i<stringsLength; ++i) {
229 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
230 const UChar *s16=string.getBuffer();
231 int32_t length16=string.length();
232 UBool thisRelevant;
233 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
234 if(spanLength<length16) { // Relevant string.
235 someRelevant=thisRelevant=TRUE;
236 } else {
237 thisRelevant=FALSE;
238 }
239 if((which&UTF16) && length16>maxLength16) {
240 maxLength16=length16;
241 }
242 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
243 int32_t length8=getUTF8Length(s16, length16);
244 utf8Length+=length8;
245 if(length8>maxLength8) {
246 maxLength8=length8;
247 }
248 }
249 }
250 if(!someRelevant) {
251 maxLength16=maxLength8=0;
252 return;
253 }
254
255 // Freeze after checking for the need to use strings at all because freezing
256 // a set takes some time and memory which are wasted if there are no relevant strings.
257 if(all) {
258 spanSet.freeze();
259 }
260
261 uint8_t *spanBackLengths;
262 uint8_t *spanUTF8Lengths;
263 uint8_t *spanBackUTF8Lengths;
264
265 // Allocate a block of meta data.
266 int32_t allocSize;
267 if(all) {
268 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
269 allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
270 } else {
271 allocSize=stringsLength; // One set of span lengths.
272 if(which&UTF8) {
273 // UTF-8 lengths and UTF-8 strings.
274 allocSize+=stringsLength*4+utf8Length;
275 }
276 }
277 if(allocSize<=(int32_t)sizeof(staticLengths)) {
278 utf8Lengths=staticLengths;
279 } else {
280 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
281 if(utf8Lengths==NULL) {
282 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
283 return; // Out of memory.
284 }
285 }
286
287 if(all) {
288 // Store span lengths for all span() variants.
289 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
290 spanBackLengths=spanLengths+stringsLength;
291 spanUTF8Lengths=spanBackLengths+stringsLength;
292 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
293 utf8=spanBackUTF8Lengths+stringsLength;
294 } else {
295 // Store span lengths for only one span() variant.
296 if(which&UTF8) {
297 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
298 utf8=spanLengths+stringsLength;
299 } else {
300 spanLengths=(uint8_t *)utf8Lengths;
301 }
302 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
303 }
304
305 // Set the meta data and pSpanNotSet and write the UTF-8 strings.
306 int32_t utf8Count=0; // Count UTF-8 bytes written so far.
307
308 for(i=0; i<stringsLength; ++i) {
309 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
310 const UChar *s16=string.getBuffer();
311 int32_t length16=string.length();
312 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
313 if(spanLength<length16) { // Relevant string.
314 if(which&UTF16) {
315 if(which&CONTAINED) {
316 if(which&FWD) {
317 spanLengths[i]=makeSpanLengthByte(spanLength);
318 }
319 if(which&BACK) {
320 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
321 spanBackLengths[i]=makeSpanLengthByte(spanLength);
322 }
323 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
324 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
325 }
326 }
327 if(which&UTF8) {
328 uint8_t *s8=utf8+utf8Count;
329 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
330 utf8Count+=utf8Lengths[i]=length8;
331 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
332 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
333 } else { // Relevant for UTF-8.
334 if(which&CONTAINED) {
335 if(which&FWD) {
336 spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
337 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
338 }
339 if(which&BACK) {
340 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
341 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
342 }
343 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
344 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
345 }
346 }
347 }
348 if(which&NOT_CONTAINED) {
349 // Add string start and end code points to the spanNotSet so that
350 // a span(while not contained) stops before any string.
351 UChar32 c;
352 if(which&FWD) {
353 int32_t len=0;
354 U16_NEXT(s16, len, length16, c);
355 addToSpanNotSet(c);
356 }
357 if(which&BACK) {
358 int32_t len=length16;
359 U16_PREV(s16, 0, len, c);
360 addToSpanNotSet(c);
361 }
362 }
363 } else { // Irrelevant string.
364 if(which&UTF8) {
365 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
366 uint8_t *s8=utf8+utf8Count;
367 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
368 utf8Count+=utf8Lengths[i]=length8;
369 } else {
370 utf8Lengths[i]=0;
371 }
372 }
373 if(all) {
374 spanLengths[i]=spanBackLengths[i]=
375 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
376 (uint8_t)ALL_CP_CONTAINED;
377 } else {
378 // All spanXYZLengths pointers contain the same address.
379 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
380 }
381 }
382 }
383
384 // Finish.
385 if(all) {
386 pSpanNotSet->freeze();
387 }
388 }
389
390 // Copy constructor. Assumes which==ALL for a frozen set.
UnicodeSetStringSpan(const UnicodeSetStringSpan & otherStringSpan,const UVector & newParentSetStrings)391 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
392 const UVector &newParentSetStrings)
393 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
394 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
395 utf8Length(otherStringSpan.utf8Length),
396 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
397 all(TRUE) {
398 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
399 pSpanNotSet=&spanSet;
400 } else {
401 pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
402 }
403
404 // Allocate a block of meta data.
405 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
406 int32_t stringsLength=strings.size();
407 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
408 if(allocSize<=(int32_t)sizeof(staticLengths)) {
409 utf8Lengths=staticLengths;
410 } else {
411 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
412 if(utf8Lengths==NULL) {
413 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
414 return; // Out of memory.
415 }
416 }
417
418 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
419 utf8=spanLengths+stringsLength*4;
420 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
421 }
422
~UnicodeSetStringSpan()423 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
424 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
425 delete pSpanNotSet;
426 }
427 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
428 uprv_free(utf8Lengths);
429 }
430 }
431
addToSpanNotSet(UChar32 c)432 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
433 if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
434 if(spanSet.contains(c)) {
435 return; // Nothing to do.
436 }
437 UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
438 if(newSet==NULL) {
439 return; // Out of memory.
440 } else {
441 pSpanNotSet=newSet;
442 }
443 }
444 pSpanNotSet->add(c);
445 }
446
447 // Compare strings without any argument checks. Requires length>0.
448 static inline UBool
matches16(const UChar * s,const UChar * t,int32_t length)449 matches16(const UChar *s, const UChar *t, int32_t length) {
450 do {
451 if(*s++!=*t++) {
452 return FALSE;
453 }
454 } while(--length>0);
455 return TRUE;
456 }
457
458 static inline UBool
matches8(const uint8_t * s,const uint8_t * t,int32_t length)459 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
460 do {
461 if(*s++!=*t++) {
462 return FALSE;
463 }
464 } while(--length>0);
465 return TRUE;
466 }
467
468 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
469 // at code point boundaries.
470 // That is, each edge of a match must not be in the middle of a surrogate pair.
471 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UChar * t,int32_t length)472 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
473 s+=start;
474 limit-=start;
475 return matches16(s, t, length) &&
476 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
477 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
478 }
479
480 // Does the set contain the next code point?
481 // If so, return its length; otherwise return its negative length.
482 static inline int32_t
spanOne(const UnicodeSet & set,const UChar * s,int32_t length)483 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
484 UChar c=*s, c2;
485 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
486 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
487 }
488 return set.contains(c) ? 1 : -1;
489 }
490
491 static inline int32_t
spanOneBack(const UnicodeSet & set,const UChar * s,int32_t length)492 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
493 UChar c=s[length-1], c2;
494 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
495 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
496 }
497 return set.contains(c) ? 1 : -1;
498 }
499
500 static inline int32_t
spanOneUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)501 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
502 UChar32 c=*s;
503 if((int8_t)c>=0) {
504 return set.contains(c) ? 1 : -1;
505 }
506 // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
507 int32_t i=0;
508 U8_NEXT_OR_FFFD(s, i, length, c);
509 return set.contains(c) ? i : -i;
510 }
511
512 static inline int32_t
spanOneBackUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)513 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
514 UChar32 c=s[length-1];
515 if((int8_t)c>=0) {
516 return set.contains(c) ? 1 : -1;
517 }
518 int32_t i=length-1;
519 c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
520 length-=i;
521 return set.contains(c) ? length : -length;
522 }
523
524 /*
525 * Note: In span() when spanLength==0 (after a string match, or at the beginning
526 * after an empty code point span) and in spanNot() and spanNotUTF8(),
527 * string matching could use a binary search
528 * because all string matches are done from the same start index.
529 *
530 * For UTF-8, this would require a comparison function that returns UTF-16 order.
531 *
532 * This optimization should not be necessary for normal UnicodeSets because
533 * most sets have no strings, and most sets with strings have
534 * very few very short strings.
535 * For cases with many strings, it might be better to use a different API
536 * and implementation with a DFA (state machine).
537 */
538
539 /*
540 * Algorithm for span(USET_SPAN_CONTAINED)
541 *
542 * Theoretical algorithm:
543 * - Iterate through the string, and at each code point boundary:
544 * + If the code point there is in the set, then remember to continue after it.
545 * + If a set string matches at the current position, then remember to continue after it.
546 * + Either recursively span for each code point or string match,
547 * or recursively span for all but the shortest one and
548 * iteratively continue the span with the shortest local match.
549 * + Remember the longest recursive span (the farthest end point).
550 * + If there is no match at the current position, neither for the code point there
551 * nor for any set string, then stop and return the longest recursive span length.
552 *
553 * Optimized implementation:
554 *
555 * (We assume that most sets will have very few very short strings.
556 * A span using a string-less set is extremely fast.)
557 *
558 * Create and cache a spanSet which contains all of the single code points
559 * of the original set but none of its strings.
560 *
561 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
562 * - Loop:
563 * + Try to match each set string at the end of the spanLength.
564 * ~ Set strings that start with set-contained code points must be matched
565 * with a partial overlap because the recursive algorithm would have tried
566 * to match them at every position.
567 * ~ Set strings that entirely consist of set-contained code points
568 * are irrelevant for span(USET_SPAN_CONTAINED) because the
569 * recursive algorithm would continue after them anyway
570 * and find the longest recursive match from their end.
571 * ~ Rather than recursing, note each end point of a set string match.
572 * + If no set string matched after spanSet.span(), then return
573 * with where the spanSet.span() ended.
574 * + If at least one set string matched after spanSet.span(), then
575 * pop the shortest string match end point and continue
576 * the loop, trying to match all set strings from there.
577 * + If at least one more set string matched after a previous string match,
578 * then test if the code point after the previous string match is also
579 * contained in the set.
580 * Continue the loop with the shortest end point of either this code point
581 * or a matching set string.
582 * + If no more set string matched after a previous string match,
583 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
584 * Stop if spanLength==0, otherwise continue the loop.
585 *
586 * By noting each end point of a set string match,
587 * the function visits each string position at most once and finishes
588 * in linear time.
589 *
590 * The recursive algorithm may visit the same string position many times
591 * if multiple paths lead to it and finishes in exponential time.
592 */
593
594 /*
595 * Algorithm for span(USET_SPAN_SIMPLE)
596 *
597 * Theoretical algorithm:
598 * - Iterate through the string, and at each code point boundary:
599 * + If the code point there is in the set, then remember to continue after it.
600 * + If a set string matches at the current position, then remember to continue after it.
601 * + Continue from the farthest match position and ignore all others.
602 * + If there is no match at the current position,
603 * then stop and return the current position.
604 *
605 * Optimized implementation:
606 *
607 * (Same assumption and spanSet as above.)
608 *
609 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
610 * - Loop:
611 * + Try to match each set string at the end of the spanLength.
612 * ~ Set strings that start with set-contained code points must be matched
613 * with a partial overlap because the standard algorithm would have tried
614 * to match them earlier.
615 * ~ Set strings that entirely consist of set-contained code points
616 * must be matched with a full overlap because the longest-match algorithm
617 * would hide set string matches that end earlier.
618 * Such set strings need not be matched earlier inside the code point span
619 * because the standard algorithm would then have continued after
620 * the set string match anyway.
621 * ~ Remember the longest set string match (farthest end point) from the earliest
622 * starting point.
623 * + If no set string matched after spanSet.span(), then return
624 * with where the spanSet.span() ended.
625 * + If at least one set string matched, then continue the loop after the
626 * longest match from the earliest position.
627 * + If no more set string matched after a previous string match,
628 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
629 * Stop if spanLength==0, otherwise continue the loop.
630 */
631
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const632 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
633 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
634 return spanNot(s, length);
635 }
636 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
637 if(spanLength==length) {
638 return length;
639 }
640
641 // Consider strings; they may overlap with the span.
642 OffsetList offsets;
643 if(spanCondition==USET_SPAN_CONTAINED) {
644 // Use offset list to try all possibilities.
645 offsets.setMaxLength(maxLength16);
646 }
647 int32_t pos=spanLength, rest=length-pos;
648 int32_t i, stringsLength=strings.size();
649 for(;;) {
650 if(spanCondition==USET_SPAN_CONTAINED) {
651 for(i=0; i<stringsLength; ++i) {
652 int32_t overlap=spanLengths[i];
653 if(overlap==ALL_CP_CONTAINED) {
654 continue; // Irrelevant string.
655 }
656 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
657 const UChar *s16=string.getBuffer();
658 int32_t length16=string.length();
659
660 // Try to match this string at pos-overlap..pos.
661 if(overlap>=LONG_SPAN) {
662 overlap=length16;
663 // While contained: No point matching fully inside the code point span.
664 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
665 }
666 if(overlap>spanLength) {
667 overlap=spanLength;
668 }
669 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
670 for(;;) {
671 if(inc>rest) {
672 break;
673 }
674 // Try to match if the increment is not listed already.
675 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
676 if(inc==rest) {
677 return length; // Reached the end of the string.
678 }
679 offsets.addOffset(inc);
680 }
681 if(overlap==0) {
682 break;
683 }
684 --overlap;
685 ++inc;
686 }
687 }
688 } else /* USET_SPAN_SIMPLE */ {
689 int32_t maxInc=0, maxOverlap=0;
690 for(i=0; i<stringsLength; ++i) {
691 int32_t overlap=spanLengths[i];
692 // For longest match, we do need to try to match even an all-contained string
693 // to find the match from the earliest start.
694
695 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
696 const UChar *s16=string.getBuffer();
697 int32_t length16=string.length();
698
699 // Try to match this string at pos-overlap..pos.
700 if(overlap>=LONG_SPAN) {
701 overlap=length16;
702 // Longest match: Need to match fully inside the code point span
703 // to find the match from the earliest start.
704 }
705 if(overlap>spanLength) {
706 overlap=spanLength;
707 }
708 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
709 for(;;) {
710 if(inc>rest || overlap<maxOverlap) {
711 break;
712 }
713 // Try to match if the string is longer or starts earlier.
714 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
715 matches16CPB(s, pos-overlap, length, s16, length16)
716 ) {
717 maxInc=inc; // Longest match from earliest start.
718 maxOverlap=overlap;
719 break;
720 }
721 --overlap;
722 ++inc;
723 }
724 }
725
726 if(maxInc!=0 || maxOverlap!=0) {
727 // Longest-match algorithm, and there was a string match.
728 // Simply continue after it.
729 pos+=maxInc;
730 rest-=maxInc;
731 if(rest==0) {
732 return length; // Reached the end of the string.
733 }
734 spanLength=0; // Match strings from after a string match.
735 continue;
736 }
737 }
738 // Finished trying to match all strings at pos.
739
740 if(spanLength!=0 || pos==0) {
741 // The position is after an unlimited code point span (spanLength!=0),
742 // not after a string match.
743 // The only position where spanLength==0 after a span is pos==0.
744 // Otherwise, an unlimited code point span is only tried again when no
745 // strings match, and if such a non-initial span fails we stop.
746 if(offsets.isEmpty()) {
747 return pos; // No strings matched after a span.
748 }
749 // Match strings from after the next string match.
750 } else {
751 // The position is after a string match (or a single code point).
752 if(offsets.isEmpty()) {
753 // No more strings matched after a previous string match.
754 // Try another code point span from after the last string match.
755 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
756 if( spanLength==rest || // Reached the end of the string, or
757 spanLength==0 // neither strings nor span progressed.
758 ) {
759 return pos+spanLength;
760 }
761 pos+=spanLength;
762 rest-=spanLength;
763 continue; // spanLength>0: Match strings from after a span.
764 } else {
765 // Try to match only one code point from after a string match if some
766 // string matched beyond it, so that we try all possible positions
767 // and don't overshoot.
768 spanLength=spanOne(spanSet, s+pos, rest);
769 if(spanLength>0) {
770 if(spanLength==rest) {
771 return length; // Reached the end of the string.
772 }
773 // Match strings after this code point.
774 // There cannot be any increments below it because UnicodeSet strings
775 // contain multiple code points.
776 pos+=spanLength;
777 rest-=spanLength;
778 offsets.shift(spanLength);
779 spanLength=0;
780 continue; // Match strings from after a single code point.
781 }
782 // Match strings from after the next string match.
783 }
784 }
785 int32_t minOffset=offsets.popMinimum();
786 pos+=minOffset;
787 rest-=minOffset;
788 spanLength=0; // Match strings from after a string match.
789 }
790 }
791
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const792 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
793 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
794 return spanNotBack(s, length);
795 }
796 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
797 if(pos==0) {
798 return 0;
799 }
800 int32_t spanLength=length-pos;
801
802 // Consider strings; they may overlap with the span.
803 OffsetList offsets;
804 if(spanCondition==USET_SPAN_CONTAINED) {
805 // Use offset list to try all possibilities.
806 offsets.setMaxLength(maxLength16);
807 }
808 int32_t i, stringsLength=strings.size();
809 uint8_t *spanBackLengths=spanLengths;
810 if(all) {
811 spanBackLengths+=stringsLength;
812 }
813 for(;;) {
814 if(spanCondition==USET_SPAN_CONTAINED) {
815 for(i=0; i<stringsLength; ++i) {
816 int32_t overlap=spanBackLengths[i];
817 if(overlap==ALL_CP_CONTAINED) {
818 continue; // Irrelevant string.
819 }
820 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
821 const UChar *s16=string.getBuffer();
822 int32_t length16=string.length();
823
824 // Try to match this string at pos-(length16-overlap)..pos-length16.
825 if(overlap>=LONG_SPAN) {
826 overlap=length16;
827 // While contained: No point matching fully inside the code point span.
828 int32_t len1=0;
829 U16_FWD_1(s16, len1, overlap);
830 overlap-=len1; // Length of the string minus the first code point.
831 }
832 if(overlap>spanLength) {
833 overlap=spanLength;
834 }
835 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
836 for(;;) {
837 if(dec>pos) {
838 break;
839 }
840 // Try to match if the decrement is not listed already.
841 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
842 if(dec==pos) {
843 return 0; // Reached the start of the string.
844 }
845 offsets.addOffset(dec);
846 }
847 if(overlap==0) {
848 break;
849 }
850 --overlap;
851 ++dec;
852 }
853 }
854 } else /* USET_SPAN_SIMPLE */ {
855 int32_t maxDec=0, maxOverlap=0;
856 for(i=0; i<stringsLength; ++i) {
857 int32_t overlap=spanBackLengths[i];
858 // For longest match, we do need to try to match even an all-contained string
859 // to find the match from the latest end.
860
861 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
862 const UChar *s16=string.getBuffer();
863 int32_t length16=string.length();
864
865 // Try to match this string at pos-(length16-overlap)..pos-length16.
866 if(overlap>=LONG_SPAN) {
867 overlap=length16;
868 // Longest match: Need to match fully inside the code point span
869 // to find the match from the latest end.
870 }
871 if(overlap>spanLength) {
872 overlap=spanLength;
873 }
874 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
875 for(;;) {
876 if(dec>pos || overlap<maxOverlap) {
877 break;
878 }
879 // Try to match if the string is longer or ends later.
880 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
881 matches16CPB(s, pos-dec, length, s16, length16)
882 ) {
883 maxDec=dec; // Longest match from latest end.
884 maxOverlap=overlap;
885 break;
886 }
887 --overlap;
888 ++dec;
889 }
890 }
891
892 if(maxDec!=0 || maxOverlap!=0) {
893 // Longest-match algorithm, and there was a string match.
894 // Simply continue before it.
895 pos-=maxDec;
896 if(pos==0) {
897 return 0; // Reached the start of the string.
898 }
899 spanLength=0; // Match strings from before a string match.
900 continue;
901 }
902 }
903 // Finished trying to match all strings at pos.
904
905 if(spanLength!=0 || pos==length) {
906 // The position is before an unlimited code point span (spanLength!=0),
907 // not before a string match.
908 // The only position where spanLength==0 before a span is pos==length.
909 // Otherwise, an unlimited code point span is only tried again when no
910 // strings match, and if such a non-initial span fails we stop.
911 if(offsets.isEmpty()) {
912 return pos; // No strings matched before a span.
913 }
914 // Match strings from before the next string match.
915 } else {
916 // The position is before a string match (or a single code point).
917 if(offsets.isEmpty()) {
918 // No more strings matched before a previous string match.
919 // Try another code point span from before the last string match.
920 int32_t oldPos=pos;
921 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
922 spanLength=oldPos-pos;
923 if( pos==0 || // Reached the start of the string, or
924 spanLength==0 // neither strings nor span progressed.
925 ) {
926 return pos;
927 }
928 continue; // spanLength>0: Match strings from before a span.
929 } else {
930 // Try to match only one code point from before a string match if some
931 // string matched beyond it, so that we try all possible positions
932 // and don't overshoot.
933 spanLength=spanOneBack(spanSet, s, pos);
934 if(spanLength>0) {
935 if(spanLength==pos) {
936 return 0; // Reached the start of the string.
937 }
938 // Match strings before this code point.
939 // There cannot be any decrements below it because UnicodeSet strings
940 // contain multiple code points.
941 pos-=spanLength;
942 offsets.shift(spanLength);
943 spanLength=0;
944 continue; // Match strings from before a single code point.
945 }
946 // Match strings from before the next string match.
947 }
948 }
949 pos-=offsets.popMinimum();
950 spanLength=0; // Match strings from before a string match.
951 }
952 }
953
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const954 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
955 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
956 return spanNotUTF8(s, length);
957 }
958 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
959 if(spanLength==length) {
960 return length;
961 }
962
963 // Consider strings; they may overlap with the span.
964 OffsetList offsets;
965 if(spanCondition==USET_SPAN_CONTAINED) {
966 // Use offset list to try all possibilities.
967 offsets.setMaxLength(maxLength8);
968 }
969 int32_t pos=spanLength, rest=length-pos;
970 int32_t i, stringsLength=strings.size();
971 uint8_t *spanUTF8Lengths=spanLengths;
972 if(all) {
973 spanUTF8Lengths+=2*stringsLength;
974 }
975 for(;;) {
976 const uint8_t *s8=utf8;
977 int32_t length8;
978 if(spanCondition==USET_SPAN_CONTAINED) {
979 for(i=0; i<stringsLength; ++i) {
980 length8=utf8Lengths[i];
981 if(length8==0) {
982 continue; // String not representable in UTF-8.
983 }
984 int32_t overlap=spanUTF8Lengths[i];
985 if(overlap==ALL_CP_CONTAINED) {
986 s8+=length8;
987 continue; // Irrelevant string.
988 }
989
990 // Try to match this string at pos-overlap..pos.
991 if(overlap>=LONG_SPAN) {
992 overlap=length8;
993 // While contained: No point matching fully inside the code point span.
994 U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
995 }
996 if(overlap>spanLength) {
997 overlap=spanLength;
998 }
999 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1000 for(;;) {
1001 if(inc>rest) {
1002 break;
1003 }
1004 // Try to match if the increment is not listed already.
1005 // Match at code point boundaries. (The UTF-8 strings were converted
1006 // from UTF-16 and are guaranteed to be well-formed.)
1007 if( !U8_IS_TRAIL(s[pos-overlap]) &&
1008 !offsets.containsOffset(inc) &&
1009 matches8(s+pos-overlap, s8, length8)
1010
1011 ) {
1012 if(inc==rest) {
1013 return length; // Reached the end of the string.
1014 }
1015 offsets.addOffset(inc);
1016 }
1017 if(overlap==0) {
1018 break;
1019 }
1020 --overlap;
1021 ++inc;
1022 }
1023 s8+=length8;
1024 }
1025 } else /* USET_SPAN_SIMPLE */ {
1026 int32_t maxInc=0, maxOverlap=0;
1027 for(i=0; i<stringsLength; ++i) {
1028 length8=utf8Lengths[i];
1029 if(length8==0) {
1030 continue; // String not representable in UTF-8.
1031 }
1032 int32_t overlap=spanUTF8Lengths[i];
1033 // For longest match, we do need to try to match even an all-contained string
1034 // to find the match from the earliest start.
1035
1036 // Try to match this string at pos-overlap..pos.
1037 if(overlap>=LONG_SPAN) {
1038 overlap=length8;
1039 // Longest match: Need to match fully inside the code point span
1040 // to find the match from the earliest start.
1041 }
1042 if(overlap>spanLength) {
1043 overlap=spanLength;
1044 }
1045 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1046 for(;;) {
1047 if(inc>rest || overlap<maxOverlap) {
1048 break;
1049 }
1050 // Try to match if the string is longer or starts earlier.
1051 // Match at code point boundaries. (The UTF-8 strings were converted
1052 // from UTF-16 and are guaranteed to be well-formed.)
1053 if( !U8_IS_TRAIL(s[pos-overlap]) &&
1054 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1055 matches8(s+pos-overlap, s8, length8)
1056
1057 ) {
1058 maxInc=inc; // Longest match from earliest start.
1059 maxOverlap=overlap;
1060 break;
1061 }
1062 --overlap;
1063 ++inc;
1064 }
1065 s8+=length8;
1066 }
1067
1068 if(maxInc!=0 || maxOverlap!=0) {
1069 // Longest-match algorithm, and there was a string match.
1070 // Simply continue after it.
1071 pos+=maxInc;
1072 rest-=maxInc;
1073 if(rest==0) {
1074 return length; // Reached the end of the string.
1075 }
1076 spanLength=0; // Match strings from after a string match.
1077 continue;
1078 }
1079 }
1080 // Finished trying to match all strings at pos.
1081
1082 if(spanLength!=0 || pos==0) {
1083 // The position is after an unlimited code point span (spanLength!=0),
1084 // not after a string match.
1085 // The only position where spanLength==0 after a span is pos==0.
1086 // Otherwise, an unlimited code point span is only tried again when no
1087 // strings match, and if such a non-initial span fails we stop.
1088 if(offsets.isEmpty()) {
1089 return pos; // No strings matched after a span.
1090 }
1091 // Match strings from after the next string match.
1092 } else {
1093 // The position is after a string match (or a single code point).
1094 if(offsets.isEmpty()) {
1095 // No more strings matched after a previous string match.
1096 // Try another code point span from after the last string match.
1097 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1098 if( spanLength==rest || // Reached the end of the string, or
1099 spanLength==0 // neither strings nor span progressed.
1100 ) {
1101 return pos+spanLength;
1102 }
1103 pos+=spanLength;
1104 rest-=spanLength;
1105 continue; // spanLength>0: Match strings from after a span.
1106 } else {
1107 // Try to match only one code point from after a string match if some
1108 // string matched beyond it, so that we try all possible positions
1109 // and don't overshoot.
1110 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1111 if(spanLength>0) {
1112 if(spanLength==rest) {
1113 return length; // Reached the end of the string.
1114 }
1115 // Match strings after this code point.
1116 // There cannot be any increments below it because UnicodeSet strings
1117 // contain multiple code points.
1118 pos+=spanLength;
1119 rest-=spanLength;
1120 offsets.shift(spanLength);
1121 spanLength=0;
1122 continue; // Match strings from after a single code point.
1123 }
1124 // Match strings from after the next string match.
1125 }
1126 }
1127 int32_t minOffset=offsets.popMinimum();
1128 pos+=minOffset;
1129 rest-=minOffset;
1130 spanLength=0; // Match strings from after a string match.
1131 }
1132 }
1133
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const1134 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1135 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1136 return spanNotBackUTF8(s, length);
1137 }
1138 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1139 if(pos==0) {
1140 return 0;
1141 }
1142 int32_t spanLength=length-pos;
1143
1144 // Consider strings; they may overlap with the span.
1145 OffsetList offsets;
1146 if(spanCondition==USET_SPAN_CONTAINED) {
1147 // Use offset list to try all possibilities.
1148 offsets.setMaxLength(maxLength8);
1149 }
1150 int32_t i, stringsLength=strings.size();
1151 uint8_t *spanBackUTF8Lengths=spanLengths;
1152 if(all) {
1153 spanBackUTF8Lengths+=3*stringsLength;
1154 }
1155 for(;;) {
1156 const uint8_t *s8=utf8;
1157 int32_t length8;
1158 if(spanCondition==USET_SPAN_CONTAINED) {
1159 for(i=0; i<stringsLength; ++i) {
1160 length8=utf8Lengths[i];
1161 if(length8==0) {
1162 continue; // String not representable in UTF-8.
1163 }
1164 int32_t overlap=spanBackUTF8Lengths[i];
1165 if(overlap==ALL_CP_CONTAINED) {
1166 s8+=length8;
1167 continue; // Irrelevant string.
1168 }
1169
1170 // Try to match this string at pos-(length8-overlap)..pos-length8.
1171 if(overlap>=LONG_SPAN) {
1172 overlap=length8;
1173 // While contained: No point matching fully inside the code point span.
1174 int32_t len1=0;
1175 U8_FWD_1(s8, len1, overlap);
1176 overlap-=len1; // Length of the string minus the first code point.
1177 }
1178 if(overlap>spanLength) {
1179 overlap=spanLength;
1180 }
1181 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1182 for(;;) {
1183 if(dec>pos) {
1184 break;
1185 }
1186 // Try to match if the decrement is not listed already.
1187 // Match at code point boundaries. (The UTF-8 strings were converted
1188 // from UTF-16 and are guaranteed to be well-formed.)
1189 if( !U8_IS_TRAIL(s[pos-dec]) &&
1190 !offsets.containsOffset(dec) &&
1191 matches8(s+pos-dec, s8, length8)
1192 ) {
1193 if(dec==pos) {
1194 return 0; // Reached the start of the string.
1195 }
1196 offsets.addOffset(dec);
1197 }
1198 if(overlap==0) {
1199 break;
1200 }
1201 --overlap;
1202 ++dec;
1203 }
1204 s8+=length8;
1205 }
1206 } else /* USET_SPAN_SIMPLE */ {
1207 int32_t maxDec=0, maxOverlap=0;
1208 for(i=0; i<stringsLength; ++i) {
1209 length8=utf8Lengths[i];
1210 if(length8==0) {
1211 continue; // String not representable in UTF-8.
1212 }
1213 int32_t overlap=spanBackUTF8Lengths[i];
1214 // For longest match, we do need to try to match even an all-contained string
1215 // to find the match from the latest end.
1216
1217 // Try to match this string at pos-(length8-overlap)..pos-length8.
1218 if(overlap>=LONG_SPAN) {
1219 overlap=length8;
1220 // Longest match: Need to match fully inside the code point span
1221 // to find the match from the latest end.
1222 }
1223 if(overlap>spanLength) {
1224 overlap=spanLength;
1225 }
1226 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1227 for(;;) {
1228 if(dec>pos || overlap<maxOverlap) {
1229 break;
1230 }
1231 // Try to match if the string is longer or ends later.
1232 // Match at code point boundaries. (The UTF-8 strings were converted
1233 // from UTF-16 and are guaranteed to be well-formed.)
1234 if( !U8_IS_TRAIL(s[pos-dec]) &&
1235 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1236 matches8(s+pos-dec, s8, length8)
1237 ) {
1238 maxDec=dec; // Longest match from latest end.
1239 maxOverlap=overlap;
1240 break;
1241 }
1242 --overlap;
1243 ++dec;
1244 }
1245 s8+=length8;
1246 }
1247
1248 if(maxDec!=0 || maxOverlap!=0) {
1249 // Longest-match algorithm, and there was a string match.
1250 // Simply continue before it.
1251 pos-=maxDec;
1252 if(pos==0) {
1253 return 0; // Reached the start of the string.
1254 }
1255 spanLength=0; // Match strings from before a string match.
1256 continue;
1257 }
1258 }
1259 // Finished trying to match all strings at pos.
1260
1261 if(spanLength!=0 || pos==length) {
1262 // The position is before an unlimited code point span (spanLength!=0),
1263 // not before a string match.
1264 // The only position where spanLength==0 before a span is pos==length.
1265 // Otherwise, an unlimited code point span is only tried again when no
1266 // strings match, and if such a non-initial span fails we stop.
1267 if(offsets.isEmpty()) {
1268 return pos; // No strings matched before a span.
1269 }
1270 // Match strings from before the next string match.
1271 } else {
1272 // The position is before a string match (or a single code point).
1273 if(offsets.isEmpty()) {
1274 // No more strings matched before a previous string match.
1275 // Try another code point span from before the last string match.
1276 int32_t oldPos=pos;
1277 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1278 spanLength=oldPos-pos;
1279 if( pos==0 || // Reached the start of the string, or
1280 spanLength==0 // neither strings nor span progressed.
1281 ) {
1282 return pos;
1283 }
1284 continue; // spanLength>0: Match strings from before a span.
1285 } else {
1286 // Try to match only one code point from before a string match if some
1287 // string matched beyond it, so that we try all possible positions
1288 // and don't overshoot.
1289 spanLength=spanOneBackUTF8(spanSet, s, pos);
1290 if(spanLength>0) {
1291 if(spanLength==pos) {
1292 return 0; // Reached the start of the string.
1293 }
1294 // Match strings before this code point.
1295 // There cannot be any decrements below it because UnicodeSet strings
1296 // contain multiple code points.
1297 pos-=spanLength;
1298 offsets.shift(spanLength);
1299 spanLength=0;
1300 continue; // Match strings from before a single code point.
1301 }
1302 // Match strings from before the next string match.
1303 }
1304 }
1305 pos-=offsets.popMinimum();
1306 spanLength=0; // Match strings from before a string match.
1307 }
1308 }
1309
1310 /*
1311 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1312 *
1313 * Theoretical algorithm:
1314 * - Iterate through the string, and at each code point boundary:
1315 * + If the code point there is in the set, then return with the current position.
1316 * + If a set string matches at the current position, then return with the current position.
1317 *
1318 * Optimized implementation:
1319 *
1320 * (Same assumption as for span() above.)
1321 *
1322 * Create and cache a spanNotSet which contains all of the single code points
1323 * of the original set but none of its strings.
1324 * For each set string add its initial code point to the spanNotSet.
1325 * (Also add its final code point for spanNotBack().)
1326 *
1327 * - Loop:
1328 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1329 * + If the current code point is in the original set, then
1330 * return the current position.
1331 * + If any set string matches at the current position, then
1332 * return the current position.
1333 * + If there is no match at the current position, neither for the code point there
1334 * nor for any set string, then skip this code point and continue the loop.
1335 * This happens for set-string-initial code points that were added to spanNotSet
1336 * when there is not actually a match for such a set string.
1337 */
1338
spanNot(const UChar * s,int32_t length) const1339 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1340 int32_t pos=0, rest=length;
1341 int32_t i, stringsLength=strings.size();
1342 do {
1343 // Span until we find a code point from the set,
1344 // or a code point that starts or ends some string.
1345 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1346 if(i==rest) {
1347 return length; // Reached the end of the string.
1348 }
1349 pos+=i;
1350 rest-=i;
1351
1352 // Check whether the current code point is in the original set,
1353 // without the string starts and ends.
1354 int32_t cpLength=spanOne(spanSet, s+pos, rest);
1355 if(cpLength>0) {
1356 return pos; // There is a set element at pos.
1357 }
1358
1359 // Try to match the strings at pos.
1360 for(i=0; i<stringsLength; ++i) {
1361 if(spanLengths[i]==ALL_CP_CONTAINED) {
1362 continue; // Irrelevant string.
1363 }
1364 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1365 const UChar *s16=string.getBuffer();
1366 int32_t length16=string.length();
1367 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1368 return pos; // There is a set element at pos.
1369 }
1370 }
1371
1372 // The span(while not contained) ended on a string start/end which is
1373 // not in the original set. Skip this code point and continue.
1374 // cpLength<0
1375 pos-=cpLength;
1376 rest+=cpLength;
1377 } while(rest!=0);
1378 return length; // Reached the end of the string.
1379 }
1380
spanNotBack(const UChar * s,int32_t length) const1381 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1382 int32_t pos=length;
1383 int32_t i, stringsLength=strings.size();
1384 do {
1385 // Span until we find a code point from the set,
1386 // or a code point that starts or ends some string.
1387 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1388 if(pos==0) {
1389 return 0; // Reached the start of the string.
1390 }
1391
1392 // Check whether the current code point is in the original set,
1393 // without the string starts and ends.
1394 int32_t cpLength=spanOneBack(spanSet, s, pos);
1395 if(cpLength>0) {
1396 return pos; // There is a set element at pos.
1397 }
1398
1399 // Try to match the strings at pos.
1400 for(i=0; i<stringsLength; ++i) {
1401 // Use spanLengths rather than a spanBackLengths pointer because
1402 // it is easier and we only need to know whether the string is irrelevant
1403 // which is the same in either array.
1404 if(spanLengths[i]==ALL_CP_CONTAINED) {
1405 continue; // Irrelevant string.
1406 }
1407 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1408 const UChar *s16=string.getBuffer();
1409 int32_t length16=string.length();
1410 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1411 return pos; // There is a set element at pos.
1412 }
1413 }
1414
1415 // The span(while not contained) ended on a string start/end which is
1416 // not in the original set. Skip this code point and continue.
1417 // cpLength<0
1418 pos+=cpLength;
1419 } while(pos!=0);
1420 return 0; // Reached the start of the string.
1421 }
1422
spanNotUTF8(const uint8_t * s,int32_t length) const1423 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1424 int32_t pos=0, rest=length;
1425 int32_t i, stringsLength=strings.size();
1426 uint8_t *spanUTF8Lengths=spanLengths;
1427 if(all) {
1428 spanUTF8Lengths+=2*stringsLength;
1429 }
1430 do {
1431 // Span until we find a code point from the set,
1432 // or a code point that starts or ends some string.
1433 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1434 if(i==rest) {
1435 return length; // Reached the end of the string.
1436 }
1437 pos+=i;
1438 rest-=i;
1439
1440 // Check whether the current code point is in the original set,
1441 // without the string starts and ends.
1442 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1443 if(cpLength>0) {
1444 return pos; // There is a set element at pos.
1445 }
1446
1447 // Try to match the strings at pos.
1448 const uint8_t *s8=utf8;
1449 int32_t length8;
1450 for(i=0; i<stringsLength; ++i) {
1451 length8=utf8Lengths[i];
1452 // ALL_CP_CONTAINED: Irrelevant string.
1453 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1454 return pos; // There is a set element at pos.
1455 }
1456 s8+=length8;
1457 }
1458
1459 // The span(while not contained) ended on a string start/end which is
1460 // not in the original set. Skip this code point and continue.
1461 // cpLength<0
1462 pos-=cpLength;
1463 rest+=cpLength;
1464 } while(rest!=0);
1465 return length; // Reached the end of the string.
1466 }
1467
spanNotBackUTF8(const uint8_t * s,int32_t length) const1468 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1469 int32_t pos=length;
1470 int32_t i, stringsLength=strings.size();
1471 uint8_t *spanBackUTF8Lengths=spanLengths;
1472 if(all) {
1473 spanBackUTF8Lengths+=3*stringsLength;
1474 }
1475 do {
1476 // Span until we find a code point from the set,
1477 // or a code point that starts or ends some string.
1478 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1479 if(pos==0) {
1480 return 0; // Reached the start of the string.
1481 }
1482
1483 // Check whether the current code point is in the original set,
1484 // without the string starts and ends.
1485 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1486 if(cpLength>0) {
1487 return pos; // There is a set element at pos.
1488 }
1489
1490 // Try to match the strings at pos.
1491 const uint8_t *s8=utf8;
1492 int32_t length8;
1493 for(i=0; i<stringsLength; ++i) {
1494 length8=utf8Lengths[i];
1495 // ALL_CP_CONTAINED: Irrelevant string.
1496 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1497 return pos; // There is a set element at pos.
1498 }
1499 s8+=length8;
1500 }
1501
1502 // The span(while not contained) ended on a string start/end which is
1503 // not in the original set. Skip this code point and continue.
1504 // cpLength<0
1505 pos+=cpLength;
1506 } while(pos!=0);
1507 return 0; // Reached the start of the string.
1508 }
1509
1510 U_NAMESPACE_END
1511