1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1999-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   10/20/99    alan        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 #include "unicode/parsepos.h"
15 #include "unicode/symtable.h"
16 #include "unicode/uniset.h"
17 #include "unicode/utf8.h"
18 #include "unicode/utf16.h"
19 #include "ruleiter.h"
20 #include "cmemory.h"
21 #include "cstring.h"
22 #include "patternprops.h"
23 #include "uelement.h"
24 #include "util.h"
25 #include "uvector.h"
26 #include "charstr.h"
27 #include "ustrfmt.h"
28 #include "uassert.h"
29 #include "bmpset.h"
30 #include "unisetspan.h"
31 
32 // Define UChar constants using hex for EBCDIC compatibility
33 // Used #define to reduce private static exports and memory access time.
34 #define SET_OPEN        ((UChar)0x005B) /*[*/
35 #define SET_CLOSE       ((UChar)0x005D) /*]*/
36 #define HYPHEN          ((UChar)0x002D) /*-*/
37 #define COMPLEMENT      ((UChar)0x005E) /*^*/
38 #define COLON           ((UChar)0x003A) /*:*/
39 #define BACKSLASH       ((UChar)0x005C) /*\*/
40 #define INTERSECTION    ((UChar)0x0026) /*&*/
41 #define UPPER_U         ((UChar)0x0055) /*U*/
42 #define LOWER_U         ((UChar)0x0075) /*u*/
43 #define OPEN_BRACE      ((UChar)123)    /*{*/
44 #define CLOSE_BRACE     ((UChar)125)    /*}*/
45 #define UPPER_P         ((UChar)0x0050) /*P*/
46 #define LOWER_P         ((UChar)0x0070) /*p*/
47 #define UPPER_N         ((UChar)78)     /*N*/
48 #define EQUALS          ((UChar)0x003D) /*=*/
49 
50 // HIGH_VALUE > all valid values. 110000 for codepoints
51 #define UNICODESET_HIGH 0x0110000
52 
53 // LOW <= all valid values. ZERO for codepoints
54 #define UNICODESET_LOW 0x000000
55 
56 // initial storage. Must be >= 0
57 #define START_EXTRA 16
58 
59 // extra amount for growth. Must be >= 0
60 #define GROW_EXTRA START_EXTRA
61 
62 U_NAMESPACE_BEGIN
63 
~SymbolTable()64 SymbolTable::~SymbolTable() {}
65 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)66 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
67 
68 /**
69  * Modify the given UChar32 variable so that it is in range, by
70  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
71  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
72  * It modifies its argument in-place and also returns it.
73  */
74 static inline UChar32 pinCodePoint(UChar32& c) {
75     if (c < UNICODESET_LOW) {
76         c = UNICODESET_LOW;
77     } else if (c > (UNICODESET_HIGH-1)) {
78         c = (UNICODESET_HIGH-1);
79     }
80     return c;
81 }
82 
83 //----------------------------------------------------------------
84 // Debugging
85 //----------------------------------------------------------------
86 
87 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
88 // To enable the debugging, define the symbol DEBUG_MEM in the line
89 // below.  This will result in text being sent to stdout that looks
90 // like this:
91 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
92 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
93 // Each line lists a construction (ct) or destruction (dt) event, the
94 // object address, the number of outstanding objects after the event,
95 // and the pattern of the object in question.
96 
97 // #define DEBUG_MEM
98 
99 #ifdef DEBUG_MEM
100 #include <stdio.h>
101 static int32_t _dbgCount = 0;
102 
_dbgct(UnicodeSet * set)103 static inline void _dbgct(UnicodeSet* set) {
104     UnicodeString str;
105     set->toPattern(str, TRUE);
106     char buf[40];
107     str.extract(0, 39, buf, "");
108     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
109 }
110 
_dbgdt(UnicodeSet * set)111 static inline void _dbgdt(UnicodeSet* set) {
112     UnicodeString str;
113     set->toPattern(str, TRUE);
114     char buf[40];
115     str.extract(0, 39, buf, "");
116     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
117 }
118 
119 #else
120 
121 #define _dbgct(set)
122 #define _dbgdt(set)
123 
124 #endif
125 
126 //----------------------------------------------------------------
127 // UnicodeString in UVector support
128 //----------------------------------------------------------------
129 
cloneUnicodeString(UElement * dst,UElement * src)130 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
131     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
132 }
133 
compareUnicodeString(UElement t1,UElement t2)134 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
135     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
136     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
137     return a.compare(b);
138 }
139 
140 //----------------------------------------------------------------
141 // Constructors &c
142 //----------------------------------------------------------------
143 
144 /**
145  * Constructs an empty set.
146  */
UnicodeSet()147 UnicodeSet::UnicodeSet() :
148     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
149     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
150     fFlags(0)
151 {
152     UErrorCode status = U_ZERO_ERROR;
153     allocateStrings(status);
154     if (U_FAILURE(status)) {
155         return;
156     }
157     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
158     if(list!=NULL){
159         list[0] = UNICODESET_HIGH;
160     } else { // If memory allocation failed, set to bogus state.
161         setToBogus();
162         return;
163     }
164     _dbgct(this);
165 }
166 
167 /**
168  * Constructs a set containing the given range. If <code>end >
169  * start</code> then an empty set is created.
170  *
171  * @param start first character, inclusive, of range
172  * @param end last character, inclusive, of range
173  */
UnicodeSet(UChar32 start,UChar32 end)174 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
175     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
176     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
177     fFlags(0)
178 {
179     UErrorCode status = U_ZERO_ERROR;
180     allocateStrings(status);
181     if (U_FAILURE(status)) {
182         return;
183     }
184     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
185     if(list!=NULL){
186         list[0] = UNICODESET_HIGH;
187         complement(start, end);
188     } else { // If memory allocation failed, set to bogus state.
189         setToBogus();
190         return;
191     }
192     _dbgct(this);
193 }
194 
195 /**
196  * Constructs a set that is identical to the given UnicodeSet.
197  */
UnicodeSet(const UnicodeSet & o)198 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
199     UnicodeFilter(o),
200     len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
201     bmpSet(0),
202     buffer(0), bufferCapacity(0),
203     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
204     fFlags(0)
205 {
206     UErrorCode status = U_ZERO_ERROR;
207     allocateStrings(status);
208     if (U_FAILURE(status)) {
209         return;
210     }
211     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
212     if(list!=NULL){
213         *this = o;
214     } else { // If memory allocation failed, set to bogus state.
215         setToBogus();
216         return;
217     }
218     _dbgct(this);
219 }
220 
221 // Copy-construct as thawed.
UnicodeSet(const UnicodeSet & o,UBool)222 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
223     UnicodeFilter(o),
224     len(0), capacity(o.len + GROW_EXTRA), list(0),
225     bmpSet(0),
226     buffer(0), bufferCapacity(0),
227     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
228     fFlags(0)
229 {
230     UErrorCode status = U_ZERO_ERROR;
231     allocateStrings(status);
232     if (U_FAILURE(status)) {
233         return;
234     }
235     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
236     if(list!=NULL){
237         // *this = o except for bmpSet and stringSpan
238         len = o.len;
239         uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
240         if (strings != NULL && o.strings != NULL) {
241             strings->assign(*o.strings, cloneUnicodeString, status);
242         } else { // Invalid strings.
243             setToBogus();
244             return;
245         }
246         if (o.pat) {
247             setPattern(UnicodeString(o.pat, o.patLen));
248         }
249     } else { // If memory allocation failed, set to bogus state.
250         setToBogus();
251         return;
252     }
253     _dbgct(this);
254 }
255 
256 /**
257  * Destructs the set.
258  */
~UnicodeSet()259 UnicodeSet::~UnicodeSet() {
260     _dbgdt(this); // first!
261     uprv_free(list);
262     delete bmpSet;
263     if (buffer) {
264         uprv_free(buffer);
265     }
266     delete strings;
267     delete stringSpan;
268     releasePattern();
269 }
270 
271 /**
272  * Assigns this object to be a copy of another.
273  */
operator =(const UnicodeSet & o)274 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
275     if (this == &o) {
276         return *this;
277     }
278     if (isFrozen()) {
279         return *this;
280     }
281     if (o.isBogus()) {
282         setToBogus();
283         return *this;
284     }
285     UErrorCode ec = U_ZERO_ERROR;
286     ensureCapacity(o.len, ec);
287     if (U_FAILURE(ec)) {
288         return *this; // There is no way to report this error :-(
289     }
290     len = o.len;
291     uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
292     if (o.bmpSet == NULL) {
293         bmpSet = NULL;
294     } else {
295         bmpSet = new BMPSet(*o.bmpSet, list, len);
296         if (bmpSet == NULL) { // Check for memory allocation error.
297             setToBogus();
298             return *this;
299         }
300     }
301     if (strings != NULL && o.strings != NULL) {
302         strings->assign(*o.strings, cloneUnicodeString, ec);
303     } else { // Invalid strings.
304         setToBogus();
305         return *this;
306     }
307     if (o.stringSpan == NULL) {
308         stringSpan = NULL;
309     } else {
310         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
311         if (stringSpan == NULL) { // Check for memory allocation error.
312             setToBogus();
313             return *this;
314         }
315     }
316     releasePattern();
317     if (o.pat) {
318         setPattern(UnicodeString(o.pat, o.patLen));
319     }
320     return *this;
321 }
322 
323 /**
324  * Returns a copy of this object.  All UnicodeMatcher objects have
325  * to support cloning in order to allow classes using
326  * UnicodeMatchers, such as Transliterator, to implement cloning.
327  */
clone() const328 UnicodeFunctor* UnicodeSet::clone() const {
329     return new UnicodeSet(*this);
330 }
331 
cloneAsThawed() const332 UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
333     return new UnicodeSet(*this, TRUE);
334 }
335 
336 /**
337  * Compares the specified object with this set for equality.  Returns
338  * <tt>true</tt> if the two sets
339  * have the same size, and every member of the specified set is
340  * contained in this set (or equivalently, every member of this set is
341  * contained in the specified set).
342  *
343  * @param o set to be compared for equality with this set.
344  * @return <tt>true</tt> if the specified set is equal to this set.
345  */
operator ==(const UnicodeSet & o) const346 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
347     if (len != o.len) return FALSE;
348     for (int32_t i = 0; i < len; ++i) {
349         if (list[i] != o.list[i]) return FALSE;
350     }
351     if (*strings != *o.strings) return FALSE;
352     return TRUE;
353 }
354 
355 /**
356  * Returns the hash code value for this set.
357  *
358  * @return the hash code value for this set.
359  * @see Object#hashCode()
360  */
hashCode(void) const361 int32_t UnicodeSet::hashCode(void) const {
362     int32_t result = len;
363     for (int32_t i = 0; i < len; ++i) {
364         result *= 1000003;
365         result += list[i];
366     }
367     return result;
368 }
369 
370 //----------------------------------------------------------------
371 // Public API
372 //----------------------------------------------------------------
373 
374 /**
375  * Returns the number of elements in this set (its cardinality),
376  * Note than the elements of a set may include both individual
377  * codepoints and strings.
378  *
379  * @return the number of elements in this set (its cardinality).
380  */
size(void) const381 int32_t UnicodeSet::size(void) const {
382     int32_t n = 0;
383     int32_t count = getRangeCount();
384     for (int32_t i = 0; i < count; ++i) {
385         n += getRangeEnd(i) - getRangeStart(i) + 1;
386     }
387     return n + strings->size();
388 }
389 
390 /**
391  * Returns <tt>true</tt> if this set contains no elements.
392  *
393  * @return <tt>true</tt> if this set contains no elements.
394  */
isEmpty(void) const395 UBool UnicodeSet::isEmpty(void) const {
396     return len == 1 && strings->size() == 0;
397 }
398 
399 /**
400  * Returns true if this set contains the given character.
401  * @param c character to be checked for containment
402  * @return true if the test condition is met
403  */
contains(UChar32 c) const404 UBool UnicodeSet::contains(UChar32 c) const {
405     // Set i to the index of the start item greater than ch
406     // We know we will terminate without length test!
407     // LATER: for large sets, add binary search
408     //int32_t i = -1;
409     //for (;;) {
410     //    if (c < list[++i]) break;
411     //}
412     if (bmpSet != NULL) {
413         return bmpSet->contains(c);
414     }
415     if (stringSpan != NULL) {
416         return stringSpan->contains(c);
417     }
418     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
419         return FALSE;
420     }
421     int32_t i = findCodePoint(c);
422     return (UBool)(i & 1); // return true if odd
423 }
424 
425 /**
426  * Returns the smallest value i such that c < list[i].  Caller
427  * must ensure that c is a legal value or this method will enter
428  * an infinite loop.  This method performs a binary search.
429  * @param c a character in the range MIN_VALUE..MAX_VALUE
430  * inclusive
431  * @return the smallest integer i in the range 0..len-1,
432  * inclusive, such that c < list[i]
433  */
findCodePoint(UChar32 c) const434 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
435     /* Examples:
436                                        findCodePoint(c)
437        set              list[]         c=0 1 3 4 7 8
438        ===              ==============   ===========
439        []               [110000]         0 0 0 0 0 0
440        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
441        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
442        [:Any:]          [0, 110000]      1 1 1 1 1 1
443      */
444 
445     // Return the smallest i such that c < list[i].  Assume
446     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
447     if (c < list[0])
448         return 0;
449     // High runner test.  c is often after the last range, so an
450     // initial check for this condition pays off.
451     int32_t lo = 0;
452     int32_t hi = len - 1;
453     if (lo >= hi || c >= list[hi-1])
454         return hi;
455     // invariant: c >= list[lo]
456     // invariant: c < list[hi]
457     for (;;) {
458         int32_t i = (lo + hi) >> 1;
459         if (i == lo) {
460             break; // Found!
461         } else if (c < list[i]) {
462             hi = i;
463         } else {
464             lo = i;
465         }
466     }
467     return hi;
468 }
469 
470 /**
471  * Returns true if this set contains every character
472  * of the given range.
473  * @param start first character, inclusive, of the range
474  * @param end last character, inclusive, of the range
475  * @return true if the test condition is met
476  */
contains(UChar32 start,UChar32 end) const477 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
478     //int32_t i = -1;
479     //for (;;) {
480     //    if (start < list[++i]) break;
481     //}
482     int32_t i = findCodePoint(start);
483     return ((i & 1) != 0 && end < list[i]);
484 }
485 
486 /**
487  * Returns <tt>true</tt> if this set contains the given
488  * multicharacter string.
489  * @param s string to be checked for containment
490  * @return <tt>true</tt> if this set contains the specified string
491  */
contains(const UnicodeString & s) const492 UBool UnicodeSet::contains(const UnicodeString& s) const {
493     if (s.length() == 0) return FALSE;
494     int32_t cp = getSingleCP(s);
495     if (cp < 0) {
496         return strings->contains((void*) &s);
497     } else {
498         return contains((UChar32) cp);
499     }
500 }
501 
502 /**
503  * Returns true if this set contains all the characters and strings
504  * of the given set.
505  * @param c set to be checked for containment
506  * @return true if the test condition is met
507  */
containsAll(const UnicodeSet & c) const508 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
509     // The specified set is a subset if all of its pairs are contained in
510     // this set.  It's possible to code this more efficiently in terms of
511     // direct manipulation of the inversion lists if the need arises.
512     int32_t n = c.getRangeCount();
513     for (int i=0; i<n; ++i) {
514         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
515             return FALSE;
516         }
517     }
518     if (!strings->containsAll(*c.strings)) return FALSE;
519     return TRUE;
520 }
521 
522 /**
523  * Returns true if this set contains all the characters
524  * of the given string.
525  * @param s string containing characters to be checked for containment
526  * @return true if the test condition is met
527  */
containsAll(const UnicodeString & s) const528 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
529     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
530                    s.length());
531 }
532 
533 /**
534  * Returns true if this set contains none of the characters
535  * of the given range.
536  * @param start first character, inclusive, of the range
537  * @param end last character, inclusive, of the range
538  * @return true if the test condition is met
539  */
containsNone(UChar32 start,UChar32 end) const540 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
541     //int32_t i = -1;
542     //for (;;) {
543     //    if (start < list[++i]) break;
544     //}
545     int32_t i = findCodePoint(start);
546     return ((i & 1) == 0 && end < list[i]);
547 }
548 
549 /**
550  * Returns true if this set contains none of the characters and strings
551  * of the given set.
552  * @param c set to be checked for containment
553  * @return true if the test condition is met
554  */
containsNone(const UnicodeSet & c) const555 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
556     // The specified set is a subset if all of its pairs are contained in
557     // this set.  It's possible to code this more efficiently in terms of
558     // direct manipulation of the inversion lists if the need arises.
559     int32_t n = c.getRangeCount();
560     for (int32_t i=0; i<n; ++i) {
561         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
562             return FALSE;
563         }
564     }
565     if (!strings->containsNone(*c.strings)) return FALSE;
566     return TRUE;
567 }
568 
569 /**
570  * Returns true if this set contains none of the characters
571  * of the given string.
572  * @param s string containing characters to be checked for containment
573  * @return true if the test condition is met
574  */
containsNone(const UnicodeString & s) const575 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
576     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
577                    s.length());
578 }
579 
580 /**
581  * Returns <tt>true</tt> if this set contains any character whose low byte
582  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
583  * indexing.
584  */
matchesIndexValue(uint8_t v) const585 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
586     /* The index value v, in the range [0,255], is contained in this set if
587      * it is contained in any pair of this set.  Pairs either have the high
588      * bytes equal, or unequal.  If the high bytes are equal, then we have
589      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
590      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
591      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
592      * time zone month containment logic.)
593      */
594     int32_t i;
595     int32_t rangeCount=getRangeCount();
596     for (i=0; i<rangeCount; ++i) {
597         UChar32 low = getRangeStart(i);
598         UChar32 high = getRangeEnd(i);
599         if ((low & ~0xFF) == (high & ~0xFF)) {
600             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
601                 return TRUE;
602             }
603         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
604             return TRUE;
605         }
606     }
607     if (strings->size() != 0) {
608         for (i=0; i<strings->size(); ++i) {
609             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
610             //if (s.length() == 0) {
611             //    // Empty strings match everything
612             //    return TRUE;
613             //}
614             // assert(s.length() != 0); // We enforce this elsewhere
615             UChar32 c = s.char32At(0);
616             if ((c & 0xFF) == v) {
617                 return TRUE;
618             }
619         }
620     }
621     return FALSE;
622 }
623 
624 /**
625  * Implementation of UnicodeMatcher::matches().  Always matches the
626  * longest possible multichar string.
627  */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)628 UMatchDegree UnicodeSet::matches(const Replaceable& text,
629                                  int32_t& offset,
630                                  int32_t limit,
631                                  UBool incremental) {
632     if (offset == limit) {
633         // Strings, if any, have length != 0, so we don't worry
634         // about them here.  If we ever allow zero-length strings
635         // we much check for them here.
636         if (contains(U_ETHER)) {
637             return incremental ? U_PARTIAL_MATCH : U_MATCH;
638         } else {
639             return U_MISMATCH;
640         }
641     } else {
642         if (strings->size() != 0) { // try strings first
643 
644             // might separate forward and backward loops later
645             // for now they are combined
646 
647             // TODO Improve efficiency of this, at least in the forward
648             // direction, if not in both.  In the forward direction we
649             // can assume the strings are sorted.
650 
651             int32_t i;
652             UBool forward = offset < limit;
653 
654             // firstChar is the leftmost char to match in the
655             // forward direction or the rightmost char to match in
656             // the reverse direction.
657             UChar firstChar = text.charAt(offset);
658 
659             // If there are multiple strings that can match we
660             // return the longest match.
661             int32_t highWaterLength = 0;
662 
663             for (i=0; i<strings->size(); ++i) {
664                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
665 
666                 //if (trial.length() == 0) {
667                 //    return U_MATCH; // null-string always matches
668                 //}
669                 // assert(trial.length() != 0); // We ensure this elsewhere
670 
671                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
672 
673                 // Strings are sorted, so we can optimize in the
674                 // forward direction.
675                 if (forward && c > firstChar) break;
676                 if (c != firstChar) continue;
677 
678                 int32_t matchLen = matchRest(text, offset, limit, trial);
679 
680                 if (incremental) {
681                     int32_t maxLen = forward ? limit-offset : offset-limit;
682                     if (matchLen == maxLen) {
683                         // We have successfully matched but only up to limit.
684                         return U_PARTIAL_MATCH;
685                     }
686                 }
687 
688                 if (matchLen == trial.length()) {
689                     // We have successfully matched the whole string.
690                     if (matchLen > highWaterLength) {
691                         highWaterLength = matchLen;
692                     }
693                     // In the forward direction we know strings
694                     // are sorted so we can bail early.
695                     if (forward && matchLen < highWaterLength) {
696                         break;
697                     }
698                     continue;
699                 }
700             }
701 
702             // We've checked all strings without a partial match.
703             // If we have full matches, return the longest one.
704             if (highWaterLength != 0) {
705                 offset += forward ? highWaterLength : -highWaterLength;
706                 return U_MATCH;
707             }
708         }
709         return UnicodeFilter::matches(text, offset, limit, incremental);
710     }
711 }
712 
713 /**
714  * Returns the longest match for s in text at the given position.
715  * If limit > start then match forward from start+1 to limit
716  * matching all characters except s.charAt(0).  If limit < start,
717  * go backward starting from start-1 matching all characters
718  * except s.charAt(s.length()-1).  This method assumes that the
719  * first character, text.charAt(start), matches s, so it does not
720  * check it.
721  * @param text the text to match
722  * @param start the first character to match.  In the forward
723  * direction, text.charAt(start) is matched against s.charAt(0).
724  * In the reverse direction, it is matched against
725  * s.charAt(s.length()-1).
726  * @param limit the limit offset for matching, either last+1 in
727  * the forward direction, or last-1 in the reverse direction,
728  * where last is the index of the last character to match.
729  * @return If part of s matches up to the limit, return |limit -
730  * start|.  If all of s matches before reaching the limit, return
731  * s.length().  If there is a mismatch between s and text, return
732  * 0
733  */
matchRest(const Replaceable & text,int32_t start,int32_t limit,const UnicodeString & s)734 int32_t UnicodeSet::matchRest(const Replaceable& text,
735                               int32_t start, int32_t limit,
736                               const UnicodeString& s) {
737     int32_t i;
738     int32_t maxLen;
739     int32_t slen = s.length();
740     if (start < limit) {
741         maxLen = limit - start;
742         if (maxLen > slen) maxLen = slen;
743         for (i = 1; i < maxLen; ++i) {
744             if (text.charAt(start + i) != s.charAt(i)) return 0;
745         }
746     } else {
747         maxLen = start - limit;
748         if (maxLen > slen) maxLen = slen;
749         --slen; // <=> slen = s.length() - 1;
750         for (i = 1; i < maxLen; ++i) {
751             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
752         }
753     }
754     return maxLen;
755 }
756 
757 /**
758  * Implement of UnicodeMatcher
759  */
addMatchSetTo(UnicodeSet & toUnionTo) const760 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
761     toUnionTo.addAll(*this);
762 }
763 
764 /**
765  * Returns the index of the given character within this set, where
766  * the set is ordered by ascending code point.  If the character
767  * is not in this set, return -1.  The inverse of this method is
768  * <code>charAt()</code>.
769  * @return an index from 0..size()-1, or -1
770  */
indexOf(UChar32 c) const771 int32_t UnicodeSet::indexOf(UChar32 c) const {
772     if (c < MIN_VALUE || c > MAX_VALUE) {
773         return -1;
774     }
775     int32_t i = 0;
776     int32_t n = 0;
777     for (;;) {
778         UChar32 start = list[i++];
779         if (c < start) {
780             return -1;
781         }
782         UChar32 limit = list[i++];
783         if (c < limit) {
784             return n + c - start;
785         }
786         n += limit - start;
787     }
788 }
789 
790 /**
791  * Returns the character at the given index within this set, where
792  * the set is ordered by ascending code point.  If the index is
793  * out of range, return (UChar32)-1.  The inverse of this method is
794  * <code>indexOf()</code>.
795  * @param index an index from 0..size()-1
796  * @return the character at the given index, or (UChar32)-1.
797  */
charAt(int32_t index) const798 UChar32 UnicodeSet::charAt(int32_t index) const {
799     if (index >= 0) {
800         // len2 is the largest even integer <= len, that is, it is len
801         // for even values and len-1 for odd values.  With odd values
802         // the last entry is UNICODESET_HIGH.
803         int32_t len2 = len & ~1;
804         for (int32_t i=0; i < len2;) {
805             UChar32 start = list[i++];
806             int32_t count = list[i++] - start;
807             if (index < count) {
808                 return (UChar32)(start + index);
809             }
810             index -= count;
811         }
812     }
813     return (UChar32)-1;
814 }
815 
816 /**
817  * Make this object represent the range <code>start - end</code>.
818  * If <code>end > start</code> then this object is set to an
819  * an empty range.
820  *
821  * @param start first character in the set, inclusive
822  * @rparam end last character in the set, inclusive
823  */
set(UChar32 start,UChar32 end)824 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
825     clear();
826     complement(start, end);
827     return *this;
828 }
829 
830 /**
831  * Adds the specified range to this set if it is not already
832  * present.  If this set already contains the specified range,
833  * the call leaves this set unchanged.  If <code>end > start</code>
834  * then an empty range is added, leaving the set unchanged.
835  *
836  * @param start first character, inclusive, of range to be added
837  * to this set.
838  * @param end last character, inclusive, of range to be added
839  * to this set.
840  */
add(UChar32 start,UChar32 end)841 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
842     if (pinCodePoint(start) < pinCodePoint(end)) {
843         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
844         add(range, 2, 0);
845     } else if (start == end) {
846         add(start);
847     }
848     return *this;
849 }
850 
851 // #define DEBUG_US_ADD
852 
853 #ifdef DEBUG_US_ADD
854 #include <stdio.h>
dump(UChar32 c)855 void dump(UChar32 c) {
856     if (c <= 0xFF) {
857         printf("%c", (char)c);
858     } else {
859         printf("U+%04X", c);
860     }
861 }
dump(const UChar32 * list,int32_t len)862 void dump(const UChar32* list, int32_t len) {
863     printf("[");
864     for (int32_t i=0; i<len; ++i) {
865         if (i != 0) printf(", ");
866         dump(list[i]);
867     }
868     printf("]");
869 }
870 #endif
871 
872 /**
873  * Adds the specified character to this set if it is not already
874  * present.  If this set already contains the specified character,
875  * the call leaves this set unchanged.
876  */
add(UChar32 c)877 UnicodeSet& UnicodeSet::add(UChar32 c) {
878     // find smallest i such that c < list[i]
879     // if odd, then it is IN the set
880     // if even, then it is OUT of the set
881     int32_t i = findCodePoint(pinCodePoint(c));
882 
883     // already in set?
884     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
885 
886     // HIGH is 0x110000
887     // assert(list[len-1] == HIGH);
888 
889     // empty = [HIGH]
890     // [start_0, limit_0, start_1, limit_1, HIGH]
891 
892     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
893     //                             ^
894     //                             list[i]
895 
896     // i == 0 means c is before the first range
897 
898 #ifdef DEBUG_US_ADD
899     printf("Add of ");
900     dump(c);
901     printf(" found at %d", i);
902     printf(": ");
903     dump(list, len);
904     printf(" => ");
905 #endif
906 
907     if (c == list[i]-1) {
908         // c is before start of next range
909         list[i] = c;
910         // if we touched the HIGH mark, then add a new one
911         if (c == (UNICODESET_HIGH - 1)) {
912             UErrorCode status = U_ZERO_ERROR;
913             ensureCapacity(len+1, status);
914             if (U_FAILURE(status)) {
915                 return *this; // There is no way to report this error :-(
916             }
917             list[len++] = UNICODESET_HIGH;
918         }
919         if (i > 0 && c == list[i-1]) {
920             // collapse adjacent ranges
921 
922             // [..., start_k-1, c, c, limit_k, ..., HIGH]
923             //                     ^
924             //                     list[i]
925 
926             //for (int32_t k=i-1; k<len-2; ++k) {
927             //    list[k] = list[k+2];
928             //}
929             UChar32* dst = list + i - 1;
930             UChar32* src = dst + 2;
931             UChar32* srclimit = list + len;
932             while (src < srclimit) *(dst++) = *(src++);
933 
934             len -= 2;
935         }
936     }
937 
938     else if (i > 0 && c == list[i-1]) {
939         // c is after end of prior range
940         list[i-1]++;
941         // no need to check for collapse here
942     }
943 
944     else {
945         // At this point we know the new char is not adjacent to
946         // any existing ranges, and it is not 10FFFF.
947 
948 
949         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
950         //                             ^
951         //                             list[i]
952 
953         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
954         //                             ^
955         //                             list[i]
956 
957         UErrorCode status = U_ZERO_ERROR;
958         ensureCapacity(len+2, status);
959         if (U_FAILURE(status)) {
960             return *this; // There is no way to report this error :-(
961         }
962 
963         //for (int32_t k=len-1; k>=i; --k) {
964         //    list[k+2] = list[k];
965         //}
966         UChar32* src = list + len;
967         UChar32* dst = src + 2;
968         UChar32* srclimit = list + i;
969         while (src > srclimit) *(--dst) = *(--src);
970 
971         list[i] = c;
972         list[i+1] = c+1;
973         len += 2;
974     }
975 
976 #ifdef DEBUG_US_ADD
977     dump(list, len);
978     printf("\n");
979 
980     for (i=1; i<len; ++i) {
981         if (list[i] <= list[i-1]) {
982             // Corrupt array!
983             printf("ERROR: list has been corrupted\n");
984             exit(1);
985         }
986     }
987 #endif
988 
989     releasePattern();
990     return *this;
991 }
992 
993 /**
994  * Adds the specified multicharacter to this set if it is not already
995  * present.  If this set already contains the multicharacter,
996  * the call leaves this set unchanged.
997  * Thus "ch" => {"ch"}
998  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
999  * @param s the source string
1000  * @return the modified set, for chaining
1001  */
add(const UnicodeString & s)1002 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
1003     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1004     int32_t cp = getSingleCP(s);
1005     if (cp < 0) {
1006         if (!strings->contains((void*) &s)) {
1007             _add(s);
1008             releasePattern();
1009         }
1010     } else {
1011         add((UChar32)cp);
1012     }
1013     return *this;
1014 }
1015 
1016 /**
1017  * Adds the given string, in order, to 'strings'.  The given string
1018  * must have been checked by the caller to not be empty and to not
1019  * already be in 'strings'.
1020  */
_add(const UnicodeString & s)1021 void UnicodeSet::_add(const UnicodeString& s) {
1022     if (isFrozen() || isBogus()) {
1023         return;
1024     }
1025     UnicodeString* t = new UnicodeString(s);
1026     if (t == NULL) { // Check for memory allocation error.
1027         setToBogus();
1028         return;
1029     }
1030     UErrorCode ec = U_ZERO_ERROR;
1031     strings->sortedInsert(t, compareUnicodeString, ec);
1032     if (U_FAILURE(ec)) {
1033         setToBogus();
1034         delete t;
1035     }
1036 }
1037 
1038 /**
1039  * @return a code point IF the string consists of a single one.
1040  * otherwise returns -1.
1041  * @param string to test
1042  */
getSingleCP(const UnicodeString & s)1043 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1044     //if (s.length() < 1) {
1045     //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1046     //}
1047     if (s.length() > 2) return -1;
1048     if (s.length() == 1) return s.charAt(0);
1049 
1050     // at this point, len = 2
1051     UChar32 cp = s.char32At(0);
1052     if (cp > 0xFFFF) { // is surrogate pair
1053         return cp;
1054     }
1055     return -1;
1056 }
1057 
1058 /**
1059  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1060  * If this set already any particular character, it has no effect on that character.
1061  * @param the source string
1062  * @return the modified set, for chaining
1063  */
addAll(const UnicodeString & s)1064 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1065     UChar32 cp;
1066     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1067         cp = s.char32At(i);
1068         add(cp);
1069     }
1070     return *this;
1071 }
1072 
1073 /**
1074  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1075  * If this set already any particular character, it has no effect on that character.
1076  * @param the source string
1077  * @return the modified set, for chaining
1078  */
retainAll(const UnicodeString & s)1079 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1080     UnicodeSet set;
1081     set.addAll(s);
1082     retainAll(set);
1083     return *this;
1084 }
1085 
1086 /**
1087  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1088  * If this set already any particular character, it has no effect on that character.
1089  * @param the source string
1090  * @return the modified set, for chaining
1091  */
complementAll(const UnicodeString & s)1092 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1093     UnicodeSet set;
1094     set.addAll(s);
1095     complementAll(set);
1096     return *this;
1097 }
1098 
1099 /**
1100  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1101  * If this set already any particular character, it has no effect on that character.
1102  * @param the source string
1103  * @return the modified set, for chaining
1104  */
removeAll(const UnicodeString & s)1105 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1106     UnicodeSet set;
1107     set.addAll(s);
1108     removeAll(set);
1109     return *this;
1110 }
1111 
removeAllStrings()1112 UnicodeSet& UnicodeSet::removeAllStrings() {
1113     strings->removeAllElements();
1114     return *this;
1115 }
1116 
1117 
1118 /**
1119  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1120  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1121  * @param the source string
1122  * @return a newly created set containing the given string
1123  */
createFrom(const UnicodeString & s)1124 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1125     UnicodeSet *set = new UnicodeSet();
1126     if (set != NULL) { // Check for memory allocation error.
1127         set->add(s);
1128     }
1129     return set;
1130 }
1131 
1132 
1133 /**
1134  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1135  * @param the source string
1136  * @return a newly created set containing the given characters
1137  */
createFromAll(const UnicodeString & s)1138 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1139     UnicodeSet *set = new UnicodeSet();
1140     if (set != NULL) { // Check for memory allocation error.
1141         set->addAll(s);
1142     }
1143     return set;
1144 }
1145 
1146 /**
1147  * Retain only the elements in this set that are contained in the
1148  * specified range.  If <code>end > start</code> then an empty range is
1149  * retained, leaving the set empty.
1150  *
1151  * @param start first character, inclusive, of range to be retained
1152  * to this set.
1153  * @param end last character, inclusive, of range to be retained
1154  * to this set.
1155  */
retain(UChar32 start,UChar32 end)1156 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1157     if (pinCodePoint(start) <= pinCodePoint(end)) {
1158         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1159         retain(range, 2, 0);
1160     } else {
1161         clear();
1162     }
1163     return *this;
1164 }
1165 
retain(UChar32 c)1166 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1167     return retain(c, c);
1168 }
1169 
1170 /**
1171  * Removes the specified range from this set if it is present.
1172  * The set will not contain the specified range once the call
1173  * returns.  If <code>end > start</code> then an empty range is
1174  * removed, leaving the set unchanged.
1175  *
1176  * @param start first character, inclusive, of range to be removed
1177  * from this set.
1178  * @param end last character, inclusive, of range to be removed
1179  * from this set.
1180  */
remove(UChar32 start,UChar32 end)1181 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1182     if (pinCodePoint(start) <= pinCodePoint(end)) {
1183         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1184         retain(range, 2, 2);
1185     }
1186     return *this;
1187 }
1188 
1189 /**
1190  * Removes the specified character from this set if it is present.
1191  * The set will not contain the specified range once the call
1192  * returns.
1193  */
remove(UChar32 c)1194 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1195     return remove(c, c);
1196 }
1197 
1198 /**
1199  * Removes the specified string from this set if it is present.
1200  * The set will not contain the specified character once the call
1201  * returns.
1202  * @param the source string
1203  * @return the modified set, for chaining
1204  */
remove(const UnicodeString & s)1205 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1206     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1207     int32_t cp = getSingleCP(s);
1208     if (cp < 0) {
1209         strings->removeElement((void*) &s);
1210         releasePattern();
1211     } else {
1212         remove((UChar32)cp, (UChar32)cp);
1213     }
1214     return *this;
1215 }
1216 
1217 /**
1218  * Complements the specified range in this set.  Any character in
1219  * the range will be removed if it is in this set, or will be
1220  * added if it is not in this set.  If <code>end > start</code>
1221  * then an empty range is xor'ed, leaving the set unchanged.
1222  *
1223  * @param start first character, inclusive, of range to be removed
1224  * from this set.
1225  * @param end last character, inclusive, of range to be removed
1226  * from this set.
1227  */
complement(UChar32 start,UChar32 end)1228 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1229     if (isFrozen() || isBogus()) {
1230         return *this;
1231     }
1232     if (pinCodePoint(start) <= pinCodePoint(end)) {
1233         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1234         exclusiveOr(range, 2, 0);
1235     }
1236     releasePattern();
1237     return *this;
1238 }
1239 
complement(UChar32 c)1240 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1241     return complement(c, c);
1242 }
1243 
1244 /**
1245  * This is equivalent to
1246  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1247  */
complement(void)1248 UnicodeSet& UnicodeSet::complement(void) {
1249     if (isFrozen() || isBogus()) {
1250         return *this;
1251     }
1252     UErrorCode status = U_ZERO_ERROR;
1253     if (list[0] == UNICODESET_LOW) {
1254         ensureBufferCapacity(len-1, status);
1255         if (U_FAILURE(status)) {
1256             return *this;
1257         }
1258         uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32));
1259         --len;
1260     } else {
1261         ensureBufferCapacity(len+1, status);
1262         if (U_FAILURE(status)) {
1263             return *this;
1264         }
1265         uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32));
1266         buffer[0] = UNICODESET_LOW;
1267         ++len;
1268     }
1269     swapBuffers();
1270     releasePattern();
1271     return *this;
1272 }
1273 
1274 /**
1275  * Complement the specified string in this set.
1276  * The set will not contain the specified string once the call
1277  * returns.
1278  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1279  * @param s the string to complement
1280  * @return this object, for chaining
1281  */
complement(const UnicodeString & s)1282 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1283     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1284     int32_t cp = getSingleCP(s);
1285     if (cp < 0) {
1286         if (strings->contains((void*) &s)) {
1287             strings->removeElement((void*) &s);
1288         } else {
1289             _add(s);
1290         }
1291         releasePattern();
1292     } else {
1293         complement((UChar32)cp, (UChar32)cp);
1294     }
1295     return *this;
1296 }
1297 
1298 /**
1299  * Adds all of the elements in the specified set to this set if
1300  * they're not already present.  This operation effectively
1301  * modifies this set so that its value is the <i>union</i> of the two
1302  * sets.  The behavior of this operation is unspecified if the specified
1303  * collection is modified while the operation is in progress.
1304  *
1305  * @param c set whose elements are to be added to this set.
1306  * @see #add(char, char)
1307  */
addAll(const UnicodeSet & c)1308 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1309     if ( c.len>0 && c.list!=NULL ) {
1310         add(c.list, c.len, 0);
1311     }
1312 
1313     // Add strings in order
1314     if ( c.strings!=NULL ) {
1315         for (int32_t i=0; i<c.strings->size(); ++i) {
1316             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1317             if (!strings->contains((void*) s)) {
1318                 _add(*s);
1319             }
1320         }
1321     }
1322     return *this;
1323 }
1324 
1325 /**
1326  * Retains only the elements in this set that are contained in the
1327  * specified set.  In other words, removes from this set all of
1328  * its elements that are not contained in the specified set.  This
1329  * operation effectively modifies this set so that its value is
1330  * the <i>intersection</i> of the two sets.
1331  *
1332  * @param c set that defines which elements this set will retain.
1333  */
retainAll(const UnicodeSet & c)1334 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1335     if (isFrozen() || isBogus()) {
1336         return *this;
1337     }
1338     retain(c.list, c.len, 0);
1339     strings->retainAll(*c.strings);
1340     return *this;
1341 }
1342 
1343 /**
1344  * Removes from this set all of its elements that are contained in the
1345  * specified set.  This operation effectively modifies this
1346  * set so that its value is the <i>asymmetric set difference</i> of
1347  * the two sets.
1348  *
1349  * @param c set that defines which elements will be removed from
1350  *          this set.
1351  */
removeAll(const UnicodeSet & c)1352 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1353     if (isFrozen() || isBogus()) {
1354         return *this;
1355     }
1356     retain(c.list, c.len, 2);
1357     strings->removeAll(*c.strings);
1358     return *this;
1359 }
1360 
1361 /**
1362  * Complements in this set all elements contained in the specified
1363  * set.  Any character in the other set will be removed if it is
1364  * in this set, or will be added if it is not in this set.
1365  *
1366  * @param c set that defines which elements will be xor'ed from
1367  *          this set.
1368  */
complementAll(const UnicodeSet & c)1369 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1370     if (isFrozen() || isBogus()) {
1371         return *this;
1372     }
1373     exclusiveOr(c.list, c.len, 0);
1374 
1375     for (int32_t i=0; i<c.strings->size(); ++i) {
1376         void* e = c.strings->elementAt(i);
1377         if (!strings->removeElement(e)) {
1378             _add(*(const UnicodeString*)e);
1379         }
1380     }
1381     return *this;
1382 }
1383 
1384 /**
1385  * Removes all of the elements from this set.  This set will be
1386  * empty after this call returns.
1387  */
clear(void)1388 UnicodeSet& UnicodeSet::clear(void) {
1389     if (isFrozen()) {
1390         return *this;
1391     }
1392     if (list != NULL) {
1393         list[0] = UNICODESET_HIGH;
1394     }
1395     len = 1;
1396     releasePattern();
1397     if (strings != NULL) {
1398         strings->removeAllElements();
1399     }
1400     if (list != NULL && strings != NULL) {
1401         // Remove bogus
1402         fFlags = 0;
1403     }
1404     return *this;
1405 }
1406 
1407 /**
1408  * Iteration method that returns the number of ranges contained in
1409  * this set.
1410  * @see #getRangeStart
1411  * @see #getRangeEnd
1412  */
getRangeCount() const1413 int32_t UnicodeSet::getRangeCount() const {
1414     return len/2;
1415 }
1416 
1417 /**
1418  * Iteration method that returns the first character in the
1419  * specified range of this set.
1420  * @see #getRangeCount
1421  * @see #getRangeEnd
1422  */
getRangeStart(int32_t index) const1423 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1424     return list[index*2];
1425 }
1426 
1427 /**
1428  * Iteration method that returns the last character in the
1429  * specified range of this set.
1430  * @see #getRangeStart
1431  * @see #getRangeEnd
1432  */
getRangeEnd(int32_t index) const1433 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1434     return list[index*2 + 1] - 1;
1435 }
1436 
getStringCount() const1437 int32_t UnicodeSet::getStringCount() const {
1438     return strings->size();
1439 }
1440 
getString(int32_t index) const1441 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1442     return (const UnicodeString*) strings->elementAt(index);
1443 }
1444 
1445 /**
1446  * Reallocate this objects internal structures to take up the least
1447  * possible space, without changing this object's value.
1448  */
compact()1449 UnicodeSet& UnicodeSet::compact() {
1450     if (isFrozen() || isBogus()) {
1451         return *this;
1452     }
1453     // Delete buffer first to defragment memory less.
1454     if (buffer != NULL) {
1455         uprv_free(buffer);
1456         buffer = NULL;
1457     }
1458     if (len < capacity) {
1459         // Make the capacity equal to len or 1.
1460         // We don't want to realloc of 0 size.
1461         int32_t newCapacity = len + (len == 0);
1462         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1463         if (temp) {
1464             list = temp;
1465             capacity = newCapacity;
1466         }
1467         // else what the heck happened?! We allocated less memory!
1468         // Oh well. We'll keep our original array.
1469     }
1470     return *this;
1471 }
1472 
1473 #ifdef DEBUG_SERIALIZE
1474 #include <stdio.h>
1475 #endif
1476 
1477 /**
1478  * Deserialize constructor.
1479  */
UnicodeSet(const uint16_t data[],int32_t dataLen,ESerialization serialization,UErrorCode & ec)1480 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
1481   : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
1482     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
1483     fFlags(0) {
1484 
1485   if(U_FAILURE(ec)) {
1486     setToBogus();
1487     return;
1488   }
1489 
1490   if( (serialization != kSerialized)
1491       || (data==NULL)
1492       || (dataLen < 1)) {
1493     ec = U_ILLEGAL_ARGUMENT_ERROR;
1494     setToBogus();
1495     return;
1496   }
1497 
1498   allocateStrings(ec);
1499   if (U_FAILURE(ec)) {
1500     setToBogus();
1501     return;
1502   }
1503 
1504   // bmp?
1505   int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1506   int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1507 
1508   len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1509 #ifdef DEBUG_SERIALIZE
1510   printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
1511 #endif
1512   capacity = len+1;
1513   list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1514   if(!list || U_FAILURE(ec)) {
1515     setToBogus();
1516     return;
1517   }
1518   // copy bmp
1519   int32_t i;
1520   for(i = 0; i< bmpLength;i++) {
1521     list[i] = data[i+headerSize];
1522 #ifdef DEBUG_SERIALIZE
1523     printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1524 #endif
1525   }
1526   // copy smp
1527   for(i=bmpLength;i<len;i++) {
1528     list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1529               ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1530 #ifdef DEBUG_SERIALIZE
1531     printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1532 #endif
1533   }
1534   // terminator
1535   list[len++]=UNICODESET_HIGH;
1536 }
1537 
1538 
serialize(uint16_t * dest,int32_t destCapacity,UErrorCode & ec) const1539 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1540     int32_t bmpLength, length, destLength;
1541 
1542     if (U_FAILURE(ec)) {
1543         return 0;
1544     }
1545 
1546     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1547         ec=U_ILLEGAL_ARGUMENT_ERROR;
1548         return 0;
1549     }
1550 
1551     /* count necessary 16-bit units */
1552     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1553     // assert(length>=0);
1554     if (length==0) {
1555         /* empty set */
1556         if (destCapacity>0) {
1557             *dest=0;
1558         } else {
1559             ec=U_BUFFER_OVERFLOW_ERROR;
1560         }
1561         return 1;
1562     }
1563     /* now length>0 */
1564 
1565     if (this->list[length-1]<=0xffff) {
1566         /* all BMP */
1567         bmpLength=length;
1568     } else if (this->list[0]>=0x10000) {
1569         /* all supplementary */
1570         bmpLength=0;
1571         length*=2;
1572     } else {
1573         /* some BMP, some supplementary */
1574         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1575         length=bmpLength+2*(length-bmpLength);
1576     }
1577 #ifdef DEBUG_SERIALIZE
1578     printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1579 #endif
1580     /* length: number of 16-bit array units */
1581     if (length>0x7fff) {
1582         /* there are only 15 bits for the length in the first serialized word */
1583         ec=U_INDEX_OUTOFBOUNDS_ERROR;
1584         return 0;
1585     }
1586 
1587     /*
1588      * total serialized length:
1589      * number of 16-bit array units (length) +
1590      * 1 length unit (always) +
1591      * 1 bmpLength unit (if there are supplementary values)
1592      */
1593     destLength=length+((length>bmpLength)?2:1);
1594     if (destLength<=destCapacity) {
1595         const UChar32 *p;
1596         int32_t i;
1597 
1598 #ifdef DEBUG_SERIALIZE
1599         printf("writeHdr\n");
1600 #endif
1601         *dest=(uint16_t)length;
1602         if (length>bmpLength) {
1603             *dest|=0x8000;
1604             *++dest=(uint16_t)bmpLength;
1605         }
1606         ++dest;
1607 
1608         /* write the BMP part of the array */
1609         p=this->list;
1610         for (i=0; i<bmpLength; ++i) {
1611 #ifdef DEBUG_SERIALIZE
1612           printf("writebmp: %x\n", (int)*p);
1613 #endif
1614             *dest++=(uint16_t)*p++;
1615         }
1616 
1617         /* write the supplementary part of the array */
1618         for (; i<length; i+=2) {
1619 #ifdef DEBUG_SERIALIZE
1620           printf("write32: %x\n", (int)*p);
1621 #endif
1622             *dest++=(uint16_t)(*p>>16);
1623             *dest++=(uint16_t)*p++;
1624         }
1625     } else {
1626         ec=U_BUFFER_OVERFLOW_ERROR;
1627     }
1628     return destLength;
1629 }
1630 
1631 //----------------------------------------------------------------
1632 // Implementation: Utility methods
1633 //----------------------------------------------------------------
1634 
1635 /**
1636  * Allocate our strings vector and return TRUE if successful.
1637  */
allocateStrings(UErrorCode & status)1638 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1639     if (U_FAILURE(status)) {
1640         return FALSE;
1641     }
1642     strings = new UVector(uprv_deleteUObject,
1643                           uhash_compareUnicodeString, 1, status);
1644     if (strings == NULL) { // Check for memory allocation error.
1645         status = U_MEMORY_ALLOCATION_ERROR;
1646         return FALSE;
1647     }
1648     if (U_FAILURE(status)) {
1649         delete strings;
1650         strings = NULL;
1651         return FALSE;
1652     }
1653     return TRUE;
1654 }
1655 
ensureCapacity(int32_t newLen,UErrorCode & ec)1656 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1657     if (newLen <= capacity)
1658         return;
1659     UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1660     if (temp == NULL) {
1661         ec = U_MEMORY_ALLOCATION_ERROR;
1662         setToBogus();
1663         return;
1664     }
1665     list = temp;
1666     capacity = newLen + GROW_EXTRA;
1667     // else we keep the original contents on the memory failure.
1668 }
1669 
ensureBufferCapacity(int32_t newLen,UErrorCode & ec)1670 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1671     if (buffer != NULL && newLen <= bufferCapacity)
1672         return;
1673     UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1674     if (temp == NULL) {
1675         ec = U_MEMORY_ALLOCATION_ERROR;
1676         setToBogus();
1677         return;
1678     }
1679     buffer = temp;
1680     bufferCapacity = newLen + GROW_EXTRA;
1681     // else we keep the original contents on the memory failure.
1682 }
1683 
1684 /**
1685  * Swap list and buffer.
1686  */
swapBuffers(void)1687 void UnicodeSet::swapBuffers(void) {
1688     // swap list and buffer
1689     UChar32* temp = list;
1690     list = buffer;
1691     buffer = temp;
1692 
1693     int32_t c = capacity;
1694     capacity = bufferCapacity;
1695     bufferCapacity = c;
1696 }
1697 
setToBogus()1698 void UnicodeSet::setToBogus() {
1699     clear(); // Remove everything in the set.
1700     fFlags = kIsBogus;
1701 }
1702 
1703 //----------------------------------------------------------------
1704 // Implementation: Fundamental operators
1705 //----------------------------------------------------------------
1706 
max(UChar32 a,UChar32 b)1707 static inline UChar32 max(UChar32 a, UChar32 b) {
1708     return (a > b) ? a : b;
1709 }
1710 
1711 // polarity = 0, 3 is normal: x xor y
1712 // polarity = 1, 2: x xor ~y == x === y
1713 
exclusiveOr(const UChar32 * other,int32_t otherLen,int8_t polarity)1714 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1715     if (isFrozen() || isBogus()) {
1716         return;
1717     }
1718     UErrorCode status = U_ZERO_ERROR;
1719     ensureBufferCapacity(len + otherLen, status);
1720     if (U_FAILURE(status)) {
1721         return;
1722     }
1723 
1724     int32_t i = 0, j = 0, k = 0;
1725     UChar32 a = list[i++];
1726     UChar32 b;
1727     if (polarity == 1 || polarity == 2) {
1728         b = UNICODESET_LOW;
1729         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1730             ++j;
1731             b = other[j];
1732         }
1733     } else {
1734         b = other[j++];
1735     }
1736     // simplest of all the routines
1737     // sort the values, discarding identicals!
1738     for (;;) {
1739         if (a < b) {
1740             buffer[k++] = a;
1741             a = list[i++];
1742         } else if (b < a) {
1743             buffer[k++] = b;
1744             b = other[j++];
1745         } else if (a != UNICODESET_HIGH) { // at this point, a == b
1746             // discard both values!
1747             a = list[i++];
1748             b = other[j++];
1749         } else { // DONE!
1750             buffer[k++] = UNICODESET_HIGH;
1751             len = k;
1752             break;
1753         }
1754     }
1755     swapBuffers();
1756     releasePattern();
1757 }
1758 
1759 // polarity = 0 is normal: x union y
1760 // polarity = 2: x union ~y
1761 // polarity = 1: ~x union y
1762 // polarity = 3: ~x union ~y
1763 
add(const UChar32 * other,int32_t otherLen,int8_t polarity)1764 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1765     if (isFrozen() || isBogus() || other==NULL) {
1766         return;
1767     }
1768     UErrorCode status = U_ZERO_ERROR;
1769     ensureBufferCapacity(len + otherLen, status);
1770     if (U_FAILURE(status)) {
1771         return;
1772     }
1773 
1774     int32_t i = 0, j = 0, k = 0;
1775     UChar32 a = list[i++];
1776     UChar32 b = other[j++];
1777     // change from xor is that we have to check overlapping pairs
1778     // polarity bit 1 means a is second, bit 2 means b is.
1779     for (;;) {
1780         switch (polarity) {
1781           case 0: // both first; take lower if unequal
1782             if (a < b) { // take a
1783                 // Back up over overlapping ranges in buffer[]
1784                 if (k > 0 && a <= buffer[k-1]) {
1785                     // Pick latter end value in buffer[] vs. list[]
1786                     a = max(list[i], buffer[--k]);
1787                 } else {
1788                     // No overlap
1789                     buffer[k++] = a;
1790                     a = list[i];
1791                 }
1792                 i++; // Common if/else code factored out
1793                 polarity ^= 1;
1794             } else if (b < a) { // take b
1795                 if (k > 0 && b <= buffer[k-1]) {
1796                     b = max(other[j], buffer[--k]);
1797                 } else {
1798                     buffer[k++] = b;
1799                     b = other[j];
1800                 }
1801                 j++;
1802                 polarity ^= 2;
1803             } else { // a == b, take a, drop b
1804                 if (a == UNICODESET_HIGH) goto loop_end;
1805                 // This is symmetrical; it doesn't matter if
1806                 // we backtrack with a or b. - liu
1807                 if (k > 0 && a <= buffer[k-1]) {
1808                     a = max(list[i], buffer[--k]);
1809                 } else {
1810                     // No overlap
1811                     buffer[k++] = a;
1812                     a = list[i];
1813                 }
1814                 i++;
1815                 polarity ^= 1;
1816                 b = other[j++];
1817                 polarity ^= 2;
1818             }
1819             break;
1820           case 3: // both second; take higher if unequal, and drop other
1821             if (b <= a) { // take a
1822                 if (a == UNICODESET_HIGH) goto loop_end;
1823                 buffer[k++] = a;
1824             } else { // take b
1825                 if (b == UNICODESET_HIGH) goto loop_end;
1826                 buffer[k++] = b;
1827             }
1828             a = list[i++];
1829             polarity ^= 1;   // factored common code
1830             b = other[j++];
1831             polarity ^= 2;
1832             break;
1833           case 1: // a second, b first; if b < a, overlap
1834             if (a < b) { // no overlap, take a
1835                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1836             } else if (b < a) { // OVERLAP, drop b
1837                 b = other[j++];
1838                 polarity ^= 2;
1839             } else { // a == b, drop both!
1840                 if (a == UNICODESET_HIGH) goto loop_end;
1841                 a = list[i++];
1842                 polarity ^= 1;
1843                 b = other[j++];
1844                 polarity ^= 2;
1845             }
1846             break;
1847           case 2: // a first, b second; if a < b, overlap
1848             if (b < a) { // no overlap, take b
1849                 buffer[k++] = b;
1850                 b = other[j++];
1851                 polarity ^= 2;
1852             } else  if (a < b) { // OVERLAP, drop a
1853                 a = list[i++];
1854                 polarity ^= 1;
1855             } else { // a == b, drop both!
1856                 if (a == UNICODESET_HIGH) goto loop_end;
1857                 a = list[i++];
1858                 polarity ^= 1;
1859                 b = other[j++];
1860                 polarity ^= 2;
1861             }
1862             break;
1863         }
1864     }
1865  loop_end:
1866     buffer[k++] = UNICODESET_HIGH;    // terminate
1867     len = k;
1868     swapBuffers();
1869     releasePattern();
1870 }
1871 
1872 // polarity = 0 is normal: x intersect y
1873 // polarity = 2: x intersect ~y == set-minus
1874 // polarity = 1: ~x intersect y
1875 // polarity = 3: ~x intersect ~y
1876 
retain(const UChar32 * other,int32_t otherLen,int8_t polarity)1877 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1878     if (isFrozen() || isBogus()) {
1879         return;
1880     }
1881     UErrorCode status = U_ZERO_ERROR;
1882     ensureBufferCapacity(len + otherLen, status);
1883     if (U_FAILURE(status)) {
1884         return;
1885     }
1886 
1887     int32_t i = 0, j = 0, k = 0;
1888     UChar32 a = list[i++];
1889     UChar32 b = other[j++];
1890     // change from xor is that we have to check overlapping pairs
1891     // polarity bit 1 means a is second, bit 2 means b is.
1892     for (;;) {
1893         switch (polarity) {
1894           case 0: // both first; drop the smaller
1895             if (a < b) { // drop a
1896                 a = list[i++];
1897                 polarity ^= 1;
1898             } else if (b < a) { // drop b
1899                 b = other[j++];
1900                 polarity ^= 2;
1901             } else { // a == b, take one, drop other
1902                 if (a == UNICODESET_HIGH) goto loop_end;
1903                 buffer[k++] = a;
1904                 a = list[i++];
1905                 polarity ^= 1;
1906                 b = other[j++];
1907                 polarity ^= 2;
1908             }
1909             break;
1910           case 3: // both second; take lower if unequal
1911             if (a < b) { // take a
1912                 buffer[k++] = a;
1913                 a = list[i++];
1914                 polarity ^= 1;
1915             } else if (b < a) { // take b
1916                 buffer[k++] = b;
1917                 b = other[j++];
1918                 polarity ^= 2;
1919             } else { // a == b, take one, drop other
1920                 if (a == UNICODESET_HIGH) goto loop_end;
1921                 buffer[k++] = a;
1922                 a = list[i++];
1923                 polarity ^= 1;
1924                 b = other[j++];
1925                 polarity ^= 2;
1926             }
1927             break;
1928           case 1: // a second, b first;
1929             if (a < b) { // NO OVERLAP, drop a
1930                 a = list[i++];
1931                 polarity ^= 1;
1932             } else if (b < a) { // OVERLAP, take b
1933                 buffer[k++] = b;
1934                 b = other[j++];
1935                 polarity ^= 2;
1936             } else { // a == b, drop both!
1937                 if (a == UNICODESET_HIGH) goto loop_end;
1938                 a = list[i++];
1939                 polarity ^= 1;
1940                 b = other[j++];
1941                 polarity ^= 2;
1942             }
1943             break;
1944           case 2: // a first, b second; if a < b, overlap
1945             if (b < a) { // no overlap, drop b
1946                 b = other[j++];
1947                 polarity ^= 2;
1948             } else  if (a < b) { // OVERLAP, take a
1949                 buffer[k++] = a;
1950                 a = list[i++];
1951                 polarity ^= 1;
1952             } else { // a == b, drop both!
1953                 if (a == UNICODESET_HIGH) goto loop_end;
1954                 a = list[i++];
1955                 polarity ^= 1;
1956                 b = other[j++];
1957                 polarity ^= 2;
1958             }
1959             break;
1960         }
1961     }
1962  loop_end:
1963     buffer[k++] = UNICODESET_HIGH;    // terminate
1964     len = k;
1965     swapBuffers();
1966     releasePattern();
1967 }
1968 
1969 /**
1970  * Append the <code>toPattern()</code> representation of a
1971  * string to the given <code>StringBuffer</code>.
1972  */
_appendToPat(UnicodeString & buf,const UnicodeString & s,UBool escapeUnprintable)1973 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1974 escapeUnprintable) {
1975     UChar32 cp;
1976     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1977         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1978     }
1979 }
1980 
1981 /**
1982  * Append the <code>toPattern()</code> representation of a
1983  * character to the given <code>StringBuffer</code>.
1984  */
_appendToPat(UnicodeString & buf,UChar32 c,UBool escapeUnprintable)1985 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1986 escapeUnprintable) {
1987     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1988         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1989         // unprintable
1990         if (ICU_Utility::escapeUnprintable(buf, c)) {
1991             return;
1992         }
1993     }
1994     // Okay to let ':' pass through
1995     switch (c) {
1996     case SET_OPEN:
1997     case SET_CLOSE:
1998     case HYPHEN:
1999     case COMPLEMENT:
2000     case INTERSECTION:
2001     case BACKSLASH:
2002     case OPEN_BRACE:
2003     case CLOSE_BRACE:
2004     case COLON:
2005     case SymbolTable::SYMBOL_REF:
2006         buf.append(BACKSLASH);
2007         break;
2008     default:
2009         // Escape whitespace
2010         if (PatternProps::isWhiteSpace(c)) {
2011             buf.append(BACKSLASH);
2012         }
2013         break;
2014     }
2015     buf.append(c);
2016 }
2017 
2018 /**
2019  * Append a string representation of this set to result.  This will be
2020  * a cleaned version of the string passed to applyPattern(), if there
2021  * is one.  Otherwise it will be generated.
2022  */
_toPattern(UnicodeString & result,UBool escapeUnprintable) const2023 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2024                                       UBool escapeUnprintable) const
2025 {
2026     if (pat != NULL) {
2027         int32_t i;
2028         int32_t backslashCount = 0;
2029         for (i=0; i<patLen; ) {
2030             UChar32 c;
2031             U16_NEXT(pat, i, patLen, c);
2032             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2033                 // If the unprintable character is preceded by an odd
2034                 // number of backslashes, then it has been escaped.
2035                 // Before unescaping it, we delete the final
2036                 // backslash.
2037                 if ((backslashCount % 2) == 1) {
2038                     result.truncate(result.length() - 1);
2039                 }
2040                 ICU_Utility::escapeUnprintable(result, c);
2041                 backslashCount = 0;
2042             } else {
2043                 result.append(c);
2044                 if (c == BACKSLASH) {
2045                     ++backslashCount;
2046                 } else {
2047                     backslashCount = 0;
2048                 }
2049             }
2050         }
2051         return result;
2052     }
2053 
2054     return _generatePattern(result, escapeUnprintable);
2055 }
2056 
2057 /**
2058  * Returns a string representation of this set.  If the result of
2059  * calling this function is passed to a UnicodeSet constructor, it
2060  * will produce another set that is equal to this one.
2061  */
toPattern(UnicodeString & result,UBool escapeUnprintable) const2062 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2063                                      UBool escapeUnprintable) const
2064 {
2065     result.truncate(0);
2066     return _toPattern(result, escapeUnprintable);
2067 }
2068 
2069 /**
2070  * Generate and append a string representation of this set to result.
2071  * This does not use this.pat, the cleaned up copy of the string
2072  * passed to applyPattern().
2073  */
_generatePattern(UnicodeString & result,UBool escapeUnprintable) const2074 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2075                                             UBool escapeUnprintable) const
2076 {
2077     result.append(SET_OPEN);
2078 
2079 //  // Check against the predefined categories.  We implicitly build
2080 //  // up ALL category sets the first time toPattern() is called.
2081 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2082 //      if (*this == getCategorySet(cat)) {
2083 //          result.append(COLON);
2084 //          result.append(CATEGORY_NAMES, cat*2, 2);
2085 //          return result.append(CATEGORY_CLOSE);
2086 //      }
2087 //  }
2088 
2089     int32_t count = getRangeCount();
2090 
2091     // If the set contains at least 2 intervals and includes both
2092     // MIN_VALUE and MAX_VALUE, then the inverse representation will
2093     // be more economical.
2094     if (count > 1 &&
2095         getRangeStart(0) == MIN_VALUE &&
2096         getRangeEnd(count-1) == MAX_VALUE) {
2097 
2098         // Emit the inverse
2099         result.append(COMPLEMENT);
2100 
2101         for (int32_t i = 1; i < count; ++i) {
2102             UChar32 start = getRangeEnd(i-1)+1;
2103             UChar32 end = getRangeStart(i)-1;
2104             _appendToPat(result, start, escapeUnprintable);
2105             if (start != end) {
2106                 if ((start+1) != end) {
2107                     result.append(HYPHEN);
2108                 }
2109                 _appendToPat(result, end, escapeUnprintable);
2110             }
2111         }
2112     }
2113 
2114     // Default; emit the ranges as pairs
2115     else {
2116         for (int32_t i = 0; i < count; ++i) {
2117             UChar32 start = getRangeStart(i);
2118             UChar32 end = getRangeEnd(i);
2119             _appendToPat(result, start, escapeUnprintable);
2120             if (start != end) {
2121                 if ((start+1) != end) {
2122                     result.append(HYPHEN);
2123                 }
2124                 _appendToPat(result, end, escapeUnprintable);
2125             }
2126         }
2127     }
2128 
2129     for (int32_t i = 0; i<strings->size(); ++i) {
2130         result.append(OPEN_BRACE);
2131         _appendToPat(result,
2132                      *(const UnicodeString*) strings->elementAt(i),
2133                      escapeUnprintable);
2134         result.append(CLOSE_BRACE);
2135     }
2136     return result.append(SET_CLOSE);
2137 }
2138 
2139 /**
2140 * Release existing cached pattern
2141 */
releasePattern()2142 void UnicodeSet::releasePattern() {
2143     if (pat) {
2144         uprv_free(pat);
2145         pat = NULL;
2146         patLen = 0;
2147     }
2148 }
2149 
2150 /**
2151 * Set the new pattern to cache.
2152 */
setPattern(const UnicodeString & newPat)2153 void UnicodeSet::setPattern(const UnicodeString& newPat) {
2154     releasePattern();
2155     int32_t newPatLen = newPat.length();
2156     pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2157     if (pat) {
2158         patLen = newPatLen;
2159         newPat.extractBetween(0, patLen, pat);
2160         pat[patLen] = 0;
2161     }
2162     // else we don't care if malloc failed. This was just a nice cache.
2163     // We can regenerate an equivalent pattern later when requested.
2164 }
2165 
freeze()2166 UnicodeFunctor *UnicodeSet::freeze() {
2167     if(!isFrozen() && !isBogus()) {
2168         // Do most of what compact() does before freezing because
2169         // compact() will not work when the set is frozen.
2170         // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2171 
2172         // Delete buffer first to defragment memory less.
2173         if (buffer != NULL) {
2174             uprv_free(buffer);
2175             buffer = NULL;
2176         }
2177         if (capacity > (len + GROW_EXTRA)) {
2178             // Make the capacity equal to len or 1.
2179             // We don't want to realloc of 0 size.
2180             capacity = len + (len == 0);
2181             list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2182             if (list == NULL) { // Check for memory allocation error.
2183                 setToBogus();
2184                 return this;
2185             }
2186         }
2187 
2188         // Optimize contains() and span() and similar functions.
2189         if (!strings->isEmpty()) {
2190             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2191             if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2192                 // All strings are irrelevant for span() etc. because
2193                 // all of each string's code points are contained in this set.
2194                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2195                 // many relevant strings as UTF-16.
2196                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2197                 delete stringSpan;
2198                 stringSpan = NULL;
2199             }
2200         }
2201         if (stringSpan == NULL) {
2202             // No span-relevant strings: Optimize for code point spans.
2203             bmpSet=new BMPSet(list, len);
2204             if (bmpSet == NULL) { // Check for memory allocation error.
2205                 setToBogus();
2206             }
2207         }
2208     }
2209     return this;
2210 }
2211 
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2212 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2213     if(length>0 && bmpSet!=NULL) {
2214         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2215     }
2216     if(length<0) {
2217         length=u_strlen(s);
2218     }
2219     if(length==0) {
2220         return 0;
2221     }
2222     if(stringSpan!=NULL) {
2223         return stringSpan->span(s, length, spanCondition);
2224     } else if(!strings->isEmpty()) {
2225         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2226                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2227                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2228         UnicodeSetStringSpan strSpan(*this, *strings, which);
2229         if(strSpan.needsStringSpanUTF16()) {
2230             return strSpan.span(s, length, spanCondition);
2231         }
2232     }
2233 
2234     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2235         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2236     }
2237 
2238     UChar32 c;
2239     int32_t start=0, prev=0;
2240     do {
2241         U16_NEXT(s, start, length, c);
2242         if(spanCondition!=contains(c)) {
2243             break;
2244         }
2245     } while((prev=start)<length);
2246     return prev;
2247 }
2248 
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2249 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2250     if(length>0 && bmpSet!=NULL) {
2251         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2252     }
2253     if(length<0) {
2254         length=u_strlen(s);
2255     }
2256     if(length==0) {
2257         return 0;
2258     }
2259     if(stringSpan!=NULL) {
2260         return stringSpan->spanBack(s, length, spanCondition);
2261     } else if(!strings->isEmpty()) {
2262         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2263                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2264                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2265         UnicodeSetStringSpan strSpan(*this, *strings, which);
2266         if(strSpan.needsStringSpanUTF16()) {
2267             return strSpan.spanBack(s, length, spanCondition);
2268         }
2269     }
2270 
2271     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2272         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2273     }
2274 
2275     UChar32 c;
2276     int32_t prev=length;
2277     do {
2278         U16_PREV(s, 0, length, c);
2279         if(spanCondition!=contains(c)) {
2280             break;
2281         }
2282     } while((prev=length)>0);
2283     return prev;
2284 }
2285 
spanUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2286 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2287     if(length>0 && bmpSet!=NULL) {
2288         const uint8_t *s0=(const uint8_t *)s;
2289         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2290     }
2291     if(length<0) {
2292         length=(int32_t)uprv_strlen(s);
2293     }
2294     if(length==0) {
2295         return 0;
2296     }
2297     if(stringSpan!=NULL) {
2298         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2299     } else if(!strings->isEmpty()) {
2300         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2301                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2302                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2303         UnicodeSetStringSpan strSpan(*this, *strings, which);
2304         if(strSpan.needsStringSpanUTF8()) {
2305             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2306         }
2307     }
2308 
2309     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2310         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2311     }
2312 
2313     UChar32 c;
2314     int32_t start=0, prev=0;
2315     do {
2316         U8_NEXT_OR_FFFD(s, start, length, c);
2317         if(spanCondition!=contains(c)) {
2318             break;
2319         }
2320     } while((prev=start)<length);
2321     return prev;
2322 }
2323 
spanBackUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2324 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2325     if(length>0 && bmpSet!=NULL) {
2326         const uint8_t *s0=(const uint8_t *)s;
2327         return bmpSet->spanBackUTF8(s0, length, spanCondition);
2328     }
2329     if(length<0) {
2330         length=(int32_t)uprv_strlen(s);
2331     }
2332     if(length==0) {
2333         return 0;
2334     }
2335     if(stringSpan!=NULL) {
2336         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2337     } else if(!strings->isEmpty()) {
2338         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2339                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2340                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2341         UnicodeSetStringSpan strSpan(*this, *strings, which);
2342         if(strSpan.needsStringSpanUTF8()) {
2343             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2344         }
2345     }
2346 
2347     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2348         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2349     }
2350 
2351     UChar32 c;
2352     int32_t prev=length;
2353     do {
2354         U8_PREV_OR_FFFD(s, 0, length, c);
2355         if(spanCondition!=contains(c)) {
2356             break;
2357         }
2358     } while((prev=length)>0);
2359     return prev;
2360 }
2361 
2362 U_NAMESPACE_END
2363