1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2007, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  unisetspan.h
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2007mar01
14 *   created by: Markus W. Scherer
15 */
16 
17 #ifndef __UNISETSPAN_H__
18 #define __UNISETSPAN_H__
19 
20 #include "unicode/utypes.h"
21 #include "unicode/uniset.h"
22 
23 U_NAMESPACE_BEGIN
24 
25 /*
26  * Implement span() etc. for a set with strings.
27  * Avoid recursion because of its exponential complexity.
28  * Instead, try multiple paths at once and track them with an IndexList.
29  */
30 class UnicodeSetStringSpan : public UMemory {
31 public:
32     /*
33      * Which span() variant will be used?
34      * The object is either built for one variant and used once,
35      * or built for all and may be used many times.
36      */
37     enum {
38         FWD             = 0x20,
39         BACK            = 0x10,
40         UTF16           = 8,
41         UTF8            = 4,
42         CONTAINED       = 2,
43         NOT_CONTAINED   = 1,
44 
45         ALL             = 0x3f,
46 
47         FWD_UTF16_CONTAINED     = FWD  | UTF16 |     CONTAINED,
48         FWD_UTF16_NOT_CONTAINED = FWD  | UTF16 | NOT_CONTAINED,
49         FWD_UTF8_CONTAINED      = FWD  | UTF8  |     CONTAINED,
50         FWD_UTF8_NOT_CONTAINED  = FWD  | UTF8  | NOT_CONTAINED,
51         BACK_UTF16_CONTAINED    = BACK | UTF16 |     CONTAINED,
52         BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED,
53         BACK_UTF8_CONTAINED     = BACK | UTF8  |     CONTAINED,
54         BACK_UTF8_NOT_CONTAINED = BACK | UTF8  | NOT_CONTAINED
55     };
56 
57     UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which);
58 
59     // Copy constructor. Assumes which==ALL for a frozen set.
60     UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings);
61 
62     ~UnicodeSetStringSpan();
63 
64     /*
65      * Do the strings need to be checked in span() etc.?
66      * @return TRUE if strings need to be checked (call span() here),
67      *         FALSE if not (use a BMPSet for best performance).
68      */
69     inline UBool needsStringSpanUTF16();
70     inline UBool needsStringSpanUTF8();
71 
72     // For fast UnicodeSet::contains(c).
73     inline UBool contains(UChar32 c) const;
74 
75     int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
76 
77     int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
78 
79     int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
80 
81     int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
82 
83 private:
84     // Special spanLength byte values.
85     enum {
86         // The spanLength is >=0xfe.
87         LONG_SPAN=0xfe,
88         // All code points in the string are contained in the parent set.
89         ALL_CP_CONTAINED=0xff
90     };
91 
92     // Add a starting or ending string character to the spanNotSet
93     // so that a character span ends before any string.
94     void addToSpanNotSet(UChar32 c);
95 
96     int32_t spanNot(const UChar *s, int32_t length) const;
97     int32_t spanNotBack(const UChar *s, int32_t length) const;
98     int32_t spanNotUTF8(const uint8_t *s, int32_t length) const;
99     int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const;
100 
101     // Set for span(). Same as parent but without strings.
102     UnicodeSet spanSet;
103 
104     // Set for span(not contained).
105     // Same as spanSet, plus characters that start or end strings.
106     UnicodeSet *pSpanNotSet;
107 
108     // The strings of the parent set.
109     const UVector &strings;
110 
111     // Pointer to the UTF-8 string lengths.
112     // Also pointer to further allocated storage for meta data and
113     // UTF-8 string contents as necessary.
114     int32_t *utf8Lengths;
115 
116     // Pointer to the part of the (utf8Lengths) memory block that stores
117     // the lengths of span(), spanBack() etc. for each string.
118     uint8_t *spanLengths;
119 
120     // Pointer to the part of the (utf8Lengths) memory block that stores
121     // the UTF-8 versions of the parent set's strings.
122     uint8_t *utf8;
123 
124     // Number of bytes for all UTF-8 versions of strings together.
125     int32_t utf8Length;
126 
127     // Maximum lengths of relevant strings.
128     int32_t maxLength16;
129     int32_t maxLength8;
130 
131     // Set up for all variants of span()?
132     UBool all;
133 
134     // Memory for small numbers and lengths of strings.
135     // For example, for 8 strings:
136     // 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters
137     // = 112 bytes = int32_t[28].
138     int32_t staticLengths[32];
139 };
140 
needsStringSpanUTF16()141 UBool UnicodeSetStringSpan::needsStringSpanUTF16() {
142     return (UBool)(maxLength16!=0);
143 }
144 
needsStringSpanUTF8()145 UBool UnicodeSetStringSpan::needsStringSpanUTF8() {
146     return (UBool)(maxLength8!=0);
147 }
148 
contains(UChar32 c)149 UBool UnicodeSetStringSpan::contains(UChar32 c) const {
150     return spanSet.contains(c);
151 }
152 
153 U_NAMESPACE_END
154 
155 #endif
156