1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2007, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  unisetspan.h
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2007mar01
16 *   created by: Markus W. Scherer
17 */
18 
19 #ifndef __UNISETSPAN_H__
20 #define __UNISETSPAN_H__
21 
22 #include "unicode/utypes.h"
23 #include "unicode/uniset.h"
24 
25 U_NAMESPACE_BEGIN
26 
27 /*
28  * Implement span() etc. for a set with strings.
29  * Avoid recursion because of its exponential complexity.
30  * Instead, try multiple paths at once and track them with an IndexList.
31  */
32 class UnicodeSetStringSpan : public UMemory {
33 public:
34     /*
35      * Which span() variant will be used?
36      * The object is either built for one variant and used once,
37      * or built for all and may be used many times.
38      */
39     enum {
40         FWD             = 0x20,
41         BACK            = 0x10,
42         UTF16           = 8,
43         UTF8            = 4,
44         CONTAINED       = 2,
45         NOT_CONTAINED   = 1,
46 
47         ALL             = 0x3f,
48 
49         FWD_UTF16_CONTAINED     = FWD  | UTF16 |     CONTAINED,
50         FWD_UTF16_NOT_CONTAINED = FWD  | UTF16 | NOT_CONTAINED,
51         FWD_UTF8_CONTAINED      = FWD  | UTF8  |     CONTAINED,
52         FWD_UTF8_NOT_CONTAINED  = FWD  | UTF8  | NOT_CONTAINED,
53         BACK_UTF16_CONTAINED    = BACK | UTF16 |     CONTAINED,
54         BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED,
55         BACK_UTF8_CONTAINED     = BACK | UTF8  |     CONTAINED,
56         BACK_UTF8_NOT_CONTAINED = BACK | UTF8  | NOT_CONTAINED
57     };
58 
59     UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which);
60 
61     // Copy constructor. Assumes which==ALL for a frozen set.
62     UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings);
63 
64     ~UnicodeSetStringSpan();
65 
66     /*
67      * Do the strings need to be checked in span() etc.?
68      * @return TRUE if strings need to be checked (call span() here),
69      *         FALSE if not (use a BMPSet for best performance).
70      */
71     inline UBool needsStringSpanUTF16();
72     inline UBool needsStringSpanUTF8();
73 
74     // For fast UnicodeSet::contains(c).
75     inline UBool contains(UChar32 c) const;
76 
77     int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
78 
79     int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
80 
81     int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
82 
83     int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
84 
85 private:
86     // Special spanLength byte values.
87     enum {
88         // The spanLength is >=0xfe.
89         LONG_SPAN=0xfe,
90         // All code points in the string are contained in the parent set.
91         ALL_CP_CONTAINED=0xff
92     };
93 
94     // Add a starting or ending string character to the spanNotSet
95     // so that a character span ends before any string.
96     void addToSpanNotSet(UChar32 c);
97 
98     int32_t spanNot(const UChar *s, int32_t length) const;
99     int32_t spanNotBack(const UChar *s, int32_t length) const;
100     int32_t spanNotUTF8(const uint8_t *s, int32_t length) const;
101     int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const;
102 
103     // Set for span(). Same as parent but without strings.
104     UnicodeSet spanSet;
105 
106     // Set for span(not contained).
107     // Same as spanSet, plus characters that start or end strings.
108     UnicodeSet *pSpanNotSet;
109 
110     // The strings of the parent set.
111     const UVector &strings;
112 
113     // Pointer to the UTF-8 string lengths.
114     // Also pointer to further allocated storage for meta data and
115     // UTF-8 string contents as necessary.
116     int32_t *utf8Lengths;
117 
118     // Pointer to the part of the (utf8Lengths) memory block that stores
119     // the lengths of span(), spanBack() etc. for each string.
120     uint8_t *spanLengths;
121 
122     // Pointer to the part of the (utf8Lengths) memory block that stores
123     // the UTF-8 versions of the parent set's strings.
124     uint8_t *utf8;
125 
126     // Number of bytes for all UTF-8 versions of strings together.
127     int32_t utf8Length;
128 
129     // Maximum lengths of relevant strings.
130     int32_t maxLength16;
131     int32_t maxLength8;
132 
133     // Set up for all variants of span()?
134     UBool all;
135 
136     // Memory for small numbers and lengths of strings.
137     // For example, for 8 strings:
138     // 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters
139     // = 112 bytes = int32_t[28].
140     int32_t staticLengths[32];
141 };
142 
needsStringSpanUTF16()143 UBool UnicodeSetStringSpan::needsStringSpanUTF16() {
144     return (UBool)(maxLength16!=0);
145 }
146 
needsStringSpanUTF8()147 UBool UnicodeSetStringSpan::needsStringSpanUTF8() {
148     return (UBool)(maxLength8!=0);
149 }
150 
contains(UChar32 c)151 UBool UnicodeSetStringSpan::contains(UChar32 c) const {
152     return spanSet.contains(c);
153 }
154 
155 U_NAMESPACE_END
156 
157 #endif
158