1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2010-2012,2014, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  stringtriebuilder.h
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2010dec24
14 *   created by: Markus W. Scherer
15 */
16 
17 #ifndef __STRINGTRIEBUILDER_H__
18 #define __STRINGTRIEBUILDER_H__
19 
20 #include "unicode/utypes.h"
21 #include "unicode/uobject.h"
22 
23 /**
24  * \file
25  * \brief C++ API: Builder API for trie builders
26  */
27 
28 // Forward declaration.
29 struct UHashtable;
30 typedef struct UHashtable UHashtable;
31 
32 /**
33  * Build options for BytesTrieBuilder and CharsTrieBuilder.
34  * @stable ICU 4.8
35  */
36 enum UStringTrieBuildOption {
37     /**
38      * Builds a trie quickly.
39      * @stable ICU 4.8
40      */
41     USTRINGTRIE_BUILD_FAST,
42     /**
43      * Builds a trie more slowly, attempting to generate
44      * a shorter but equivalent serialization.
45      * This build option also uses more memory.
46      *
47      * This option can be effective when many integer values are the same
48      * and string/byte sequence suffixes can be shared.
49      * Runtime speed is not expected to improve.
50      * @stable ICU 4.8
51      */
52     USTRINGTRIE_BUILD_SMALL
53 };
54 
55 U_NAMESPACE_BEGIN
56 
57 /**
58  * Base class for string trie builder classes.
59  *
60  * This class is not intended for public subclassing.
61  * @stable ICU 4.8
62  */
63 class U_COMMON_API StringTrieBuilder : public UObject {
64 public:
65 #ifndef U_HIDE_INTERNAL_API
66     /** @internal */
67     static UBool hashNode(const void *node);
68     /** @internal */
69     static UBool equalNodes(const void *left, const void *right);
70 #endif  /* U_HIDE_INTERNAL_API */
71 
72 protected:
73     // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
74     // or else the compiler will create a public default constructor.
75     /** @internal */
76     StringTrieBuilder();
77     /** @internal */
78     virtual ~StringTrieBuilder();
79 
80 #ifndef U_HIDE_INTERNAL_API
81     /** @internal */
82     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
83     /** @internal */
84     void deleteCompactBuilder();
85 
86     /** @internal */
87     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
88 
89     /** @internal */
90     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
91     /** @internal */
92     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
93 #endif  /* U_HIDE_INTERNAL_API */
94 
95     class Node;
96 
97 #ifndef U_HIDE_INTERNAL_API
98     /** @internal */
99     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
100     /** @internal */
101     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
102                             int32_t length, UErrorCode &errorCode);
103 #endif  /* U_HIDE_INTERNAL_API */
104 
105     /** @internal */
106     virtual int32_t getElementStringLength(int32_t i) const = 0;
107     /** @internal */
108     virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
109     /** @internal */
110     virtual int32_t getElementValue(int32_t i) const = 0;
111 
112     // Finds the first unit index after this one where
113     // the first and last element have different units again.
114     /** @internal */
115     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
116 
117     // Number of different units at unitIndex.
118     /** @internal */
119     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
120     /** @internal */
121     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
122     /** @internal */
123     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
124 
125     /** @internal */
126     virtual UBool matchNodesCanHaveValues() const = 0;
127 
128     /** @internal */
129     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
130     /** @internal */
131     virtual int32_t getMinLinearMatch() const = 0;
132     /** @internal */
133     virtual int32_t getMaxLinearMatchLength() const = 0;
134 
135 #ifndef U_HIDE_INTERNAL_API
136     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
137     /** @internal */
138     static const int32_t kMaxBranchLinearSubNodeLength=5;
139 
140     // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
141     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
142     /** @internal */
143     static const int32_t kMaxSplitBranchLevels=14;
144 
145     /**
146      * Makes sure that there is only one unique node registered that is
147      * equivalent to newNode.
148      * @param newNode Input node. The builder takes ownership.
149      * @param errorCode ICU in/out UErrorCode.
150                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
151      * @return newNode if it is the first of its kind, or
152      *         an equivalent node if newNode is a duplicate.
153      * @internal
154      */
155     Node *registerNode(Node *newNode, UErrorCode &errorCode);
156     /**
157      * Makes sure that there is only one unique FinalValueNode registered
158      * with this value.
159      * Avoids creating a node if the value is a duplicate.
160      * @param value A final value.
161      * @param errorCode ICU in/out UErrorCode.
162                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
163      * @return A FinalValueNode with the given value.
164      * @internal
165      */
166     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
167 #endif  /* U_HIDE_INTERNAL_API */
168 
169     /*
170      * C++ note:
171      * registerNode() and registerFinalValue() take ownership of their input nodes,
172      * and only return owned nodes.
173      * If they see a failure UErrorCode, they will delete the input node.
174      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
175      * If there is a failure, they return NULL.
176      *
177      * NULL Node pointers can be safely passed into other Nodes because
178      * they call the static Node::hashCode() which checks for a NULL pointer first.
179      *
180      * Therefore, as long as builder functions register a new node,
181      * they need to check for failures only before explicitly dereferencing
182      * a Node pointer, or before setting a new UErrorCode.
183      */
184 
185     // Hash set of nodes, maps from nodes to integer 1.
186     /** @internal */
187     UHashtable *nodes;
188 
189     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
190     // it is needed for layout of other objects.
191     /** @internal */
192     class Node : public UObject {
193     public:
Node(int32_t initialHash)194         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
hashCode()195         inline int32_t hashCode() const { return hash; }
196         // Handles node==NULL.
hashCode(const Node * node)197         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
198         // Base class operator==() compares the actual class types.
199         virtual UBool operator==(const Node &other) const;
200         inline UBool operator!=(const Node &other) const { return !operator==(other); }
201         /**
202          * Traverses the Node graph and numbers branch edges, with rightmost edges first.
203          * This is to avoid writing a duplicate node twice.
204          *
205          * Branch nodes in this trie data structure are not symmetric.
206          * Most branch edges "jump" to other nodes but the rightmost branch edges
207          * just continue without a jump.
208          * Therefore, write() must write the rightmost branch edge last
209          * (trie units are written backwards), and must write it at that point even if
210          * it is a duplicate of a node previously written elsewhere.
211          *
212          * This function visits and marks right branch edges first.
213          * Edges are numbered with increasingly negative values because we share the
214          * offset field which gets positive values when nodes are written.
215          * A branch edge also remembers the first number for any of its edges.
216          *
217          * When a further-left branch edge has a number in the range of the rightmost
218          * edge's numbers, then it will be written as part of the required right edge
219          * and we can avoid writing it first.
220          *
221          * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
222          * edge numbers.
223          *
224          * @param edgeNumber The first edge number for this node and its sub-nodes.
225          * @return An edge number that is at least the maximum-negative
226          *         of the input edge number and the numbers of this node and all of its sub-nodes.
227          */
228         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
229         // write() must set the offset to a positive value.
230         virtual void write(StringTrieBuilder &builder) = 0;
231         // See markRightEdgesFirst.
writeUnlessInsideRightEdge(int32_t firstRight,int32_t lastRight,StringTrieBuilder & builder)232         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
233                                                StringTrieBuilder &builder) {
234             // Note: Edge numbers are negative, lastRight<=firstRight.
235             // If offset>0 then this node and its sub-nodes have been written already
236             // and we need not write them again.
237             // If this node is part of the unwritten right branch edge,
238             // then we wait until that is written.
239             if(offset<0 && (offset<lastRight || firstRight<offset)) {
240                 write(builder);
241             }
242         }
getOffset()243         inline int32_t getOffset() const { return offset; }
244     protected:
245         int32_t hash;
246         int32_t offset;
247     };
248 
249 #ifndef U_HIDE_INTERNAL_API
250     // This class should not be overridden because
251     // registerFinalValue() compares a stack-allocated FinalValueNode
252     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
253     // with the input node, and the
254     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
255     // will be false if the typeid's are different.
256     /** @internal */
257     class FinalValueNode : public Node {
258     public:
FinalValueNode(int32_t v)259         FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
260         virtual UBool operator==(const Node &other) const;
261         virtual void write(StringTrieBuilder &builder);
262     protected:
263         int32_t value;
264     };
265 #endif  /* U_HIDE_INTERNAL_API */
266 
267     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
268     // it is needed for layout of other objects.
269     /**
270      * @internal
271      */
272     class ValueNode : public Node {
273     public:
ValueNode(int32_t initialHash)274         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
275         virtual UBool operator==(const Node &other) const;
setValue(int32_t v)276         void setValue(int32_t v) {
277             hasValue=TRUE;
278             value=v;
279             hash=hash*37+v;
280         }
281     protected:
282         UBool hasValue;
283         int32_t value;
284     };
285 
286 #ifndef U_HIDE_INTERNAL_API
287     /**
288      * @internal
289      */
290     class IntermediateValueNode : public ValueNode {
291     public:
IntermediateValueNode(int32_t v,Node * nextNode)292         IntermediateValueNode(int32_t v, Node *nextNode)
293                 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
294         virtual UBool operator==(const Node &other) const;
295         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
296         virtual void write(StringTrieBuilder &builder);
297     protected:
298         Node *next;
299     };
300 #endif  /* U_HIDE_INTERNAL_API */
301 
302     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
303     // it is needed for layout of other objects.
304     /**
305      * @internal
306      */
307     class LinearMatchNode : public ValueNode {
308     public:
LinearMatchNode(int32_t len,Node * nextNode)309         LinearMatchNode(int32_t len, Node *nextNode)
310                 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
311                   length(len), next(nextNode) {}
312         virtual UBool operator==(const Node &other) const;
313         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
314     protected:
315         int32_t length;
316         Node *next;
317     };
318 
319 #ifndef U_HIDE_INTERNAL_API
320     /**
321      * @internal
322      */
323     class BranchNode : public Node {
324     public:
BranchNode(int32_t initialHash)325         BranchNode(int32_t initialHash) : Node(initialHash) {}
326     protected:
327         int32_t firstEdgeNumber;
328     };
329 
330     /**
331      * @internal
332      */
333     class ListBranchNode : public BranchNode {
334     public:
ListBranchNode()335         ListBranchNode() : BranchNode(0x444444), length(0) {}
336         virtual UBool operator==(const Node &other) const;
337         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
338         virtual void write(StringTrieBuilder &builder);
339         // Adds a unit with a final value.
add(int32_t c,int32_t value)340         void add(int32_t c, int32_t value) {
341             units[length]=(UChar)c;
342             equal[length]=NULL;
343             values[length]=value;
344             ++length;
345             hash=(hash*37+c)*37+value;
346         }
347         // Adds a unit which leads to another match node.
add(int32_t c,Node * node)348         void add(int32_t c, Node *node) {
349             units[length]=(UChar)c;
350             equal[length]=node;
351             values[length]=0;
352             ++length;
353             hash=(hash*37+c)*37+hashCode(node);
354         }
355     protected:
356         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
357         int32_t length;
358         int32_t values[kMaxBranchLinearSubNodeLength];
359         UChar units[kMaxBranchLinearSubNodeLength];
360     };
361 
362     /**
363      * @internal
364      */
365     class SplitBranchNode : public BranchNode {
366     public:
SplitBranchNode(UChar middleUnit,Node * lessThanNode,Node * greaterOrEqualNode)367         SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
368                 : BranchNode(((0x555555*37+middleUnit)*37+
369                               hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
370                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
371         virtual UBool operator==(const Node &other) const;
372         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
373         virtual void write(StringTrieBuilder &builder);
374     protected:
375         UChar unit;
376         Node *lessThan;
377         Node *greaterOrEqual;
378     };
379 
380     // Branch head node, for writing the actual node lead unit.
381     /** @internal */
382     class BranchHeadNode : public ValueNode {
383     public:
BranchHeadNode(int32_t len,Node * subNode)384         BranchHeadNode(int32_t len, Node *subNode)
385                 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
386                   length(len), next(subNode) {}
387         virtual UBool operator==(const Node &other) const;
388         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
389         virtual void write(StringTrieBuilder &builder);
390     protected:
391         int32_t length;
392         Node *next;  // A branch sub-node.
393     };
394 #endif  /* U_HIDE_INTERNAL_API */
395 
396     /** @internal */
397     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
398                                         Node *nextNode) const = 0;
399 
400     /** @internal */
401     virtual int32_t write(int32_t unit) = 0;
402     /** @internal */
403     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
404     /** @internal */
405     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
406     /** @internal */
407     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
408     /** @internal */
409     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
410 };
411 
412 U_NAMESPACE_END
413 
414 #endif  // __STRINGTRIEBUILDER_H__
415