1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2010-2012,2014, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  stringtriebuilder.h
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2010dec24
14 *   created by: Markus W. Scherer
15 */
16 
17 #ifndef __STRINGTRIEBUILDER_H__
18 #define __STRINGTRIEBUILDER_H__
19 
20 #include "unicode/utypes.h"
21 #include "unicode/uobject.h"
22 
23 /**
24  * \file
25  * \brief C++ API: Builder API for trie builders
26  */
27 
28 // Forward declaration.
29 /// \cond
30 struct UHashtable;
31 typedef struct UHashtable UHashtable;
32 /// \endcond
33 
34 /**
35  * Build options for BytesTrieBuilder and CharsTrieBuilder.
36  * @stable ICU 4.8
37  */
38 enum UStringTrieBuildOption {
39     /**
40      * Builds a trie quickly.
41      * @stable ICU 4.8
42      */
43     USTRINGTRIE_BUILD_FAST,
44     /**
45      * Builds a trie more slowly, attempting to generate
46      * a shorter but equivalent serialization.
47      * This build option also uses more memory.
48      *
49      * This option can be effective when many integer values are the same
50      * and string/byte sequence suffixes can be shared.
51      * Runtime speed is not expected to improve.
52      * @stable ICU 4.8
53      */
54     USTRINGTRIE_BUILD_SMALL
55 };
56 
57 U_NAMESPACE_BEGIN
58 
59 /**
60  * Base class for string trie builder classes.
61  *
62  * This class is not intended for public subclassing.
63  * @stable ICU 4.8
64  */
65 class U_COMMON_API StringTrieBuilder : public UObject {
66 public:
67 #ifndef U_HIDE_INTERNAL_API
68     /** @internal */
69     static int32_t hashNode(const void *node);
70     /** @internal */
71     static UBool equalNodes(const void *left, const void *right);
72 #endif  /* U_HIDE_INTERNAL_API */
73 
74 protected:
75     // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
76     // or else the compiler will create a public default constructor.
77     /** @internal */
78     StringTrieBuilder();
79     /** @internal */
80     virtual ~StringTrieBuilder();
81 
82 #ifndef U_HIDE_INTERNAL_API
83     /** @internal */
84     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
85     /** @internal */
86     void deleteCompactBuilder();
87 
88     /** @internal */
89     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
90 
91     /** @internal */
92     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
93     /** @internal */
94     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
95 #endif  /* U_HIDE_INTERNAL_API */
96 
97     class Node;
98 
99 #ifndef U_HIDE_INTERNAL_API
100     /** @internal */
101     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
102     /** @internal */
103     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
104                             int32_t length, UErrorCode &errorCode);
105 #endif  /* U_HIDE_INTERNAL_API */
106 
107     /** @internal */
108     virtual int32_t getElementStringLength(int32_t i) const = 0;
109     /** @internal */
110     virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
111     /** @internal */
112     virtual int32_t getElementValue(int32_t i) const = 0;
113 
114     // Finds the first unit index after this one where
115     // the first and last element have different units again.
116     /** @internal */
117     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
118 
119     // Number of different units at unitIndex.
120     /** @internal */
121     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
122     /** @internal */
123     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
124     /** @internal */
125     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
126 
127     /** @internal */
128     virtual UBool matchNodesCanHaveValues() const = 0;
129 
130     /** @internal */
131     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
132     /** @internal */
133     virtual int32_t getMinLinearMatch() const = 0;
134     /** @internal */
135     virtual int32_t getMaxLinearMatchLength() const = 0;
136 
137 #ifndef U_HIDE_INTERNAL_API
138     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
139     /** @internal */
140     static const int32_t kMaxBranchLinearSubNodeLength=5;
141 
142     // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
143     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
144     /** @internal */
145     static const int32_t kMaxSplitBranchLevels=14;
146 
147     /**
148      * Makes sure that there is only one unique node registered that is
149      * equivalent to newNode.
150      * @param newNode Input node. The builder takes ownership.
151      * @param errorCode ICU in/out UErrorCode.
152                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
153      * @return newNode if it is the first of its kind, or
154      *         an equivalent node if newNode is a duplicate.
155      * @internal
156      */
157     Node *registerNode(Node *newNode, UErrorCode &errorCode);
158     /**
159      * Makes sure that there is only one unique FinalValueNode registered
160      * with this value.
161      * Avoids creating a node if the value is a duplicate.
162      * @param value A final value.
163      * @param errorCode ICU in/out UErrorCode.
164                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
165      * @return A FinalValueNode with the given value.
166      * @internal
167      */
168     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
169 #endif  /* U_HIDE_INTERNAL_API */
170 
171     /*
172      * C++ note:
173      * registerNode() and registerFinalValue() take ownership of their input nodes,
174      * and only return owned nodes.
175      * If they see a failure UErrorCode, they will delete the input node.
176      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
177      * If there is a failure, they return NULL.
178      *
179      * NULL Node pointers can be safely passed into other Nodes because
180      * they call the static Node::hashCode() which checks for a NULL pointer first.
181      *
182      * Therefore, as long as builder functions register a new node,
183      * they need to check for failures only before explicitly dereferencing
184      * a Node pointer, or before setting a new UErrorCode.
185      */
186 
187     // Hash set of nodes, maps from nodes to integer 1.
188     /** @internal */
189     UHashtable *nodes;
190 
191     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
192     // it is needed for layout of other objects.
193     /**
194      * @internal
195      * \cond
196      */
197     class Node : public UObject {
198     public:
Node(int32_t initialHash)199         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
hashCode()200         inline int32_t hashCode() const { return hash; }
201         // Handles node==NULL.
hashCode(const Node * node)202         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
203         // Base class operator==() compares the actual class types.
204         virtual UBool operator==(const Node &other) const;
205         inline UBool operator!=(const Node &other) const { return !operator==(other); }
206         /**
207          * Traverses the Node graph and numbers branch edges, with rightmost edges first.
208          * This is to avoid writing a duplicate node twice.
209          *
210          * Branch nodes in this trie data structure are not symmetric.
211          * Most branch edges "jump" to other nodes but the rightmost branch edges
212          * just continue without a jump.
213          * Therefore, write() must write the rightmost branch edge last
214          * (trie units are written backwards), and must write it at that point even if
215          * it is a duplicate of a node previously written elsewhere.
216          *
217          * This function visits and marks right branch edges first.
218          * Edges are numbered with increasingly negative values because we share the
219          * offset field which gets positive values when nodes are written.
220          * A branch edge also remembers the first number for any of its edges.
221          *
222          * When a further-left branch edge has a number in the range of the rightmost
223          * edge's numbers, then it will be written as part of the required right edge
224          * and we can avoid writing it first.
225          *
226          * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
227          * edge numbers.
228          *
229          * @param edgeNumber The first edge number for this node and its sub-nodes.
230          * @return An edge number that is at least the maximum-negative
231          *         of the input edge number and the numbers of this node and all of its sub-nodes.
232          */
233         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
234         // write() must set the offset to a positive value.
235         virtual void write(StringTrieBuilder &builder) = 0;
236         // See markRightEdgesFirst.
writeUnlessInsideRightEdge(int32_t firstRight,int32_t lastRight,StringTrieBuilder & builder)237         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
238                                                StringTrieBuilder &builder) {
239             // Note: Edge numbers are negative, lastRight<=firstRight.
240             // If offset>0 then this node and its sub-nodes have been written already
241             // and we need not write them again.
242             // If this node is part of the unwritten right branch edge,
243             // then we wait until that is written.
244             if(offset<0 && (offset<lastRight || firstRight<offset)) {
245                 write(builder);
246             }
247         }
getOffset()248         inline int32_t getOffset() const { return offset; }
249     protected:
250         int32_t hash;
251         int32_t offset;
252     };
253 
254 #ifndef U_HIDE_INTERNAL_API
255     // This class should not be overridden because
256     // registerFinalValue() compares a stack-allocated FinalValueNode
257     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
258     // with the input node, and the
259     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
260     // will be false if the typeid's are different.
261     /** @internal */
262     class FinalValueNode : public Node {
263     public:
FinalValueNode(int32_t v)264         FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
265         virtual UBool operator==(const Node &other) const;
266         virtual void write(StringTrieBuilder &builder);
267     protected:
268         int32_t value;
269     };
270 #endif  /* U_HIDE_INTERNAL_API */
271 
272     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
273     // it is needed for layout of other objects.
274     /**
275      * @internal
276      */
277     class ValueNode : public Node {
278     public:
ValueNode(int32_t initialHash)279         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
280         virtual UBool operator==(const Node &other) const;
setValue(int32_t v)281         void setValue(int32_t v) {
282             hasValue=TRUE;
283             value=v;
284             hash=hash*37u+v;
285         }
286     protected:
287         UBool hasValue;
288         int32_t value;
289     };
290 
291 #ifndef U_HIDE_INTERNAL_API
292     /**
293      * @internal
294      */
295     class IntermediateValueNode : public ValueNode {
296     public:
IntermediateValueNode(int32_t v,Node * nextNode)297         IntermediateValueNode(int32_t v, Node *nextNode)
298                 : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
299         virtual UBool operator==(const Node &other) const;
300         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
301         virtual void write(StringTrieBuilder &builder);
302     protected:
303         Node *next;
304     };
305 #endif  /* U_HIDE_INTERNAL_API */
306 
307     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
308     // it is needed for layout of other objects.
309     /**
310      * @internal
311      */
312     class LinearMatchNode : public ValueNode {
313     public:
LinearMatchNode(int32_t len,Node * nextNode)314         LinearMatchNode(int32_t len, Node *nextNode)
315                 : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
316                   length(len), next(nextNode) {}
317         virtual UBool operator==(const Node &other) const;
318         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
319     protected:
320         int32_t length;
321         Node *next;
322     };
323 
324 #ifndef U_HIDE_INTERNAL_API
325     /**
326      * @internal
327      */
328     class BranchNode : public Node {
329     public:
BranchNode(int32_t initialHash)330         BranchNode(int32_t initialHash) : Node(initialHash) {}
331     protected:
332         int32_t firstEdgeNumber;
333     };
334 
335     /**
336      * @internal
337      */
338     class ListBranchNode : public BranchNode {
339     public:
ListBranchNode()340         ListBranchNode() : BranchNode(0x444444), length(0) {}
341         virtual UBool operator==(const Node &other) const;
342         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
343         virtual void write(StringTrieBuilder &builder);
344         // Adds a unit with a final value.
add(int32_t c,int32_t value)345         void add(int32_t c, int32_t value) {
346             units[length]=(char16_t)c;
347             equal[length]=NULL;
348             values[length]=value;
349             ++length;
350             hash=(hash*37u+c)*37u+value;
351         }
352         // Adds a unit which leads to another match node.
add(int32_t c,Node * node)353         void add(int32_t c, Node *node) {
354             units[length]=(char16_t)c;
355             equal[length]=node;
356             values[length]=0;
357             ++length;
358             hash=(hash*37u+c)*37u+hashCode(node);
359         }
360     protected:
361         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
362         int32_t length;
363         int32_t values[kMaxBranchLinearSubNodeLength];
364         char16_t units[kMaxBranchLinearSubNodeLength];
365     };
366 
367     /**
368      * @internal
369      */
370     class SplitBranchNode : public BranchNode {
371     public:
SplitBranchNode(char16_t middleUnit,Node * lessThanNode,Node * greaterOrEqualNode)372         SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
373                 : BranchNode(((0x555555u*37u+middleUnit)*37u+
374                               hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
375                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
376         virtual UBool operator==(const Node &other) const;
377         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
378         virtual void write(StringTrieBuilder &builder);
379     protected:
380         char16_t unit;
381         Node *lessThan;
382         Node *greaterOrEqual;
383     };
384 
385     // Branch head node, for writing the actual node lead unit.
386     /** @internal */
387     class BranchHeadNode : public ValueNode {
388     public:
BranchHeadNode(int32_t len,Node * subNode)389         BranchHeadNode(int32_t len, Node *subNode)
390                 : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
391                   length(len), next(subNode) {}
392         virtual UBool operator==(const Node &other) const;
393         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
394         virtual void write(StringTrieBuilder &builder);
395     protected:
396         int32_t length;
397         Node *next;  // A branch sub-node.
398     };
399 
400 #endif  /* U_HIDE_INTERNAL_API */
401     /// \endcond
402 
403     /** @internal */
404     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
405                                         Node *nextNode) const = 0;
406 
407     /** @internal */
408     virtual int32_t write(int32_t unit) = 0;
409     /** @internal */
410     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
411     /** @internal */
412     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
413     /** @internal */
414     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
415     /** @internal */
416     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
417 };
418 
419 U_NAMESPACE_END
420 
421 #endif  // __STRINGTRIEBUILDER_H__
422