1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2012,2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: stringtriebuilder.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2010dec24 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __STRINGTRIEBUILDER_H__ 18 #define __STRINGTRIEBUILDER_H__ 19 20 #include "unicode/utypes.h" 21 #include "unicode/uobject.h" 22 23 /** 24 * \file 25 * \brief C++ API: Builder API for trie builders 26 */ 27 28 // Forward declaration. 29 struct UHashtable; 30 typedef struct UHashtable UHashtable; 31 32 /** 33 * Build options for BytesTrieBuilder and CharsTrieBuilder. 34 * @stable ICU 4.8 35 */ 36 enum UStringTrieBuildOption { 37 /** 38 * Builds a trie quickly. 39 * @stable ICU 4.8 40 */ 41 USTRINGTRIE_BUILD_FAST, 42 /** 43 * Builds a trie more slowly, attempting to generate 44 * a shorter but equivalent serialization. 45 * This build option also uses more memory. 46 * 47 * This option can be effective when many integer values are the same 48 * and string/byte sequence suffixes can be shared. 49 * Runtime speed is not expected to improve. 50 * @stable ICU 4.8 51 */ 52 USTRINGTRIE_BUILD_SMALL 53 }; 54 55 U_NAMESPACE_BEGIN 56 57 /** 58 * Base class for string trie builder classes. 59 * 60 * This class is not intended for public subclassing. 61 * @stable ICU 4.8 62 */ 63 class U_COMMON_API StringTrieBuilder : public UObject { 64 public: 65 #ifndef U_HIDE_INTERNAL_API 66 /** @internal */ 67 static UBool hashNode(const void *node); 68 /** @internal */ 69 static UBool equalNodes(const void *left, const void *right); 70 #endif /* U_HIDE_INTERNAL_API */ 71 72 protected: 73 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API 74 // or else the compiler will create a public default constructor. 75 /** @internal */ 76 StringTrieBuilder(); 77 /** @internal */ 78 virtual ~StringTrieBuilder(); 79 80 #ifndef U_HIDE_INTERNAL_API 81 /** @internal */ 82 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode); 83 /** @internal */ 84 void deleteCompactBuilder(); 85 86 /** @internal */ 87 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode); 88 89 /** @internal */ 90 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex); 91 /** @internal */ 92 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length); 93 #endif /* U_HIDE_INTERNAL_API */ 94 95 class Node; 96 97 #ifndef U_HIDE_INTERNAL_API 98 /** @internal */ 99 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode); 100 /** @internal */ 101 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, 102 int32_t length, UErrorCode &errorCode); 103 #endif /* U_HIDE_INTERNAL_API */ 104 105 /** @internal */ 106 virtual int32_t getElementStringLength(int32_t i) const = 0; 107 /** @internal */ 108 virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0; 109 /** @internal */ 110 virtual int32_t getElementValue(int32_t i) const = 0; 111 112 // Finds the first unit index after this one where 113 // the first and last element have different units again. 114 /** @internal */ 115 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0; 116 117 // Number of different units at unitIndex. 118 /** @internal */ 119 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0; 120 /** @internal */ 121 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0; 122 /** @internal */ 123 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0; 124 125 /** @internal */ 126 virtual UBool matchNodesCanHaveValues() const = 0; 127 128 /** @internal */ 129 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0; 130 /** @internal */ 131 virtual int32_t getMinLinearMatch() const = 0; 132 /** @internal */ 133 virtual int32_t getMaxLinearMatchLength() const = 0; 134 135 #ifndef U_HIDE_INTERNAL_API 136 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength). 137 /** @internal */ 138 static const int32_t kMaxBranchLinearSubNodeLength=5; 139 140 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units. 141 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up. 142 /** @internal */ 143 static const int32_t kMaxSplitBranchLevels=14; 144 145 /** 146 * Makes sure that there is only one unique node registered that is 147 * equivalent to newNode. 148 * @param newNode Input node. The builder takes ownership. 149 * @param errorCode ICU in/out UErrorCode. 150 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 151 * @return newNode if it is the first of its kind, or 152 * an equivalent node if newNode is a duplicate. 153 * @internal 154 */ 155 Node *registerNode(Node *newNode, UErrorCode &errorCode); 156 /** 157 * Makes sure that there is only one unique FinalValueNode registered 158 * with this value. 159 * Avoids creating a node if the value is a duplicate. 160 * @param value A final value. 161 * @param errorCode ICU in/out UErrorCode. 162 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 163 * @return A FinalValueNode with the given value. 164 * @internal 165 */ 166 Node *registerFinalValue(int32_t value, UErrorCode &errorCode); 167 #endif /* U_HIDE_INTERNAL_API */ 168 169 /* 170 * C++ note: 171 * registerNode() and registerFinalValue() take ownership of their input nodes, 172 * and only return owned nodes. 173 * If they see a failure UErrorCode, they will delete the input node. 174 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR. 175 * If there is a failure, they return NULL. 176 * 177 * NULL Node pointers can be safely passed into other Nodes because 178 * they call the static Node::hashCode() which checks for a NULL pointer first. 179 * 180 * Therefore, as long as builder functions register a new node, 181 * they need to check for failures only before explicitly dereferencing 182 * a Node pointer, or before setting a new UErrorCode. 183 */ 184 185 // Hash set of nodes, maps from nodes to integer 1. 186 /** @internal */ 187 UHashtable *nodes; 188 189 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, 190 // it is needed for layout of other objects. 191 /** @internal */ 192 class Node : public UObject { 193 public: Node(int32_t initialHash)194 Node(int32_t initialHash) : hash(initialHash), offset(0) {} hashCode()195 inline int32_t hashCode() const { return hash; } 196 // Handles node==NULL. hashCode(const Node * node)197 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); } 198 // Base class operator==() compares the actual class types. 199 virtual UBool operator==(const Node &other) const; 200 inline UBool operator!=(const Node &other) const { return !operator==(other); } 201 /** 202 * Traverses the Node graph and numbers branch edges, with rightmost edges first. 203 * This is to avoid writing a duplicate node twice. 204 * 205 * Branch nodes in this trie data structure are not symmetric. 206 * Most branch edges "jump" to other nodes but the rightmost branch edges 207 * just continue without a jump. 208 * Therefore, write() must write the rightmost branch edge last 209 * (trie units are written backwards), and must write it at that point even if 210 * it is a duplicate of a node previously written elsewhere. 211 * 212 * This function visits and marks right branch edges first. 213 * Edges are numbered with increasingly negative values because we share the 214 * offset field which gets positive values when nodes are written. 215 * A branch edge also remembers the first number for any of its edges. 216 * 217 * When a further-left branch edge has a number in the range of the rightmost 218 * edge's numbers, then it will be written as part of the required right edge 219 * and we can avoid writing it first. 220 * 221 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative 222 * edge numbers. 223 * 224 * @param edgeNumber The first edge number for this node and its sub-nodes. 225 * @return An edge number that is at least the maximum-negative 226 * of the input edge number and the numbers of this node and all of its sub-nodes. 227 */ 228 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 229 // write() must set the offset to a positive value. 230 virtual void write(StringTrieBuilder &builder) = 0; 231 // See markRightEdgesFirst. writeUnlessInsideRightEdge(int32_t firstRight,int32_t lastRight,StringTrieBuilder & builder)232 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight, 233 StringTrieBuilder &builder) { 234 // Note: Edge numbers are negative, lastRight<=firstRight. 235 // If offset>0 then this node and its sub-nodes have been written already 236 // and we need not write them again. 237 // If this node is part of the unwritten right branch edge, 238 // then we wait until that is written. 239 if(offset<0 && (offset<lastRight || firstRight<offset)) { 240 write(builder); 241 } 242 } getOffset()243 inline int32_t getOffset() const { return offset; } 244 protected: 245 int32_t hash; 246 int32_t offset; 247 }; 248 249 #ifndef U_HIDE_INTERNAL_API 250 // This class should not be overridden because 251 // registerFinalValue() compares a stack-allocated FinalValueNode 252 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes) 253 // with the input node, and the 254 // !Node::operator==(other) used inside FinalValueNode::operator==(other) 255 // will be false if the typeid's are different. 256 /** @internal */ 257 class FinalValueNode : public Node { 258 public: FinalValueNode(int32_t v)259 FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {} 260 virtual UBool operator==(const Node &other) const; 261 virtual void write(StringTrieBuilder &builder); 262 protected: 263 int32_t value; 264 }; 265 #endif /* U_HIDE_INTERNAL_API */ 266 267 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, 268 // it is needed for layout of other objects. 269 /** 270 * @internal 271 */ 272 class ValueNode : public Node { 273 public: ValueNode(int32_t initialHash)274 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {} 275 virtual UBool operator==(const Node &other) const; setValue(int32_t v)276 void setValue(int32_t v) { 277 hasValue=TRUE; 278 value=v; 279 hash=hash*37+v; 280 } 281 protected: 282 UBool hasValue; 283 int32_t value; 284 }; 285 286 #ifndef U_HIDE_INTERNAL_API 287 /** 288 * @internal 289 */ 290 class IntermediateValueNode : public ValueNode { 291 public: IntermediateValueNode(int32_t v,Node * nextNode)292 IntermediateValueNode(int32_t v, Node *nextNode) 293 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); } 294 virtual UBool operator==(const Node &other) const; 295 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 296 virtual void write(StringTrieBuilder &builder); 297 protected: 298 Node *next; 299 }; 300 #endif /* U_HIDE_INTERNAL_API */ 301 302 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, 303 // it is needed for layout of other objects. 304 /** 305 * @internal 306 */ 307 class LinearMatchNode : public ValueNode { 308 public: LinearMatchNode(int32_t len,Node * nextNode)309 LinearMatchNode(int32_t len, Node *nextNode) 310 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)), 311 length(len), next(nextNode) {} 312 virtual UBool operator==(const Node &other) const; 313 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 314 protected: 315 int32_t length; 316 Node *next; 317 }; 318 319 #ifndef U_HIDE_INTERNAL_API 320 /** 321 * @internal 322 */ 323 class BranchNode : public Node { 324 public: BranchNode(int32_t initialHash)325 BranchNode(int32_t initialHash) : Node(initialHash) {} 326 protected: 327 int32_t firstEdgeNumber; 328 }; 329 330 /** 331 * @internal 332 */ 333 class ListBranchNode : public BranchNode { 334 public: ListBranchNode()335 ListBranchNode() : BranchNode(0x444444), length(0) {} 336 virtual UBool operator==(const Node &other) const; 337 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 338 virtual void write(StringTrieBuilder &builder); 339 // Adds a unit with a final value. add(int32_t c,int32_t value)340 void add(int32_t c, int32_t value) { 341 units[length]=(UChar)c; 342 equal[length]=NULL; 343 values[length]=value; 344 ++length; 345 hash=(hash*37+c)*37+value; 346 } 347 // Adds a unit which leads to another match node. add(int32_t c,Node * node)348 void add(int32_t c, Node *node) { 349 units[length]=(UChar)c; 350 equal[length]=node; 351 values[length]=0; 352 ++length; 353 hash=(hash*37+c)*37+hashCode(node); 354 } 355 protected: 356 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value". 357 int32_t length; 358 int32_t values[kMaxBranchLinearSubNodeLength]; 359 UChar units[kMaxBranchLinearSubNodeLength]; 360 }; 361 362 /** 363 * @internal 364 */ 365 class SplitBranchNode : public BranchNode { 366 public: SplitBranchNode(UChar middleUnit,Node * lessThanNode,Node * greaterOrEqualNode)367 SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode) 368 : BranchNode(((0x555555*37+middleUnit)*37+ 369 hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)), 370 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {} 371 virtual UBool operator==(const Node &other) const; 372 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 373 virtual void write(StringTrieBuilder &builder); 374 protected: 375 UChar unit; 376 Node *lessThan; 377 Node *greaterOrEqual; 378 }; 379 380 // Branch head node, for writing the actual node lead unit. 381 /** @internal */ 382 class BranchHeadNode : public ValueNode { 383 public: BranchHeadNode(int32_t len,Node * subNode)384 BranchHeadNode(int32_t len, Node *subNode) 385 : ValueNode((0x666666*37+len)*37+hashCode(subNode)), 386 length(len), next(subNode) {} 387 virtual UBool operator==(const Node &other) const; 388 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 389 virtual void write(StringTrieBuilder &builder); 390 protected: 391 int32_t length; 392 Node *next; // A branch sub-node. 393 }; 394 #endif /* U_HIDE_INTERNAL_API */ 395 396 /** @internal */ 397 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, 398 Node *nextNode) const = 0; 399 400 /** @internal */ 401 virtual int32_t write(int32_t unit) = 0; 402 /** @internal */ 403 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0; 404 /** @internal */ 405 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0; 406 /** @internal */ 407 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0; 408 /** @internal */ 409 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0; 410 }; 411 412 U_NAMESPACE_END 413 414 #endif // __STRINGTRIEBUILDER_H__ 415