1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2010-2012, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  bytestrie.h
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2010sep25
14 *   created by: Markus W. Scherer
15 */
16 
17 #ifndef __BYTESTRIE_H__
18 #define __BYTESTRIE_H__
19 
20 /**
21  * \file
22  * \brief C++ API: Trie for mapping byte sequences to integer values.
23  */
24 
25 #include "unicode/utypes.h"
26 #include "unicode/stringpiece.h"
27 #include "unicode/uobject.h"
28 #include "unicode/ustringtrie.h"
29 
30 U_NAMESPACE_BEGIN
31 
32 class ByteSink;
33 class BytesTrieBuilder;
34 class CharString;
35 class UVector32;
36 
37 /**
38  * Light-weight, non-const reader class for a BytesTrie.
39  * Traverses a byte-serialized data structure with minimal state,
40  * for mapping byte sequences to non-negative integer values.
41  *
42  * This class owns the serialized trie data only if it was constructed by
43  * the builder's build() method.
44  * The public constructor and the copy constructor only alias the data (only copy the pointer).
45  * There is no assignment operator.
46  *
47  * This class is not intended for public subclassing.
48  * @stable ICU 4.8
49  */
50 class U_COMMON_API BytesTrie : public UMemory {
51 public:
52     /**
53      * Constructs a BytesTrie reader instance.
54      *
55      * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder,
56      * starting with the first byte of that sequence.
57      * The BytesTrie object will not read more bytes than
58      * the BytesTrieBuilder generated in the corresponding build() call.
59      *
60      * The array is not copied/cloned and must not be modified while
61      * the BytesTrie object is in use.
62      *
63      * @param trieBytes The byte array that contains the serialized trie.
64      * @stable ICU 4.8
65      */
BytesTrie(const void * trieBytes)66     BytesTrie(const void *trieBytes)
67             : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
68               pos_(bytes_), remainingMatchLength_(-1) {}
69 
70     /**
71      * Destructor.
72      * @stable ICU 4.8
73      */
74     ~BytesTrie();
75 
76     /**
77      * Copy constructor, copies the other trie reader object and its state,
78      * but not the byte array which will be shared. (Shallow copy.)
79      * @param other Another BytesTrie object.
80      * @stable ICU 4.8
81      */
BytesTrie(const BytesTrie & other)82     BytesTrie(const BytesTrie &other)
83             : ownedArray_(NULL), bytes_(other.bytes_),
84               pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
85 
86     /**
87      * Resets this trie to its initial state.
88      * @return *this
89      * @stable ICU 4.8
90      */
reset()91     BytesTrie &reset() {
92         pos_=bytes_;
93         remainingMatchLength_=-1;
94         return *this;
95     }
96 
97     /**
98      * BytesTrie state object, for saving a trie's current state
99      * and resetting the trie back to this state later.
100      * @stable ICU 4.8
101      */
102     class State : public UMemory {
103     public:
104         /**
105          * Constructs an empty State.
106          * @stable ICU 4.8
107          */
State()108         State() { bytes=NULL; }
109     private:
110         friend class BytesTrie;
111 
112         const uint8_t *bytes;
113         const uint8_t *pos;
114         int32_t remainingMatchLength;
115     };
116 
117     /**
118      * Saves the state of this trie.
119      * @param state The State object to hold the trie's state.
120      * @return *this
121      * @see resetToState
122      * @stable ICU 4.8
123      */
saveState(State & state)124     const BytesTrie &saveState(State &state) const {
125         state.bytes=bytes_;
126         state.pos=pos_;
127         state.remainingMatchLength=remainingMatchLength_;
128         return *this;
129     }
130 
131     /**
132      * Resets this trie to the saved state.
133      * If the state object contains no state, or the state of a different trie,
134      * then this trie remains unchanged.
135      * @param state The State object which holds a saved trie state.
136      * @return *this
137      * @see saveState
138      * @see reset
139      * @stable ICU 4.8
140      */
resetToState(const State & state)141     BytesTrie &resetToState(const State &state) {
142         if(bytes_==state.bytes && bytes_!=NULL) {
143             pos_=state.pos;
144             remainingMatchLength_=state.remainingMatchLength;
145         }
146         return *this;
147     }
148 
149     /**
150      * Determines whether the byte sequence so far matches, whether it has a value,
151      * and whether another input byte can continue a matching byte sequence.
152      * @return The match/value Result.
153      * @stable ICU 4.8
154      */
155     UStringTrieResult current() const;
156 
157     /**
158      * Traverses the trie from the initial state for this input byte.
159      * Equivalent to reset().next(inByte).
160      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
161      *               Values below -0x100 and above 0xff will never match.
162      * @return The match/value Result.
163      * @stable ICU 4.8
164      */
first(int32_t inByte)165     inline UStringTrieResult first(int32_t inByte) {
166         remainingMatchLength_=-1;
167         if(inByte<0) {
168             inByte+=0x100;
169         }
170         return nextImpl(bytes_, inByte);
171     }
172 
173     /**
174      * Traverses the trie from the current state for this input byte.
175      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
176      *               Values below -0x100 and above 0xff will never match.
177      * @return The match/value Result.
178      * @stable ICU 4.8
179      */
180     UStringTrieResult next(int32_t inByte);
181 
182     /**
183      * Traverses the trie from the current state for this byte sequence.
184      * Equivalent to
185      * \code
186      * Result result=current();
187      * for(each c in s)
188      *   if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
189      *   result=next(c);
190      * return result;
191      * \endcode
192      * @param s A string or byte sequence. Can be NULL if length is 0.
193      * @param length The length of the byte sequence. Can be -1 if NUL-terminated.
194      * @return The match/value Result.
195      * @stable ICU 4.8
196      */
197     UStringTrieResult next(const char *s, int32_t length);
198 
199     /**
200      * Returns a matching byte sequence's value if called immediately after
201      * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
202      * getValue() can be called multiple times.
203      *
204      * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
205      * @return The value for the byte sequence so far.
206      * @stable ICU 4.8
207      */
getValue()208     inline int32_t getValue() const {
209         const uint8_t *pos=pos_;
210         int32_t leadByte=*pos++;
211         // U_ASSERT(leadByte>=kMinValueLead);
212         return readValue(pos, leadByte>>1);
213     }
214 
215     /**
216      * Determines whether all byte sequences reachable from the current state
217      * map to the same value.
218      * @param uniqueValue Receives the unique value, if this function returns TRUE.
219      *                    (output-only)
220      * @return TRUE if all byte sequences reachable from the current state
221      *         map to the same value.
222      * @stable ICU 4.8
223      */
hasUniqueValue(int32_t & uniqueValue)224     inline UBool hasUniqueValue(int32_t &uniqueValue) const {
225         const uint8_t *pos=pos_;
226         // Skip the rest of a pending linear-match node.
227         return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
228     }
229 
230     /**
231      * Finds each byte which continues the byte sequence from the current state.
232      * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now.
233      * @param out Each next byte is appended to this object.
234      *            (Only uses the out.Append(s, length) method.)
235      * @return the number of bytes which continue the byte sequence from here
236      * @stable ICU 4.8
237      */
238     int32_t getNextBytes(ByteSink &out) const;
239 
240     /**
241      * Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
242      * @stable ICU 4.8
243      */
244     class U_COMMON_API Iterator : public UMemory {
245     public:
246         /**
247          * Iterates from the root of a byte-serialized BytesTrie.
248          * @param trieBytes The trie bytes.
249          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
250          *                        Otherwise, the iterator returns strings with this maximum length.
251          * @param errorCode Standard ICU error code. Its input value must
252          *                  pass the U_SUCCESS() test, or else the function returns
253          *                  immediately. Check for U_FAILURE() on output or use with
254          *                  function chaining. (See User Guide for details.)
255          * @stable ICU 4.8
256          */
257         Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
258 
259         /**
260          * Iterates from the current state of the specified BytesTrie.
261          * @param trie The trie whose state will be copied for iteration.
262          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
263          *                        Otherwise, the iterator returns strings with this maximum length.
264          * @param errorCode Standard ICU error code. Its input value must
265          *                  pass the U_SUCCESS() test, or else the function returns
266          *                  immediately. Check for U_FAILURE() on output or use with
267          *                  function chaining. (See User Guide for details.)
268          * @stable ICU 4.8
269          */
270         Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
271 
272         /**
273          * Destructor.
274          * @stable ICU 4.8
275          */
276         ~Iterator();
277 
278         /**
279          * Resets this iterator to its initial state.
280          * @return *this
281          * @stable ICU 4.8
282          */
283         Iterator &reset();
284 
285         /**
286          * @return TRUE if there are more elements.
287          * @stable ICU 4.8
288          */
289         UBool hasNext() const;
290 
291         /**
292          * Finds the next (byte sequence, value) pair if there is one.
293          *
294          * If the byte sequence is truncated to the maximum length and does not
295          * have a real value, then the value is set to -1.
296          * In this case, this "not a real value" is indistinguishable from
297          * a real value of -1.
298          * @param errorCode Standard ICU error code. Its input value must
299          *                  pass the U_SUCCESS() test, or else the function returns
300          *                  immediately. Check for U_FAILURE() on output or use with
301          *                  function chaining. (See User Guide for details.)
302          * @return TRUE if there is another element.
303          * @stable ICU 4.8
304          */
305         UBool next(UErrorCode &errorCode);
306 
307         /**
308          * @return The NUL-terminated byte sequence for the last successful next().
309          * @stable ICU 4.8
310          */
311         StringPiece getString() const;
312         /**
313          * @return The value for the last successful next().
314          * @stable ICU 4.8
315          */
getValue()316         int32_t getValue() const { return value_; }
317 
318     private:
319         UBool truncateAndStop();
320 
321         const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
322 
323         const uint8_t *bytes_;
324         const uint8_t *pos_;
325         const uint8_t *initialPos_;
326         int32_t remainingMatchLength_;
327         int32_t initialRemainingMatchLength_;
328 
329         CharString *str_;
330         int32_t maxLength_;
331         int32_t value_;
332 
333         // The stack stores pairs of integers for backtracking to another
334         // outbound edge of a branch node.
335         // The first integer is an offset from bytes_.
336         // The second integer has the str_->length() from before the node in bits 15..0,
337         // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
338         // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
339         // but the code looks more confusing that way.)
340         UVector32 *stack_;
341     };
342 
343 private:
344     friend class BytesTrieBuilder;
345 
346     /**
347      * Constructs a BytesTrie reader instance.
348      * Unlike the public constructor which just aliases an array,
349      * this constructor adopts the builder's array.
350      * This constructor is only called by the builder.
351      */
BytesTrie(void * adoptBytes,const void * trieBytes)352     BytesTrie(void *adoptBytes, const void *trieBytes)
353             : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
354               bytes_(static_cast<const uint8_t *>(trieBytes)),
355               pos_(bytes_), remainingMatchLength_(-1) {}
356 
357     // No assignment operator.
358     BytesTrie &operator=(const BytesTrie &other);
359 
stop()360     inline void stop() {
361         pos_=NULL;
362     }
363 
364     // Reads a compact 32-bit integer.
365     // pos is already after the leadByte, and the lead byte is already shifted right by 1.
366     static int32_t readValue(const uint8_t *pos, int32_t leadByte);
skipValue(const uint8_t * pos,int32_t leadByte)367     static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
368         // U_ASSERT(leadByte>=kMinValueLead);
369         if(leadByte>=(kMinTwoByteValueLead<<1)) {
370             if(leadByte<(kMinThreeByteValueLead<<1)) {
371                 ++pos;
372             } else if(leadByte<(kFourByteValueLead<<1)) {
373                 pos+=2;
374             } else {
375                 pos+=3+((leadByte>>1)&1);
376             }
377         }
378         return pos;
379     }
skipValue(const uint8_t * pos)380     static inline const uint8_t *skipValue(const uint8_t *pos) {
381         int32_t leadByte=*pos++;
382         return skipValue(pos, leadByte);
383     }
384 
385     // Reads a jump delta and jumps.
386     static const uint8_t *jumpByDelta(const uint8_t *pos);
387 
skipDelta(const uint8_t * pos)388     static inline const uint8_t *skipDelta(const uint8_t *pos) {
389         int32_t delta=*pos++;
390         if(delta>=kMinTwoByteDeltaLead) {
391             if(delta<kMinThreeByteDeltaLead) {
392                 ++pos;
393             } else if(delta<kFourByteDeltaLead) {
394                 pos+=2;
395             } else {
396                 pos+=3+(delta&1);
397             }
398         }
399         return pos;
400     }
401 
valueResult(int32_t node)402     static inline UStringTrieResult valueResult(int32_t node) {
403         return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
404     }
405 
406     // Handles a branch node for both next(byte) and next(string).
407     UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
408 
409     // Requires remainingLength_<0.
410     UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
411 
412     // Helper functions for hasUniqueValue().
413     // Recursively finds a unique value (or whether there is not a unique one)
414     // from a branch.
415     static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
416                                                     UBool haveUniqueValue, int32_t &uniqueValue);
417     // Recursively finds a unique value (or whether there is not a unique one)
418     // starting from a position on a node lead byte.
419     static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
420 
421     // Helper functions for getNextBytes().
422     // getNextBytes() when pos is on a branch node.
423     static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
424     static void append(ByteSink &out, int c);
425 
426     // BytesTrie data structure
427     //
428     // The trie consists of a series of byte-serialized nodes for incremental
429     // string/byte sequence matching. The root node is at the beginning of the trie data.
430     //
431     // Types of nodes are distinguished by their node lead byte ranges.
432     // After each node, except a final-value node, another node follows to
433     // encode match values or continue matching further bytes.
434     //
435     // Node types:
436     //  - Value node: Stores a 32-bit integer in a compact, variable-length format.
437     //    The value is for the string/byte sequence so far.
438     //    One node bit indicates whether the value is final or whether
439     //    matching continues with the next node.
440     //  - Linear-match node: Matches a number of bytes.
441     //  - Branch node: Branches to other nodes according to the current input byte.
442     //    The node byte is the length of the branch (number of bytes to select from)
443     //    minus 1. It is followed by a sub-node:
444     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
445     //      there are length-1 (key, value) pairs and then one more comparison byte.
446     //      If one of the key bytes matches, then the value is either a final value for
447     //      the string/byte sequence so far, or a "jump" delta to the next node.
448     //      If the last byte matches, then matching continues with the next node.
449     //      (Values have the same encoding as value nodes.)
450     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
451     //      there is one byte and one "jump" delta.
452     //      If the input byte is less than the sub-node byte, then "jump" by delta to
453     //      the next sub-node which will have a length of length/2.
454     //      (The delta has its own compact encoding.)
455     //      Otherwise, skip the "jump" delta to the next sub-node
456     //      which will have a length of length-length/2.
457 
458     // Node lead byte values.
459 
460     // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
461     // the length is one more than the next byte.
462 
463     // For a branch sub-node with at most this many entries, we drop down
464     // to a linear search.
465     static const int32_t kMaxBranchLinearSubNodeLength=5;
466 
467     // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
468     static const int32_t kMinLinearMatch=0x10;
469     static const int32_t kMaxLinearMatchLength=0x10;
470 
471     // 20..ff: Variable-length value node.
472     // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
473     // Then shift-right by 1 bit.
474     // The remaining lead byte value indicates the number of following bytes (0..4)
475     // and contains the value's top bits.
476     static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x20
477     // It is a final value if bit 0 is set.
478     static const int32_t kValueIsFinal=1;
479 
480     // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
481     static const int32_t kMinOneByteValueLead=kMinValueLead/2;  // 0x10
482     static const int32_t kMaxOneByteValue=0x40;  // At least 6 bits in the first byte.
483 
484     static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;  // 0x51
485     static const int32_t kMaxTwoByteValue=0x1aff;
486 
487     static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;  // 0x6c
488     static const int32_t kFourByteValueLead=0x7e;
489 
490     // A little more than Unicode code points. (0x11ffff)
491     static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
492 
493     static const int32_t kFiveByteValueLead=0x7f;
494 
495     // Compact delta integers.
496     static const int32_t kMaxOneByteDelta=0xbf;
497     static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1;  // 0xc0
498     static const int32_t kMinThreeByteDeltaLead=0xf0;
499     static const int32_t kFourByteDeltaLead=0xfe;
500     static const int32_t kFiveByteDeltaLead=0xff;
501 
502     static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;  // 0x2fff
503     static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;  // 0xdffff
504 
505     uint8_t *ownedArray_;
506 
507     // Fixed value referencing the BytesTrie bytes.
508     const uint8_t *bytes_;
509 
510     // Iterator variables.
511 
512     // Pointer to next trie byte to read. NULL if no more matches.
513     const uint8_t *pos_;
514     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
515     int32_t remainingMatchLength_;
516 };
517 
518 U_NAMESPACE_END
519 
520 #endif  // __BYTESTRIE_H__
521