1 /*
2 *******************************************************************************
3 *   Copyright (C) 2010-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  bytestrie.h
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2010sep25
12 *   created by: Markus W. Scherer
13 */
14 
15 #ifndef __BYTESTRIE_H__
16 #define __BYTESTRIE_H__
17 
18 /**
19  * \file
20  * \brief C++ API: Trie for mapping byte sequences to integer values.
21  */
22 
23 #include "unicode/utypes.h"
24 #include "unicode/stringpiece.h"
25 #include "unicode/uobject.h"
26 #include "unicode/ustringtrie.h"
27 
28 U_NAMESPACE_BEGIN
29 
30 class ByteSink;
31 class BytesTrieBuilder;
32 class CharString;
33 class UVector32;
34 
35 /**
36  * Light-weight, non-const reader class for a BytesTrie.
37  * Traverses a byte-serialized data structure with minimal state,
38  * for mapping byte sequences to non-negative integer values.
39  *
40  * This class owns the serialized trie data only if it was constructed by
41  * the builder's build() method.
42  * The public constructor and the copy constructor only alias the data (only copy the pointer).
43  * There is no assignment operator.
44  *
45  * This class is not intended for public subclassing.
46  * @stable ICU 4.8
47  */
48 class U_COMMON_API BytesTrie : public UMemory {
49 public:
50     /**
51      * Constructs a BytesTrie reader instance.
52      *
53      * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder,
54      * starting with the first byte of that sequence.
55      * The BytesTrie object will not read more bytes than
56      * the BytesTrieBuilder generated in the corresponding build() call.
57      *
58      * The array is not copied/cloned and must not be modified while
59      * the BytesTrie object is in use.
60      *
61      * @param trieBytes The byte array that contains the serialized trie.
62      * @stable ICU 4.8
63      */
BytesTrie(const void * trieBytes)64     BytesTrie(const void *trieBytes)
65             : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
66               pos_(bytes_), remainingMatchLength_(-1) {}
67 
68     /**
69      * Destructor.
70      * @stable ICU 4.8
71      */
72     ~BytesTrie();
73 
74     /**
75      * Copy constructor, copies the other trie reader object and its state,
76      * but not the byte array which will be shared. (Shallow copy.)
77      * @param other Another BytesTrie object.
78      * @stable ICU 4.8
79      */
BytesTrie(const BytesTrie & other)80     BytesTrie(const BytesTrie &other)
81             : ownedArray_(NULL), bytes_(other.bytes_),
82               pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
83 
84     /**
85      * Resets this trie to its initial state.
86      * @return *this
87      * @stable ICU 4.8
88      */
reset()89     BytesTrie &reset() {
90         pos_=bytes_;
91         remainingMatchLength_=-1;
92         return *this;
93     }
94 
95     /**
96      * BytesTrie state object, for saving a trie's current state
97      * and resetting the trie back to this state later.
98      * @stable ICU 4.8
99      */
100     class State : public UMemory {
101     public:
102         /**
103          * Constructs an empty State.
104          * @stable ICU 4.8
105          */
State()106         State() { bytes=NULL; }
107     private:
108         friend class BytesTrie;
109 
110         const uint8_t *bytes;
111         const uint8_t *pos;
112         int32_t remainingMatchLength;
113     };
114 
115     /**
116      * Saves the state of this trie.
117      * @param state The State object to hold the trie's state.
118      * @return *this
119      * @see resetToState
120      * @stable ICU 4.8
121      */
saveState(State & state)122     const BytesTrie &saveState(State &state) const {
123         state.bytes=bytes_;
124         state.pos=pos_;
125         state.remainingMatchLength=remainingMatchLength_;
126         return *this;
127     }
128 
129     /**
130      * Resets this trie to the saved state.
131      * If the state object contains no state, or the state of a different trie,
132      * then this trie remains unchanged.
133      * @param state The State object which holds a saved trie state.
134      * @return *this
135      * @see saveState
136      * @see reset
137      * @stable ICU 4.8
138      */
resetToState(const State & state)139     BytesTrie &resetToState(const State &state) {
140         if(bytes_==state.bytes && bytes_!=NULL) {
141             pos_=state.pos;
142             remainingMatchLength_=state.remainingMatchLength;
143         }
144         return *this;
145     }
146 
147     /**
148      * Determines whether the byte sequence so far matches, whether it has a value,
149      * and whether another input byte can continue a matching byte sequence.
150      * @return The match/value Result.
151      * @stable ICU 4.8
152      */
153     UStringTrieResult current() const;
154 
155     /**
156      * Traverses the trie from the initial state for this input byte.
157      * Equivalent to reset().next(inByte).
158      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
159      *               Values below -0x100 and above 0xff will never match.
160      * @return The match/value Result.
161      * @stable ICU 4.8
162      */
first(int32_t inByte)163     inline UStringTrieResult first(int32_t inByte) {
164         remainingMatchLength_=-1;
165         if(inByte<0) {
166             inByte+=0x100;
167         }
168         return nextImpl(bytes_, inByte);
169     }
170 
171     /**
172      * Traverses the trie from the current state for this input byte.
173      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
174      *               Values below -0x100 and above 0xff will never match.
175      * @return The match/value Result.
176      * @stable ICU 4.8
177      */
178     UStringTrieResult next(int32_t inByte);
179 
180     /**
181      * Traverses the trie from the current state for this byte sequence.
182      * Equivalent to
183      * \code
184      * Result result=current();
185      * for(each c in s)
186      *   if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
187      *   result=next(c);
188      * return result;
189      * \endcode
190      * @param s A string or byte sequence. Can be NULL if length is 0.
191      * @param length The length of the byte sequence. Can be -1 if NUL-terminated.
192      * @return The match/value Result.
193      * @stable ICU 4.8
194      */
195     UStringTrieResult next(const char *s, int32_t length);
196 
197     /**
198      * Returns a matching byte sequence's value if called immediately after
199      * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
200      * getValue() can be called multiple times.
201      *
202      * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
203      * @return The value for the byte sequence so far.
204      * @stable ICU 4.8
205      */
getValue()206     inline int32_t getValue() const {
207         const uint8_t *pos=pos_;
208         int32_t leadByte=*pos++;
209         // U_ASSERT(leadByte>=kMinValueLead);
210         return readValue(pos, leadByte>>1);
211     }
212 
213     /**
214      * Determines whether all byte sequences reachable from the current state
215      * map to the same value.
216      * @param uniqueValue Receives the unique value, if this function returns TRUE.
217      *                    (output-only)
218      * @return TRUE if all byte sequences reachable from the current state
219      *         map to the same value.
220      * @stable ICU 4.8
221      */
hasUniqueValue(int32_t & uniqueValue)222     inline UBool hasUniqueValue(int32_t &uniqueValue) const {
223         const uint8_t *pos=pos_;
224         // Skip the rest of a pending linear-match node.
225         return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
226     }
227 
228     /**
229      * Finds each byte which continues the byte sequence from the current state.
230      * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now.
231      * @param out Each next byte is appended to this object.
232      *            (Only uses the out.Append(s, length) method.)
233      * @return the number of bytes which continue the byte sequence from here
234      * @stable ICU 4.8
235      */
236     int32_t getNextBytes(ByteSink &out) const;
237 
238     /**
239      * Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
240      * @stable ICU 4.8
241      */
242     class U_COMMON_API Iterator : public UMemory {
243     public:
244         /**
245          * Iterates from the root of a byte-serialized BytesTrie.
246          * @param trieBytes The trie bytes.
247          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
248          *                        Otherwise, the iterator returns strings with this maximum length.
249          * @param errorCode Standard ICU error code. Its input value must
250          *                  pass the U_SUCCESS() test, or else the function returns
251          *                  immediately. Check for U_FAILURE() on output or use with
252          *                  function chaining. (See User Guide for details.)
253          * @stable ICU 4.8
254          */
255         Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
256 
257         /**
258          * Iterates from the current state of the specified BytesTrie.
259          * @param trie The trie whose state will be copied for iteration.
260          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
261          *                        Otherwise, the iterator returns strings with this maximum length.
262          * @param errorCode Standard ICU error code. Its input value must
263          *                  pass the U_SUCCESS() test, or else the function returns
264          *                  immediately. Check for U_FAILURE() on output or use with
265          *                  function chaining. (See User Guide for details.)
266          * @stable ICU 4.8
267          */
268         Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
269 
270         /**
271          * Destructor.
272          * @stable ICU 4.8
273          */
274         ~Iterator();
275 
276         /**
277          * Resets this iterator to its initial state.
278          * @return *this
279          * @stable ICU 4.8
280          */
281         Iterator &reset();
282 
283         /**
284          * @return TRUE if there are more elements.
285          * @stable ICU 4.8
286          */
287         UBool hasNext() const;
288 
289         /**
290          * Finds the next (byte sequence, value) pair if there is one.
291          *
292          * If the byte sequence is truncated to the maximum length and does not
293          * have a real value, then the value is set to -1.
294          * In this case, this "not a real value" is indistinguishable from
295          * a real value of -1.
296          * @param errorCode Standard ICU error code. Its input value must
297          *                  pass the U_SUCCESS() test, or else the function returns
298          *                  immediately. Check for U_FAILURE() on output or use with
299          *                  function chaining. (See User Guide for details.)
300          * @return TRUE if there is another element.
301          * @stable ICU 4.8
302          */
303         UBool next(UErrorCode &errorCode);
304 
305         /**
306          * @return The NUL-terminated byte sequence for the last successful next().
307          * @stable ICU 4.8
308          */
getString()309         const StringPiece &getString() const { return sp_; }
310         /**
311          * @return The value for the last successful next().
312          * @stable ICU 4.8
313          */
getValue()314         int32_t getValue() const { return value_; }
315 
316     private:
317         UBool truncateAndStop();
318 
319         const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
320 
321         const uint8_t *bytes_;
322         const uint8_t *pos_;
323         const uint8_t *initialPos_;
324         int32_t remainingMatchLength_;
325         int32_t initialRemainingMatchLength_;
326 
327         CharString *str_;
328         StringPiece sp_;
329         int32_t maxLength_;
330         int32_t value_;
331 
332         // The stack stores pairs of integers for backtracking to another
333         // outbound edge of a branch node.
334         // The first integer is an offset from bytes_.
335         // The second integer has the str_->length() from before the node in bits 15..0,
336         // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
337         // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
338         // but the code looks more confusing that way.)
339         UVector32 *stack_;
340     };
341 
342 private:
343     friend class BytesTrieBuilder;
344 
345     /**
346      * Constructs a BytesTrie reader instance.
347      * Unlike the public constructor which just aliases an array,
348      * this constructor adopts the builder's array.
349      * This constructor is only called by the builder.
350      */
BytesTrie(void * adoptBytes,const void * trieBytes)351     BytesTrie(void *adoptBytes, const void *trieBytes)
352             : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
353               bytes_(static_cast<const uint8_t *>(trieBytes)),
354               pos_(bytes_), remainingMatchLength_(-1) {}
355 
356     // No assignment operator.
357     BytesTrie &operator=(const BytesTrie &other);
358 
stop()359     inline void stop() {
360         pos_=NULL;
361     }
362 
363     // Reads a compact 32-bit integer.
364     // pos is already after the leadByte, and the lead byte is already shifted right by 1.
365     static int32_t readValue(const uint8_t *pos, int32_t leadByte);
skipValue(const uint8_t * pos,int32_t leadByte)366     static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
367         // U_ASSERT(leadByte>=kMinValueLead);
368         if(leadByte>=(kMinTwoByteValueLead<<1)) {
369             if(leadByte<(kMinThreeByteValueLead<<1)) {
370                 ++pos;
371             } else if(leadByte<(kFourByteValueLead<<1)) {
372                 pos+=2;
373             } else {
374                 pos+=3+((leadByte>>1)&1);
375             }
376         }
377         return pos;
378     }
skipValue(const uint8_t * pos)379     static inline const uint8_t *skipValue(const uint8_t *pos) {
380         int32_t leadByte=*pos++;
381         return skipValue(pos, leadByte);
382     }
383 
384     // Reads a jump delta and jumps.
385     static const uint8_t *jumpByDelta(const uint8_t *pos);
386 
skipDelta(const uint8_t * pos)387     static inline const uint8_t *skipDelta(const uint8_t *pos) {
388         int32_t delta=*pos++;
389         if(delta>=kMinTwoByteDeltaLead) {
390             if(delta<kMinThreeByteDeltaLead) {
391                 ++pos;
392             } else if(delta<kFourByteDeltaLead) {
393                 pos+=2;
394             } else {
395                 pos+=3+(delta&1);
396             }
397         }
398         return pos;
399     }
400 
valueResult(int32_t node)401     static inline UStringTrieResult valueResult(int32_t node) {
402         return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
403     }
404 
405     // Handles a branch node for both next(byte) and next(string).
406     UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
407 
408     // Requires remainingLength_<0.
409     UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
410 
411     // Helper functions for hasUniqueValue().
412     // Recursively finds a unique value (or whether there is not a unique one)
413     // from a branch.
414     static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
415                                                     UBool haveUniqueValue, int32_t &uniqueValue);
416     // Recursively finds a unique value (or whether there is not a unique one)
417     // starting from a position on a node lead byte.
418     static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
419 
420     // Helper functions for getNextBytes().
421     // getNextBytes() when pos is on a branch node.
422     static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
423     static void append(ByteSink &out, int c);
424 
425     // BytesTrie data structure
426     //
427     // The trie consists of a series of byte-serialized nodes for incremental
428     // string/byte sequence matching. The root node is at the beginning of the trie data.
429     //
430     // Types of nodes are distinguished by their node lead byte ranges.
431     // After each node, except a final-value node, another node follows to
432     // encode match values or continue matching further bytes.
433     //
434     // Node types:
435     //  - Value node: Stores a 32-bit integer in a compact, variable-length format.
436     //    The value is for the string/byte sequence so far.
437     //    One node bit indicates whether the value is final or whether
438     //    matching continues with the next node.
439     //  - Linear-match node: Matches a number of bytes.
440     //  - Branch node: Branches to other nodes according to the current input byte.
441     //    The node byte is the length of the branch (number of bytes to select from)
442     //    minus 1. It is followed by a sub-node:
443     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
444     //      there are length-1 (key, value) pairs and then one more comparison byte.
445     //      If one of the key bytes matches, then the value is either a final value for
446     //      the string/byte sequence so far, or a "jump" delta to the next node.
447     //      If the last byte matches, then matching continues with the next node.
448     //      (Values have the same encoding as value nodes.)
449     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
450     //      there is one byte and one "jump" delta.
451     //      If the input byte is less than the sub-node byte, then "jump" by delta to
452     //      the next sub-node which will have a length of length/2.
453     //      (The delta has its own compact encoding.)
454     //      Otherwise, skip the "jump" delta to the next sub-node
455     //      which will have a length of length-length/2.
456 
457     // Node lead byte values.
458 
459     // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
460     // the length is one more than the next byte.
461 
462     // For a branch sub-node with at most this many entries, we drop down
463     // to a linear search.
464     static const int32_t kMaxBranchLinearSubNodeLength=5;
465 
466     // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
467     static const int32_t kMinLinearMatch=0x10;
468     static const int32_t kMaxLinearMatchLength=0x10;
469 
470     // 20..ff: Variable-length value node.
471     // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
472     // Then shift-right by 1 bit.
473     // The remaining lead byte value indicates the number of following bytes (0..4)
474     // and contains the value's top bits.
475     static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x20
476     // It is a final value if bit 0 is set.
477     static const int32_t kValueIsFinal=1;
478 
479     // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
480     static const int32_t kMinOneByteValueLead=kMinValueLead/2;  // 0x10
481     static const int32_t kMaxOneByteValue=0x40;  // At least 6 bits in the first byte.
482 
483     static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;  // 0x51
484     static const int32_t kMaxTwoByteValue=0x1aff;
485 
486     static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;  // 0x6c
487     static const int32_t kFourByteValueLead=0x7e;
488 
489     // A little more than Unicode code points. (0x11ffff)
490     static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
491 
492     static const int32_t kFiveByteValueLead=0x7f;
493 
494     // Compact delta integers.
495     static const int32_t kMaxOneByteDelta=0xbf;
496     static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1;  // 0xc0
497     static const int32_t kMinThreeByteDeltaLead=0xf0;
498     static const int32_t kFourByteDeltaLead=0xfe;
499     static const int32_t kFiveByteDeltaLead=0xff;
500 
501     static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;  // 0x2fff
502     static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;  // 0xdffff
503 
504     uint8_t *ownedArray_;
505 
506     // Fixed value referencing the BytesTrie bytes.
507     const uint8_t *bytes_;
508 
509     // Iterator variables.
510 
511     // Pointer to next trie byte to read. NULL if no more matches.
512     const uint8_t *pos_;
513     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
514     int32_t remainingMatchLength_;
515 };
516 
517 U_NAMESPACE_END
518 
519 #endif  // __BYTESTRIE_H__
520