1 /* NOLINT(build/header_guard) */
2 /* Copyright 2016 Google Inc. All Rights Reserved.
3 
4    Distributed under MIT license.
5    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
6 */
7 
8 /* template parameters: FN, BUCKET_BITS, MAX_TREE_COMP_LENGTH,
9                         MAX_TREE_SEARCH_DEPTH */
10 
11 /* A (forgetful) hash table where each hash bucket contains a binary tree of
12    sequences whose first 4 bytes share the same hash code.
13    Each sequence is MAX_TREE_COMP_LENGTH long and is identified by its starting
14    position in the input data. The binary tree is sorted by the lexicographic
15    order of the sequences, and it is also a max-heap with respect to the
16    starting positions. */
17 
18 #define HashToBinaryTree HASHER()
19 
20 #define BUCKET_SIZE (1 << BUCKET_BITS)
21 
FN(HashTypeLength)22 static BROTLI_INLINE size_t FN(HashTypeLength)(void) { return 4; }
FN(StoreLookahead)23 static BROTLI_INLINE size_t FN(StoreLookahead)(void) {
24   return MAX_TREE_COMP_LENGTH;
25 }
26 
FN(HashBytes)27 static uint32_t FN(HashBytes)(const uint8_t* BROTLI_RESTRICT data) {
28   uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32;
29   /* The higher bits contain more mixture from the multiplication,
30      so we take our results from there. */
31   return h >> (32 - BUCKET_BITS);
32 }
33 
34 typedef struct HashToBinaryTree {
35   /* The window size minus 1 */
36   size_t window_mask_;
37 
38   /* Hash table that maps the 4-byte hashes of the sequence to the last
39      position where this hash was found, which is the root of the binary
40      tree of sequences that share this hash bucket. */
41   uint32_t* buckets_;  /* uint32_t[BUCKET_SIZE]; */
42 
43   /* A position used to mark a non-existent sequence, i.e. a tree is empty if
44      its root is at invalid_pos_ and a node is a leaf if both its children
45      are at invalid_pos_. */
46   uint32_t invalid_pos_;
47 
48   /* --- Dynamic size members --- */
49 
50   /* The union of the binary trees of each hash bucket. The root of the tree
51      corresponding to a hash is a sequence starting at buckets_[hash] and
52      the left and right children of a sequence starting at pos are
53      forest_[2 * pos] and forest_[2 * pos + 1]. */
54   uint32_t* forest_;  /* uint32_t[2 * num_nodes] */
55 } HashToBinaryTree;
56 
FN(Initialize)57 static void FN(Initialize)(
58     HasherCommon* common, HashToBinaryTree* BROTLI_RESTRICT self,
59     const BrotliEncoderParams* params) {
60   self->buckets_ = (uint32_t*)common->extra;
61   self->forest_ = &self->buckets_[BUCKET_SIZE];
62 
63   self->window_mask_ = (1u << params->lgwin) - 1u;
64   self->invalid_pos_ = (uint32_t)(0 - self->window_mask_);
65 }
66 
FN(Prepare)67 static void FN(Prepare)
68     (HashToBinaryTree* BROTLI_RESTRICT self, BROTLI_BOOL one_shot,
69     size_t input_size, const uint8_t* BROTLI_RESTRICT data) {
70   uint32_t invalid_pos = self->invalid_pos_;
71   uint32_t i;
72   uint32_t* BROTLI_RESTRICT buckets = self->buckets_;
73   BROTLI_UNUSED(data);
74   BROTLI_UNUSED(one_shot);
75   BROTLI_UNUSED(input_size);
76   for (i = 0; i < BUCKET_SIZE; i++) {
77     buckets[i] = invalid_pos;
78   }
79 }
80 
FN(HashMemAllocInBytes)81 static BROTLI_INLINE size_t FN(HashMemAllocInBytes)(
82     const BrotliEncoderParams* params, BROTLI_BOOL one_shot,
83     size_t input_size) {
84   size_t num_nodes = (size_t)1 << params->lgwin;
85   if (one_shot && input_size < num_nodes) {
86     num_nodes = input_size;
87   }
88   return sizeof(uint32_t) * BUCKET_SIZE + 2 * sizeof(uint32_t) * num_nodes;
89 }
90 
FN(LeftChildIndex)91 static BROTLI_INLINE size_t FN(LeftChildIndex)(
92     HashToBinaryTree* BROTLI_RESTRICT self,
93     const size_t pos) {
94   return 2 * (pos & self->window_mask_);
95 }
96 
FN(RightChildIndex)97 static BROTLI_INLINE size_t FN(RightChildIndex)(
98     HashToBinaryTree* BROTLI_RESTRICT self,
99     const size_t pos) {
100   return 2 * (pos & self->window_mask_) + 1;
101 }
102 
103 /* Stores the hash of the next 4 bytes and in a single tree-traversal, the
104    hash bucket's binary tree is searched for matches and is re-rooted at the
105    current position.
106 
107    If less than MAX_TREE_COMP_LENGTH data is available, the hash bucket of the
108    current position is searched for matches, but the state of the hash table
109    is not changed, since we can not know the final sorting order of the
110    current (incomplete) sequence.
111 
112    This function must be called with increasing cur_ix positions. */
FN(StoreAndFindMatches)113 static BROTLI_INLINE BackwardMatch* FN(StoreAndFindMatches)(
114     HashToBinaryTree* BROTLI_RESTRICT self, const uint8_t* BROTLI_RESTRICT data,
115     const size_t cur_ix, const size_t ring_buffer_mask, const size_t max_length,
116     const size_t max_backward, size_t* const BROTLI_RESTRICT best_len,
117     BackwardMatch* BROTLI_RESTRICT matches) {
118   const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
119   const size_t max_comp_len =
120       BROTLI_MIN(size_t, max_length, MAX_TREE_COMP_LENGTH);
121   const BROTLI_BOOL should_reroot_tree =
122       TO_BROTLI_BOOL(max_length >= MAX_TREE_COMP_LENGTH);
123   const uint32_t key = FN(HashBytes)(&data[cur_ix_masked]);
124   uint32_t* BROTLI_RESTRICT buckets = self->buckets_;
125   uint32_t* BROTLI_RESTRICT forest = self->forest_;
126   size_t prev_ix = buckets[key];
127   /* The forest index of the rightmost node of the left subtree of the new
128      root, updated as we traverse and re-root the tree of the hash bucket. */
129   size_t node_left = FN(LeftChildIndex)(self, cur_ix);
130   /* The forest index of the leftmost node of the right subtree of the new
131      root, updated as we traverse and re-root the tree of the hash bucket. */
132   size_t node_right = FN(RightChildIndex)(self, cur_ix);
133   /* The match length of the rightmost node of the left subtree of the new
134      root, updated as we traverse and re-root the tree of the hash bucket. */
135   size_t best_len_left = 0;
136   /* The match length of the leftmost node of the right subtree of the new
137      root, updated as we traverse and re-root the tree of the hash bucket. */
138   size_t best_len_right = 0;
139   size_t depth_remaining;
140   if (should_reroot_tree) {
141     buckets[key] = (uint32_t)cur_ix;
142   }
143   for (depth_remaining = MAX_TREE_SEARCH_DEPTH; ; --depth_remaining) {
144     const size_t backward = cur_ix - prev_ix;
145     const size_t prev_ix_masked = prev_ix & ring_buffer_mask;
146     if (backward == 0 || backward > max_backward || depth_remaining == 0) {
147       if (should_reroot_tree) {
148         forest[node_left] = self->invalid_pos_;
149         forest[node_right] = self->invalid_pos_;
150       }
151       break;
152     }
153     {
154       const size_t cur_len = BROTLI_MIN(size_t, best_len_left, best_len_right);
155       size_t len;
156       BROTLI_DCHECK(cur_len <= MAX_TREE_COMP_LENGTH);
157       len = cur_len +
158           FindMatchLengthWithLimit(&data[cur_ix_masked + cur_len],
159                                    &data[prev_ix_masked + cur_len],
160                                    max_length - cur_len);
161       BROTLI_DCHECK(
162           0 == memcmp(&data[cur_ix_masked], &data[prev_ix_masked], len));
163       if (matches && len > *best_len) {
164         *best_len = len;
165         InitBackwardMatch(matches++, backward, len);
166       }
167       if (len >= max_comp_len) {
168         if (should_reroot_tree) {
169           forest[node_left] = forest[FN(LeftChildIndex)(self, prev_ix)];
170           forest[node_right] = forest[FN(RightChildIndex)(self, prev_ix)];
171         }
172         break;
173       }
174       if (data[cur_ix_masked + len] > data[prev_ix_masked + len]) {
175         best_len_left = len;
176         if (should_reroot_tree) {
177           forest[node_left] = (uint32_t)prev_ix;
178         }
179         node_left = FN(RightChildIndex)(self, prev_ix);
180         prev_ix = forest[node_left];
181       } else {
182         best_len_right = len;
183         if (should_reroot_tree) {
184           forest[node_right] = (uint32_t)prev_ix;
185         }
186         node_right = FN(LeftChildIndex)(self, prev_ix);
187         prev_ix = forest[node_right];
188       }
189     }
190   }
191   return matches;
192 }
193 
194 /* Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the
195    length of max_length and stores the position cur_ix in the hash table.
196 
197    Sets *num_matches to the number of matches found, and stores the found
198    matches in matches[0] to matches[*num_matches - 1]. The matches will be
199    sorted by strictly increasing length and (non-strictly) increasing
200    distance. */
FN(FindAllMatches)201 static BROTLI_INLINE size_t FN(FindAllMatches)(
202     HashToBinaryTree* BROTLI_RESTRICT self,
203     const BrotliEncoderDictionary* dictionary,
204     const uint8_t* BROTLI_RESTRICT data,
205     const size_t ring_buffer_mask, const size_t cur_ix,
206     const size_t max_length, const size_t max_backward,
207     const size_t dictionary_distance, const BrotliEncoderParams* params,
208     BackwardMatch* matches) {
209   BackwardMatch* const orig_matches = matches;
210   const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
211   size_t best_len = 1;
212   const size_t short_match_max_backward =
213       params->quality != HQ_ZOPFLIFICATION_QUALITY ? 16 : 64;
214   size_t stop = cur_ix - short_match_max_backward;
215   uint32_t dict_matches[BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1];
216   size_t i;
217   if (cur_ix < short_match_max_backward) { stop = 0; }
218   for (i = cur_ix - 1; i > stop && best_len <= 2; --i) {
219     size_t prev_ix = i;
220     const size_t backward = cur_ix - prev_ix;
221     if (BROTLI_PREDICT_FALSE(backward > max_backward)) {
222       break;
223     }
224     prev_ix &= ring_buffer_mask;
225     if (data[cur_ix_masked] != data[prev_ix] ||
226         data[cur_ix_masked + 1] != data[prev_ix + 1]) {
227       continue;
228     }
229     {
230       const size_t len =
231           FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
232                                    max_length);
233       if (len > best_len) {
234         best_len = len;
235         InitBackwardMatch(matches++, backward, len);
236       }
237     }
238   }
239   if (best_len < max_length) {
240     matches = FN(StoreAndFindMatches)(self, data, cur_ix,
241         ring_buffer_mask, max_length, max_backward, &best_len, matches);
242   }
243   for (i = 0; i <= BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN; ++i) {
244     dict_matches[i] = kInvalidMatch;
245   }
246   {
247     size_t minlen = BROTLI_MAX(size_t, 4, best_len + 1);
248     if (BrotliFindAllStaticDictionaryMatches(dictionary,
249         &data[cur_ix_masked], minlen, max_length, &dict_matches[0])) {
250       size_t maxlen = BROTLI_MIN(
251           size_t, BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN, max_length);
252       size_t l;
253       for (l = minlen; l <= maxlen; ++l) {
254         uint32_t dict_id = dict_matches[l];
255         if (dict_id < kInvalidMatch) {
256           size_t distance = dictionary_distance + (dict_id >> 5) + 1;
257           if (distance <= params->dist.max_distance) {
258             InitDictionaryBackwardMatch(matches++, distance, l, dict_id & 31);
259           }
260         }
261       }
262     }
263   }
264   return (size_t)(matches - orig_matches);
265 }
266 
267 /* Stores the hash of the next 4 bytes and re-roots the binary tree at the
268    current sequence, without returning any matches.
269    REQUIRES: ix + MAX_TREE_COMP_LENGTH <= end-of-current-block */
FN(Store)270 static BROTLI_INLINE void FN(Store)(HashToBinaryTree* BROTLI_RESTRICT self,
271     const uint8_t* BROTLI_RESTRICT data,
272     const size_t mask, const size_t ix) {
273   /* Maximum distance is window size - 16, see section 9.1. of the spec. */
274   const size_t max_backward = self->window_mask_ - BROTLI_WINDOW_GAP + 1;
275   FN(StoreAndFindMatches)(self, data, ix, mask, MAX_TREE_COMP_LENGTH,
276       max_backward, NULL, NULL);
277 }
278 
FN(StoreRange)279 static BROTLI_INLINE void FN(StoreRange)(HashToBinaryTree* BROTLI_RESTRICT self,
280     const uint8_t* BROTLI_RESTRICT data, const size_t mask,
281     const size_t ix_start, const size_t ix_end) {
282   size_t i = ix_start;
283   size_t j = ix_start;
284   if (ix_start + 63 <= ix_end) {
285     i = ix_end - 63;
286   }
287   if (ix_start + 512 <= i) {
288     for (; j < i; j += 8) {
289       FN(Store)(self, data, mask, j);
290     }
291   }
292   for (; i < ix_end; ++i) {
293     FN(Store)(self, data, mask, i);
294   }
295 }
296 
FN(StitchToPreviousBlock)297 static BROTLI_INLINE void FN(StitchToPreviousBlock)(
298     HashToBinaryTree* BROTLI_RESTRICT self,
299     size_t num_bytes, size_t position, const uint8_t* ringbuffer,
300     size_t ringbuffer_mask) {
301   if (num_bytes >= FN(HashTypeLength)() - 1 &&
302       position >= MAX_TREE_COMP_LENGTH) {
303     /* Store the last `MAX_TREE_COMP_LENGTH - 1` positions in the hasher.
304        These could not be calculated before, since they require knowledge
305        of both the previous and the current block. */
306     const size_t i_start = position - MAX_TREE_COMP_LENGTH + 1;
307     const size_t i_end = BROTLI_MIN(size_t, position, i_start + num_bytes);
308     size_t i;
309     for (i = i_start; i < i_end; ++i) {
310       /* Maximum distance is window size - 16, see section 9.1. of the spec.
311          Furthermore, we have to make sure that we don't look further back
312          from the start of the next block than the window size, otherwise we
313          could access already overwritten areas of the ring-buffer. */
314       const size_t max_backward =
315           self->window_mask_ - BROTLI_MAX(size_t,
316                                           BROTLI_WINDOW_GAP - 1,
317                                           position - i);
318       /* We know that i + MAX_TREE_COMP_LENGTH <= position + num_bytes, i.e. the
319          end of the current block and that we have at least
320          MAX_TREE_COMP_LENGTH tail in the ring-buffer. */
321       FN(StoreAndFindMatches)(self, ringbuffer, i, ringbuffer_mask,
322           MAX_TREE_COMP_LENGTH, max_backward, NULL, NULL);
323     }
324   }
325 }
326 
327 #undef BUCKET_SIZE
328 
329 #undef HashToBinaryTree
330