1 /* Copyright 2010 Google Inc. All Rights Reserved.
2 
3    Distributed under MIT license.
4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6 
7 /* A (forgetful) hash table to the data seen by the compressor, to
8    help create backward references to previous data. */
9 
10 #ifndef BROTLI_ENC_HASH_H_
11 #define BROTLI_ENC_HASH_H_
12 
13 #include <string.h>  /* memcmp, memset */
14 
15 #include "../common/constants.h"
16 #include "../common/dictionary.h"
17 #include "../common/platform.h"
18 #include <brotli/types.h>
19 #include "./encoder_dict.h"
20 #include "./fast_log.h"
21 #include "./find_match_length.h"
22 #include "./memory.h"
23 #include "./quality.h"
24 #include "./static_dict.h"
25 
26 #if defined(__cplusplus) || defined(c_plusplus)
27 extern "C" {
28 #endif
29 
30 /* Pointer to hasher data.
31  *
32  * Excluding initialization and destruction, hasher can be passed as
33  * HasherHandle by value.
34  *
35  * Typically hasher data consists of 3 sections:
36  * * HasherCommon structure
37  * * private structured hasher data, depending on hasher type
38  * * private dynamic hasher data, depending on hasher type and parameters
39  *
40  * Using "define" instead of "typedef", because on MSVC __restrict does not work
41  * on typedef pointer types. */
42 #define HasherHandle uint8_t*
43 
44 typedef struct {
45   BrotliHasherParams params;
46 
47   /* False if hasher needs to be "prepared" before use. */
48   BROTLI_BOOL is_prepared_;
49 
50   size_t dict_num_lookups;
51   size_t dict_num_matches;
52 } HasherCommon;
53 
GetHasherCommon(HasherHandle handle)54 static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) {
55   return (HasherCommon*)handle;
56 }
57 
58 #define score_t size_t
59 
60 static const uint32_t kCutoffTransformsCount = 10;
61 /*   0,  12,   27,    23,    42,    63,    56,    48,    59,    64 */
62 /* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */
63 static const uint64_t kCutoffTransforms =
64     BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200);
65 
66 typedef struct HasherSearchResult {
67   size_t len;
68   size_t distance;
69   score_t score;
70   int len_code_delta; /* == len_code - len */
71 } HasherSearchResult;
72 
73 /* kHashMul32 multiplier has these properties:
74    * The multiplier must be odd. Otherwise we may lose the highest bit.
75    * No long streaks of ones or zeros.
76    * There is no effort to ensure that it is a prime, the oddity is enough
77      for this use.
78    * The number has been tuned heuristically against compression benchmarks. */
79 static const uint32_t kHashMul32 = 0x1E35A7BD;
80 static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1E35A7BD, 0x1E35A7BD);
81 static const uint64_t kHashMul64Long =
82     BROTLI_MAKE_UINT64_T(0x1FE35A7Bu, 0xD3579BD3u);
83 
Hash14(const uint8_t * data)84 static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
85   uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32;
86   /* The higher bits contain more mixture from the multiplication,
87      so we take our results from there. */
88   return h >> (32 - 14);
89 }
90 
PrepareDistanceCache(int * BROTLI_RESTRICT distance_cache,const int num_distances)91 static BROTLI_INLINE void PrepareDistanceCache(
92     int* BROTLI_RESTRICT distance_cache, const int num_distances) {
93   if (num_distances > 4) {
94     int last_distance = distance_cache[0];
95     distance_cache[4] = last_distance - 1;
96     distance_cache[5] = last_distance + 1;
97     distance_cache[6] = last_distance - 2;
98     distance_cache[7] = last_distance + 2;
99     distance_cache[8] = last_distance - 3;
100     distance_cache[9] = last_distance + 3;
101     if (num_distances > 10) {
102       int next_last_distance = distance_cache[1];
103       distance_cache[10] = next_last_distance - 1;
104       distance_cache[11] = next_last_distance + 1;
105       distance_cache[12] = next_last_distance - 2;
106       distance_cache[13] = next_last_distance + 2;
107       distance_cache[14] = next_last_distance - 3;
108       distance_cache[15] = next_last_distance + 3;
109     }
110   }
111 }
112 
113 #define BROTLI_LITERAL_BYTE_SCORE 135
114 #define BROTLI_DISTANCE_BIT_PENALTY 30
115 /* Score must be positive after applying maximal penalty. */
116 #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
117 
118 /* Usually, we always choose the longest backward reference. This function
119    allows for the exception of that rule.
120 
121    If we choose a backward reference that is further away, it will
122    usually be coded with more bits. We approximate this by assuming
123    log2(distance). If the distance can be expressed in terms of the
124    last four distances, we use some heuristic constants to estimate
125    the bits cost. For the first up to four literals we use the bit
126    cost of the literals from the literal cost model, after that we
127    use the average bit cost of the cost model.
128 
129    This function is used to sometimes discard a longer backward reference
130    when it is not much longer and the bit cost for encoding it is more
131    than the saved literals.
132 
133    backward_reference_offset MUST be positive. */
BackwardReferenceScore(size_t copy_length,size_t backward_reference_offset)134 static BROTLI_INLINE score_t BackwardReferenceScore(
135     size_t copy_length, size_t backward_reference_offset) {
136   return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
137       BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
138 }
139 
BackwardReferenceScoreUsingLastDistance(size_t copy_length)140 static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
141     size_t copy_length) {
142   return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
143       BROTLI_SCORE_BASE + 15;
144 }
145 
BackwardReferencePenaltyUsingLastDistance(size_t distance_short_code)146 static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance(
147     size_t distance_short_code) {
148   return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE);
149 }
150 
TestStaticDictionaryItem(const BrotliEncoderDictionary * dictionary,size_t item,const uint8_t * data,size_t max_length,size_t max_backward,size_t max_distance,HasherSearchResult * out)151 static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
152     const BrotliEncoderDictionary* dictionary, size_t item,
153     const uint8_t* data, size_t max_length, size_t max_backward,
154     size_t max_distance, HasherSearchResult* out) {
155   size_t len;
156   size_t word_idx;
157   size_t offset;
158   size_t matchlen;
159   size_t backward;
160   score_t score;
161   len = item & 0x1F;
162   word_idx = item >> 5;
163   offset = dictionary->words->offsets_by_length[len] + len * word_idx;
164   if (len > max_length) {
165     return BROTLI_FALSE;
166   }
167 
168   matchlen =
169       FindMatchLengthWithLimit(data, &dictionary->words->data[offset], len);
170   if (matchlen + dictionary->cutoffTransformsCount <= len || matchlen == 0) {
171     return BROTLI_FALSE;
172   }
173   {
174     size_t cut = len - matchlen;
175     size_t transform_id = (cut << 2) +
176         (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F);
177     backward = max_backward + 1 + word_idx +
178         (transform_id << dictionary->words->size_bits_by_length[len]);
179   }
180   if (backward > max_distance) {
181     return BROTLI_FALSE;
182   }
183   score = BackwardReferenceScore(matchlen, backward);
184   if (score < out->score) {
185     return BROTLI_FALSE;
186   }
187   out->len = matchlen;
188   out->len_code_delta = (int)len - (int)matchlen;
189   out->distance = backward;
190   out->score = score;
191   return BROTLI_TRUE;
192 }
193 
SearchInStaticDictionary(const BrotliEncoderDictionary * dictionary,HasherHandle handle,const uint8_t * data,size_t max_length,size_t max_backward,size_t max_distance,HasherSearchResult * out,BROTLI_BOOL shallow)194 static BROTLI_INLINE void SearchInStaticDictionary(
195     const BrotliEncoderDictionary* dictionary,
196     HasherHandle handle, const uint8_t* data, size_t max_length,
197     size_t max_backward, size_t max_distance,
198     HasherSearchResult* out, BROTLI_BOOL shallow) {
199   size_t key;
200   size_t i;
201   HasherCommon* self = GetHasherCommon(handle);
202   if (self->dict_num_matches < (self->dict_num_lookups >> 7)) {
203     return;
204   }
205   key = Hash14(data) << 1;
206   for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
207     size_t item = dictionary->hash_table[key];
208     self->dict_num_lookups++;
209     if (item != 0) {
210       BROTLI_BOOL item_matches = TestStaticDictionaryItem(
211           dictionary, item, data,
212           max_length, max_backward, max_distance, out);
213       if (item_matches) {
214         self->dict_num_matches++;
215       }
216     }
217   }
218 }
219 
220 typedef struct BackwardMatch {
221   uint32_t distance;
222   uint32_t length_and_code;
223 } BackwardMatch;
224 
InitBackwardMatch(BackwardMatch * self,size_t dist,size_t len)225 static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
226     size_t dist, size_t len) {
227   self->distance = (uint32_t)dist;
228   self->length_and_code = (uint32_t)(len << 5);
229 }
230 
InitDictionaryBackwardMatch(BackwardMatch * self,size_t dist,size_t len,size_t len_code)231 static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
232     size_t dist, size_t len, size_t len_code) {
233   self->distance = (uint32_t)dist;
234   self->length_and_code =
235       (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
236 }
237 
BackwardMatchLength(const BackwardMatch * self)238 static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
239   return self->length_and_code >> 5;
240 }
241 
BackwardMatchLengthCode(const BackwardMatch * self)242 static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
243   size_t code = self->length_and_code & 31;
244   return code ? code : BackwardMatchLength(self);
245 }
246 
247 #define EXPAND_CAT(a, b) CAT(a, b)
248 #define CAT(a, b) a ## b
249 #define FN(X) EXPAND_CAT(X, HASHER())
250 
251 #define HASHER() H10
252 #define BUCKET_BITS 17
253 #define MAX_TREE_SEARCH_DEPTH 64
254 #define MAX_TREE_COMP_LENGTH 128
255 #include "./hash_to_binary_tree_inc.h"  /* NOLINT(build/include) */
256 #undef MAX_TREE_SEARCH_DEPTH
257 #undef MAX_TREE_COMP_LENGTH
258 #undef BUCKET_BITS
259 #undef HASHER
260 /* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */
261 #define MAX_NUM_MATCHES_H10 128
262 
263 /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression
264    a little faster (0.5% - 1%) and it compresses 0.15% better on small text
265    and HTML inputs. */
266 
267 #define HASHER() H2
268 #define BUCKET_BITS 16
269 #define BUCKET_SWEEP 1
270 #define HASH_LEN 5
271 #define USE_DICTIONARY 1
272 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
273 #undef BUCKET_SWEEP
274 #undef USE_DICTIONARY
275 #undef HASHER
276 
277 #define HASHER() H3
278 #define BUCKET_SWEEP 2
279 #define USE_DICTIONARY 0
280 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
281 #undef USE_DICTIONARY
282 #undef BUCKET_SWEEP
283 #undef BUCKET_BITS
284 #undef HASHER
285 
286 #define HASHER() H4
287 #define BUCKET_BITS 17
288 #define BUCKET_SWEEP 4
289 #define USE_DICTIONARY 1
290 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
291 #undef USE_DICTIONARY
292 #undef HASH_LEN
293 #undef BUCKET_SWEEP
294 #undef BUCKET_BITS
295 #undef HASHER
296 
297 #define HASHER() H5
298 #include "./hash_longest_match_inc.h"  /* NOLINT(build/include) */
299 #undef HASHER
300 
301 #define HASHER() H6
302 #include "./hash_longest_match64_inc.h"  /* NOLINT(build/include) */
303 #undef HASHER
304 
305 #define BUCKET_BITS 15
306 
307 #define NUM_LAST_DISTANCES_TO_CHECK 4
308 #define NUM_BANKS 1
309 #define BANK_BITS 16
310 #define HASHER() H40
311 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
312 #undef HASHER
313 #undef NUM_LAST_DISTANCES_TO_CHECK
314 
315 #define NUM_LAST_DISTANCES_TO_CHECK 10
316 #define HASHER() H41
317 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
318 #undef HASHER
319 #undef NUM_LAST_DISTANCES_TO_CHECK
320 #undef NUM_BANKS
321 #undef BANK_BITS
322 
323 #define NUM_LAST_DISTANCES_TO_CHECK 16
324 #define NUM_BANKS 512
325 #define BANK_BITS 9
326 #define HASHER() H42
327 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
328 #undef HASHER
329 #undef NUM_LAST_DISTANCES_TO_CHECK
330 #undef NUM_BANKS
331 #undef BANK_BITS
332 
333 #undef BUCKET_BITS
334 
335 #define HASHER() H54
336 #define BUCKET_BITS 20
337 #define BUCKET_SWEEP 4
338 #define HASH_LEN 7
339 #define USE_DICTIONARY 0
340 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
341 #undef USE_DICTIONARY
342 #undef HASH_LEN
343 #undef BUCKET_SWEEP
344 #undef BUCKET_BITS
345 #undef HASHER
346 
347 /* fast large window hashers */
348 
349 #define HASHER() HROLLING_FAST
350 #define CHUNKLEN 32
351 #define JUMP 4
352 #define NUMBUCKETS 16777216
353 #define MASK ((NUMBUCKETS * 64) - 1)
354 #include "./hash_rolling_inc.h"  /* NOLINT(build/include) */
355 #undef JUMP
356 #undef HASHER
357 
358 
359 #define HASHER() HROLLING
360 #define JUMP 1
361 #include "./hash_rolling_inc.h"  /* NOLINT(build/include) */
362 #undef MASK
363 #undef NUMBUCKETS
364 #undef JUMP
365 #undef CHUNKLEN
366 #undef HASHER
367 
368 #define HASHER() H35
369 #define HASHER_A H3
370 #define HASHER_B HROLLING_FAST
371 #include "./hash_composite_inc.h"  /* NOLINT(build/include) */
372 #undef HASHER_A
373 #undef HASHER_B
374 #undef HASHER
375 
376 #define HASHER() H55
377 #define HASHER_A H54
378 #define HASHER_B HROLLING_FAST
379 #include "./hash_composite_inc.h"  /* NOLINT(build/include) */
380 #undef HASHER_A
381 #undef HASHER_B
382 #undef HASHER
383 
384 #define HASHER() H65
385 #define HASHER_A H6
386 #define HASHER_B HROLLING
387 #include "./hash_composite_inc.h"  /* NOLINT(build/include) */
388 #undef HASHER_A
389 #undef HASHER_B
390 #undef HASHER
391 
392 #undef FN
393 #undef CAT
394 #undef EXPAND_CAT
395 
396 #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)\
397                                H(35) H(55) H(65)
398 #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
399 
DestroyHasher(MemoryManager * m,HasherHandle * handle)400 static BROTLI_INLINE void DestroyHasher(
401     MemoryManager* m, HasherHandle* handle) {
402   if (*handle == NULL) return;
403   BROTLI_FREE(m, *handle);
404 }
405 
HasherReset(HasherHandle handle)406 static BROTLI_INLINE void HasherReset(HasherHandle handle) {
407   if (handle == NULL) return;
408   GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE;
409 }
410 
HasherSize(const BrotliEncoderParams * params,BROTLI_BOOL one_shot,const size_t input_size)411 static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
412     BROTLI_BOOL one_shot, const size_t input_size) {
413   size_t result = sizeof(HasherCommon);
414   switch (params->hasher.type) {
415 #define SIZE_(N)                                                         \
416     case N:                                                              \
417       result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \
418       break;
419     FOR_ALL_HASHERS(SIZE_)
420 #undef SIZE_
421     default:
422       break;
423   }
424   return result;
425 }
426 
HasherSetup(MemoryManager * m,HasherHandle * handle,BrotliEncoderParams * params,const uint8_t * data,size_t position,size_t input_size,BROTLI_BOOL is_last)427 static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle,
428     BrotliEncoderParams* params, const uint8_t* data, size_t position,
429     size_t input_size, BROTLI_BOOL is_last) {
430   HasherHandle self = NULL;
431   HasherCommon* common = NULL;
432   BROTLI_BOOL one_shot = (position == 0 && is_last);
433   if (*handle == NULL) {
434     size_t alloc_size;
435     ChooseHasher(params, &params->hasher);
436     alloc_size = HasherSize(params, one_shot, input_size);
437     self = BROTLI_ALLOC(m, uint8_t, alloc_size);
438     if (BROTLI_IS_OOM(m)) return;
439     *handle = self;
440     common = GetHasherCommon(self);
441     common->params = params->hasher;
442     switch (common->params.type) {
443 #define INITIALIZE_(N)                     \
444       case N:                              \
445         InitializeH ## N(*handle, params); \
446         break;
447       FOR_ALL_HASHERS(INITIALIZE_);
448 #undef INITIALIZE_
449       default:
450         break;
451     }
452     HasherReset(*handle);
453   }
454 
455   self = *handle;
456   common = GetHasherCommon(self);
457   if (!common->is_prepared_) {
458     switch (common->params.type) {
459 #define PREPARE_(N)                                      \
460       case N:                                            \
461         PrepareH ## N(self, one_shot, input_size, data); \
462         break;
463       FOR_ALL_HASHERS(PREPARE_)
464 #undef PREPARE_
465       default: break;
466     }
467     if (position == 0) {
468         common->dict_num_lookups = 0;
469         common->dict_num_matches = 0;
470     }
471     common->is_prepared_ = BROTLI_TRUE;
472   }
473 }
474 
InitOrStitchToPreviousBlock(MemoryManager * m,HasherHandle * handle,const uint8_t * data,size_t mask,BrotliEncoderParams * params,size_t position,size_t input_size,BROTLI_BOOL is_last)475 static BROTLI_INLINE void InitOrStitchToPreviousBlock(
476     MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask,
477     BrotliEncoderParams* params, size_t position, size_t input_size,
478     BROTLI_BOOL is_last) {
479   HasherHandle self;
480   HasherSetup(m, handle, params, data, position, input_size, is_last);
481   if (BROTLI_IS_OOM(m)) return;
482   self = *handle;
483   switch (GetHasherCommon(self)->params.type) {
484 #define INIT_(N)                                                           \
485     case N:                                                                \
486       StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \
487     break;
488     FOR_ALL_HASHERS(INIT_)
489 #undef INIT_
490     default: break;
491   }
492 }
493 
494 #if defined(__cplusplus) || defined(c_plusplus)
495 }  /* extern "C" */
496 #endif
497 
498 #endif  /* BROTLI_ENC_HASH_H_ */
499