1/* Copyright 2010 Google Inc. All Rights Reserved.
2
3   Distributed under MIT license.
4   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5*/
6
7/* A (forgetful) hash table to the data seen by the compressor, to
8   help create backward references to previous data. */
9
10#ifndef BROTLI_ENC_HASH_H_
11#define BROTLI_ENC_HASH_H_
12
13#include <string.h>  /* memcmp, memset */
14
15#include "../common/constants.h"
16#include "../common/dictionary.h"
17#include <brotli/types.h>
18#include "./fast_log.h"
19#include "./find_match_length.h"
20#include "./memory.h"
21#include "./port.h"
22#include "./quality.h"
23#include "./static_dict.h"
24
25#if defined(__cplusplus) || defined(c_plusplus)
26extern "C" {
27#endif
28
29/* Pointer to hasher data.
30 *
31 * Excluding initialization and destruction, hasher can be passed as
32 * HasherHandle by value.
33 *
34 * Typically hasher data consists of 3 sections:
35 * * HasherCommon structure
36 * * private structured hasher data, depending on hasher type
37 * * private dynamic hasher data, depending on hasher type and parameters
38 */
39typedef uint8_t* HasherHandle;
40
41typedef struct {
42  BrotliHasherParams params;
43
44  /* False if hasher needs to be "prepared" before use. */
45  BROTLI_BOOL is_prepared_;
46
47  size_t dict_num_lookups;
48  size_t dict_num_matches;
49} HasherCommon;
50
51static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) {
52  return (HasherCommon*)handle;
53}
54
55#define score_t size_t
56
57static const uint32_t kCutoffTransformsCount = 10;
58/*   0,  12,   27,    23,    42,    63,    56,    48,    59,    64 */
59/* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */
60static const uint64_t kCutoffTransforms =
61    BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200);
62
63typedef struct HasherSearchResult {
64  size_t len;
65  size_t distance;
66  score_t score;
67  int len_code_delta; /* == len_code - len */
68} HasherSearchResult;
69
70/* kHashMul32 multiplier has these properties:
71   * The multiplier must be odd. Otherwise we may lose the highest bit.
72   * No long streaks of ones or zeros.
73   * There is no effort to ensure that it is a prime, the oddity is enough
74     for this use.
75   * The number has been tuned heuristically against compression benchmarks. */
76static const uint32_t kHashMul32 = 0x1e35a7bd;
77static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1e35a7bd, 0x1e35a7bd);
78static const uint64_t kHashMul64Long =
79    BROTLI_MAKE_UINT64_T(0x1fe35a7bU, 0xd3579bd3U);
80
81static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
82  uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
83  /* The higher bits contain more mixture from the multiplication,
84     so we take our results from there. */
85  return h >> (32 - 14);
86}
87
88static BROTLI_INLINE void PrepareDistanceCache(
89    int* BROTLI_RESTRICT distance_cache, const int num_distances) {
90  if (num_distances > 4) {
91    int last_distance = distance_cache[0];
92    distance_cache[4] = last_distance - 1;
93    distance_cache[5] = last_distance + 1;
94    distance_cache[6] = last_distance - 2;
95    distance_cache[7] = last_distance + 2;
96    distance_cache[8] = last_distance - 3;
97    distance_cache[9] = last_distance + 3;
98    if (num_distances > 10) {
99      int next_last_distance = distance_cache[1];
100      distance_cache[10] = next_last_distance - 1;
101      distance_cache[11] = next_last_distance + 1;
102      distance_cache[12] = next_last_distance - 2;
103      distance_cache[13] = next_last_distance + 2;
104      distance_cache[14] = next_last_distance - 3;
105      distance_cache[15] = next_last_distance + 3;
106    }
107  }
108}
109
110#define BROTLI_LITERAL_BYTE_SCORE 135
111#define BROTLI_DISTANCE_BIT_PENALTY 30
112/* Score must be positive after applying maximal penalty. */
113#define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
114
115/* Usually, we always choose the longest backward reference. This function
116   allows for the exception of that rule.
117
118   If we choose a backward reference that is further away, it will
119   usually be coded with more bits. We approximate this by assuming
120   log2(distance). If the distance can be expressed in terms of the
121   last four distances, we use some heuristic constants to estimate
122   the bits cost. For the first up to four literals we use the bit
123   cost of the literals from the literal cost model, after that we
124   use the average bit cost of the cost model.
125
126   This function is used to sometimes discard a longer backward reference
127   when it is not much longer and the bit cost for encoding it is more
128   than the saved literals.
129
130   backward_reference_offset MUST be positive. */
131static BROTLI_INLINE score_t BackwardReferenceScore(
132    size_t copy_length, size_t backward_reference_offset) {
133  return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
134      BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
135}
136
137static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
138    size_t copy_length) {
139  return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
140      BROTLI_SCORE_BASE + 15;
141}
142
143static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance(
144    size_t distance_short_code) {
145  return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE);
146}
147
148static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
149    const BrotliDictionary* dictionary, size_t item, const uint8_t* data,
150    size_t max_length, size_t max_backward, HasherSearchResult* out) {
151  size_t len;
152  size_t dist;
153  size_t offset;
154  size_t matchlen;
155  size_t backward;
156  score_t score;
157  len = item & 0x1F;
158  dist = item >> 5;
159  offset = dictionary->offsets_by_length[len] + len * dist;
160  if (len > max_length) {
161    return BROTLI_FALSE;
162  }
163
164  matchlen =
165      FindMatchLengthWithLimit(data, &dictionary->data[offset], len);
166  if (matchlen + kCutoffTransformsCount <= len || matchlen == 0) {
167    return BROTLI_FALSE;
168  }
169  {
170    size_t cut = len - matchlen;
171    size_t transform_id =
172        (cut << 2) + (size_t)((kCutoffTransforms >> (cut * 6)) & 0x3F);
173    backward = max_backward + dist + 1 +
174        (transform_id << dictionary->size_bits_by_length[len]);
175  }
176  if (backward >= BROTLI_MAX_DISTANCE) {
177    return BROTLI_FALSE;
178  }
179  score = BackwardReferenceScore(matchlen, backward);
180  if (score < out->score) {
181    return BROTLI_FALSE;
182  }
183  out->len = matchlen;
184  out->len_code_delta = (int)len - (int)matchlen;
185  out->distance = backward;
186  out->score = score;
187  return BROTLI_TRUE;
188}
189
190static BROTLI_INLINE void SearchInStaticDictionary(
191    const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
192    HasherHandle handle, const uint8_t* data, size_t max_length,
193    size_t max_backward, HasherSearchResult* out, BROTLI_BOOL shallow) {
194  size_t key;
195  size_t i;
196  HasherCommon* self = GetHasherCommon(handle);
197  if (self->dict_num_matches < (self->dict_num_lookups >> 7)) {
198    return;
199  }
200  key = Hash14(data) << 1;
201  for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
202    size_t item = dictionary_hash[key];
203    self->dict_num_lookups++;
204    if (item != 0) {
205      BROTLI_BOOL item_matches = TestStaticDictionaryItem(
206          dictionary, item, data, max_length, max_backward, out);
207      if (item_matches) {
208        self->dict_num_matches++;
209      }
210    }
211  }
212}
213
214typedef struct BackwardMatch {
215  uint32_t distance;
216  uint32_t length_and_code;
217} BackwardMatch;
218
219static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
220    size_t dist, size_t len) {
221  self->distance = (uint32_t)dist;
222  self->length_and_code = (uint32_t)(len << 5);
223}
224
225static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
226    size_t dist, size_t len, size_t len_code) {
227  self->distance = (uint32_t)dist;
228  self->length_and_code =
229      (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
230}
231
232static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
233  return self->length_and_code >> 5;
234}
235
236static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
237  size_t code = self->length_and_code & 31;
238  return code ? code : BackwardMatchLength(self);
239}
240
241#define EXPAND_CAT(a, b) CAT(a, b)
242#define CAT(a, b) a ## b
243#define FN(X) EXPAND_CAT(X, HASHER())
244
245#define HASHER() H10
246#define BUCKET_BITS 17
247#define MAX_TREE_SEARCH_DEPTH 64
248#define MAX_TREE_COMP_LENGTH 128
249#include "./hash_to_binary_tree_inc.h"  /* NOLINT(build/include) */
250#undef MAX_TREE_SEARCH_DEPTH
251#undef MAX_TREE_COMP_LENGTH
252#undef BUCKET_BITS
253#undef HASHER
254/* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */
255#define MAX_NUM_MATCHES_H10 128
256
257/* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression
258   a little faster (0.5% - 1%) and it compresses 0.15% better on small text
259   and HTML inputs. */
260
261#define HASHER() H2
262#define BUCKET_BITS 16
263#define BUCKET_SWEEP 1
264#define HASH_LEN 5
265#define USE_DICTIONARY 1
266#include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
267#undef BUCKET_SWEEP
268#undef USE_DICTIONARY
269#undef HASHER
270
271#define HASHER() H3
272#define BUCKET_SWEEP 2
273#define USE_DICTIONARY 0
274#include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
275#undef USE_DICTIONARY
276#undef BUCKET_SWEEP
277#undef BUCKET_BITS
278#undef HASHER
279
280#define HASHER() H4
281#define BUCKET_BITS 17
282#define BUCKET_SWEEP 4
283#define USE_DICTIONARY 1
284#include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
285#undef USE_DICTIONARY
286#undef HASH_LEN
287#undef BUCKET_SWEEP
288#undef BUCKET_BITS
289#undef HASHER
290
291#define HASHER() H5
292#include "./hash_longest_match_inc.h"  /* NOLINT(build/include) */
293#undef HASHER
294
295#define HASHER() H6
296#include "./hash_longest_match64_inc.h"  /* NOLINT(build/include) */
297#undef HASHER
298
299#define BUCKET_BITS 15
300
301#define NUM_LAST_DISTANCES_TO_CHECK 4
302#define NUM_BANKS 1
303#define BANK_BITS 16
304#define HASHER() H40
305#include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
306#undef HASHER
307#undef NUM_LAST_DISTANCES_TO_CHECK
308
309#define NUM_LAST_DISTANCES_TO_CHECK 10
310#define HASHER() H41
311#include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
312#undef HASHER
313#undef NUM_LAST_DISTANCES_TO_CHECK
314#undef NUM_BANKS
315#undef BANK_BITS
316
317#define NUM_LAST_DISTANCES_TO_CHECK 16
318#define NUM_BANKS 512
319#define BANK_BITS 9
320#define HASHER() H42
321#include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
322#undef HASHER
323#undef NUM_LAST_DISTANCES_TO_CHECK
324#undef NUM_BANKS
325#undef BANK_BITS
326
327#undef BUCKET_BITS
328
329#define HASHER() H54
330#define BUCKET_BITS 20
331#define BUCKET_SWEEP 4
332#define HASH_LEN 7
333#define USE_DICTIONARY 0
334#include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
335#undef USE_DICTIONARY
336#undef HASH_LEN
337#undef BUCKET_SWEEP
338#undef BUCKET_BITS
339#undef HASHER
340
341#undef FN
342#undef CAT
343#undef EXPAND_CAT
344
345#define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)
346#define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
347
348static BROTLI_INLINE void DestroyHasher(
349    MemoryManager* m, HasherHandle* handle) {
350  if (*handle == NULL) return;
351  BROTLI_FREE(m, *handle);
352}
353
354static BROTLI_INLINE void HasherReset(HasherHandle handle) {
355  if (handle == NULL) return;
356  GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE;
357}
358
359static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
360    BROTLI_BOOL one_shot, const size_t input_size) {
361  size_t result = sizeof(HasherCommon);
362  switch (params->hasher.type) {
363#define SIZE_(N)                                                         \
364    case N:                                                              \
365      result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \
366      break;
367    FOR_ALL_HASHERS(SIZE_)
368#undef SIZE_
369    default:
370      break;
371  }
372  return result;
373}
374
375static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle,
376    BrotliEncoderParams* params, const uint8_t* data, size_t position,
377    size_t input_size, BROTLI_BOOL is_last) {
378  HasherHandle self = NULL;
379  HasherCommon* common = NULL;
380  BROTLI_BOOL one_shot = (position == 0 && is_last);
381  if (*handle == NULL) {
382    size_t alloc_size;
383    ChooseHasher(params, &params->hasher);
384    alloc_size = HasherSize(params, one_shot, input_size);
385    self = BROTLI_ALLOC(m, uint8_t, alloc_size);
386    if (BROTLI_IS_OOM(m)) return;
387    *handle = self;
388    common = GetHasherCommon(self);
389    common->params = params->hasher;
390    switch (common->params.type) {
391#define INITIALIZE_(N)                     \
392      case N:                              \
393        InitializeH ## N(*handle, params); \
394        break;
395      FOR_ALL_HASHERS(INITIALIZE_);
396#undef INITIALIZE_
397      default:
398        break;
399    }
400    HasherReset(*handle);
401  }
402
403  self = *handle;
404  common = GetHasherCommon(self);
405  if (!common->is_prepared_) {
406    switch (common->params.type) {
407#define PREPARE_(N)                                      \
408      case N:                                            \
409        PrepareH ## N(self, one_shot, input_size, data); \
410        break;
411      FOR_ALL_HASHERS(PREPARE_)
412#undef PREPARE_
413      default: break;
414    }
415    if (position == 0) {
416        common->dict_num_lookups = 0;
417        common->dict_num_matches = 0;
418    }
419    common->is_prepared_ = BROTLI_TRUE;
420  }
421}
422
423static BROTLI_INLINE void InitOrStitchToPreviousBlock(
424    MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask,
425    BrotliEncoderParams* params, size_t position, size_t input_size,
426    BROTLI_BOOL is_last) {
427  HasherHandle self;
428  HasherSetup(m, handle, params, data, position, input_size, is_last);
429  if (BROTLI_IS_OOM(m)) return;
430  self = *handle;
431  switch (GetHasherCommon(self)->params.type) {
432#define INIT_(N)                                                           \
433    case N:                                                                \
434      StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \
435    break;
436    FOR_ALL_HASHERS(INIT_)
437#undef INIT_
438    default: break;
439  }
440}
441
442#if defined(__cplusplus) || defined(c_plusplus)
443}  /* extern "C" */
444#endif
445
446#endif  /* BROTLI_ENC_HASH_H_ */
447