1/* Copyright 2013 Google Inc. All Rights Reserved.
2
3   Distributed under MIT license.
4   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5*/
6
7#include "./static_dict.h"
8
9#include "../common/dictionary.h"
10#include "./find_match_length.h"
11#include "./port.h"
12#include "./static_dict_lut.h"
13
14#if defined(__cplusplus) || defined(c_plusplus)
15extern "C" {
16#endif
17
18static const uint8_t kUppercaseFirst = 10;
19static const uint8_t kOmitLastNTransforms[10] = {
20  0, 12, 27, 23, 42, 63, 56, 48, 59, 64,
21};
22
23static BROTLI_INLINE uint32_t Hash(const uint8_t *data) {
24  uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kDictHashMul32;
25  /* The higher bits contain more mixture from the multiplication,
26     so we take our results from there. */
27  return h >> (32 - kDictNumBits);
28}
29
30static BROTLI_INLINE void AddMatch(size_t distance, size_t len, size_t len_code,
31                                   uint32_t* matches) {
32  uint32_t match = (uint32_t)((distance << 5) + len_code);
33  matches[len] = BROTLI_MIN(uint32_t, matches[len], match);
34}
35
36static BROTLI_INLINE size_t DictMatchLength(const BrotliDictionary* dictionary,
37                                            const uint8_t* data,
38                                            size_t id,
39                                            size_t len,
40                                            size_t maxlen) {
41  const size_t offset = dictionary->offsets_by_length[len] + len * id;
42  return FindMatchLengthWithLimit(&dictionary->data[offset], data,
43                                  BROTLI_MIN(size_t, len, maxlen));
44}
45
46static BROTLI_INLINE BROTLI_BOOL IsMatch(const BrotliDictionary* dictionary,
47    DictWord w, const uint8_t* data, size_t max_length) {
48  if (w.len > max_length) {
49    return BROTLI_FALSE;
50  } else {
51    const size_t offset = dictionary->offsets_by_length[w.len] +
52        (size_t)w.len * (size_t)w.idx;
53    const uint8_t* dict = &dictionary->data[offset];
54    if (w.transform == 0) {
55      /* Match against base dictionary word. */
56      return
57          TO_BROTLI_BOOL(FindMatchLengthWithLimit(dict, data, w.len) == w.len);
58    } else if (w.transform == 10) {
59      /* Match against uppercase first transform.
60         Note that there are only ASCII uppercase words in the lookup table. */
61      return TO_BROTLI_BOOL(dict[0] >= 'a' && dict[0] <= 'z' &&
62              (dict[0] ^ 32) == data[0] &&
63              FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1u) ==
64              w.len - 1u);
65    } else {
66      /* Match against uppercase all transform.
67         Note that there are only ASCII uppercase words in the lookup table. */
68      size_t i;
69      for (i = 0; i < w.len; ++i) {
70        if (dict[i] >= 'a' && dict[i] <= 'z') {
71          if ((dict[i] ^ 32) != data[i]) return BROTLI_FALSE;
72        } else {
73          if (dict[i] != data[i]) return BROTLI_FALSE;
74        }
75      }
76      return BROTLI_TRUE;
77    }
78  }
79}
80
81BROTLI_BOOL BrotliFindAllStaticDictionaryMatches(
82    const BrotliDictionary* dictionary, const uint8_t* data, size_t min_length,
83    size_t max_length, uint32_t* matches) {
84  BROTLI_BOOL has_found_match = BROTLI_FALSE;
85  {
86    size_t offset = kStaticDictionaryBuckets[Hash(data)];
87    BROTLI_BOOL end = !offset;
88    while (!end) {
89      DictWord w = kStaticDictionaryWords[offset++];
90      const size_t l = w.len & 0x1F;
91      const size_t n = (size_t)1 << dictionary->size_bits_by_length[l];
92      const size_t id = w.idx;
93      end = !!(w.len & 0x80);
94      w.len = (uint8_t)l;
95      if (w.transform == 0) {
96        const size_t matchlen =
97            DictMatchLength(dictionary, data, id, l, max_length);
98        const uint8_t* s;
99        size_t minlen;
100        size_t maxlen;
101        size_t len;
102        /* Transform "" + kIdentity + "" */
103        if (matchlen == l) {
104          AddMatch(id, l, l, matches);
105          has_found_match = BROTLI_TRUE;
106        }
107        /* Transforms "" + kOmitLast1 + "" and "" + kOmitLast1 + "ing " */
108        if (matchlen >= l - 1) {
109          AddMatch(id + 12 * n, l - 1, l, matches);
110          if (l + 2 < max_length &&
111              data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' &&
112              data[l + 2] == ' ') {
113            AddMatch(id + 49 * n, l + 3, l, matches);
114          }
115          has_found_match = BROTLI_TRUE;
116        }
117        /* Transform "" + kOmitLastN + "" (N = 2 .. 9) */
118        minlen = min_length;
119        if (l > 9) minlen = BROTLI_MAX(size_t, minlen, l - 9);
120        maxlen = BROTLI_MIN(size_t, matchlen, l - 2);
121        for (len = minlen; len <= maxlen; ++len) {
122          AddMatch(id + kOmitLastNTransforms[l - len] * n, len, l, matches);
123          has_found_match = BROTLI_TRUE;
124        }
125        if (matchlen < l || l + 6 >= max_length) {
126          continue;
127        }
128        s = &data[l];
129        /* Transforms "" + kIdentity + <suffix> */
130        if (s[0] == ' ') {
131          AddMatch(id + n, l + 1, l, matches);
132          if (s[1] == 'a') {
133            if (s[2] == ' ') {
134              AddMatch(id + 28 * n, l + 3, l, matches);
135            } else if (s[2] == 's') {
136              if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches);
137            } else if (s[2] == 't') {
138              if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches);
139            } else if (s[2] == 'n') {
140              if (s[3] == 'd' && s[4] == ' ') {
141                AddMatch(id + 10 * n, l + 5, l, matches);
142              }
143            }
144          } else if (s[1] == 'b') {
145            if (s[2] == 'y' && s[3] == ' ') {
146              AddMatch(id + 38 * n, l + 4, l, matches);
147            }
148          } else if (s[1] == 'i') {
149            if (s[2] == 'n') {
150              if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches);
151            } else if (s[2] == 's') {
152              if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches);
153            }
154          } else if (s[1] == 'f') {
155            if (s[2] == 'o') {
156              if (s[3] == 'r' && s[4] == ' ') {
157                AddMatch(id + 25 * n, l + 5, l, matches);
158              }
159            } else if (s[2] == 'r') {
160              if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') {
161                AddMatch(id + 37 * n, l + 6, l, matches);
162              }
163            }
164          } else if (s[1] == 'o') {
165            if (s[2] == 'f') {
166              if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches);
167            } else if (s[2] == 'n') {
168              if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches);
169            }
170          } else if (s[1] == 'n') {
171            if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') {
172              AddMatch(id + 80 * n, l + 5, l, matches);
173            }
174          } else if (s[1] == 't') {
175            if (s[2] == 'h') {
176              if (s[3] == 'e') {
177                if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches);
178              } else if (s[3] == 'a') {
179                if (s[4] == 't' && s[5] == ' ') {
180                  AddMatch(id + 29 * n, l + 6, l, matches);
181                }
182              }
183            } else if (s[2] == 'o') {
184              if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches);
185            }
186          } else if (s[1] == 'w') {
187            if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') {
188              AddMatch(id + 35 * n, l + 6, l, matches);
189            }
190          }
191        } else if (s[0] == '"') {
192          AddMatch(id + 19 * n, l + 1, l, matches);
193          if (s[1] == '>') {
194            AddMatch(id + 21 * n, l + 2, l, matches);
195          }
196        } else if (s[0] == '.') {
197          AddMatch(id + 20 * n, l + 1, l, matches);
198          if (s[1] == ' ') {
199            AddMatch(id + 31 * n, l + 2, l, matches);
200            if (s[2] == 'T' && s[3] == 'h') {
201              if (s[4] == 'e') {
202                if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches);
203              } else if (s[4] == 'i') {
204                if (s[5] == 's' && s[6] == ' ') {
205                  AddMatch(id + 75 * n, l + 7, l, matches);
206                }
207              }
208            }
209          }
210        } else if (s[0] == ',') {
211          AddMatch(id + 76 * n, l + 1, l, matches);
212          if (s[1] == ' ') {
213            AddMatch(id + 14 * n, l + 2, l, matches);
214          }
215        } else if (s[0] == '\n') {
216          AddMatch(id + 22 * n, l + 1, l, matches);
217          if (s[1] == '\t') {
218            AddMatch(id + 50 * n, l + 2, l, matches);
219          }
220        } else if (s[0] == ']') {
221          AddMatch(id + 24 * n, l + 1, l, matches);
222        } else if (s[0] == '\'') {
223          AddMatch(id + 36 * n, l + 1, l, matches);
224        } else if (s[0] == ':') {
225          AddMatch(id + 51 * n, l + 1, l, matches);
226        } else if (s[0] == '(') {
227          AddMatch(id + 57 * n, l + 1, l, matches);
228        } else if (s[0] == '=') {
229          if (s[1] == '"') {
230            AddMatch(id + 70 * n, l + 2, l, matches);
231          } else if (s[1] == '\'') {
232            AddMatch(id + 86 * n, l + 2, l, matches);
233          }
234        } else if (s[0] == 'a') {
235          if (s[1] == 'l' && s[2] == ' ') {
236            AddMatch(id + 84 * n, l + 3, l, matches);
237          }
238        } else if (s[0] == 'e') {
239          if (s[1] == 'd') {
240            if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches);
241          } else if (s[1] == 'r') {
242            if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches);
243          } else if (s[1] == 's') {
244            if (s[2] == 't' && s[3] == ' ') {
245              AddMatch(id + 95 * n, l + 4, l, matches);
246            }
247          }
248        } else if (s[0] == 'f') {
249          if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') {
250            AddMatch(id + 90 * n, l + 4, l, matches);
251          }
252        } else if (s[0] == 'i') {
253          if (s[1] == 'v') {
254            if (s[2] == 'e' && s[3] == ' ') {
255              AddMatch(id + 92 * n, l + 4, l, matches);
256            }
257          } else if (s[1] == 'z') {
258            if (s[2] == 'e' && s[3] == ' ') {
259              AddMatch(id + 100 * n, l + 4, l, matches);
260            }
261          }
262        } else if (s[0] == 'l') {
263          if (s[1] == 'e') {
264            if (s[2] == 's' && s[3] == 's' && s[4] == ' ') {
265              AddMatch(id + 93 * n, l + 5, l, matches);
266            }
267          } else if (s[1] == 'y') {
268            if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches);
269          }
270        } else if (s[0] == 'o') {
271          if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') {
272            AddMatch(id + 106 * n, l + 4, l, matches);
273          }
274        }
275      } else {
276        /* Set is_all_caps=0 for kUppercaseFirst and
277               is_all_caps=1 otherwise (kUppercaseAll) transform. */
278        const BROTLI_BOOL is_all_caps =
279            TO_BROTLI_BOOL(w.transform != kUppercaseFirst);
280        const uint8_t* s;
281        if (!IsMatch(dictionary, w, data, max_length)) {
282          continue;
283        }
284        /* Transform "" + kUppercase{First,All} + "" */
285        AddMatch(id + (is_all_caps ? 44 : 9) * n, l, l, matches);
286        has_found_match = BROTLI_TRUE;
287        if (l + 1 >= max_length) {
288          continue;
289        }
290        /* Transforms "" + kUppercase{First,All} + <suffix> */
291        s = &data[l];
292        if (s[0] == ' ') {
293          AddMatch(id + (is_all_caps ? 68 : 4) * n, l + 1, l, matches);
294        } else if (s[0] == '"') {
295          AddMatch(id + (is_all_caps ? 87 : 66) * n, l + 1, l, matches);
296          if (s[1] == '>') {
297            AddMatch(id + (is_all_caps ? 97 : 69) * n, l + 2, l, matches);
298          }
299        } else if (s[0] == '.') {
300          AddMatch(id + (is_all_caps ? 101 : 79) * n, l + 1, l, matches);
301          if (s[1] == ' ') {
302            AddMatch(id + (is_all_caps ? 114 : 88) * n, l + 2, l, matches);
303          }
304        } else if (s[0] == ',') {
305          AddMatch(id + (is_all_caps ? 112 : 99) * n, l + 1, l, matches);
306          if (s[1] == ' ') {
307            AddMatch(id + (is_all_caps ? 107 : 58) * n, l + 2, l, matches);
308          }
309        } else if (s[0] == '\'') {
310          AddMatch(id + (is_all_caps ? 94 : 74) * n, l + 1, l, matches);
311        } else if (s[0] == '(') {
312          AddMatch(id + (is_all_caps ? 113 : 78) * n, l + 1, l, matches);
313        } else if (s[0] == '=') {
314          if (s[1] == '"') {
315            AddMatch(id + (is_all_caps ? 105 : 104) * n, l + 2, l, matches);
316          } else if (s[1] == '\'') {
317            AddMatch(id + (is_all_caps ? 116 : 108) * n, l + 2, l, matches);
318          }
319        }
320      }
321    }
322  }
323  /* Transforms with prefixes " " and "." */
324  if (max_length >= 5 && (data[0] == ' ' || data[0] == '.')) {
325    BROTLI_BOOL is_space = TO_BROTLI_BOOL(data[0] == ' ');
326    size_t offset = kStaticDictionaryBuckets[Hash(&data[1])];
327    BROTLI_BOOL end = !offset;
328    while (!end) {
329      DictWord w = kStaticDictionaryWords[offset++];
330      const size_t l = w.len & 0x1F;
331      const size_t n = (size_t)1 << dictionary->size_bits_by_length[l];
332      const size_t id = w.idx;
333      end = !!(w.len & 0x80);
334      w.len = (uint8_t)l;
335      if (w.transform == 0) {
336        const uint8_t* s;
337        if (!IsMatch(dictionary, w, &data[1], max_length - 1)) {
338          continue;
339        }
340        /* Transforms " " + kIdentity + "" and "." + kIdentity + "" */
341        AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches);
342        has_found_match = BROTLI_TRUE;
343        if (l + 2 >= max_length) {
344          continue;
345        }
346        /* Transforms " " + kIdentity + <suffix> and "." + kIdentity + <suffix>
347        */
348        s = &data[l + 1];
349        if (s[0] == ' ') {
350          AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches);
351        } else if (s[0] == '(') {
352          AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches);
353        } else if (is_space) {
354          if (s[0] == ',') {
355            AddMatch(id + 103 * n, l + 2, l, matches);
356            if (s[1] == ' ') {
357              AddMatch(id + 33 * n, l + 3, l, matches);
358            }
359          } else if (s[0] == '.') {
360            AddMatch(id + 71 * n, l + 2, l, matches);
361            if (s[1] == ' ') {
362              AddMatch(id + 52 * n, l + 3, l, matches);
363            }
364          } else if (s[0] == '=') {
365            if (s[1] == '"') {
366              AddMatch(id + 81 * n, l + 3, l, matches);
367            } else if (s[1] == '\'') {
368              AddMatch(id + 98 * n, l + 3, l, matches);
369            }
370          }
371        }
372      } else if (is_space) {
373        /* Set is_all_caps=0 for kUppercaseFirst and
374               is_all_caps=1 otherwise (kUppercaseAll) transform. */
375        const BROTLI_BOOL is_all_caps =
376            TO_BROTLI_BOOL(w.transform != kUppercaseFirst);
377        const uint8_t* s;
378        if (!IsMatch(dictionary, w, &data[1], max_length - 1)) {
379          continue;
380        }
381        /* Transforms " " + kUppercase{First,All} + "" */
382        AddMatch(id + (is_all_caps ? 85 : 30) * n, l + 1, l, matches);
383        has_found_match = BROTLI_TRUE;
384        if (l + 2 >= max_length) {
385          continue;
386        }
387        /* Transforms " " + kUppercase{First,All} + <suffix> */
388        s = &data[l + 1];
389        if (s[0] == ' ') {
390          AddMatch(id + (is_all_caps ? 83 : 15) * n, l + 2, l, matches);
391        } else if (s[0] == ',') {
392          if (!is_all_caps) {
393            AddMatch(id + 109 * n, l + 2, l, matches);
394          }
395          if (s[1] == ' ') {
396            AddMatch(id + (is_all_caps ? 111 : 65) * n, l + 3, l, matches);
397          }
398        } else if (s[0] == '.') {
399          AddMatch(id + (is_all_caps ? 115 : 96) * n, l + 2, l, matches);
400          if (s[1] == ' ') {
401            AddMatch(id + (is_all_caps ? 117 : 91) * n, l + 3, l, matches);
402          }
403        } else if (s[0] == '=') {
404          if (s[1] == '"') {
405            AddMatch(id + (is_all_caps ? 110 : 118) * n, l + 3, l, matches);
406          } else if (s[1] == '\'') {
407            AddMatch(id + (is_all_caps ? 119 : 120) * n, l + 3, l, matches);
408          }
409        }
410      }
411    }
412  }
413  if (max_length >= 6) {
414    /* Transforms with prefixes "e ", "s ", ", " and "\xc2\xa0" */
415    if ((data[1] == ' ' &&
416         (data[0] == 'e' || data[0] == 's' || data[0] == ',')) ||
417        (data[0] == 0xc2 && data[1] == 0xa0)) {
418      size_t offset = kStaticDictionaryBuckets[Hash(&data[2])];
419      BROTLI_BOOL end = !offset;
420      while (!end) {
421        DictWord w = kStaticDictionaryWords[offset++];
422        const size_t l = w.len & 0x1F;
423        const size_t n = (size_t)1 << dictionary->size_bits_by_length[l];
424        const size_t id = w.idx;
425        end = !!(w.len & 0x80);
426        w.len = (uint8_t)l;
427        if (w.transform == 0 &&
428            IsMatch(dictionary, w, &data[2], max_length - 2)) {
429          if (data[0] == 0xc2) {
430            AddMatch(id + 102 * n, l + 2, l, matches);
431            has_found_match = BROTLI_TRUE;
432          } else if (l + 2 < max_length && data[l + 2] == ' ') {
433            size_t t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13);
434            AddMatch(id + t * n, l + 3, l, matches);
435            has_found_match = BROTLI_TRUE;
436          }
437        }
438      }
439    }
440  }
441  if (max_length >= 9) {
442    /* Transforms with prefixes " the " and ".com/" */
443    if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' &&
444         data[3] == 'e' && data[4] == ' ') ||
445        (data[0] == '.' && data[1] == 'c' && data[2] == 'o' &&
446         data[3] == 'm' && data[4] == '/')) {
447      size_t offset = kStaticDictionaryBuckets[Hash(&data[5])];
448      BROTLI_BOOL end = !offset;
449      while (!end) {
450        DictWord w = kStaticDictionaryWords[offset++];
451        const size_t l = w.len & 0x1F;
452        const size_t n = (size_t)1 << dictionary->size_bits_by_length[l];
453        const size_t id = w.idx;
454        end = !!(w.len & 0x80);
455        w.len = (uint8_t)l;
456        if (w.transform == 0 &&
457            IsMatch(dictionary, w, &data[5], max_length - 5)) {
458          AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches);
459          has_found_match = BROTLI_TRUE;
460          if (l + 5 < max_length) {
461            const uint8_t* s = &data[l + 5];
462            if (data[0] == ' ') {
463              if (l + 8 < max_length &&
464                  s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') {
465                AddMatch(id + 62 * n, l + 9, l, matches);
466                if (l + 12 < max_length &&
467                    s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') {
468                  AddMatch(id + 73 * n, l + 13, l, matches);
469                }
470              }
471            }
472          }
473        }
474      }
475    }
476  }
477  return has_found_match;
478}
479
480#if defined(__cplusplus) || defined(c_plusplus)
481}  /* extern "C" */
482#endif
483