1/* Copyright 2013 Google Inc. All Rights Reserved.
2
3   Distributed under MIT license.
4   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5*/
6
7/* Heuristics for deciding about the UTF8-ness of strings. */
8
9#include "./utf8_util.h"
10
11#include <brotli/types.h>
12
13#if defined(__cplusplus) || defined(c_plusplus)
14extern "C" {
15#endif
16
17static size_t BrotliParseAsUTF8(
18    int* symbol, const uint8_t* input, size_t size) {
19  /* ASCII */
20  if ((input[0] & 0x80) == 0) {
21    *symbol = input[0];
22    if (*symbol > 0) {
23      return 1;
24    }
25  }
26  /* 2-byte UTF8 */
27  if (size > 1u &&
28      (input[0] & 0xe0) == 0xc0 &&
29      (input[1] & 0xc0) == 0x80) {
30    *symbol = (((input[0] & 0x1f) << 6) |
31               (input[1] & 0x3f));
32    if (*symbol > 0x7f) {
33      return 2;
34    }
35  }
36  /* 3-byte UFT8 */
37  if (size > 2u &&
38      (input[0] & 0xf0) == 0xe0 &&
39      (input[1] & 0xc0) == 0x80 &&
40      (input[2] & 0xc0) == 0x80) {
41    *symbol = (((input[0] & 0x0f) << 12) |
42               ((input[1] & 0x3f) << 6) |
43               (input[2] & 0x3f));
44    if (*symbol > 0x7ff) {
45      return 3;
46    }
47  }
48  /* 4-byte UFT8 */
49  if (size > 3u &&
50      (input[0] & 0xf8) == 0xf0 &&
51      (input[1] & 0xc0) == 0x80 &&
52      (input[2] & 0xc0) == 0x80 &&
53      (input[3] & 0xc0) == 0x80) {
54    *symbol = (((input[0] & 0x07) << 18) |
55               ((input[1] & 0x3f) << 12) |
56               ((input[2] & 0x3f) << 6) |
57               (input[3] & 0x3f));
58    if (*symbol > 0xffff && *symbol <= 0x10ffff) {
59      return 4;
60    }
61  }
62  /* Not UTF8, emit a special symbol above the UTF8-code space */
63  *symbol = 0x110000 | input[0];
64  return 1;
65}
66
67/* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
68BROTLI_BOOL BrotliIsMostlyUTF8(
69    const uint8_t* data, const size_t pos, const size_t mask,
70    const size_t length, const double min_fraction) {
71  size_t size_utf8 = 0;
72  size_t i = 0;
73  while (i < length) {
74    int symbol;
75    size_t bytes_read =
76        BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
77    i += bytes_read;
78    if (symbol < 0x110000) size_utf8 += bytes_read;
79  }
80  return TO_BROTLI_BOOL(size_utf8 > min_fraction * (double)length);
81}
82
83#if defined(__cplusplus) || defined(c_plusplus)
84}  /* extern "C" */
85#endif
86