179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Copyright 2013 Google Inc. All Rights Reserved.
279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Licensed under the Apache License, Version 2.0 (the "License");
479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// you may not use this file except in compliance with the License.
579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// You may obtain a copy of the License at
679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// http://www.apache.org/licenses/LICENSE-2.0
879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Unless required by applicable law or agreed to in writing, software
1079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// distributed under the License is distributed on an "AS IS" BASIS,
1179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// See the License for the specific language governing permissions and
1379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// limitations under the License.
1479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
1579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Literal cost model to allow backward reference replacement to be efficient.
1679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
1779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include "./literal_cost.h"
1879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
1979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <math.h>
2079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <stdint.h>
2179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <algorithm>
2279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
2379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkanamespace brotli {
2479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
25cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadkastatic int UTF8Position(int last, int c, int clamp) {
26cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  if (c < 128) {
27cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    return 0;  // Next one is the 'Byte 1' again.
28cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  } else if (c >= 192) {
29cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    return std::min(1, clamp);  // Next one is the 'Byte 2' of utf-8 encoding.
30cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  } else {
31cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    // Let's decide over the last byte if this ends the sequence.
32cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    if (last < 0xe0) {
33cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      return 0;  // Completed two or three byte coding.
34cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    } else {
35cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      return std::min(2, clamp);  // Next one is the 'Byte 3' of utf-8 encoding.
36cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    }
37cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  }
38cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka}
39cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka
40cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadkastatic int DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
41cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka                                     const uint8_t *data) {
42cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  int counts[3] = { 0 };
43cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  int max_utf8 = 1;  // should be 2, but 1 compresses better.
44cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  int last_c = 0;
45cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  int utf8_pos = 0;
46cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  for (int i = 0; i < len; ++i) {
47cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    int c = data[(pos + i) & mask];
48cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    utf8_pos = UTF8Position(last_c, c, 2);
49cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    ++counts[utf8_pos];
50cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    last_c = c;
51cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  }
52cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  if (counts[2] < 500) {
53cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    max_utf8 = 1;
54cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  }
55cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  if (counts[1] + counts[2] < 25) {
56cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    max_utf8 = 0;
57cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  }
58cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  return max_utf8;
59cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka}
60cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka
61cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadkavoid EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
62278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka                                     size_t cost_mask, const uint8_t *data,
63278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka                                     float *cost) {
64cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka
65cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  // max_utf8 is 0 (normal ascii single byte modeling),
66cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  // 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling).
67cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  const int max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
68cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  int histogram[3][256] = { { 0 } };
69cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  int window_half = 495;
70cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  int in_window = std::min(static_cast<size_t>(window_half), len);
71cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  int in_window_utf8[3] = { 0 };
72cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka
73cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  // Bootstrap histograms.
74cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  int last_c = 0;
75cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  int utf8_pos = 0;
76cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  for (int i = 0; i < in_window; ++i) {
77cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    int c = data[(pos + i) & mask];
78cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    ++histogram[utf8_pos][c];
79cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    ++in_window_utf8[utf8_pos];
80cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    utf8_pos = UTF8Position(last_c, c, max_utf8);
81cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    last_c = c;
82cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  }
83cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka
84cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  // Compute bit costs with sliding window.
85cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  for (int i = 0; i < len; ++i) {
86cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    if (i - window_half >= 0) {
87cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      // Remove a byte in the past.
88cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      int c = (i - window_half - 1) < 0 ?
89cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka          0 : data[(pos + i - window_half - 1) & mask];
90cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      int last_c = (i - window_half - 2) < 0 ?
91cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka          0 : data[(pos + i - window_half - 2) & mask];
92cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      int utf8_pos2 = UTF8Position(last_c, c, max_utf8);
93cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      --histogram[utf8_pos2][data[(pos + i - window_half) & mask]];
94cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      --in_window_utf8[utf8_pos2];
95cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    }
96cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    if (i + window_half < len) {
97cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      // Add a byte in the future.
98cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      int c = (i + window_half - 1) < 0 ?
99cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka          0 : data[(pos + i + window_half - 1) & mask];
100cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      int last_c = (i + window_half - 2) < 0 ?
101cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka          0 : data[(pos + i + window_half - 2) & mask];
102cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      int utf8_pos2 = UTF8Position(last_c, c, max_utf8);
103cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]];
104cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      ++in_window_utf8[utf8_pos2];
105cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    }
106cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    int c = i < 1 ? 0 : data[(pos + i - 1) & mask];
107cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    int last_c = i < 2 ? 0 : data[(pos + i - 2) & mask];
108cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    int utf8_pos = UTF8Position(last_c, c, max_utf8);
109cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    int masked_pos = (pos + i) & mask;
110cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    int histo = histogram[utf8_pos][data[masked_pos]];
111cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    if (histo == 0) {
112cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka      histo = 1;
113cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    }
114278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka    float lit_cost = log2(static_cast<double>(in_window_utf8[utf8_pos])
115278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka                          / histo);
116278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka    lit_cost += 0.02905;
117278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka    if (lit_cost < 1.0) {
118278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka      lit_cost *= 0.5;
119278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka      lit_cost += 0.5;
120cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka    }
1210829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka    // Make the first bytes more expensive -- seems to help, not sure why.
1220829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka    // Perhaps because the entropy source is changing its properties
1230829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka    // rapidly in the beginning of the file, perhaps because the beginning
1240829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka    // of the data is a statistical "anomaly".
1250829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka    if (i < 2000) {
1260829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka      lit_cost += 0.7 - ((2000 - i) / 2000.0 * 0.35);
1270829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka    }
128278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka    cost[(pos + i) & cost_mask] = lit_cost;
129cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka  }
130cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka}
131cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka
1321571db36a9b00e895882ee236e9f84d62f8ea226Zoltan Szabadkavoid EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
133278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka                                 size_t cost_mask, const uint8_t *data,
134278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka                                 float *cost) {
13579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int histogram[256] = { 0 };
13679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int window_half = 2000;
13779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int in_window = std::min(static_cast<size_t>(window_half), len);
13879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
13979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Bootstrap histogram.
14079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < in_window; ++i) {
1411571db36a9b00e895882ee236e9f84d62f8ea226Zoltan Szabadka    ++histogram[data[(pos + i) & mask]];
14279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
14379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
14479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Compute bit costs with sliding window.
14579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < len; ++i) {
14679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (i - window_half >= 0) {
14779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      // Remove a byte in the past.
1481571db36a9b00e895882ee236e9f84d62f8ea226Zoltan Szabadka      --histogram[data[(pos + i - window_half) & mask]];
14979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      --in_window;
15079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
15179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (i + window_half < len) {
15279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      // Add a byte in the future.
1531571db36a9b00e895882ee236e9f84d62f8ea226Zoltan Szabadka      ++histogram[data[(pos + i + window_half) & mask]];
15479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      ++in_window;
15579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
156278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka    int histo = histogram[data[(pos + i) & mask]];
15779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (histo == 0) {
15879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      histo = 1;
15979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
160278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka    float lit_cost = log2(static_cast<double>(in_window) / histo);
161278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka    lit_cost += 0.029;
162278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka    if (lit_cost < 1.0) {
163278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka      lit_cost *= 0.5;
164278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka      lit_cost += 0.5;
16579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
166278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka    cost[(pos + i) & cost_mask] = lit_cost;
16779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
16879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
16979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
170cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka
17179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}  // namespace brotli
172