179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Copyright 2013 Google Inc. All Rights Reserved. 279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// 379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Licensed under the Apache License, Version 2.0 (the "License"); 479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// you may not use this file except in compliance with the License. 579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// You may obtain a copy of the License at 679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// 779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// http://www.apache.org/licenses/LICENSE-2.0 879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// 979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Unless required by applicable law or agreed to in writing, software 1079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// distributed under the License is distributed on an "AS IS" BASIS, 1179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// See the License for the specific language governing permissions and 1379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// limitations under the License. 1479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// 1579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Literal cost model to allow backward reference replacement to be efficient. 1679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka 1779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include "./literal_cost.h" 1879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka 1979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <math.h> 2079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <stdint.h> 2179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <algorithm> 2279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka 2379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkanamespace brotli { 2479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka 25cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadkastatic int UTF8Position(int last, int c, int clamp) { 26cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka if (c < 128) { 27cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka return 0; // Next one is the 'Byte 1' again. 28cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } else if (c >= 192) { 29cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka return std::min(1, clamp); // Next one is the 'Byte 2' of utf-8 encoding. 30cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } else { 31cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka // Let's decide over the last byte if this ends the sequence. 32cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka if (last < 0xe0) { 33cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka return 0; // Completed two or three byte coding. 34cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } else { 35cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka return std::min(2, clamp); // Next one is the 'Byte 3' of utf-8 encoding. 36cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } 37cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } 38cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka} 39cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka 40cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadkastatic int DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask, 41cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka const uint8_t *data) { 42cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int counts[3] = { 0 }; 43cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int max_utf8 = 1; // should be 2, but 1 compresses better. 44cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int last_c = 0; 45cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int utf8_pos = 0; 46cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka for (int i = 0; i < len; ++i) { 47cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int c = data[(pos + i) & mask]; 48cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka utf8_pos = UTF8Position(last_c, c, 2); 49cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka ++counts[utf8_pos]; 50cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka last_c = c; 51cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } 52cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka if (counts[2] < 500) { 53cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka max_utf8 = 1; 54cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } 55cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka if (counts[1] + counts[2] < 25) { 56cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka max_utf8 = 0; 57cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } 58cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka return max_utf8; 59cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka} 60cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka 61cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadkavoid EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, 62278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka size_t cost_mask, const uint8_t *data, 63278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka float *cost) { 64cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka 65cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka // max_utf8 is 0 (normal ascii single byte modeling), 66cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka // 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling). 67cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka const int max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data); 68cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int histogram[3][256] = { { 0 } }; 69cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int window_half = 495; 70cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int in_window = std::min(static_cast<size_t>(window_half), len); 71cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int in_window_utf8[3] = { 0 }; 72cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka 73cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka // Bootstrap histograms. 74cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int last_c = 0; 75cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int utf8_pos = 0; 76cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka for (int i = 0; i < in_window; ++i) { 77cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int c = data[(pos + i) & mask]; 78cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka ++histogram[utf8_pos][c]; 79cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka ++in_window_utf8[utf8_pos]; 80cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka utf8_pos = UTF8Position(last_c, c, max_utf8); 81cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka last_c = c; 82cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } 83cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka 84cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka // Compute bit costs with sliding window. 85cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka for (int i = 0; i < len; ++i) { 86cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka if (i - window_half >= 0) { 87cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka // Remove a byte in the past. 88cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int c = (i - window_half - 1) < 0 ? 89cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka 0 : data[(pos + i - window_half - 1) & mask]; 90cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int last_c = (i - window_half - 2) < 0 ? 91cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka 0 : data[(pos + i - window_half - 2) & mask]; 92cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int utf8_pos2 = UTF8Position(last_c, c, max_utf8); 93cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka --histogram[utf8_pos2][data[(pos + i - window_half) & mask]]; 94cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka --in_window_utf8[utf8_pos2]; 95cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } 96cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka if (i + window_half < len) { 97cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka // Add a byte in the future. 98cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int c = (i + window_half - 1) < 0 ? 99cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka 0 : data[(pos + i + window_half - 1) & mask]; 100cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int last_c = (i + window_half - 2) < 0 ? 101cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka 0 : data[(pos + i + window_half - 2) & mask]; 102cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int utf8_pos2 = UTF8Position(last_c, c, max_utf8); 103cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]]; 104cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka ++in_window_utf8[utf8_pos2]; 105cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } 106cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int c = i < 1 ? 0 : data[(pos + i - 1) & mask]; 107cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int last_c = i < 2 ? 0 : data[(pos + i - 2) & mask]; 108cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int utf8_pos = UTF8Position(last_c, c, max_utf8); 109cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int masked_pos = (pos + i) & mask; 110cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka int histo = histogram[utf8_pos][data[masked_pos]]; 111cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka if (histo == 0) { 112cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka histo = 1; 113cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } 114278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka float lit_cost = log2(static_cast<double>(in_window_utf8[utf8_pos]) 115278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka / histo); 116278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka lit_cost += 0.02905; 117278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka if (lit_cost < 1.0) { 118278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka lit_cost *= 0.5; 119278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka lit_cost += 0.5; 120cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } 1210829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka // Make the first bytes more expensive -- seems to help, not sure why. 1220829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka // Perhaps because the entropy source is changing its properties 1230829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka // rapidly in the beginning of the file, perhaps because the beginning 1240829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka // of the data is a statistical "anomaly". 1250829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka if (i < 2000) { 1260829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka lit_cost += 0.7 - ((2000 - i) / 2000.0 * 0.35); 1270829e37293abc2523a1d2b0f4d68ff7b5fcd8e01Zoltan Szabadka } 128278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka cost[(pos + i) & cost_mask] = lit_cost; 129cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka } 130cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka} 131cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka 1321571db36a9b00e895882ee236e9f84d62f8ea226Zoltan Szabadkavoid EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask, 133278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka size_t cost_mask, const uint8_t *data, 134278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka float *cost) { 13579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka int histogram[256] = { 0 }; 13679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka int window_half = 2000; 13779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka int in_window = std::min(static_cast<size_t>(window_half), len); 13879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka 13979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka // Bootstrap histogram. 14079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka for (int i = 0; i < in_window; ++i) { 1411571db36a9b00e895882ee236e9f84d62f8ea226Zoltan Szabadka ++histogram[data[(pos + i) & mask]]; 14279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka } 14379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka 14479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka // Compute bit costs with sliding window. 14579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka for (int i = 0; i < len; ++i) { 14679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka if (i - window_half >= 0) { 14779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka // Remove a byte in the past. 1481571db36a9b00e895882ee236e9f84d62f8ea226Zoltan Szabadka --histogram[data[(pos + i - window_half) & mask]]; 14979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka --in_window; 15079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka } 15179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka if (i + window_half < len) { 15279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka // Add a byte in the future. 1531571db36a9b00e895882ee236e9f84d62f8ea226Zoltan Szabadka ++histogram[data[(pos + i + window_half) & mask]]; 15479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka ++in_window; 15579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka } 156278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka int histo = histogram[data[(pos + i) & mask]]; 15779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka if (histo == 0) { 15879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka histo = 1; 15979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka } 160278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka float lit_cost = log2(static_cast<double>(in_window) / histo); 161278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka lit_cost += 0.029; 162278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka if (lit_cost < 1.0) { 163278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka lit_cost *= 0.5; 164278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka lit_cost += 0.5; 16579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka } 166278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka cost[(pos + i) & cost_mask] = lit_cost; 16779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka } 16879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka} 16979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka 170cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadka 17179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka} // namespace brotli 172