179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Copyright 2013 Google Inc. All Rights Reserved.
279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Licensed under the Apache License, Version 2.0 (the "License");
479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// you may not use this file except in compliance with the License.
579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// You may obtain a copy of the License at
679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// http://www.apache.org/licenses/LICENSE-2.0
879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Unless required by applicable law or agreed to in writing, software
1079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// distributed under the License is distributed on an "AS IS" BASIS,
1179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// See the License for the specific language governing permissions and
1379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// limitations under the License.
1479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
1579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Block split point selection utilities.
1679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
1779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include "./block_splitter.h"
1879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
1979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <math.h>
2079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <stdio.h>
2179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <stdlib.h>
2279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <string.h>
2379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
2479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <algorithm>
2579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <map>
2679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
2779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include "./cluster.h"
2879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include "./command.h"
2979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include "./fast_log.h"
3079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include "./histogram.h"
3179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
3279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkanamespace brotli {
3379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
34cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadkastatic const int kMaxLiteralHistograms = 100;
3579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastatic const int kMaxCommandHistograms = 50;
3679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastatic const double kLiteralBlockSwitchCost = 26;
3779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastatic const double kCommandBlockSwitchCost = 13.5;
3879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastatic const double kDistanceBlockSwitchCost = 14.6;
3979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastatic const int kLiteralStrideLength = 70;
4079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastatic const int kCommandStrideLength = 40;
41cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadkastatic const int kSymbolsPerLiteralHistogram = 544;
4279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastatic const int kSymbolsPerCommandHistogram = 530;
43cbd5cb55f487eda746b0d6f8b5742b5a8e5c846aZoltan Szabadkastatic const int kSymbolsPerDistanceHistogram = 544;
4479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastatic const int kMinLengthForBlockSplitting = 128;
4579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastatic const int kIterMulForRefining = 2;
4679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastatic const int kMinItersForRefining = 100;
4779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
4879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid CopyLiteralsToByteArray(const std::vector<Command>& cmds,
4979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                             const uint8_t* data,
5079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                             std::vector<uint8_t>* literals) {
5179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Count how many we have.
5279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  size_t total_length = 0;
5379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < cmds.size(); ++i) {
5479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    total_length += cmds[i].insert_length_;
5579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
5679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  if (total_length == 0) {
5779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    return;
5879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
5979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
6079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Allocate.
6179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  literals->resize(total_length);
6279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
6379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Loop again, and copy this time.
6479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  size_t pos = 0;
6579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  size_t from_pos = 0;
6679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < cmds.size() && pos < total_length; ++i) {
6779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    memcpy(&(*literals)[pos], data + from_pos, cmds[i].insert_length_);
6879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    pos += cmds[i].insert_length_;
6979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    from_pos += cmds[i].insert_length_ + cmds[i].copy_length_;
7079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
7179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
7279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
7379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid CopyCommandsToByteArray(const std::vector<Command>& cmds,
7479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                             std::vector<uint16_t>* insert_and_copy_codes,
7579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                             std::vector<uint8_t>* distance_prefixes) {
7679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < cmds.size(); ++i) {
7779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    const Command& cmd = cmds[i];
7879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    insert_and_copy_codes->push_back(cmd.command_prefix_);
7979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (cmd.copy_length_ > 0 && cmd.distance_prefix_ != 0xffff) {
8079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      distance_prefixes->push_back(cmd.distance_prefix_);
8179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
8279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
8379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
8479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
8579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<typename HistogramType, typename DataType>
8679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid InitialEntropyCodes(const DataType* data, size_t length,
8779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                         int literals_per_histogram,
8879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                         int max_histograms,
8979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                         size_t stride,
9079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                         std::vector<HistogramType>* vec) {
9179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int total_histograms = length / literals_per_histogram + 1;
9279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  if (total_histograms > max_histograms) {
9379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    total_histograms = max_histograms;
9479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
9579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  unsigned int seed = 7;
9679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int block_length = length / total_histograms;
9779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < total_histograms; ++i) {
9879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    int pos = length * i / total_histograms;
9979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (i != 0) {
10079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      pos += rand_r(&seed) % block_length;
10179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
10279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (pos + stride >= length) {
10379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      pos = length - stride - 1;
10479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
10579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    HistogramType histo;
10679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    histo.Add(data + pos, stride);
10779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    vec->push_back(histo);
10879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
10979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
11079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
11179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<typename HistogramType, typename DataType>
11279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid RandomSample(unsigned int* seed,
11379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                  const DataType* data,
11479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                  size_t length,
11579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                  size_t stride,
11679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                  HistogramType* sample) {
11740955ce409e55573646af2b0d0ece2e2404f2e7aZoltan Szabadka  size_t pos = 0;
11840955ce409e55573646af2b0d0ece2e2404f2e7aZoltan Szabadka  if (stride >= length) {
11940955ce409e55573646af2b0d0ece2e2404f2e7aZoltan Szabadka    pos = 0;
12040955ce409e55573646af2b0d0ece2e2404f2e7aZoltan Szabadka    stride = length;
12140955ce409e55573646af2b0d0ece2e2404f2e7aZoltan Szabadka  } else {
12240955ce409e55573646af2b0d0ece2e2404f2e7aZoltan Szabadka    pos = rand_r(seed) % (length - stride + 1);
12340955ce409e55573646af2b0d0ece2e2404f2e7aZoltan Szabadka  }
12479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  sample->Add(data + pos, stride);
12579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
12679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
12779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<typename HistogramType, typename DataType>
12879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid RefineEntropyCodes(const DataType* data, size_t length,
12979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                        size_t stride,
13079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                        std::vector<HistogramType>* vec) {
13140955ce409e55573646af2b0d0ece2e2404f2e7aZoltan Szabadka  int iters =
13279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      kIterMulForRefining * length / stride + kMinItersForRefining;
13379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  unsigned int seed = 7;
13440955ce409e55573646af2b0d0ece2e2404f2e7aZoltan Szabadka  iters = ((iters + vec->size() - 1) / vec->size()) * vec->size();
13579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int iter = 0; iter < iters; ++iter) {
13679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    HistogramType sample;
13779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    RandomSample(&seed, data, length, stride, &sample);
13840955ce409e55573646af2b0d0ece2e2404f2e7aZoltan Szabadka    int ix = iter % vec->size();
13979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    (*vec)[ix].AddHistogram(sample);
14079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
14179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
14279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
14379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkainline static float BitCost(int total, int count) {
14479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  return count == 0 ? FastLog2(total) + 2 : FastLog2(total) - FastLog2(count);
14579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
14679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
14779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<typename DataType, int kSize>
14879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid FindBlocks(const DataType* data, const size_t length,
14979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                const double block_switch_bitcost,
15079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                const std::vector<Histogram<kSize> > &vec,
15179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                uint8_t *block_id) {
15279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  if (vec.size() <= 1) {
15379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    for (int i = 0; i < length; ++i) {
15479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      block_id[i] = 0;
15579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
15679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    return;
15779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
15879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int vecsize = vec.size();
15979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  double* insert_cost = new double[kSize * vecsize];
16079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
16179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < kSize; ++i) {
16279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    for (int j = 0; j < vecsize; ++j) {
16379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      insert_cost[i * vecsize + j] =
16479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka          BitCost(vec[j].total_count_, vec[j].data_[i]);
16579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
16679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
16779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  double *cost = new double[vecsize];
16879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  memset(cost, 0, sizeof(cost[0]) * vecsize);
16979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  bool* switch_signal = new bool[length * vecsize];
17079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
17179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // After each iteration of this loop, cost[k] will contain the difference
17279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // between the minimum cost of arriving at the current byte position using
17379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // entropy code k, and the minimum cost of arriving at the current byte
17479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // position. This difference is capped at the block switch cost, and if it
17579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // reaches block switch cost, it means that when we trace back from the last
17679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // position, we need to switch here.
17779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
17879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    int ix = byte_ix * vecsize;
17979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    int insert_cost_ix = data[byte_ix] * vecsize;
18079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    double min_cost = 1e99;
18179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    for (int k = 0; k < vecsize; ++k) {
18279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      // We are coding the symbol in data[byte_ix] with entropy code k.
18379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      cost[k] += insert_cost[insert_cost_ix + k];
18479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      if (cost[k] < min_cost) {
18579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka        min_cost = cost[k];
18679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka        block_id[byte_ix] = k;
18779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      }
18879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
18979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    double block_switch_cost = block_switch_bitcost;
19079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    // More blocks for the beginning.
19179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (byte_ix < 2000) {
19279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      block_switch_cost *= 0.77 + 0.07 * byte_ix / 2000;
19379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
19479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    for (int k = 0; k < vecsize; ++k) {
19579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      cost[k] -= min_cost;
19679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      if (cost[k] >= block_switch_cost) {
19779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka        cost[k] = block_switch_cost;
19879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka        switch_signal[ix + k] = true;
19979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      }
20079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
20179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
20279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Now trace back from the last position and switch at the marked places.
20379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int byte_ix = length - 1;
20479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int ix = byte_ix * vecsize;
20579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int cur_id = block_id[byte_ix];
20679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  while (byte_ix > 0) {
20779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    --byte_ix;
20879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    ix -= vecsize;
20979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (switch_signal[ix + cur_id]) {
21079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      cur_id = block_id[byte_ix];
21179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
21279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    block_id[byte_ix] = cur_id;
21379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
21479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  delete[] insert_cost;
21579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  delete[] cost;
21679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  delete[] switch_signal;
21779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
21879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
21979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkaint RemapBlockIds(uint8_t* block_ids, const size_t length) {
22079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::map<uint8_t, uint8_t> new_id;
22179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int next_id = 0;
22279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < length; ++i) {
22379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (new_id.find(block_ids[i]) == new_id.end()) {
22479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      new_id[block_ids[i]] = next_id;
22579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      ++next_id;
22679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
22779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
22879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < length; ++i) {
22979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    block_ids[i] = new_id[block_ids[i]];
23079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
23179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  return next_id;
23279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
23379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
23479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<typename HistogramType, typename DataType>
23579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid BuildBlockHistograms(const DataType* data, const size_t length,
23679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                          uint8_t* block_ids,
23779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                          std::vector<HistogramType>* histograms) {
23879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int num_types = RemapBlockIds(block_ids, length);
23979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  histograms->clear();
24079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  histograms->resize(num_types);
24179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < length; ++i) {
24279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    (*histograms)[block_ids[i]].Add(data[i]);
24379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
24479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
24579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
24679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<typename HistogramType, typename DataType>
24779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid ClusterBlocks(const DataType* data, const size_t length,
24879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                   uint8_t* block_ids) {
24979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<HistogramType> histograms;
25079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<int> block_index(length);
25179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int cur_idx = 0;
25279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  HistogramType cur_histogram;
25379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < length; ++i) {
25479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    bool block_boundary = (i + 1 == length || block_ids[i] != block_ids[i + 1]);
25579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    block_index[i] = cur_idx;
25679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    cur_histogram.Add(data[i]);
25779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (block_boundary) {
25879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      histograms.push_back(cur_histogram);
25979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      cur_histogram.Clear();
26079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      ++cur_idx;
26179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
26279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
26379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<HistogramType> clustered_histograms;
26479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<int> histogram_symbols;
265e70949119a5540e62ef3b9aae797f797a8d8b44bZoltan Szabadka  // Block ids need to fit in one byte.
266e70949119a5540e62ef3b9aae797f797a8d8b44bZoltan Szabadka  static const int kMaxNumberOfBlockTypes = 256;
26779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  ClusterHistograms(histograms, 1, histograms.size(),
26879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                    kMaxNumberOfBlockTypes,
26979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                    &clustered_histograms,
27079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                    &histogram_symbols);
27179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < length; ++i) {
27279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    block_ids[i] = histogram_symbols[block_index[i]];
27379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
27479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
27579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
27679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid BuildBlockSplit(const std::vector<uint8_t>& block_ids, BlockSplit* split) {
27779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int cur_id = block_ids[0];
27879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int cur_length = 1;
27979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  split->num_types_ = -1;
28079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 1; i < block_ids.size(); ++i) {
28179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (block_ids[i] != cur_id) {
28279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      split->types_.push_back(cur_id);
28379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      split->lengths_.push_back(cur_length);
28479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      split->num_types_ = std::max(split->num_types_, cur_id);
28579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      cur_id = block_ids[i];
28679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      cur_length = 0;
28779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
28879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    ++cur_length;
28979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
29079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  split->types_.push_back(cur_id);
29179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  split->lengths_.push_back(cur_length);
29279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  split->num_types_ = std::max(split->num_types_, cur_id);
29379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  ++split->num_types_;
29479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
29579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
29679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<typename HistogramType, typename DataType>
29779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid SplitByteVector(const std::vector<DataType>& data,
29879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                     const int literals_per_histogram,
29979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                     const int max_histograms,
30079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                     const int sampling_stride_length,
30179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                     const double block_switch_cost,
30279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                     BlockSplit* split) {
30379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  if (data.empty()) {
30479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    split->num_types_ = 0;
30579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    return;
30679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  } else if (data.size() < kMinLengthForBlockSplitting) {
30779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    split->num_types_ = 1;
30879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    split->types_.push_back(0);
30979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    split->lengths_.push_back(data.size());
31079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    return;
31179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
31279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<HistogramType> histograms;
31379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Find good entropy codes.
31479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  InitialEntropyCodes(data.data(), data.size(),
31579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                      literals_per_histogram,
31679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                      max_histograms,
31779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                      sampling_stride_length,
31879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                      &histograms);
31979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  RefineEntropyCodes(data.data(), data.size(),
32079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                     sampling_stride_length,
32179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                     &histograms);
32279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Find a good path through literals with the good entropy codes.
32379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<uint8_t> block_ids(data.size());
32479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < 10; ++i) {
32579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    FindBlocks(data.data(), data.size(),
32679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka               block_switch_cost,
32779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka               histograms,
32879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka               &block_ids[0]);
32979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    BuildBlockHistograms(data.data(), data.size(), &block_ids[0], &histograms);
33079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
33179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  ClusterBlocks<HistogramType>(data.data(), data.size(), &block_ids[0]);
33279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  BuildBlockSplit(block_ids, split);
33379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
33479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
33579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid SplitBlock(const std::vector<Command>& cmds,
33679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                const uint8_t* data,
33779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                BlockSplit* literal_split,
33879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                BlockSplit* insert_and_copy_split,
33979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                BlockSplit* dist_split) {
34079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Create a continuous array of literals.
34179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<uint8_t> literals;
34279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  CopyLiteralsToByteArray(cmds, data, &literals);
34379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
34479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Compute prefix codes for commands.
34579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<uint16_t> insert_and_copy_codes;
34679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<uint8_t> distance_prefixes;
34779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  CopyCommandsToByteArray(cmds,
34879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                          &insert_and_copy_codes,
34979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                          &distance_prefixes);
35079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
351278b89484fa947fad7fbf8753aadff0c9ce18a8cZoltan Szabadka
35279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  SplitByteVector<HistogramLiteral>(
35379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      literals,
35479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
35579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      kLiteralStrideLength, kLiteralBlockSwitchCost,
35679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      literal_split);
35779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  SplitByteVector<HistogramCommand>(
35879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      insert_and_copy_codes,
35979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      kSymbolsPerCommandHistogram, kMaxCommandHistograms,
36079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      kCommandStrideLength, kCommandBlockSwitchCost,
36179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      insert_and_copy_split);
36279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  SplitByteVector<HistogramDistance>(
36379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      distance_prefixes,
36479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
36579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      kCommandStrideLength, kDistanceBlockSwitchCost,
36679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      dist_split);
36779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
36879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
36979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid SplitBlockByTotalLength(const std::vector<Command>& all_commands,
37079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                             int input_size,
37179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                             int target_length,
37279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                             std::vector<std::vector<Command> >* blocks) {
37379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int num_blocks = input_size / target_length + 1;
37479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int length_limit = input_size / num_blocks + 1;
37579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int total_length = 0;
37679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<Command> cur_block;
37779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < all_commands.size(); ++i) {
37879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    const Command& cmd = all_commands[i];
37979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    int cmd_length = cmd.insert_length_ + cmd.copy_length_;
38079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (total_length > length_limit) {
38179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      blocks->push_back(cur_block);
38279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      cur_block.clear();
38379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      total_length = 0;
38479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
38579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    cur_block.push_back(cmd);
38679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    total_length += cmd_length;
38779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
38879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  blocks->push_back(cur_block);
38979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
39079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
39179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}  // namespace brotli
392