179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Copyright 2013 Google Inc. All Rights Reserved.
279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Licensed under the Apache License, Version 2.0 (the "License");
479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// you may not use this file except in compliance with the License.
579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// You may obtain a copy of the License at
679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// http://www.apache.org/licenses/LICENSE-2.0
879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Unless required by applicable law or agreed to in writing, software
1079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// distributed under the License is distributed on an "AS IS" BASIS,
1179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// See the License for the specific language governing permissions and
1379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// limitations under the License.
1479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka//
1579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Functions for clustering similar histograms together.
1679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
1779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#ifndef BROTLI_ENC_CLUSTER_H_
1879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#define BROTLI_ENC_CLUSTER_H_
1979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
2079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <math.h>
2179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <stdint.h>
2279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <stdio.h>
2379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <complex>
2479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <map>
2579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <set>
2679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <utility>
2779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include <vector>
2879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
2979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include "./bit_cost.h"
3079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include "./entropy_encode.h"
3179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include "./fast_log.h"
3279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#include "./histogram.h"
3379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
3479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkanamespace brotli {
3579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
3679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastruct HistogramPair {
3779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int idx1;
3879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int idx2;
3979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  bool valid;
4079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  double cost_combo;
4179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  double cost_diff;
4279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka};
4379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
4479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkastruct HistogramPairComparator {
4579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  bool operator()(const HistogramPair& p1, const HistogramPair& p2) {
4679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (p1.cost_diff != p2.cost_diff) {
4779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      return p1.cost_diff > p2.cost_diff;
4879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
4979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    return abs(p1.idx1 - p1.idx2) > abs(p2.idx1 - p2.idx2);
5079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
5179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka};
5279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
5379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Returns entropy reduction of the context map when we combine two clusters.
5479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkainline double ClusterCostDiff(int size_a, int size_b) {
5579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int size_c = size_a + size_b;
5679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  return size_a * FastLog2(size_a) + size_b * FastLog2(size_b) -
5779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      size_c * FastLog2(size_c);
5879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
5979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
6079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
6179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// it is below a threshold, stores the pair (idx1, idx2) in the *pairs heap.
6279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<int kSize>
6379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid CompareAndPushToHeap(const Histogram<kSize>* out,
6479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                          const int* cluster_size,
6579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                          int idx1, int idx2,
6679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                          std::vector<HistogramPair>* pairs) {
6779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  if (idx1 == idx2) {
6879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    return;
6979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
7079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  if (idx2 < idx1) {
7179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    int t = idx2;
7279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    idx2 = idx1;
7379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    idx1 = t;
7479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
7579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  bool store_pair = false;
7679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  HistogramPair p;
7779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  p.idx1 = idx1;
7879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  p.idx2 = idx2;
7979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  p.valid = true;
8079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
8179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  p.cost_diff -= out[idx1].bit_cost_;
8279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  p.cost_diff -= out[idx2].bit_cost_;
8379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
8479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  if (out[idx1].total_count_ == 0) {
8579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    p.cost_combo = out[idx2].bit_cost_;
8679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    store_pair = true;
8779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  } else if (out[idx2].total_count_ == 0) {
8879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    p.cost_combo = out[idx1].bit_cost_;
8979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    store_pair = true;
9079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  } else {
9179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    double threshold = pairs->empty() ? 1e99 :
9279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka        std::max(0.0, (*pairs)[0].cost_diff);
9379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    Histogram<kSize> combo = out[idx1];
9479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    combo.AddHistogram(out[idx2]);
9579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    double cost_combo = PopulationCost(combo);
9679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (cost_combo < threshold - p.cost_diff) {
9779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      p.cost_combo = cost_combo;
9879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      store_pair = true;
9979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
10079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
10179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  if (store_pair) {
10279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    p.cost_diff += p.cost_combo;
10379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    pairs->push_back(p);
10479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    push_heap(pairs->begin(), pairs->end(), HistogramPairComparator());
10579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
10679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
10779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
10879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<int kSize>
10979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid HistogramCombine(Histogram<kSize>* out,
11079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                      int* cluster_size,
11179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                      int* symbols,
11279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                      int symbols_size,
11379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                      int max_clusters) {
11479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  double cost_diff_threshold = 0.0;
11579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int min_cluster_size = 1;
11679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::set<int> all_symbols;
11779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<int> clusters;
11879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < symbols_size; ++i) {
11979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (all_symbols.find(symbols[i]) == all_symbols.end()) {
12079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      all_symbols.insert(symbols[i]);
12179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      clusters.push_back(symbols[i]);
12279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
12379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
12479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
12579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // We maintain a heap of histogram pairs, ordered by the bit cost reduction.
12679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<HistogramPair> pairs;
12779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int idx1 = 0; idx1 < clusters.size(); ++idx1) {
12879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    for (int idx2 = idx1 + 1; idx2 < clusters.size(); ++idx2) {
12979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      CompareAndPushToHeap(out, cluster_size, clusters[idx1], clusters[idx2],
13079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                           &pairs);
13179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
13279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
13379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
13479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  while (clusters.size() > min_cluster_size) {
13579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (pairs[0].cost_diff >= cost_diff_threshold) {
13679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      cost_diff_threshold = 1e99;
13779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      min_cluster_size = max_clusters;
13879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      continue;
13979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
14079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    // Take the best pair from the top of heap.
14179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    int best_idx1 = pairs[0].idx1;
14279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    int best_idx2 = pairs[0].idx2;
14379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    out[best_idx1].AddHistogram(out[best_idx2]);
14479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    out[best_idx1].bit_cost_ = pairs[0].cost_combo;
14579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    cluster_size[best_idx1] += cluster_size[best_idx2];
14679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    for (int i = 0; i < symbols_size; ++i) {
14779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      if (symbols[i] == best_idx2) {
14879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka        symbols[i] = best_idx1;
14979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      }
15079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
15179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    for (int i = 0; i + 1 < clusters.size(); ++i) {
15279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      if (clusters[i] >= best_idx2) {
15379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka        clusters[i] = clusters[i + 1];
15479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      }
15579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
15679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    clusters.pop_back();
15779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    // Invalidate pairs intersecting the just combined best pair.
15879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    for (int i = 0; i < pairs.size(); ++i) {
15979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      HistogramPair& p = pairs[i];
16079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      if (p.idx1 == best_idx1 || p.idx2 == best_idx1 ||
16179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka          p.idx1 == best_idx2 || p.idx2 == best_idx2) {
16279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka        p.valid = false;
16379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      }
16479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
16579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    // Pop invalid pairs from the top of the heap.
16679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    while (!pairs.empty() && !pairs[0].valid) {
16779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      pop_heap(pairs.begin(), pairs.end(), HistogramPairComparator());
16879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      pairs.pop_back();
16979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
17079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    // Push new pairs formed with the combined histogram to the heap.
17179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    for (int i = 0; i < clusters.size(); ++i) {
17279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      CompareAndPushToHeap(out, cluster_size, best_idx1, clusters[i], &pairs);
17379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
17479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
17579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
17679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
17779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// -----------------------------------------------------------------------------
17879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Histogram refinement
17979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
18079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// What is the bit cost of moving histogram from cur_symbol to candidate.
18179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<int kSize>
18279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkadouble HistogramBitCostDistance(const Histogram<kSize>& histogram,
18379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                                const Histogram<kSize>& candidate) {
18479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  if (histogram.total_count_ == 0) {
18579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    return 0.0;
18679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
18779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  Histogram<kSize> tmp = histogram;
18879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  tmp.AddHistogram(candidate);
18979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  return PopulationCost(tmp) - candidate.bit_cost_;
19079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
19179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
19279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Find the best 'out' histogram for each of the 'in' histograms.
19379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Note: we assume that out[]->bit_cost_ is already up-to-date.
19479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<int kSize>
19579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid HistogramRemap(const Histogram<kSize>* in, int in_size,
19679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                    Histogram<kSize>* out, int* symbols) {
19779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::set<int> all_symbols;
19879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < in_size; ++i) {
19979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    all_symbols.insert(symbols[i]);
20079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
20179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < in_size; ++i) {
20279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    int best_out = i == 0 ? symbols[0] : symbols[i - 1];
20379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    double best_bits = HistogramBitCostDistance(in[i], out[best_out]);
20479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    for (std::set<int>::const_iterator k = all_symbols.begin();
20579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka         k != all_symbols.end(); ++k) {
20679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      const double cur_bits = HistogramBitCostDistance(in[i], out[*k]);
20779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      if (cur_bits < best_bits) {
20879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka        best_bits = cur_bits;
20979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka        best_out = *k;
21079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      }
21179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
21279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    symbols[i] = best_out;
21379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
21479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
21579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Recompute each out based on raw and symbols.
21679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (std::set<int>::const_iterator k = all_symbols.begin();
21779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka       k != all_symbols.end(); ++k) {
21879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    out[*k].Clear();
21979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
22079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < in_size; ++i) {
22179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    out[symbols[i]].AddHistogram(in[i]);
22279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
22379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
22479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
22579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Reorder histograms in *out so that the new symbols in *symbols come in
22679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// increasing order.
22779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<int kSize>
22879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid HistogramReindex(std::vector<Histogram<kSize> >* out,
22979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                      std::vector<int>* symbols) {
23079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<Histogram<kSize> > tmp(*out);
23179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::map<int, int> new_index;
23279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  int next_index = 0;
23379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < symbols->size(); ++i) {
23479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    if (new_index.find((*symbols)[i]) == new_index.end()) {
23579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      new_index[(*symbols)[i]] = next_index;
23679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      (*out)[next_index] = tmp[(*symbols)[i]];
23779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      ++next_index;
23879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
23979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
24079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  out->resize(next_index);
24179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < symbols->size(); ++i) {
24279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    (*symbols)[i] = new_index[(*symbols)[i]];
24379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
24479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
24579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
24679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// Clusters similar histograms in 'in' together, the selected histograms are
24779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// placed in 'out', and for each index in 'in', *histogram_symbols will
24879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka// indicate which of the 'out' histograms is the best approximation.
24979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkatemplate<int kSize>
25079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadkavoid ClusterHistograms(const std::vector<Histogram<kSize> >& in,
25179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                       int num_contexts, int num_blocks,
25279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                       int max_histograms,
25379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                       std::vector<Histogram<kSize> >* out,
25479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                       std::vector<int>* histogram_symbols) {
25579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  const int in_size = num_contexts * num_blocks;
25679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  std::vector<int> cluster_size(in_size, 1);
25779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  out->resize(in_size);
25879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  histogram_symbols->resize(in_size);
25979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  for (int i = 0; i < in_size; ++i) {
26079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    (*out)[i] = in[i];
26179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    (*out)[i].bit_cost_ = PopulationCost(in[i]);
26279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    (*histogram_symbols)[i] = i;
26379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
26479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
26579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Collapse similar histograms within a block type.
26679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  if (num_contexts > 1) {
26779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    for (int i = 0; i < num_blocks; ++i) {
26879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka      HistogramCombine(&(*out)[0], &cluster_size[0],
26979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                       &(*histogram_symbols)[i * num_contexts], num_contexts,
27079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                       max_histograms);
27179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka    }
27279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  }
27379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
27479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Collapse similar histograms.
27579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  HistogramCombine(&(*out)[0], &cluster_size[0],
27679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                   &(*histogram_symbols)[0], in_size,
27779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka                   max_histograms);
27879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
27979e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Find the optimal map from original histograms to the final ones.
28079e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  HistogramRemap(&in[0], in_size, &(*out)[0], &(*histogram_symbols)[0]);
28179e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
28279e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  // Convert the context map to a canonical form.
28379e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka  HistogramReindex(out, histogram_symbols);
28479e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}
28579e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
28679e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka}  // namespace brotli
28779e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka
28879e99afe468407e9ff9f0820df7190cb069eabebZoltan Szabadka#endif  // BROTLI_ENC_CLUSTER_H_
289