1// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Functions for clustering similar histograms together.
16
17#ifndef BROTLI_ENC_CLUSTER_H_
18#define BROTLI_ENC_CLUSTER_H_
19
20#include <math.h>
21#include <stdint.h>
22#include <stdio.h>
23#include <complex>
24#include <map>
25#include <set>
26#include <utility>
27#include <vector>
28
29#include "./bit_cost.h"
30#include "./entropy_encode.h"
31#include "./fast_log.h"
32#include "./histogram.h"
33
34namespace brotli {
35
36struct HistogramPair {
37  int idx1;
38  int idx2;
39  bool valid;
40  double cost_combo;
41  double cost_diff;
42};
43
44struct HistogramPairComparator {
45  bool operator()(const HistogramPair& p1, const HistogramPair& p2) {
46    if (p1.cost_diff != p2.cost_diff) {
47      return p1.cost_diff > p2.cost_diff;
48    }
49    return abs(p1.idx1 - p1.idx2) > abs(p2.idx1 - p2.idx2);
50  }
51};
52
53// Returns entropy reduction of the context map when we combine two clusters.
54inline double ClusterCostDiff(int size_a, int size_b) {
55  int size_c = size_a + size_b;
56  return size_a * FastLog2(size_a) + size_b * FastLog2(size_b) -
57      size_c * FastLog2(size_c);
58}
59
60// Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
61// it is below a threshold, stores the pair (idx1, idx2) in the *pairs heap.
62template<int kSize>
63void CompareAndPushToHeap(const Histogram<kSize>* out,
64                          const int* cluster_size,
65                          int idx1, int idx2,
66                          std::vector<HistogramPair>* pairs) {
67  if (idx1 == idx2) {
68    return;
69  }
70  if (idx2 < idx1) {
71    int t = idx2;
72    idx2 = idx1;
73    idx1 = t;
74  }
75  bool store_pair = false;
76  HistogramPair p;
77  p.idx1 = idx1;
78  p.idx2 = idx2;
79  p.valid = true;
80  p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
81  p.cost_diff -= out[idx1].bit_cost_;
82  p.cost_diff -= out[idx2].bit_cost_;
83
84  if (out[idx1].total_count_ == 0) {
85    p.cost_combo = out[idx2].bit_cost_;
86    store_pair = true;
87  } else if (out[idx2].total_count_ == 0) {
88    p.cost_combo = out[idx1].bit_cost_;
89    store_pair = true;
90  } else {
91    double threshold = pairs->empty() ? 1e99 :
92        std::max(0.0, (*pairs)[0].cost_diff);
93    Histogram<kSize> combo = out[idx1];
94    combo.AddHistogram(out[idx2]);
95    double cost_combo = PopulationCost(combo);
96    if (cost_combo < threshold - p.cost_diff) {
97      p.cost_combo = cost_combo;
98      store_pair = true;
99    }
100  }
101  if (store_pair) {
102    p.cost_diff += p.cost_combo;
103    pairs->push_back(p);
104    push_heap(pairs->begin(), pairs->end(), HistogramPairComparator());
105  }
106}
107
108template<int kSize>
109void HistogramCombine(Histogram<kSize>* out,
110                      int* cluster_size,
111                      int* symbols,
112                      int symbols_size,
113                      int max_clusters) {
114  double cost_diff_threshold = 0.0;
115  int min_cluster_size = 1;
116  std::set<int> all_symbols;
117  std::vector<int> clusters;
118  for (int i = 0; i < symbols_size; ++i) {
119    if (all_symbols.find(symbols[i]) == all_symbols.end()) {
120      all_symbols.insert(symbols[i]);
121      clusters.push_back(symbols[i]);
122    }
123  }
124
125  // We maintain a heap of histogram pairs, ordered by the bit cost reduction.
126  std::vector<HistogramPair> pairs;
127  for (int idx1 = 0; idx1 < clusters.size(); ++idx1) {
128    for (int idx2 = idx1 + 1; idx2 < clusters.size(); ++idx2) {
129      CompareAndPushToHeap(out, cluster_size, clusters[idx1], clusters[idx2],
130                           &pairs);
131    }
132  }
133
134  while (clusters.size() > min_cluster_size) {
135    if (pairs[0].cost_diff >= cost_diff_threshold) {
136      cost_diff_threshold = 1e99;
137      min_cluster_size = max_clusters;
138      continue;
139    }
140    // Take the best pair from the top of heap.
141    int best_idx1 = pairs[0].idx1;
142    int best_idx2 = pairs[0].idx2;
143    out[best_idx1].AddHistogram(out[best_idx2]);
144    out[best_idx1].bit_cost_ = pairs[0].cost_combo;
145    cluster_size[best_idx1] += cluster_size[best_idx2];
146    for (int i = 0; i < symbols_size; ++i) {
147      if (symbols[i] == best_idx2) {
148        symbols[i] = best_idx1;
149      }
150    }
151    for (int i = 0; i + 1 < clusters.size(); ++i) {
152      if (clusters[i] >= best_idx2) {
153        clusters[i] = clusters[i + 1];
154      }
155    }
156    clusters.pop_back();
157    // Invalidate pairs intersecting the just combined best pair.
158    for (int i = 0; i < pairs.size(); ++i) {
159      HistogramPair& p = pairs[i];
160      if (p.idx1 == best_idx1 || p.idx2 == best_idx1 ||
161          p.idx1 == best_idx2 || p.idx2 == best_idx2) {
162        p.valid = false;
163      }
164    }
165    // Pop invalid pairs from the top of the heap.
166    while (!pairs.empty() && !pairs[0].valid) {
167      pop_heap(pairs.begin(), pairs.end(), HistogramPairComparator());
168      pairs.pop_back();
169    }
170    // Push new pairs formed with the combined histogram to the heap.
171    for (int i = 0; i < clusters.size(); ++i) {
172      CompareAndPushToHeap(out, cluster_size, best_idx1, clusters[i], &pairs);
173    }
174  }
175}
176
177// -----------------------------------------------------------------------------
178// Histogram refinement
179
180// What is the bit cost of moving histogram from cur_symbol to candidate.
181template<int kSize>
182double HistogramBitCostDistance(const Histogram<kSize>& histogram,
183                                const Histogram<kSize>& candidate) {
184  if (histogram.total_count_ == 0) {
185    return 0.0;
186  }
187  Histogram<kSize> tmp = histogram;
188  tmp.AddHistogram(candidate);
189  return PopulationCost(tmp) - candidate.bit_cost_;
190}
191
192// Find the best 'out' histogram for each of the 'in' histograms.
193// Note: we assume that out[]->bit_cost_ is already up-to-date.
194template<int kSize>
195void HistogramRemap(const Histogram<kSize>* in, int in_size,
196                    Histogram<kSize>* out, int* symbols) {
197  std::set<int> all_symbols;
198  for (int i = 0; i < in_size; ++i) {
199    all_symbols.insert(symbols[i]);
200  }
201  for (int i = 0; i < in_size; ++i) {
202    int best_out = i == 0 ? symbols[0] : symbols[i - 1];
203    double best_bits = HistogramBitCostDistance(in[i], out[best_out]);
204    for (std::set<int>::const_iterator k = all_symbols.begin();
205         k != all_symbols.end(); ++k) {
206      const double cur_bits = HistogramBitCostDistance(in[i], out[*k]);
207      if (cur_bits < best_bits) {
208        best_bits = cur_bits;
209        best_out = *k;
210      }
211    }
212    symbols[i] = best_out;
213  }
214
215  // Recompute each out based on raw and symbols.
216  for (std::set<int>::const_iterator k = all_symbols.begin();
217       k != all_symbols.end(); ++k) {
218    out[*k].Clear();
219  }
220  for (int i = 0; i < in_size; ++i) {
221    out[symbols[i]].AddHistogram(in[i]);
222  }
223}
224
225// Reorder histograms in *out so that the new symbols in *symbols come in
226// increasing order.
227template<int kSize>
228void HistogramReindex(std::vector<Histogram<kSize> >* out,
229                      std::vector<int>* symbols) {
230  std::vector<Histogram<kSize> > tmp(*out);
231  std::map<int, int> new_index;
232  int next_index = 0;
233  for (int i = 0; i < symbols->size(); ++i) {
234    if (new_index.find((*symbols)[i]) == new_index.end()) {
235      new_index[(*symbols)[i]] = next_index;
236      (*out)[next_index] = tmp[(*symbols)[i]];
237      ++next_index;
238    }
239  }
240  out->resize(next_index);
241  for (int i = 0; i < symbols->size(); ++i) {
242    (*symbols)[i] = new_index[(*symbols)[i]];
243  }
244}
245
246// Clusters similar histograms in 'in' together, the selected histograms are
247// placed in 'out', and for each index in 'in', *histogram_symbols will
248// indicate which of the 'out' histograms is the best approximation.
249template<int kSize>
250void ClusterHistograms(const std::vector<Histogram<kSize> >& in,
251                       int num_contexts, int num_blocks,
252                       int max_histograms,
253                       std::vector<Histogram<kSize> >* out,
254                       std::vector<int>* histogram_symbols) {
255  const int in_size = num_contexts * num_blocks;
256  std::vector<int> cluster_size(in_size, 1);
257  out->resize(in_size);
258  histogram_symbols->resize(in_size);
259  for (int i = 0; i < in_size; ++i) {
260    (*out)[i] = in[i];
261    (*out)[i].bit_cost_ = PopulationCost(in[i]);
262    (*histogram_symbols)[i] = i;
263  }
264
265  // Collapse similar histograms within a block type.
266  if (num_contexts > 1) {
267    for (int i = 0; i < num_blocks; ++i) {
268      HistogramCombine(&(*out)[0], &cluster_size[0],
269                       &(*histogram_symbols)[i * num_contexts], num_contexts,
270                       max_histograms);
271    }
272  }
273
274  // Collapse similar histograms.
275  HistogramCombine(&(*out)[0], &cluster_size[0],
276                   &(*histogram_symbols)[0], in_size,
277                   max_histograms);
278
279  // Find the optimal map from original histograms to the final ones.
280  HistogramRemap(&in[0], in_size, &(*out)[0], &(*histogram_symbols)[0]);
281
282  // Convert the context map to a canonical form.
283  HistogramReindex(out, histogram_symbols);
284}
285
286}  // namespace brotli
287
288#endif  // BROTLI_ENC_CLUSTER_H_
289