```/* <![CDATA[ */
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Remember a subset of a sequence of values, using a modest amount of memory
6
7/***
8  Design:
9  Accumulate in powers of three, using 3-way median to collapse entries.
10  At any given time, there is one most-dense (highest power of 3) range of
11  entries and a series of less-dense ranges that hold 0..2 entries each. There
12  is a bounded-size storage array of S cells for all the entries.
13
14  The overflow detect is set up so that a new higher power of 3, K+1, is
15  triggered precisely when range K has 3n entries and all ranges < K have
16  zero entries.
17
18  In general, think of the range sizes as a multi-digit base 3 number, except
19  the highest digit may exceed 2:
20
21  3**6  3**5  3**4  3**3  3**2  3**1  3**0  K=2
22    0     0     0     0   3n-1    2     2   unused:1
23
24  There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
25  one more than that, and we add a new 3**0 entry and "carry" by performing
26  medians on any group of 3 elements:
27
28  3**6  3**5  3**4  3**3  3**2  3**1  3**0       K=2
29    0     0     0     0   3n-1    2     3        unused:0
30    0     0     0     0   3n-1    3     0 carry  unused:2
31    0     0     0     0    3n     0     0 carry  unused:4
32
33  To accumulate 2 entries at all levels < K and 3 just before the first carry at
34  level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
35  in this case. Since we only have 4 cells in the example above, we need to
36  make room by starting a new power of three:
37
38  3**6  3**5  3**4  3**3  3**2  3**1  3**0
39    0     0     0     0    3n     0     0        K=2 unused:4
40    0     0     0     n     0     0     0        K=3 unused:2n+4
41
42  In the code below, we don't worry about overflow from the topmost place.
43
44
45***/
46
47#include "encodings/compact_lang_det/subsetsequence.h"
48#include <stdio.h>
49
50#include "encodings/compact_lang_det/win/cld_logging.h"
51
52void DumpInts(const char* label, const int* v, int n) {
53  printf("%s ", label);
54  for (int i = 0; i < n; ++i) {
55    printf("%d ", v[i]);
56  }
57  printf("\n");
58}
59
60void DumpUint8s(const char* label, const uint8* v, int n) {
61  printf("%s ", label);
62  for (int i = 0; i < n; ++i) {
63    printf("%d ", v[i]);
64  }
65  printf("\n");
66}
67
68// Return median of seq_[sub] .. seq_[sub+2], favoring middle element
69uint8 SubsetSequence::Median3(int sub) {
70  if (seq_[sub] == seq_[sub + 1]) {
71    return seq_[sub];
72  }
73  if (seq_[sub] == seq_[sub + 2]) {
74    return seq_[sub];
75  }
76  return seq_[sub + 1];
77}
78
79void SubsetSequence::Init() {
80  // printf("Init\n");
81
82  k_ = 0;
83  count_ = 0;
84  next_e_ = 0;
85  seq_ = 0;    // Default value if no calls to Add
86
87  // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
88  int reserve = (2 * k_ + 1);
89  level_limit_e_ = kMaxSeq_ - reserve;
90  level_limit_e_ = (level_limit_e_ / 3) * 3;  // Round down to multiple of 3
91  limit_e_ = level_limit_e_;
92}
93
94// Compress level k by 3x, creating level k+1
95void SubsetSequence::NewLevel() {
96  // printf("NewLevel 3 ** %d\n", k_ + 1);
97  //DumpUint8s("count[k]", count_, k_ + 1);
98  //DumpUint8s("seq[next]", seq_, next_e_);
99
100  // Incoming level must be an exact multiple of three in size
101  CHECK((count_[k_] % 3) == 0);
102  int k_size = count_[k_];
103  int new_size = k_size / 3;
104
105  // Compress down by 3x, via median
106  for (int j = 0; j < new_size; ++j) {
107    seq_[j] = Median3(j * 3);
108  }
109
110  // Update counts
111  count_[k_] = 0;
112  // Else Overflow -- just continue with 3x dense Level K
113  if (k_ < (kMaxLevel_ - 1)) {++k_;}
114  count_[k_] = new_size;
115
116  // Update limits
117  next_e_ = new_size;
118  limit_e_ = next_e_ + 3;
119
120  // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
121  int reserve = (2 * k_ + 1);
122  level_limit_e_ = kMaxSeq_ - reserve;
123  level_limit_e_ = (level_limit_e_ / 3) * 3;  // Round down to multiple of 3
124                                              //
125  //DumpUint8s("after: count[k]", count_, k_ + 1);
126  //DumpUint8s("after: seq[next]", seq_, next_e_);
127}
128
129void SubsetSequence::DoCarries() {
130  CHECK(count_[k_] > 3);    // We depend on count_[k_] being > 3 to stop while
131  // Make room by carrying
132
133  //DumpUint8s("DoCarries count[k]", count_, k_ + 1);
134  //DumpUint8s("DoCarries seq[next]", seq_, next_e_);
135
136  int i = 0;
137  while (count_[i] == 3) {
138    next_e_ -= 3;
139    seq_[next_e_] = Median3(next_e_);
140    ++next_e_;
141    count_[i] = 0;
142    ++count_[i + 1];
143    ++i;
144  }
145  limit_e_ = next_e_ + 3;
146
147  //DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
148  //DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);
149
150  // If we just fully carried into level K,
151  // Make sure there is now enough room, else start level K + 1
152  if (i >= k_) {
153    CHECK(count_[k_] == next_e_);
154    if (next_e_ >= level_limit_e_) {
155      NewLevel();
156    }
157  }
158}
159
161  // Add an entry then carry as needed
162  seq_[next_e_] = e;
163  ++next_e_;
164  ++count_;
165
166  if (next_e_ >= limit_e_) {
167    DoCarries();
168  }
169}
170
171
172// Collapse tail end by simple median across disparate-weight values,
173// dropping or duplicating last value if need be.
174// This routine is idempotent.
175void SubsetSequence::Flush() {
176  // printf("Flush %d\n", count_[k_]);
177  int start_tail = count_[k_];
178  int size_tail = next_e_ - start_tail;
179  if ((size_tail % 3) == 2) {
180    seq_[next_e_] = seq_[next_e_ - 1];      // Duplicate last value
181    ++size_tail;
182  }
183
184  // Compress tail down by 3x, via median
185  int new_size = size_tail / 3;             // May delete last value
186  for (int j = 0; j < new_size; ++j) {
187    seq_[start_tail + j] = Median3(start_tail + j * 3);
188  }
189
190  next_e_ = start_tail + new_size;
191  count_[k_] = next_e_;
192}
193
194
195// Extract representative pattern of exactly N values into dst[0..n-1]
196// This routine may be called multiple times, but it may downsample as a
197// side effect, causing subsequent calls with larger N to get poor answers.
198void SubsetSequence::Extract(int to_n, uint8* dst) {
199  // Collapse partial-carries in tail
200  Flush();
201
202  // Just use Bresenham to resample
203  int from_n = next_e_;
204  if (to_n >= from_n) {
205    // Up-sample from_n => to_n
206    int err = to_n - 1;          // bias toward no overshoot
207    int j = 0;
208    for (int i = 0; i < to_n; ++i) {
209      dst[i] = seq_[j];
210      err -= from_n;
211      if (err < 0) {
212        ++j;
213        err += to_n;
214      }
215    }
216  } else {
217    // Get to the point that the number of samples is <= 3 * to_n
218    while (next_e_ > (to_n * 3)) {
219      // Compress down by 3x, via median
220      // printf("Extract, median %d / 3\n", next_e_);
221      if ((next_e_ % 3) == 2) {
222        seq_[next_e_] = seq_[next_e_ - 1];    // Duplicate last value
223        ++next_e_;
224      }
225      int new_size = next_e_ / 3;             // May delete last value
226      for (int j = 0; j < new_size; ++j) {
227        seq_[j] = Median3(j * 3);
228      }
229      next_e_ = new_size;
230      count_[k_] = next_e_;
231    }
232    from_n = next_e_;
233
234    if (to_n == from_n) {
235      // Copy verbatim
236      for (int i = 0; i < to_n; ++i) {
237        dst[i] = seq_[i];
238      }
239      return;
240    }
241
242    // Down-sample from_n => to_n, using medians
243    int err = 0;          // Bias to immediate median sample
244    int j = 0;
245    for (int i = 0; i < from_n; ++i) {
246      err -= to_n;
247      if (err < 0) {
248        if (i <= (next_e_ - 2)) {
249          dst[j] = Median3(i);
250        } else {
251          dst[j] = seq_[i];
252        }
253        ++j;
254        err += from_n;
255      }
256    }
257  }
258
259}
260```