1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "encodings/compact_lang_det/tote.h"
6#include <string.h>     // memset
7
8#include "encodings/compact_lang_det/win/cld_logging.h"
9
10
11// Take a set of <key, value> pairs and tote them up.
12// After explicitly sorting, retrieve top key, value pairs
13Tote::Tote() {
14  gram_count_ = 0;
15  incr_count_ = 0;
16  byte_count_ = 0;
17  memset(key_, 0, sizeof(key_));
18  // No need to initialize values
19}
20
21Tote::~Tote() {
22}
23
24void Tote::Reinit() {
25  gram_count_ = 0;
26  incr_count_ = 0;
27  byte_count_ = 0;
28  memset(key_, 0, sizeof(key_));
29  // No need to initialize values
30}
31
32// Increment count of quadgrams/trigrams/unigrams scored
33void Tote::AddGram() {
34  ++gram_count_;
35}
36
37// Three-way associative, guaranteeing that the largest two counts are always
38// in the data structure. kMaxSize must be a multiple of 3, and is tied to the
39// subscript calculations here, which are for 8 sets of 3-way associative
40// buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
41// slightly-weird way: The initial probe point is [N] or [N+8], whichever
42// is specified by key mod 16. In most cases (nearly *all* cases except Latin
43// script), this entry matches and we update/return. The second probe is
44// the other of [N] and [N+8]. The third probe is only used as a fallback to
45// these two, and is there only for the rare case that there are three or more
46// languages with Language enum values equal mod 8, contending within the same
47// bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
48// the other scripts have fewer than 17 languages total.
49// If you change kMaxSize, change the constants 7/8/15/16 below
50void Tote::Add(uint8 ikey, int idelta) {
51  DCHECK(ikey != 0);
52  ++incr_count_;
53
54  // Look for existing entry
55  int sub0 = ikey & 15;
56  if (key_[sub0] == ikey) {
57    value_[sub0] += idelta;
58    return;
59  }
60  int sub1 = sub0 ^ 8;
61  if (key_[sub1] == ikey) {
62    value_[sub1] += idelta;
63    return;
64  }
65  int sub2 = (ikey & 7) + 16;
66  if (key_[sub2] == ikey) {
67    value_[sub2] += idelta;
68    return;
69  }
70
71  // Allocate new entry
72  int alloc = -1;
73  if (key_[sub0] == 0) {
74    alloc = sub0;
75  } else if (key_[sub1] == 0) {
76    alloc = sub1;
77  } else if (key_[sub2] == 0) {
78    alloc = sub2;
79  } else {
80    // All choices allocated, need to replace smallest one
81    alloc = sub0;
82    if (value_[sub1] < value_[alloc]) {alloc = sub1;}
83    if (value_[sub2] < value_[alloc]) {alloc = sub2;}
84  }
85  key_[alloc] = ikey;
86  value_[alloc] = idelta;
87  return;
88}
89
90// Return current top key
91int Tote::CurrentTopKey() {
92  int top_key = 0;
93  int top_value = -1;
94  for (int sub = 0; sub < kMaxSize_; ++sub) {
95    if (key_[sub] == 0) {continue;}
96    if (top_value < value_[sub]) {
97      top_value = value_[sub];
98      top_key = key_[sub];
99    }
100  }
101  return top_key;
102}
103
104
105// Sort first n entries by decreasing order of value
106// If key==0 other fields are not valid, treat value as -1
107void Tote::Sort(int n) {
108  // This is n**2, but n is small
109  for (int sub = 0; sub < n; ++sub) {
110    if (key_[sub] == 0) {value_[sub] = -1;}
111
112    // Bubble sort key[sub] and entry[sub]
113    for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
114      if (key_[sub2] == 0) {value_[sub2] = -1;}
115      if (value_[sub] < value_[sub2]) {
116        // swap
117        uint8 tmpk = key_[sub];
118        key_[sub] = key_[sub2];
119        key_[sub2] = tmpk;
120        int tmpv = value_[sub];
121        value_[sub] = value_[sub2];
122        value_[sub2] = tmpv;
123      }
124    }
125  }
126}
127
128void Tote::Dump(FILE* f) {
129  for (int sub = 0; sub < kMaxSize_; ++sub) {
130    if (key_[sub] > 0) {
131      fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]);
132    }
133  }
134  fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_);
135}
136
137
138
139
140// Take a set of <key, value> pairs and tote them up.
141// After explicitly sorting, retrieve top key, value pairs
142ToteWithReliability::ToteWithReliability() {
143  // No need to initialize score_ or value_
144  incr_count_ = 0;
145  sorted_ = 0;
146  memset(closepair_, 0, sizeof(closepair_));
147  memset(key_, 0, sizeof(key_));
148}
149
150ToteWithReliability::~ToteWithReliability() {
151}
152
153void ToteWithReliability::Reinit() {
154  // No need to initialize score_ or value_
155  incr_count_ = 0;
156  sorted_ = 0;
157  memset(closepair_, 0, sizeof(closepair_));
158  memset(key_, 0, sizeof(key_));
159  ////ss_.Init();
160}
161
162// Weight reliability by ibytes
163// Also see three-way associative comments above for Tote
164void ToteWithReliability::Add(uint8 ikey, int ibytes,
165                              int score, int ireliability) {
166  DCHECK(ikey != 0);
167  CHECK(sorted_ == 0);
168  ++incr_count_;
169
170  // Look for existing entry
171  int sub0 = ikey & 15;
172  if (key_[sub0] == ikey) {
173    value_[sub0] += ibytes;
174    score_[sub0] += score;
175    reliability_[sub0] += ireliability * ibytes;
176    return;
177  }
178  int sub1 = sub0 ^ 8;
179  if (key_[sub1] == ikey) {
180    value_[sub1] += ibytes;
181    score_[sub1] += score;
182    reliability_[sub1] += ireliability * ibytes;
183    return;
184  }
185  int sub2 = (ikey & 7) + 16;
186  if (key_[sub2] == ikey) {
187    value_[sub2] += ibytes;
188    score_[sub2] += score;
189    reliability_[sub2] += ireliability * ibytes;
190    return;
191  }
192
193  // Allocate new entry
194  int alloc = -1;
195  if (key_[sub0] == 0) {
196    alloc = sub0;
197  } else if (key_[sub1] == 0) {
198    alloc = sub1;
199  } else if (key_[sub2] == 0) {
200    alloc = sub2;
201  } else {
202    // All choices allocated, need to replace smallest one
203    alloc = sub0;
204    if (value_[sub1] < value_[alloc]) {alloc = sub1;}
205    if (value_[sub2] < value_[alloc]) {alloc = sub2;}
206  }
207  key_[alloc] = ikey;
208  value_[alloc] = ibytes;
209  score_[alloc] = score;
210  reliability_[alloc] = ireliability * ibytes;
211  return;
212}
213
214// Find subscript of a given packed language, or -1
215int ToteWithReliability::Find(uint8 ikey) {
216  DCHECK(ikey != 0);
217
218  if (sorted_) {
219    // Linear search if sorted
220    for (int sub = 0; sub < kMaxSize_; ++sub) {
221      if (key_[sub] == ikey) {return sub;}
222    }
223    return -1;
224  }
225
226  // Look for existing entry
227  int sub0 = ikey & 15;
228  if (key_[sub0] == ikey) {
229    return sub0;
230  }
231  int sub1 = sub0 ^ 8;
232  if (key_[sub1] == ikey) {
233    return sub1;
234  }
235  int sub2 = (ikey & 7) + 16;
236  if (key_[sub2] == ikey) {
237    return sub2;
238  }
239
240  return -1;
241}
242
243// Return current top key
244int ToteWithReliability::CurrentTopKey() {
245  int top_key = 0;
246  int top_value = -1;
247  for (int sub = 0; sub < kMaxSize_; ++sub) {
248    if (key_[sub] == 0) {continue;}
249    if (top_value < value_[sub]) {
250      top_value = value_[sub];
251      top_key = key_[sub];
252    }
253  }
254  return top_key;
255}
256
257
258// Sort first n entries by decreasing order of value
259// If key==0 other fields are not valid, treat value as -1
260void ToteWithReliability::Sort(int n) {
261  // This is n**2, but n is small
262  for (int sub = 0; sub < n; ++sub) {
263    if (key_[sub] == 0) {value_[sub] = -1;}
264
265    // Bubble sort key[sub] and entry[sub]
266    for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
267      if (key_[sub2] == 0) {value_[sub2] = -1;}
268      if (value_[sub] < value_[sub2]) {
269        // swap
270        uint8 tmpk = key_[sub];
271        key_[sub] = key_[sub2];
272        key_[sub2] = tmpk;
273
274        int tmpv = value_[sub];
275        value_[sub] = value_[sub2];
276        value_[sub2] = tmpv;
277
278        int tmps = score_[sub];
279        score_[sub] = score_[sub2];
280        score_[sub2] = tmps;
281
282        int tmpr = reliability_[sub];
283        reliability_[sub] = reliability_[sub2];
284        reliability_[sub2] = tmpr;
285      }
286    }
287  }
288  sorted_ = 1;
289}
290
291void ToteWithReliability::Dump(FILE* f) {
292  for (int sub = 0; sub < kMaxSize_; ++sub) {
293    if (key_[sub] > 0) {
294      fprintf(f, "[%2d] %3d %6d %5d %4d\n",
295              sub, key_[sub], value_[sub], score_[sub], reliability_[sub]);
296    }
297  }
298  fprintf(f, "  %d#\n", incr_count_);
299}
300