1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/safe_browsing/prefix_set.h"
6
7#include <algorithm>
8
9#include "base/files/file_util.h"
10#include "base/files/scoped_file.h"
11#include "base/logging.h"
12#include "base/md5.h"
13#include "base/metrics/histogram.h"
14#include "base/metrics/sparse_histogram.h"
15
16namespace {
17
18// |kMagic| should be reasonably unique, and not match itself across
19// endianness changes.  I generated this value with:
20// md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
21static uint32 kMagic = 0x864088dd;
22
23// Version history:
24// Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10
25// Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27
26// Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05
27
28// Version 2 layout is identical to version 1.  The sort order of |index_|
29// changed from |int32| to |uint32| to match the change of |SBPrefix|.
30// Version 3 adds storage for full hashes.
31static uint32 kVersion = 3;
32static uint32 kDeprecatedVersion = 2;  // And lower.
33
34typedef struct {
35  uint32 magic;
36  uint32 version;
37  uint32 index_size;
38  uint32 deltas_size;
39  uint32 full_hashes_size;
40} FileHeader;
41
42// Common std::vector<> implementations add capacity by multiplying from the
43// current size (usually either by 2 or 1.5) to satisfy push_back() running in
44// amortized constant time.  This is not necessary for insert() at end(), but
45// AFAICT it seems true for some implementations.  SBPrefix values should
46// uniformly cover the 32-bit space, so the final size can be estimated given a
47// subset of the input.
48//
49// |kEstimateThreshold| is when estimates start converging.  Results are strong
50// starting around 1<<27.  1<<30 is chosen to prevent the degenerate case of
51// resizing capacity from >50% to 100%.
52//
53// TODO(shess): I'm sure there is math in the world to describe good settings
54// for estimating the size of a uniformly-distributed set of integers from a
55// sorted subset.  I do not have such math in me, so I assumed that my current
56// organic database of prefixes was scale-free, and wrote a script to see how
57// often given slop values would always suffice for given strides.  At 1<<30,
58// .5% slop was sufficient to cover all cases (though the code below uses 1%).
59//
60// TODO(shess): A smaller threshold uses less transient space in reallocation.
61// 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc.  The cost
62// is that a smaller threshold needs more slop (locked down for the long term).
63// 1<<29 worked well with 1%, 1<<27 worked well with 2%.
64const SBPrefix kEstimateThreshold = 1 << 30;
65size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {
66  // estimated_count / current_count == estimated_max / current_prefix
67  // For large input sets, estimated_max of 2^32 is close enough.
68  const size_t estimated_prefix_count = static_cast<size_t>(
69      (static_cast<uint64>(current_count) << 32) / current_prefix);
70
71  // The estimate has an error bar, if the final total is below the estimate, no
72  // harm, but if it is above a capacity resize will happen at nearly 100%.  Add
73  // some slop to make sure all cases are covered.
74  return estimated_prefix_count + estimated_prefix_count / 100;
75}
76
77}  // namespace
78
79namespace safe_browsing {
80
81// For |std::upper_bound()| to find a prefix w/in a vector of pairs.
82// static
83bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {
84  return a.first < b.first;
85}
86
87PrefixSet::PrefixSet() {
88}
89
90PrefixSet::PrefixSet(IndexVector* index,
91                     std::vector<uint16>* deltas,
92                     std::vector<SBFullHash>* full_hashes) {
93  DCHECK(index && deltas && full_hashes);
94  index_.swap(*index);
95  deltas_.swap(*deltas);
96  full_hashes_.swap(*full_hashes);
97}
98
99PrefixSet::~PrefixSet() {}
100
101bool PrefixSet::PrefixExists(SBPrefix prefix) const {
102  if (index_.empty())
103    return false;
104
105  // Find the first position after |prefix| in |index_|.
106  IndexVector::const_iterator iter =
107      std::upper_bound(index_.begin(), index_.end(),
108                       IndexPair(prefix, 0), PrefixLess);
109
110  // |prefix| comes before anything that's in the set.
111  if (iter == index_.begin())
112    return false;
113
114  // Capture the upper bound of our target entry's deltas.
115  const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
116
117  // Back up to the entry our target is in.
118  --iter;
119
120  // All prefixes in |index_| are in the set.
121  SBPrefix current = iter->first;
122  if (current == prefix)
123    return true;
124
125  // Scan forward accumulating deltas while a match is possible.
126  for (size_t di = iter->second; di < bound && current < prefix; ++di) {
127    current += deltas_[di];
128  }
129
130  return current == prefix;
131}
132
133bool PrefixSet::Exists(const SBFullHash& hash) const {
134  if (std::binary_search(full_hashes_.begin(), full_hashes_.end(),
135                         hash, SBFullHashLess)) {
136    return true;
137  }
138  return PrefixExists(hash.prefix);
139}
140
141void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
142  prefixes->reserve(index_.size() + deltas_.size());
143
144  for (size_t ii = 0; ii < index_.size(); ++ii) {
145    // The deltas for this |index_| entry run to the next index entry,
146    // or the end of the deltas.
147    const size_t deltas_end =
148        (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
149
150    SBPrefix current = index_[ii].first;
151    prefixes->push_back(current);
152    for (size_t di = index_[ii].second; di < deltas_end; ++di) {
153      current += deltas_[di];
154      prefixes->push_back(current);
155    }
156  }
157}
158
159// static
160scoped_ptr<PrefixSet> PrefixSet::LoadFile(const base::FilePath& filter_name) {
161  int64 size_64;
162  if (!base::GetFileSize(filter_name, &size_64))
163    return scoped_ptr<PrefixSet>();
164  using base::MD5Digest;
165  if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))
166    return scoped_ptr<PrefixSet>();
167
168  base::ScopedFILE file(base::OpenFile(filter_name, "rb"));
169  if (!file.get())
170    return scoped_ptr<PrefixSet>();
171
172  FileHeader header;
173  size_t read = fread(&header, sizeof(header), 1, file.get());
174  if (read != 1)
175    return scoped_ptr<PrefixSet>();
176
177  // The file looks valid, start building the digest.
178  base::MD5Context context;
179  base::MD5Init(&context);
180  base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
181                                              sizeof(header)));
182
183  if (header.magic != kMagic)
184    return scoped_ptr<PrefixSet>();
185
186  // Track version read to inform removal of support for older versions.
187  UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version);
188
189  if (header.version <= kDeprecatedVersion) {
190    return scoped_ptr<PrefixSet>();
191  } else if (header.version != kVersion) {
192    return scoped_ptr<PrefixSet>();
193  }
194
195  IndexVector index;
196  const size_t index_bytes = sizeof(index[0]) * header.index_size;
197
198  std::vector<uint16> deltas;
199  const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
200
201  std::vector<SBFullHash> full_hashes;
202  const size_t full_hashes_bytes =
203      sizeof(full_hashes[0]) * header.full_hashes_size;
204
205  // Check for bogus sizes before allocating any space.
206  const size_t expected_bytes = sizeof(header) +
207      index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);
208  if (static_cast<int64>(expected_bytes) != size_64)
209    return scoped_ptr<PrefixSet>();
210
211  // Read the index vector.  Herb Sutter indicates that vectors are
212  // guaranteed to be contiuguous, so reading to where element 0 lives
213  // is valid.
214  if (header.index_size) {
215    index.resize(header.index_size);
216    read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
217    if (read != index.size())
218      return scoped_ptr<PrefixSet>();
219    base::MD5Update(&context,
220                    base::StringPiece(reinterpret_cast<char*>(&(index[0])),
221                                      index_bytes));
222  }
223
224  // Read vector of deltas.
225  if (header.deltas_size) {
226    deltas.resize(header.deltas_size);
227    read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
228    if (read != deltas.size())
229      return scoped_ptr<PrefixSet>();
230    base::MD5Update(&context,
231                    base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),
232                                      deltas_bytes));
233  }
234
235  // Read vector of full hashes.
236  if (header.full_hashes_size) {
237    full_hashes.resize(header.full_hashes_size);
238    read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(),
239                 file.get());
240    if (read != full_hashes.size())
241      return scoped_ptr<PrefixSet>();
242    base::MD5Update(&context,
243                    base::StringPiece(
244                        reinterpret_cast<char*>(&(full_hashes[0])),
245                        full_hashes_bytes));
246  }
247
248  base::MD5Digest calculated_digest;
249  base::MD5Final(&calculated_digest, &context);
250
251  base::MD5Digest file_digest;
252  read = fread(&file_digest, sizeof(file_digest), 1, file.get());
253  if (read != 1)
254    return scoped_ptr<PrefixSet>();
255
256  if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
257    return scoped_ptr<PrefixSet>();
258
259  // Steals vector contents using swap().
260  return scoped_ptr<PrefixSet>(new PrefixSet(&index, &deltas, &full_hashes));
261}
262
263bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {
264  FileHeader header;
265  header.magic = kMagic;
266  header.version = kVersion;
267  header.index_size = static_cast<uint32>(index_.size());
268  header.deltas_size = static_cast<uint32>(deltas_.size());
269  header.full_hashes_size = static_cast<uint32>(full_hashes_.size());
270
271  // Sanity check that the 32-bit values never mess things up.
272  if (static_cast<size_t>(header.index_size) != index_.size() ||
273      static_cast<size_t>(header.deltas_size) != deltas_.size() ||
274      static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {
275    NOTREACHED();
276    return false;
277  }
278
279  base::ScopedFILE file(base::OpenFile(filter_name, "wb"));
280  if (!file.get())
281    return false;
282
283  base::MD5Context context;
284  base::MD5Init(&context);
285
286  // TODO(shess): The I/O code in safe_browsing_store_file.cc would
287  // sure be useful about now.
288  size_t written = fwrite(&header, sizeof(header), 1, file.get());
289  if (written != 1)
290    return false;
291  base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
292                                              sizeof(header)));
293
294  // As for reads, the standard guarantees the ability to access the
295  // contents of the vector by a pointer to an element.
296  if (index_.size()) {
297    const size_t index_bytes = sizeof(index_[0]) * index_.size();
298    written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),
299                     file.get());
300    if (written != index_.size())
301      return false;
302    base::MD5Update(&context,
303                    base::StringPiece(
304                        reinterpret_cast<const char*>(&(index_[0])),
305                        index_bytes));
306  }
307
308  if (deltas_.size()) {
309    const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
310    written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
311                     file.get());
312    if (written != deltas_.size())
313      return false;
314    base::MD5Update(&context,
315                    base::StringPiece(
316                        reinterpret_cast<const char*>(&(deltas_[0])),
317                        deltas_bytes));
318  }
319
320  if (full_hashes_.size()) {
321    const size_t elt_size = sizeof(full_hashes_[0]);
322    const size_t elts = full_hashes_.size();
323    const size_t full_hashes_bytes = elt_size * elts;
324    written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get());
325    if (written != elts)
326      return false;
327    base::MD5Update(&context,
328                    base::StringPiece(
329                        reinterpret_cast<const char*>(&(full_hashes_[0])),
330                        full_hashes_bytes));
331  }
332
333  base::MD5Digest digest;
334  base::MD5Final(&digest, &context);
335  written = fwrite(&digest, sizeof(digest), 1, file.get());
336  if (written != 1)
337    return false;
338
339  // TODO(shess): Can this code check that the close was successful?
340  file.reset();
341
342  return true;
343}
344
345void PrefixSet::AddRun(SBPrefix index_prefix,
346                       const uint16* run_begin, const uint16* run_end) {
347  // Preempt organic capacity decisions for |delta_| once strong estimates can
348  // be made.
349  if (index_prefix > kEstimateThreshold &&
350      deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {
351    deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));
352  }
353
354  index_.push_back(std::make_pair(index_prefix, deltas_.size()));
355  deltas_.insert(deltas_.end(), run_begin, run_end);
356}
357
358PrefixSetBuilder::PrefixSetBuilder()
359    : prefix_set_(new PrefixSet()) {
360}
361
362PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)
363    : prefix_set_(new PrefixSet()) {
364  for (size_t i = 0; i < prefixes.size(); ++i) {
365    AddPrefix(prefixes[i]);
366  }
367}
368
369PrefixSetBuilder::~PrefixSetBuilder() {
370}
371
372scoped_ptr<PrefixSet> PrefixSetBuilder::GetPrefixSet(
373    const std::vector<SBFullHash>& hashes) {
374  DCHECK(prefix_set_.get());
375
376  // Flush runs until buffered data is gone.
377  while (!buffer_.empty()) {
378    EmitRun();
379  }
380
381  // Precisely size |index_| for read-only.  It's 50k-60k, so minor savings, but
382  // they're almost free.
383  PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_);
384
385  prefix_set_->full_hashes_ = hashes;
386  std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(),
387            SBFullHashLess);
388
389  return prefix_set_.Pass();
390}
391
392scoped_ptr<PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {
393  return GetPrefixSet(std::vector<SBFullHash>()).Pass();
394}
395
396void PrefixSetBuilder::EmitRun() {
397  DCHECK(prefix_set_.get());
398
399  SBPrefix prev_prefix = buffer_[0];
400  uint16 run[PrefixSet::kMaxRun];
401  size_t run_pos = 0;
402
403  size_t i;
404  for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {
405    // Calculate the delta.  |unsigned| is mandatory, because the
406    // sorted_prefixes could be more than INT_MAX apart.
407    DCHECK_GT(buffer_[i], prev_prefix);
408    const unsigned delta = buffer_[i] - prev_prefix;
409    const uint16 delta16 = static_cast<uint16>(delta);
410
411    // Break the run if the delta doesn't fit.
412    if (delta != static_cast<unsigned>(delta16))
413      break;
414
415    // Continue the run of deltas.
416    run[run_pos++] = delta16;
417    DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);
418
419    prev_prefix = buffer_[i];
420  }
421  prefix_set_->AddRun(buffer_[0], run, run + run_pos);
422  buffer_.erase(buffer_.begin(), buffer_.begin() + i);
423}
424
425void PrefixSetBuilder::AddPrefix(SBPrefix prefix) {
426  DCHECK(prefix_set_.get());
427
428  if (buffer_.empty()) {
429    DCHECK(prefix_set_->index_.empty());
430    DCHECK(prefix_set_->deltas_.empty());
431  } else {
432    // Drop duplicates.
433    if (buffer_.back() == prefix)
434      return;
435
436    DCHECK_LT(buffer_.back(), prefix);
437  }
438  buffer_.push_back(prefix);
439
440  // Flush buffer when a run can be constructed.  +1 for the index item, and +1
441  // to leave at least one item in the buffer for dropping duplicates.
442  if (buffer_.size() > PrefixSet::kMaxRun + 2)
443    EmitRun();
444}
445
446}  // namespace safe_browsing
447