1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/safe_browsing/prefix_set.h"
6
7#include <algorithm>
8#include <math.h>
9
10#include "base/file_util.h"
11#include "base/logging.h"
12#include "base/md5.h"
13#include "base/metrics/histogram.h"
14
15namespace {
16
17// |kMagic| should be reasonably unique, and not match itself across
18// endianness changes.  I generated this value with:
19// md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
20static uint32 kMagic = 0x864088dd;
21
22// Current version the code writes out.
23static uint32 kVersion = 0x1;
24
25typedef struct {
26  uint32 magic;
27  uint32 version;
28  uint32 index_size;
29  uint32 deltas_size;
30} FileHeader;
31
32// For |std::upper_bound()| to find a prefix w/in a vector of pairs.
33bool PrefixLess(const std::pair<SBPrefix,size_t>& a,
34                const std::pair<SBPrefix,size_t>& b) {
35  return a.first < b.first;
36}
37
38}  // namespace
39
40namespace safe_browsing {
41
42PrefixSet::PrefixSet(const std::vector<SBPrefix>& sorted_prefixes) {
43  if (sorted_prefixes.size()) {
44    // Estimate the resulting vector sizes.  There will be strictly
45    // more than |min_runs| entries in |index_|, but there generally
46    // aren't many forced breaks.
47    const size_t min_runs = sorted_prefixes.size() / kMaxRun;
48    index_.reserve(min_runs);
49    deltas_.reserve(sorted_prefixes.size() - min_runs);
50
51    // Lead with the first prefix.
52    SBPrefix prev_prefix = sorted_prefixes[0];
53    size_t run_length = 0;
54    index_.push_back(std::make_pair(prev_prefix, deltas_.size()));
55
56    for (size_t i = 1; i < sorted_prefixes.size(); ++i) {
57      // Skip duplicates.
58      if (sorted_prefixes[i] == prev_prefix)
59        continue;
60
61      // Calculate the delta.  |unsigned| is mandatory, because the
62      // sorted_prefixes could be more than INT_MAX apart.
63      DCHECK_GT(sorted_prefixes[i], prev_prefix);
64      const unsigned delta = sorted_prefixes[i] - prev_prefix;
65      const uint16 delta16 = static_cast<uint16>(delta);
66
67      // New index ref if the delta doesn't fit, or if too many
68      // consecutive deltas have been encoded.
69      if (delta != static_cast<unsigned>(delta16) || run_length >= kMaxRun) {
70        index_.push_back(std::make_pair(sorted_prefixes[i], deltas_.size()));
71        run_length = 0;
72      } else {
73        // Continue the run of deltas.
74        deltas_.push_back(delta16);
75        DCHECK_EQ(static_cast<unsigned>(deltas_.back()), delta);
76        ++run_length;
77      }
78
79      prev_prefix = sorted_prefixes[i];
80    }
81
82    // Send up some memory-usage stats.  Bits because fractional bytes
83    // are weird.
84    const size_t bits_used = index_.size() * sizeof(index_[0]) * CHAR_BIT +
85        deltas_.size() * sizeof(deltas_[0]) * CHAR_BIT;
86    const size_t unique_prefixes = index_.size() + deltas_.size();
87    static const size_t kMaxBitsPerPrefix = sizeof(SBPrefix) * CHAR_BIT;
88    UMA_HISTOGRAM_ENUMERATION("SB2.PrefixSetBitsPerPrefix",
89                              bits_used / unique_prefixes,
90                              kMaxBitsPerPrefix);
91  }
92}
93
94PrefixSet::PrefixSet(std::vector<std::pair<SBPrefix,size_t> > *index,
95                     std::vector<uint16> *deltas) {
96  DCHECK(index && deltas);
97  index_.swap(*index);
98  deltas_.swap(*deltas);
99}
100
101PrefixSet::~PrefixSet() {}
102
103bool PrefixSet::Exists(SBPrefix prefix) const {
104  if (index_.empty())
105    return false;
106
107  // Find the first position after |prefix| in |index_|.
108  std::vector<std::pair<SBPrefix,size_t> >::const_iterator
109      iter = std::upper_bound(index_.begin(), index_.end(),
110                              std::pair<SBPrefix,size_t>(prefix, 0),
111                              PrefixLess);
112
113  // |prefix| comes before anything that's in the set.
114  if (iter == index_.begin())
115    return false;
116
117  // Capture the upper bound of our target entry's deltas.
118  const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
119
120  // Back up to the entry our target is in.
121  --iter;
122
123  // All prefixes in |index_| are in the set.
124  SBPrefix current = iter->first;
125  if (current == prefix)
126    return true;
127
128  // Scan forward accumulating deltas while a match is possible.
129  for (size_t di = iter->second; di < bound && current < prefix; ++di) {
130    current += deltas_[di];
131  }
132
133  return current == prefix;
134}
135
136void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
137  prefixes->reserve(index_.size() + deltas_.size());
138
139  for (size_t ii = 0; ii < index_.size(); ++ii) {
140    // The deltas for this |index_| entry run to the next index entry,
141    // or the end of the deltas.
142    const size_t deltas_end =
143        (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
144
145    SBPrefix current = index_[ii].first;
146    prefixes->push_back(current);
147    for (size_t di = index_[ii].second; di < deltas_end; ++di) {
148      current += deltas_[di];
149      prefixes->push_back(current);
150    }
151  }
152}
153
154// static
155PrefixSet* PrefixSet::LoadFile(const base::FilePath& filter_name) {
156  int64 size_64;
157  if (!file_util::GetFileSize(filter_name, &size_64))
158    return NULL;
159  using base::MD5Digest;
160  if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))
161    return NULL;
162
163  file_util::ScopedFILE file(file_util::OpenFile(filter_name, "rb"));
164  if (!file.get())
165    return NULL;
166
167  FileHeader header;
168  size_t read = fread(&header, sizeof(header), 1, file.get());
169  if (read != 1)
170    return NULL;
171
172  if (header.magic != kMagic || header.version != kVersion)
173    return NULL;
174
175  std::vector<std::pair<SBPrefix,size_t> > index;
176  const size_t index_bytes = sizeof(index[0]) * header.index_size;
177
178  std::vector<uint16> deltas;
179  const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
180
181  // Check for bogus sizes before allocating any space.
182  const size_t expected_bytes =
183      sizeof(header) + index_bytes + deltas_bytes + sizeof(MD5Digest);
184  if (static_cast<int64>(expected_bytes) != size_64)
185    return NULL;
186
187  // The file looks valid, start building the digest.
188  base::MD5Context context;
189  base::MD5Init(&context);
190  base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
191                                              sizeof(header)));
192
193  // Read the index vector.  Herb Sutter indicates that vectors are
194  // guaranteed to be contiuguous, so reading to where element 0 lives
195  // is valid.
196  if (header.index_size) {
197    index.resize(header.index_size);
198    read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
199    if (read != index.size())
200      return NULL;
201    base::MD5Update(&context,
202                    base::StringPiece(reinterpret_cast<char*>(&(index[0])),
203                                      index_bytes));
204  }
205
206  // Read vector of deltas.
207  if (header.deltas_size) {
208    deltas.resize(header.deltas_size);
209    read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
210    if (read != deltas.size())
211      return NULL;
212    base::MD5Update(&context,
213                    base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),
214                                      deltas_bytes));
215  }
216
217  base::MD5Digest calculated_digest;
218  base::MD5Final(&calculated_digest, &context);
219
220  base::MD5Digest file_digest;
221  read = fread(&file_digest, sizeof(file_digest), 1, file.get());
222  if (read != 1)
223    return NULL;
224
225  if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
226    return NULL;
227
228  // Steals contents of |index| and |deltas| via swap().
229  return new PrefixSet(&index, &deltas);
230}
231
232bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {
233  FileHeader header;
234  header.magic = kMagic;
235  header.version = kVersion;
236  header.index_size = static_cast<uint32>(index_.size());
237  header.deltas_size = static_cast<uint32>(deltas_.size());
238
239  // Sanity check that the 32-bit values never mess things up.
240  if (static_cast<size_t>(header.index_size) != index_.size() ||
241      static_cast<size_t>(header.deltas_size) != deltas_.size()) {
242    NOTREACHED();
243    return false;
244  }
245
246  file_util::ScopedFILE file(file_util::OpenFile(filter_name, "wb"));
247  if (!file.get())
248    return false;
249
250  base::MD5Context context;
251  base::MD5Init(&context);
252
253  // TODO(shess): The I/O code in safe_browsing_store_file.cc would
254  // sure be useful about now.
255  size_t written = fwrite(&header, sizeof(header), 1, file.get());
256  if (written != 1)
257    return false;
258  base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
259                                              sizeof(header)));
260
261  // As for reads, the standard guarantees the ability to access the
262  // contents of the vector by a pointer to an element.
263  if (index_.size()) {
264    const size_t index_bytes = sizeof(index_[0]) * index_.size();
265    written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),
266                     file.get());
267    if (written != index_.size())
268      return false;
269    base::MD5Update(&context,
270                    base::StringPiece(
271                        reinterpret_cast<const char*>(&(index_[0])),
272                        index_bytes));
273  }
274
275  if (deltas_.size()) {
276    const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
277    written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
278                     file.get());
279    if (written != deltas_.size())
280      return false;
281    base::MD5Update(&context,
282                    base::StringPiece(
283                        reinterpret_cast<const char*>(&(deltas_[0])),
284                        deltas_bytes));
285  }
286
287  base::MD5Digest digest;
288  base::MD5Final(&digest, &context);
289  written = fwrite(&digest, sizeof(digest), 1, file.get());
290  if (written != 1)
291    return false;
292
293  // TODO(shess): Can this code check that the close was successful?
294  file.reset();
295
296  return true;
297}
298
299}  // namespace safe_browsing
300