172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved.
272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen// Use of this source code is governed by a BSD-style license that can be
372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen// found in the LICENSE file.
472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen#include "chrome/browser/safe_browsing/prefix_set.h"
672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen#include <algorithm>
872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen#include <math.h>
972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
10ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "base/file_util.h"
1172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen#include "base/logging.h"
12ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "base/md5.h"
1372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen#include "base/metrics/histogram.h"
1472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
1572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsennamespace {
1672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
17ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// |kMagic| should be reasonably unique, and not match itself across
18ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// endianness changes.  I generated this value with:
19ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
20ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenstatic uint32 kMagic = 0x864088dd;
21ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
22ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// Current version the code writes out.
23ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenstatic uint32 kVersion = 0x1;
24ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
25ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsentypedef struct {
26ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  uint32 magic;
27ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  uint32 version;
28ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  uint32 index_size;
29ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  uint32 deltas_size;
30ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen} FileHeader;
31ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
3272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen// For |std::upper_bound()| to find a prefix w/in a vector of pairs.
3372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenbool PrefixLess(const std::pair<SBPrefix,size_t>& a,
3472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen                const std::pair<SBPrefix,size_t>& b) {
3572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  return a.first < b.first;
3672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen}
3772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
3872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen}  // namespace
3972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
4072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsennamespace safe_browsing {
4172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
42dc0f95d653279beabeb9817299e2902918ba123eKristian MonsenPrefixSet::PrefixSet(const std::vector<SBPrefix>& sorted_prefixes) {
43dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen  if (sorted_prefixes.size()) {
4472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    // Lead with the first prefix.
45dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen    SBPrefix prev_prefix = sorted_prefixes[0];
4672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    size_t run_length = 0;
4772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    index_.push_back(std::make_pair(prev_prefix, deltas_.size()));
4872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
49ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    // Used to build a checksum from the data used to construct the
50ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    // structures.  Since the data is a bunch of uniform hashes, it
51ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    // seems reasonable to just xor most of it in, rather than trying
52ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    // to use a more complicated algorithm.
53ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    uint32 checksum = static_cast<uint32>(sorted_prefixes[0]);
54ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    checksum ^= static_cast<uint32>(deltas_.size());
55ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
56dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen    for (size_t i = 1; i < sorted_prefixes.size(); ++i) {
57dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen      // Skip duplicates.
58dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen      if (sorted_prefixes[i] == prev_prefix)
5972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen        continue;
6072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
6172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      // Calculate the delta.  |unsigned| is mandatory, because the
62dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen      // sorted_prefixes could be more than INT_MAX apart.
63dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen      DCHECK_GT(sorted_prefixes[i], prev_prefix);
64ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      const unsigned delta = sorted_prefixes[i] - prev_prefix;
65ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      const uint16 delta16 = static_cast<uint16>(delta);
6672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
67ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      // New index ref if the delta doesn't fit, or if too many
68ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      // consecutive deltas have been encoded.
69ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      if (delta != static_cast<unsigned>(delta16) || run_length >= kMaxRun) {
70ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen        checksum ^= static_cast<uint32>(sorted_prefixes[i]);
71ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen        checksum ^= static_cast<uint32>(deltas_.size());
72dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen        index_.push_back(std::make_pair(sorted_prefixes[i], deltas_.size()));
7372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen        run_length = 0;
7472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      } else {
75ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen        checksum ^= static_cast<uint32>(delta16);
7672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen        // Continue the run of deltas.
77ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen        deltas_.push_back(delta16);
78ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen        DCHECK_EQ(static_cast<unsigned>(deltas_.back()), delta);
7972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen        ++run_length;
8072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      }
8172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
82dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen      prev_prefix = sorted_prefixes[i];
8372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    }
84ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    checksum_ = checksum;
85ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    DCHECK(CheckChecksum());
8672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
8772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    // Send up some memory-usage stats.  Bits because fractional bytes
8872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    // are weird.
8972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    const size_t bits_used = index_.size() * sizeof(index_[0]) * CHAR_BIT +
9072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen        deltas_.size() * sizeof(deltas_[0]) * CHAR_BIT;
9172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    const size_t unique_prefixes = index_.size() + deltas_.size();
9272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    static const size_t kMaxBitsPerPrefix = sizeof(SBPrefix) * CHAR_BIT;
9372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    UMA_HISTOGRAM_ENUMERATION("SB2.PrefixSetBitsPerPrefix",
9472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen                              bits_used / unique_prefixes,
9572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen                              kMaxBitsPerPrefix);
9672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  }
9772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen}
9872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
99ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian MonsenPrefixSet::PrefixSet(std::vector<std::pair<SBPrefix,size_t> > *index,
100ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen                     std::vector<uint16> *deltas) {
101ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  DCHECK(index && deltas);
102ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  index_.swap(*index);
103ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  deltas_.swap(*deltas);
104ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen}
105ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
10672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian MonsenPrefixSet::~PrefixSet() {}
10772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
10872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenbool PrefixSet::Exists(SBPrefix prefix) const {
10972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  if (index_.empty())
11072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    return false;
11172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
11272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  // Find the first position after |prefix| in |index_|.
11372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  std::vector<std::pair<SBPrefix,size_t> >::const_iterator
11472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      iter = std::upper_bound(index_.begin(), index_.end(),
11572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen                              std::pair<SBPrefix,size_t>(prefix, 0),
11672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen                              PrefixLess);
11772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
11872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  // |prefix| comes before anything that's in the set.
11972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  if (iter == index_.begin())
12072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    return false;
12172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
12272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  // Capture the upper bound of our target entry's deltas.
12372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
12472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
12572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  // Back up to the entry our target is in.
12672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  --iter;
12772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
12872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  // All prefixes in |index_| are in the set.
12972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  SBPrefix current = iter->first;
13072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  if (current == prefix)
13172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    return true;
13272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
13372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  // Scan forward accumulating deltas while a match is possible.
13472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  for (size_t di = iter->second; di < bound && current < prefix; ++di) {
13572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    current += deltas_[di];
13672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  }
13772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
13872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  return current == prefix;
13972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen}
14072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
141ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenvoid PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
142dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen  for (size_t ii = 0; ii < index_.size(); ++ii) {
143dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen    // The deltas for this |index_| entry run to the next index entry,
144dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen    // or the end of the deltas.
145dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen    const size_t deltas_end =
146dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen        (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
147dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen
148dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen    SBPrefix current = index_[ii].first;
149dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen    prefixes->push_back(current);
150dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen    for (size_t di = index_[ii].second; di < deltas_end; ++di) {
151dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen      current += deltas_[di];
152dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen      prefixes->push_back(current);
153dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen    }
154dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen  }
155dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen}
156dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen
157ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// static
158ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian MonsenPrefixSet* PrefixSet::LoadFile(const FilePath& filter_name) {
159ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  int64 size_64;
160ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (!file_util::GetFileSize(filter_name, &size_64))
161ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return NULL;
162ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))
163ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return NULL;
164ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
165ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  file_util::ScopedFILE file(file_util::OpenFile(filter_name, "rb"));
166ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (!file.get())
167ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return NULL;
168ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
169ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  FileHeader header;
170ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  size_t read = fread(&header, sizeof(header), 1, file.get());
171ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (read != 1)
172ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return NULL;
173ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
174ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (header.magic != kMagic || header.version != kVersion)
175ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return NULL;
176ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
177ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  std::vector<std::pair<SBPrefix,size_t> > index;
178ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  const size_t index_bytes = sizeof(index[0]) * header.index_size;
179ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
180ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  std::vector<uint16> deltas;
181ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
182ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
183ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // Check for bogus sizes before allocating any space.
184ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  const size_t expected_bytes =
185ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      sizeof(header) + index_bytes + deltas_bytes + sizeof(MD5Digest);
186ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (static_cast<int64>(expected_bytes) != size_64)
187ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return NULL;
188ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
189ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // The file looks valid, start building the digest.
190ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Context context;
191ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Init(&context);
192ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Update(&context, &header, sizeof(header));
193ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
194ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // Read the index vector.  Herb Sutter indicates that vectors are
195ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // guaranteed to be contiuguous, so reading to where element 0 lives
196ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // is valid.
197ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  index.resize(header.index_size);
198ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
199ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (read != index.size())
200ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return NULL;
201ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Update(&context, &(index[0]), index_bytes);
202ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
203ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // Read vector of deltas.
204ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  deltas.resize(header.deltas_size);
205ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
206ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (read != deltas.size())
207ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return NULL;
208ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Update(&context, &(deltas[0]), deltas_bytes);
209ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
210ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Digest calculated_digest;
211ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Final(&calculated_digest, &context);
212ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
213ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Digest file_digest;
214ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  read = fread(&file_digest, sizeof(file_digest), 1, file.get());
215ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (read != 1)
216ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return NULL;
217ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
218ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
219ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return NULL;
220ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
221ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // Steals contents of |index| and |deltas| via swap().
222ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  return new PrefixSet(&index, &deltas);
223ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen}
224ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
225ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenbool PrefixSet::WriteFile(const FilePath& filter_name) const {
226ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  FileHeader header;
227ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  header.magic = kMagic;
228ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  header.version = kVersion;
229ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  header.index_size = static_cast<uint32>(index_.size());
230ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  header.deltas_size = static_cast<uint32>(deltas_.size());
231ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
232ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // Sanity check that the 32-bit values never mess things up.
233ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (static_cast<size_t>(header.index_size) != index_.size() ||
234ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      static_cast<size_t>(header.deltas_size) != deltas_.size()) {
235ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    NOTREACHED();
236ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return false;
237ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  }
238ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
239ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  file_util::ScopedFILE file(file_util::OpenFile(filter_name, "wb"));
240ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (!file.get())
241ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return false;
242ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
243ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Context context;
244ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Init(&context);
245ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
246ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // TODO(shess): The I/O code in safe_browsing_store_file.cc would
247ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // sure be useful about now.
248ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  size_t written = fwrite(&header, sizeof(header), 1, file.get());
249ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (written != 1)
250ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return false;
251ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Update(&context, &header, sizeof(header));
252ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
253ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // As for reads, the standard guarantees the ability to access the
254ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // contents of the vector by a pointer to an element.
255ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  const size_t index_bytes = sizeof(index_[0]) * index_.size();
256ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(), file.get());
257ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (written != index_.size())
258ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return false;
259ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Update(&context, &(index_[0]), index_bytes);
260ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
261ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
262ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
263ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen                   file.get());
264ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (written != deltas_.size())
265ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return false;
266ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Update(&context, &(deltas_[0]), deltas_bytes);
267ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
268ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Digest digest;
269ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  MD5Final(&digest, &context);
270ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  written = fwrite(&digest, sizeof(digest), 1, file.get());
271ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  if (written != 1)
272ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    return false;
273ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
274ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // TODO(shess): Can this code check that the close was successful?
275ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  file.reset();
276ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
277ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  return true;
278ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen}
279ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
280ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsensize_t PrefixSet::IndexBinFor(size_t target_index) const {
281ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // The items in |index_| have the logical index of each previous
282ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // item in |index_| plus the count of deltas between the items.
283ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // Since the indices into |deltas_| are absolute, the logical index
284ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // is then the sum of the two indices.
285ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  size_t lo = 0;
286ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  size_t hi = index_.size();
287ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
288ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // Binary search because linear search was too slow (really, the
289ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // unit test sucked).  Inline because the elements can't be compared
290ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // in isolation (their position matters).
291ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  while (hi - lo > 1) {
292ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    const size_t i = (lo + hi) / 2;
293ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
294ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    if (target_index < i + index_[i].second) {
295ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      DCHECK_LT(i, hi);  // Always making progress.
296ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      hi = i;
297ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    } else {
298ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      DCHECK_GT(i, lo);  // Always making progress.
299ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen      lo = i;
300ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    }
301ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  }
302ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  return lo;
303ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen}
304ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
305ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsensize_t PrefixSet::GetSize() const {
306ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  return index_.size() + deltas_.size();
307ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen}
308ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
309ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenbool PrefixSet::IsDeltaAt(size_t target_index) const {
310ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  CHECK_LT(target_index, GetSize());
311ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
312ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  const size_t i = IndexBinFor(target_index);
313ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  return target_index > i + index_[i].second;
314ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen}
315ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
316ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenuint16 PrefixSet::DeltaAt(size_t target_index) const {
317ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  CHECK_LT(target_index, GetSize());
318ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
319ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // Find the |index_| entry which contains |target_index|.
320ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  const size_t i = IndexBinFor(target_index);
321ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
322ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // Exactly on the |index_| entry means no delta.
323ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  CHECK_GT(target_index, i + index_[i].second);
324ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
325ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // -i backs out the |index_| entries, -1 gets the delta that lead to
326ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  // the value at |target_index|.
327ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  CHECK_LT(target_index - i - 1, deltas_.size());
328ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  return deltas_[target_index - i - 1];
329ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen}
330ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
331ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenbool PrefixSet::CheckChecksum() const {
332ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  uint32 checksum = 0;
333ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
334ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  for (size_t ii = 0; ii < index_.size(); ++ii) {
335ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    checksum ^= static_cast<uint32>(index_[ii].first);
336ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    checksum ^= static_cast<uint32>(index_[ii].second);
337ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  }
338ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
339ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  for (size_t di = 0; di < deltas_.size(); ++di) {
340ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    checksum ^= static_cast<uint32>(deltas_[di]);
341ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  }
342ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
343ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  return checksum == checksum_;
344ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen}
345ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
34672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen}  // namespace safe_browsing
347