1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
6
7#include "base/files/file_util.h"
8#include "base/files/scoped_file.h"
9#include "base/md5.h"
10#include "base/metrics/histogram.h"
11#include "base/metrics/sparse_histogram.h"
12
13namespace {
14
15// NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
16// that byte-order changes force corruption.
17const int32 kFileMagic = 0x600D71FE;
18
19// Version history:
20// Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
21// Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
22// Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
23const int32 kFileVersion = 8;
24
25// ReadAndVerifyHeader() returns this in case of error.
26const int32 kInvalidVersion = -1;
27
28// Starting with version 8, the storage is sorted and can be sharded to allow
29// updates to be done with lower memory requirements.  Newly written files will
30// be sharded to need less than this amount of memory during update.  Larger
31// values are preferred to minimize looping overhead during processing.
32const int64 kUpdateStorageBytes = 100 * 1024;
33
34// Prevent excessive sharding by setting a lower limit on the shard stride.
35// Smaller values should work fine, but very small values will probably lead to
36// poor performance.  Shard stride is indirectly related to
37// |kUpdateStorageBytes|, setting that very small will bump against this.
38const uint32 kMinShardStride = 1 << 24;
39
40// Strides over the entire SBPrefix space.
41const uint64 kMaxShardStride = 1ULL << 32;
42
43// Maximum SBPrefix value.
44const SBPrefix kMaxSBPrefix = 0xFFFFFFFF;
45
46// Header at the front of the main database file.
47struct FileHeader {
48  int32 magic, version;
49  uint32 add_chunk_count, sub_chunk_count;
50  uint32 shard_stride;
51  // TODO(shess): Is this where 64-bit will bite me?  Perhaps write a
52  // specialized read/write?
53};
54
55// Header for each chunk in the chunk-accumulation file.
56struct ChunkHeader {
57  uint32 add_prefix_count, sub_prefix_count;
58  uint32 add_hash_count, sub_hash_count;
59};
60
61// Header for each shard of data in the main database file.
62struct ShardHeader {
63  uint32 add_prefix_count, sub_prefix_count;
64  uint32 add_hash_count, sub_hash_count;
65};
66
67// Enumerate different format-change events for histogramming
68// purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
69enum FormatEventType {
70  // Corruption detected, broken down by file format.
71  FORMAT_EVENT_FILE_CORRUPT,
72  FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete
73
74  // The type of format found in the file.  The expected case (new
75  // file format) is intentionally not covered.
76  FORMAT_EVENT_FOUND_SQLITE,  // Obsolete
77  FORMAT_EVENT_FOUND_UNKNOWN,  // magic does not match.
78
79  // The number of SQLite-format files deleted should be the same as
80  // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
81  // or if a failure prevents the update from succeeding.
82  FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
83  FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete
84
85  // Found and deleted (or failed to delete) the ancient "Safe
86  // Browsing" file.
87  FORMAT_EVENT_DELETED_ORIGINAL,  // Obsolete
88  FORMAT_EVENT_DELETED_ORIGINAL_FAILED,  // Obsolete
89
90  // The checksum did not check out in CheckValidity() or in
91  // FinishUpdate().  This most likely indicates that the machine
92  // crashed before the file was fully sync'ed to disk.
93  FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
94  FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
95
96  // The header checksum was incorrect in ReadAndVerifyHeader().  Likely
97  // indicates that the system crashed while writing an update.
98  FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
99
100  FORMAT_EVENT_FOUND_DEPRECATED,  // version too old.
101
102  // Memory space for histograms is determined by the max.  ALWAYS
103  // ADD NEW VALUES BEFORE THIS ONE.
104  FORMAT_EVENT_MAX
105};
106
107void RecordFormatEvent(FormatEventType event_type) {
108  UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
109}
110
111// Rewind the file.  Using fseek(2) because rewind(3) errors are
112// weird.
113bool FileRewind(FILE* fp) {
114  int rv = fseek(fp, 0, SEEK_SET);
115  DCHECK_EQ(rv, 0);
116  return rv == 0;
117}
118
119// Read from |fp| into |item|, and fold the input data into the
120// checksum in |context|, if non-NULL.  Return true on success.
121template <class T>
122bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
123  const size_t ret = fread(item, sizeof(T), 1, fp);
124  if (ret != 1)
125    return false;
126
127  if (context) {
128    base::MD5Update(context,
129                    base::StringPiece(reinterpret_cast<char*>(item),
130                                      sizeof(T)));
131  }
132  return true;
133}
134
135// Write |item| to |fp|, and fold the output data into the checksum in
136// |context|, if non-NULL.  Return true on success.
137template <class T>
138bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
139  const size_t ret = fwrite(&item, sizeof(T), 1, fp);
140  if (ret != 1)
141    return false;
142
143  if (context) {
144    base::MD5Update(context,
145                    base::StringPiece(reinterpret_cast<const char*>(&item),
146                                      sizeof(T)));
147  }
148
149  return true;
150}
151
152// Read |count| items into |values| from |fp|, and fold them into the
153// checksum in |context|.  Returns true on success.
154template <typename CT>
155bool ReadToContainer(CT* values, size_t count, FILE* fp,
156                     base::MD5Context* context) {
157  if (!count)
158    return true;
159
160  for (size_t i = 0; i < count; ++i) {
161    typename CT::value_type value;
162    if (!ReadItem(&value, fp, context))
163      return false;
164
165    // push_back() is more obvious, but coded this way std::set can
166    // also be read.
167    values->insert(values->end(), value);
168  }
169
170  return true;
171}
172
173// Write values between |beg| and |end| to |fp|, and fold the data into the
174// checksum in |context|, if non-NULL.  Returns true if all items successful.
175template <typename CTI>
176bool WriteRange(const CTI& beg, const CTI& end,
177                FILE* fp, base::MD5Context* context) {
178  for (CTI iter = beg; iter != end; ++iter) {
179    if (!WriteItem(*iter, fp, context))
180      return false;
181  }
182  return true;
183}
184
185// Write all of |values| to |fp|, and fold the data into the checksum
186// in |context|, if non-NULL.  Returns true if all items successful.
187template <typename CT>
188bool WriteContainer(const CT& values, FILE* fp,
189                    base::MD5Context* context) {
190  return WriteRange(values.begin(), values.end(), fp, context);
191}
192
193// Delete the chunks in |deleted| from |chunks|.
194void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
195                         std::set<int32>* chunks) {
196  for (std::set<int32>::iterator iter = chunks->begin();
197       iter != chunks->end();) {
198    std::set<int32>::iterator prev = iter++;
199    if (deleted.count(*prev) > 0)
200      chunks->erase(prev);
201  }
202}
203
204bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
205  base::MD5Digest calculated_digest;
206  base::MD5IntermediateFinal(&calculated_digest, context);
207
208  base::MD5Digest file_digest;
209  if (!ReadItem(&file_digest, fp, context))
210    return false;
211
212  return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
213}
214
215// Helper function to read the file header and chunk TOC.  Rewinds |fp| and
216// initializes |context|.  The header is left in |header|, with the version
217// returned.  kInvalidVersion is returned for sanity check or checksum failure.
218int ReadAndVerifyHeader(const base::FilePath& filename,
219                        FileHeader* header,
220                        std::set<int32>* add_chunks,
221                        std::set<int32>* sub_chunks,
222                        FILE* fp,
223                        base::MD5Context* context) {
224  DCHECK(header);
225  DCHECK(add_chunks);
226  DCHECK(sub_chunks);
227  DCHECK(fp);
228  DCHECK(context);
229
230  base::MD5Init(context);
231  if (!FileRewind(fp))
232    return kInvalidVersion;
233  if (!ReadItem(header, fp, context))
234    return kInvalidVersion;
235  if (header->magic != kFileMagic)
236    return kInvalidVersion;
237
238  // Track version read to inform removal of support for older versions.
239  UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version);
240
241  if (header->version != kFileVersion)
242    return kInvalidVersion;
243
244  if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) ||
245      !ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) {
246    return kInvalidVersion;
247  }
248
249  // Verify that the data read thus far is valid.
250  if (!ReadAndVerifyChecksum(fp, context)) {
251    RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
252    return kInvalidVersion;
253  }
254
255  return kFileVersion;
256}
257
258// Helper function to write out the initial header and chunks-contained data.
259// Rewinds |fp|, initializes |context|, then writes a file header and
260// |add_chunks| and |sub_chunks|.
261bool WriteHeader(uint32 out_stride,
262                 const std::set<int32>& add_chunks,
263                 const std::set<int32>& sub_chunks,
264                 FILE* fp,
265                 base::MD5Context* context) {
266  if (!FileRewind(fp))
267    return false;
268
269  base::MD5Init(context);
270  FileHeader header;
271  header.magic = kFileMagic;
272  header.version = kFileVersion;
273  header.add_chunk_count = add_chunks.size();
274  header.sub_chunk_count = sub_chunks.size();
275  header.shard_stride = out_stride;
276  if (!WriteItem(header, fp, context))
277    return false;
278
279  if (!WriteContainer(add_chunks, fp, context) ||
280      !WriteContainer(sub_chunks, fp, context))
281    return false;
282
283  // Write out the header digest.
284  base::MD5Digest header_digest;
285  base::MD5IntermediateFinal(&header_digest, context);
286  if (!WriteItem(header_digest, fp, context))
287    return false;
288
289  return true;
290}
291
292// Return |true| if the range is sorted by the given comparator.
293template <typename CTI, typename LESS>
294bool sorted(CTI beg, CTI end, LESS less) {
295  while ((end - beg) > 2) {
296    CTI n = beg++;
297    DCHECK(!less(*beg, *n));
298    if (less(*beg, *n))
299      return false;
300  }
301  return true;
302}
303
304// Merge |beg|..|end| into |container|.  Both should be sorted by the given
305// comparator, and the range iterators should not be derived from |container|.
306// Differs from std::inplace_merge() in that additional memory is not required
307// for linear performance.
308template <typename CT, typename CTI, typename COMP>
309void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
310  DCHECK(sorted(container->begin(), container->end(), less));
311  DCHECK(sorted(beg, end, less));
312
313  // Size the container to fit the results.
314  const size_t c_size = container->size();
315  container->resize(c_size + (end - beg));
316
317  // |c_end| points to the original endpoint, while |c_out| points to the
318  // endpoint that will scan from end to beginning while merging.
319  typename CT::iterator c_end = container->begin() + c_size;
320  typename CT::iterator c_out = container->end();
321
322  // While both inputs have data, move the greater to |c_out|.
323  while (c_end != container->begin() && end != beg) {
324    if (less(*(c_end - 1), *(end - 1))) {
325      *(--c_out) = *(--end);
326    } else {
327      *(--c_out) = *(--c_end);
328    }
329  }
330
331  // Copy any data remaining in the new range.
332  if (end != beg) {
333    // The original container data has been fully shifted.
334    DCHECK(c_end == container->begin());
335
336    // There is exactly the correct amount of space left.
337    DCHECK_EQ(c_out - c_end, end - beg);
338
339    std::copy(beg, end, container->begin());
340  }
341
342  DCHECK(sorted(container->begin(), container->end(), less));
343}
344
345// Collection of iterators used while stepping through StateInternal (see
346// below).
347class StateInternalPos {
348 public:
349  StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
350                   SBSubPrefixes::iterator sub_prefixes_iter,
351                   std::vector<SBAddFullHash>::iterator add_hashes_iter,
352                   std::vector<SBSubFullHash>::iterator sub_hashes_iter)
353      : add_prefixes_iter_(add_prefixes_iter),
354        sub_prefixes_iter_(sub_prefixes_iter),
355        add_hashes_iter_(add_hashes_iter),
356        sub_hashes_iter_(sub_hashes_iter) {
357  }
358
359  SBAddPrefixes::iterator add_prefixes_iter_;
360  SBSubPrefixes::iterator sub_prefixes_iter_;
361  std::vector<SBAddFullHash>::iterator add_hashes_iter_;
362  std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
363};
364
365// Helper to find the next shard boundary.
366template <class T>
367bool prefix_bounder(SBPrefix val, const T& elt) {
368  return val < elt.GetAddPrefix();
369}
370
371// Container for partial database state.  Includes add/sub prefixes/hashes, plus
372// aggregate operations on same.
373class StateInternal {
374 public:
375  // Append indicated amount of data from |fp|.
376  bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
377                  size_t add_hash_count, size_t sub_hash_count,
378                  FILE* fp, base::MD5Context* context) {
379    return
380        ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
381        ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
382        ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
383        ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
384  }
385
386  void ClearData() {
387    add_prefixes_.clear();
388    sub_prefixes_.clear();
389    add_full_hashes_.clear();
390    sub_full_hashes_.clear();
391  }
392
393  // Merge data from |beg|..|end| into receiver's state, then process the state.
394  // The current state and the range given should corrospond to the same sorted
395  // shard of data from different sources.  |add_del_cache| and |sub_del_cache|
396  // indicate the chunk ids which should be deleted during processing (see
397  // SBProcessSubs).
398  void MergeDataAndProcess(const StateInternalPos& beg,
399                           const StateInternalPos& end,
400                           const base::hash_set<int32>& add_del_cache,
401                           const base::hash_set<int32>& sub_del_cache) {
402    container_merge(&add_prefixes_,
403                    beg.add_prefixes_iter_,
404                    end.add_prefixes_iter_,
405                    SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
406
407    container_merge(&sub_prefixes_,
408                    beg.sub_prefixes_iter_,
409                    end.sub_prefixes_iter_,
410                    SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
411
412    container_merge(&add_full_hashes_,
413                    beg.add_hashes_iter_,
414                    end.add_hashes_iter_,
415                    SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
416
417    container_merge(&sub_full_hashes_,
418                    beg.sub_hashes_iter_,
419                    end.sub_hashes_iter_,
420                    SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
421
422    SBProcessSubs(&add_prefixes_, &sub_prefixes_,
423                  &add_full_hashes_, &sub_full_hashes_,
424                  add_del_cache, sub_del_cache);
425  }
426
427  // Sort the data appropriately for the sharding, merging, and processing
428  // operations.
429  void SortData() {
430    std::sort(add_prefixes_.begin(), add_prefixes_.end(),
431              SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
432    std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
433              SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
434    std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
435              SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
436    std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
437              SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
438  }
439
440  // Iterator from the beginning of the state's data.
441  StateInternalPos StateBegin() {
442    return StateInternalPos(add_prefixes_.begin(),
443                            sub_prefixes_.begin(),
444                            add_full_hashes_.begin(),
445                            sub_full_hashes_.begin());
446  }
447
448  // An iterator pointing just after the last possible element of the shard
449  // indicated by |shard_max|.  Used to step through the state by shard.
450  // TODO(shess): Verify whether binary search really improves over linear.
451  // Merging or writing will immediately touch all of these elements.
452  StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
453    return StateInternalPos(
454        std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
455                         shard_max, prefix_bounder<SBAddPrefix>),
456        std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
457                         shard_max, prefix_bounder<SBSubPrefix>),
458        std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
459                         shard_max, prefix_bounder<SBAddFullHash>),
460        std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
461                         shard_max, prefix_bounder<SBSubFullHash>));
462  }
463
464  // Write a shard header and data for the shard starting at |beg| and ending at
465  // the element before |end|.
466  bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
467                  FILE* fp, base::MD5Context* context) {
468    ShardHeader shard_header;
469    shard_header.add_prefix_count =
470        end.add_prefixes_iter_ - beg.add_prefixes_iter_;
471    shard_header.sub_prefix_count =
472        end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
473    shard_header.add_hash_count =
474        end.add_hashes_iter_ - beg.add_hashes_iter_;
475    shard_header.sub_hash_count =
476        end.sub_hashes_iter_ - beg.sub_hashes_iter_;
477
478    return
479        WriteItem(shard_header, fp, context) &&
480        WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
481                   fp, context) &&
482        WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
483                   fp, context) &&
484        WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
485                   fp, context) &&
486        WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
487                   fp, context);
488  }
489
490  SBAddPrefixes add_prefixes_;
491  SBSubPrefixes sub_prefixes_;
492  std::vector<SBAddFullHash> add_full_hashes_;
493  std::vector<SBSubFullHash> sub_full_hashes_;
494};
495
496// True if |val| is an even power of two.
497template <typename T>
498bool IsPowerOfTwo(const T& val) {
499  return val && (val & (val - 1)) == 0;
500}
501
502// Helper to read the entire database state, used by GetAddPrefixes() and
503// GetAddFullHashes().  Those functions are generally used only for smaller
504// files.  Returns false in case of errors reading the data.
505bool ReadDbStateHelper(const base::FilePath& filename,
506                       StateInternal* db_state) {
507  base::ScopedFILE file(base::OpenFile(filename, "rb"));
508  if (file.get() == NULL)
509    return false;
510
511  std::set<int32> add_chunks;
512  std::set<int32> sub_chunks;
513
514  base::MD5Context context;
515  FileHeader header;
516  const int version =
517      ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
518                          file.get(), &context);
519  if (version == kInvalidVersion)
520    return false;
521
522  uint64 in_min = 0;
523  uint64 in_stride = header.shard_stride;
524  if (!in_stride)
525    in_stride = kMaxShardStride;
526  if (!IsPowerOfTwo(in_stride))
527    return false;
528
529  do {
530    ShardHeader shard_header;
531    if (!ReadItem(&shard_header, file.get(), &context))
532      return false;
533
534    if (!db_state->AppendData(shard_header.add_prefix_count,
535                              shard_header.sub_prefix_count,
536                              shard_header.add_hash_count,
537                              shard_header.sub_hash_count,
538                              file.get(), &context)) {
539      return false;
540    }
541
542    in_min += in_stride;
543  } while (in_min <= kMaxSBPrefix);
544
545  if (!ReadAndVerifyChecksum(file.get(), &context))
546    return false;
547
548  int64 size = 0;
549  if (!base::GetFileSize(filename, &size))
550    return false;
551
552  return static_cast<int64>(ftell(file.get())) == size;
553}
554
555}  // namespace
556
557SafeBrowsingStoreFile::SafeBrowsingStoreFile()
558    : chunks_written_(0), empty_(false), corruption_seen_(false) {}
559
560SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
561  Close();
562}
563
564bool SafeBrowsingStoreFile::Delete() {
565  // The database should not be open at this point.  But, just in
566  // case, close everything before deleting.
567  if (!Close()) {
568    NOTREACHED();
569    return false;
570  }
571
572  return DeleteStore(filename_);
573}
574
575bool SafeBrowsingStoreFile::CheckValidity() {
576  // The file was either empty or never opened.  The empty case is
577  // presumed not to be invalid.  The never-opened case can happen if
578  // BeginUpdate() fails for any databases, and should already have
579  // caused the corruption callback to fire.
580  if (!file_.get())
581    return true;
582
583  if (!FileRewind(file_.get()))
584    return OnCorruptDatabase();
585
586  int64 size = 0;
587  if (!base::GetFileSize(filename_, &size))
588    return OnCorruptDatabase();
589
590  base::MD5Context context;
591  base::MD5Init(&context);
592
593  // Read everything except the final digest.
594  size_t bytes_left = static_cast<size_t>(size);
595  CHECK(size == static_cast<int64>(bytes_left));
596  if (bytes_left < sizeof(base::MD5Digest))
597    return OnCorruptDatabase();
598  bytes_left -= sizeof(base::MD5Digest);
599
600  // Fold the contents of the file into the checksum.
601  while (bytes_left > 0) {
602    char buf[4096];
603    const size_t c = std::min(sizeof(buf), bytes_left);
604    const size_t ret = fread(buf, 1, c, file_.get());
605
606    // The file's size changed while reading, give up.
607    if (ret != c)
608      return OnCorruptDatabase();
609    base::MD5Update(&context, base::StringPiece(buf, c));
610    bytes_left -= c;
611  }
612
613  if (!ReadAndVerifyChecksum(file_.get(), &context)) {
614    RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
615    return OnCorruptDatabase();
616  }
617
618  return true;
619}
620
621void SafeBrowsingStoreFile::Init(
622    const base::FilePath& filename,
623    const base::Closure& corruption_callback
624) {
625  filename_ = filename;
626  corruption_callback_ = corruption_callback;
627}
628
629bool SafeBrowsingStoreFile::BeginChunk() {
630  return ClearChunkBuffers();
631}
632
633bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
634  add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
635  return true;
636}
637
638bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
639  add_prefixes->clear();
640  if (!base::PathExists(filename_))
641    return true;
642
643  StateInternal db_state;
644  if (!ReadDbStateHelper(filename_, &db_state))
645    return OnCorruptDatabase();
646
647  add_prefixes->swap(db_state.add_prefixes_);
648  return true;
649}
650
651bool SafeBrowsingStoreFile::GetAddFullHashes(
652    std::vector<SBAddFullHash>* add_full_hashes) {
653  add_full_hashes->clear();
654  if (!base::PathExists(filename_))
655    return true;
656
657  StateInternal db_state;
658  if (!ReadDbStateHelper(filename_, &db_state))
659    return OnCorruptDatabase();
660
661  add_full_hashes->swap(db_state.add_full_hashes_);
662  return true;
663}
664
665bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
666                                         const SBFullHash& full_hash) {
667  add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
668  return true;
669}
670
671bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
672                                           int32 add_chunk_id,
673                                           SBPrefix prefix) {
674  sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
675  return true;
676}
677
678bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
679                                         const SBFullHash& full_hash) {
680  sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
681  return true;
682}
683
684bool SafeBrowsingStoreFile::OnCorruptDatabase() {
685  if (!corruption_seen_)
686    RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
687  corruption_seen_ = true;
688
689  corruption_callback_.Run();
690
691  // Return false as a convenience to callers.
692  return false;
693}
694
695bool SafeBrowsingStoreFile::Close() {
696  ClearUpdateBuffers();
697
698  // Make sure the files are closed.
699  file_.reset();
700  new_file_.reset();
701  return true;
702}
703
704bool SafeBrowsingStoreFile::BeginUpdate() {
705  DCHECK(!file_.get() && !new_file_.get());
706
707  // Structures should all be clear unless something bad happened.
708  DCHECK(add_chunks_cache_.empty());
709  DCHECK(sub_chunks_cache_.empty());
710  DCHECK(add_del_cache_.empty());
711  DCHECK(sub_del_cache_.empty());
712  DCHECK(add_prefixes_.empty());
713  DCHECK(sub_prefixes_.empty());
714  DCHECK(add_hashes_.empty());
715  DCHECK(sub_hashes_.empty());
716  DCHECK_EQ(chunks_written_, 0);
717
718  corruption_seen_ = false;
719
720  const base::FilePath new_filename = TemporaryFileForFilename(filename_);
721  base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
722  if (new_file.get() == NULL)
723    return false;
724
725  base::ScopedFILE file(base::OpenFile(filename_, "rb"));
726  empty_ = (file.get() == NULL);
727  if (empty_) {
728    // If the file exists but cannot be opened, try to delete it (not
729    // deleting directly, the bloom filter needs to be deleted, too).
730    if (base::PathExists(filename_))
731      return OnCorruptDatabase();
732
733    new_file_.swap(new_file);
734    return true;
735  }
736
737  base::MD5Context context;
738  FileHeader header;
739  const int version =
740      ReadAndVerifyHeader(filename_, &header,
741                          &add_chunks_cache_, &sub_chunks_cache_,
742                          file.get(), &context);
743  if (version == kInvalidVersion) {
744    FileHeader retry_header;
745    if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) {
746      if (retry_header.magic == kFileMagic &&
747          retry_header.version < kFileVersion) {
748        RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED);
749      } else {
750        RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
751      }
752    }
753
754    // Close the file so that it can be deleted.
755    file.reset();
756
757    return OnCorruptDatabase();
758  }
759
760  file_.swap(file);
761  new_file_.swap(new_file);
762  return true;
763}
764
765bool SafeBrowsingStoreFile::FinishChunk() {
766  if (!add_prefixes_.size() && !sub_prefixes_.size() &&
767      !add_hashes_.size() && !sub_hashes_.size())
768    return true;
769
770  ChunkHeader header;
771  header.add_prefix_count = add_prefixes_.size();
772  header.sub_prefix_count = sub_prefixes_.size();
773  header.add_hash_count = add_hashes_.size();
774  header.sub_hash_count = sub_hashes_.size();
775  if (!WriteItem(header, new_file_.get(), NULL))
776    return false;
777
778  if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
779      !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
780      !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
781      !WriteContainer(sub_hashes_, new_file_.get(), NULL))
782    return false;
783
784  ++chunks_written_;
785
786  // Clear everything to save memory.
787  return ClearChunkBuffers();
788}
789
790bool SafeBrowsingStoreFile::DoUpdate(
791    safe_browsing::PrefixSetBuilder* builder,
792    std::vector<SBAddFullHash>* add_full_hashes_result) {
793  DCHECK(file_.get() || empty_);
794  DCHECK(new_file_.get());
795  CHECK(builder);
796  CHECK(add_full_hashes_result);
797
798  // Rewind the temporary storage.
799  if (!FileRewind(new_file_.get()))
800    return false;
801
802  // Get chunk file's size for validating counts.
803  int64 update_size = 0;
804  if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
805    return OnCorruptDatabase();
806
807  // Track update size to answer questions at http://crbug.com/72216 .
808  // Log small updates as 1k so that the 0 (underflow) bucket can be
809  // used for "empty" in SafeBrowsingDatabase.
810  UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
811                       std::max(static_cast<int>(update_size / 1024), 1));
812
813  // Chunk updates to integrate.
814  StateInternal new_state;
815
816  // Read update chunks.
817  for (int i = 0; i < chunks_written_; ++i) {
818    ChunkHeader header;
819
820    int64 ofs = ftell(new_file_.get());
821    if (ofs == -1)
822      return false;
823
824    if (!ReadItem(&header, new_file_.get(), NULL))
825      return false;
826
827    // As a safety measure, make sure that the header describes a sane
828    // chunk, given the remaining file size.
829    int64 expected_size = ofs + sizeof(ChunkHeader);
830    expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
831    expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
832    expected_size += header.add_hash_count * sizeof(SBAddFullHash);
833    expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
834    if (expected_size > update_size)
835      return false;
836
837    if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
838                              header.add_hash_count, header.sub_hash_count,
839                              new_file_.get(), NULL)) {
840      return false;
841    }
842  }
843
844  // The state was accumulated by chunk, sort by prefix.
845  new_state.SortData();
846
847  // These strides control how much data is loaded into memory per pass.
848  // Strides must be an even power of two.  |in_stride| will be derived from the
849  // input file.  |out_stride| will be derived from an estimate of the resulting
850  // file's size.  |process_stride| will be the max of both.
851  uint64 in_stride = kMaxShardStride;
852  uint64 out_stride = kMaxShardStride;
853  uint64 process_stride = 0;
854
855  // Used to verify the input's checksum if |!empty_|.
856  base::MD5Context in_context;
857
858  if (!empty_) {
859    DCHECK(file_.get());
860
861    FileHeader header = {0};
862    int version = ReadAndVerifyHeader(filename_, &header,
863                                      &add_chunks_cache_, &sub_chunks_cache_,
864                                      file_.get(), &in_context);
865    if (version == kInvalidVersion)
866      return OnCorruptDatabase();
867
868    if (header.shard_stride)
869      in_stride = header.shard_stride;
870
871    // The header checksum should have prevented this case, but the code will be
872    // broken if this is not correct.
873    if (!IsPowerOfTwo(in_stride))
874      return OnCorruptDatabase();
875  }
876
877  // We no longer need to track deleted chunks.
878  DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
879  DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
880
881  // Calculate |out_stride| to break the file down into reasonable shards.
882  {
883    int64 original_size = 0;
884    if (!empty_ && !base::GetFileSize(filename_, &original_size))
885      return OnCorruptDatabase();
886
887    // Approximate the final size as everything.  Subs and deletes will reduce
888    // the size, but modest over-sharding won't hurt much.
889    int64 shard_size = original_size + update_size;
890
891    // Keep splitting until a single stride of data fits the target.
892    size_t shifts = 0;
893    while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
894      out_stride >>= 1;
895      shard_size >>= 1;
896      ++shifts;
897    }
898    UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
899
900    DCHECK(IsPowerOfTwo(out_stride));
901  }
902
903  // Outer loop strides by the max of the input stride (to read integral shards)
904  // and the output stride (to write integral shards).
905  process_stride = std::max(in_stride, out_stride);
906  DCHECK(IsPowerOfTwo(process_stride));
907  DCHECK_EQ(0u, process_stride % in_stride);
908  DCHECK_EQ(0u, process_stride % out_stride);
909
910  // Start writing the new data to |new_file_|.
911  base::MD5Context out_context;
912  if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
913                   new_file_.get(), &out_context)) {
914    return false;
915  }
916
917  // Start at the beginning of the SBPrefix space.
918  uint64 in_min = 0;
919  uint64 out_min = 0;
920  uint64 process_min = 0;
921
922  // Start at the beginning of the updates.
923  StateInternalPos new_pos = new_state.StateBegin();
924
925  // Re-usable container for shard processing.
926  StateInternal db_state;
927
928  // Track aggregate counts for histograms.
929  size_t add_prefix_count = 0;
930  size_t sub_prefix_count = 0;
931
932  do {
933    // Maximum element in the current shard.
934    SBPrefix process_max =
935        static_cast<SBPrefix>(process_min + process_stride - 1);
936    DCHECK_GT(process_max, process_min);
937
938    // Drop the data from previous pass.
939    db_state.ClearData();
940
941    // Fill the processing shard with one or more input shards.
942    if (!empty_) {
943      do {
944        ShardHeader shard_header;
945        if (!ReadItem(&shard_header, file_.get(), &in_context))
946          return OnCorruptDatabase();
947
948        if (!db_state.AppendData(shard_header.add_prefix_count,
949                                 shard_header.sub_prefix_count,
950                                 shard_header.add_hash_count,
951                                 shard_header.sub_hash_count,
952                                 file_.get(), &in_context))
953          return OnCorruptDatabase();
954
955        in_min += in_stride;
956      } while (in_min <= kMaxSBPrefix && in_min < process_max);
957    }
958
959    // Shard the update data to match the database data, then merge the update
960    // data and process the results.
961    {
962      StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
963      db_state.MergeDataAndProcess(new_pos, new_end,
964                                   add_del_cache_, sub_del_cache_);
965      new_pos = new_end;
966    }
967
968    // Collect the processed data for return to caller.
969    for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
970      builder->AddPrefix(db_state.add_prefixes_[i].prefix);
971    }
972    add_full_hashes_result->insert(add_full_hashes_result->end(),
973                                   db_state.add_full_hashes_.begin(),
974                                   db_state.add_full_hashes_.end());
975    add_prefix_count += db_state.add_prefixes_.size();
976    sub_prefix_count += db_state.sub_prefixes_.size();
977
978    // Write one or more shards of processed output.
979    StateInternalPos out_pos = db_state.StateBegin();
980    do {
981      SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
982      DCHECK_GT(out_max, out_min);
983
984      StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
985      if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
986        return false;
987      out_pos = out_end;
988
989      out_min += out_stride;
990    } while (out_min == static_cast<SBPrefix>(out_min) &&
991             out_min < process_max);
992
993    process_min += process_stride;
994  } while (process_min <= kMaxSBPrefix);
995
996  // Verify the overall checksum.
997  if (!empty_) {
998    if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
999      RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
1000      return OnCorruptDatabase();
1001    }
1002
1003    // TODO(shess): Verify EOF?
1004
1005    // Close the input file so the new file can be renamed over it.
1006    file_.reset();
1007  }
1008  DCHECK(!file_.get());
1009
1010  // Write the overall checksum.
1011  base::MD5Digest out_digest;
1012  base::MD5Final(&out_digest, &out_context);
1013  if (!WriteItem(out_digest, new_file_.get(), NULL))
1014    return false;
1015
1016  // Trim any excess left over from the temporary chunk data.
1017  if (!base::TruncateFile(new_file_.get()))
1018    return false;
1019
1020  // Close the file handle and swizzle the file into place.
1021  new_file_.reset();
1022  if (!base::DeleteFile(filename_, false) &&
1023      base::PathExists(filename_))
1024    return false;
1025
1026  const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1027  if (!base::Move(new_filename, filename_))
1028    return false;
1029
1030  // Record counts before swapping to caller.
1031  UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
1032  UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
1033
1034  return true;
1035}
1036
1037bool SafeBrowsingStoreFile::FinishUpdate(
1038    safe_browsing::PrefixSetBuilder* builder,
1039    std::vector<SBAddFullHash>* add_full_hashes_result) {
1040  DCHECK(builder);
1041  DCHECK(add_full_hashes_result);
1042
1043  if (!DoUpdate(builder, add_full_hashes_result)) {
1044    CancelUpdate();
1045    return false;
1046  }
1047
1048  DCHECK(!new_file_.get());
1049  DCHECK(!file_.get());
1050
1051  return Close();
1052}
1053
1054bool SafeBrowsingStoreFile::CancelUpdate() {
1055  bool ret = Close();
1056
1057  // Delete stale staging file.
1058  const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1059  base::DeleteFile(new_filename, false);
1060
1061  return ret;
1062}
1063
1064void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
1065  add_chunks_cache_.insert(chunk_id);
1066}
1067
1068bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
1069  return add_chunks_cache_.count(chunk_id) > 0;
1070}
1071
1072void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
1073  out->clear();
1074  out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
1075}
1076
1077void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
1078  sub_chunks_cache_.insert(chunk_id);
1079}
1080
1081bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
1082  return sub_chunks_cache_.count(chunk_id) > 0;
1083}
1084
1085void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
1086  out->clear();
1087  out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
1088}
1089
1090void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
1091  add_del_cache_.insert(chunk_id);
1092}
1093
1094void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
1095  sub_del_cache_.insert(chunk_id);
1096}
1097
1098// static
1099bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
1100  if (!base::DeleteFile(basename, false) &&
1101      base::PathExists(basename)) {
1102    NOTREACHED();
1103    return false;
1104  }
1105
1106  const base::FilePath new_filename = TemporaryFileForFilename(basename);
1107  if (!base::DeleteFile(new_filename, false) &&
1108      base::PathExists(new_filename)) {
1109    NOTREACHED();
1110    return false;
1111  }
1112
1113  // With SQLite support gone, one way to get to this code is if the
1114  // existing file is a SQLite file.  Make sure the journal file is
1115  // also removed.
1116  const base::FilePath journal_filename(
1117      basename.value() + FILE_PATH_LITERAL("-journal"));
1118  if (base::PathExists(journal_filename))
1119    base::DeleteFile(journal_filename, false);
1120
1121  return true;
1122}
1123