1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
6
7#include "base/file_util.h"
8#include "base/files/scoped_file.h"
9#include "base/md5.h"
10#include "base/metrics/histogram.h"
11#include "base/metrics/sparse_histogram.h"
12
13namespace {
14
15// NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
16// that byte-order changes force corruption.
17const int32 kFileMagic = 0x600D71FE;
18
19// Version history:
20// Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
21// Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
22// Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
23const int32 kFileVersion = 8;
24
25// ReadAndVerifyHeader() returns this in case of error.
26const int32 kInvalidVersion = -1;
27
28// Header at the front of the main database file.
29struct FileHeaderV7 {
30  int32 magic, version;
31  uint32 add_chunk_count, sub_chunk_count;
32  uint32 add_prefix_count, sub_prefix_count;
33  uint32 add_hash_count, sub_hash_count;
34};
35
36// Starting with version 8, the storage is sorted and can be sharded to allow
37// updates to be done with lower memory requirements.  Newly written files will
38// be sharded to need less than this amount of memory during update.  Larger
39// values are preferred to minimize looping overhead during processing.
40const int64 kUpdateStorageBytes = 100 * 1024;
41
42// Prevent excessive sharding by setting a lower limit on the shard stride.
43// Smaller values should work fine, but very small values will probably lead to
44// poor performance.  Shard stride is indirectly related to
45// |kUpdateStorageBytes|, setting that very small will bump against this.
46const uint32 kMinShardStride = 1 << 24;
47
48// Strides over the entire SBPrefix space.
49const uint64 kMaxShardStride = 1ULL << 32;
50
51// Maximum SBPrefix value.
52const SBPrefix kMaxSBPrefix = ~0;
53
54// Header at the front of the main database file.
55struct FileHeaderV8 {
56  int32 magic, version;
57  uint32 add_chunk_count, sub_chunk_count;
58  uint32 shard_stride;
59  // TODO(shess): Is this where 64-bit will bite me?  Perhaps write a
60  // specialized read/write?
61};
62
63union FileHeader {
64  struct FileHeaderV7 v7;
65  struct FileHeaderV8 v8;
66};
67
68// Header for each chunk in the chunk-accumulation file.
69struct ChunkHeader {
70  uint32 add_prefix_count, sub_prefix_count;
71  uint32 add_hash_count, sub_hash_count;
72};
73
74// Header for each shard of data in the main database file.
75struct ShardHeader {
76  uint32 add_prefix_count, sub_prefix_count;
77  uint32 add_hash_count, sub_hash_count;
78};
79
80// Enumerate different format-change events for histogramming
81// purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
82enum FormatEventType {
83  // Corruption detected, broken down by file format.
84  FORMAT_EVENT_FILE_CORRUPT,
85  FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete
86
87  // The type of format found in the file.  The expected case (new
88  // file format) is intentionally not covered.
89  FORMAT_EVENT_FOUND_SQLITE,
90  FORMAT_EVENT_FOUND_UNKNOWN,
91
92  // The number of SQLite-format files deleted should be the same as
93  // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
94  // or if a failure prevents the update from succeeding.
95  FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
96  FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete
97
98  // Found and deleted (or failed to delete) the ancient "Safe
99  // Browsing" file.
100  FORMAT_EVENT_DELETED_ORIGINAL,
101  FORMAT_EVENT_DELETED_ORIGINAL_FAILED,
102
103  // The checksum did not check out in CheckValidity() or in
104  // FinishUpdate().  This most likely indicates that the machine
105  // crashed before the file was fully sync'ed to disk.
106  FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
107  FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
108
109  // The header checksum was incorrect in ReadAndVerifyHeader().  Likely
110  // indicates that the system crashed while writing an update.
111  FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
112
113  // Memory space for histograms is determined by the max.  ALWAYS
114  // ADD NEW VALUES BEFORE THIS ONE.
115  FORMAT_EVENT_MAX
116};
117
118void RecordFormatEvent(FormatEventType event_type) {
119  UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
120}
121
122// Rewind the file.  Using fseek(2) because rewind(3) errors are
123// weird.
124bool FileRewind(FILE* fp) {
125  int rv = fseek(fp, 0, SEEK_SET);
126  DCHECK_EQ(rv, 0);
127  return rv == 0;
128}
129
130// Read from |fp| into |item|, and fold the input data into the
131// checksum in |context|, if non-NULL.  Return true on success.
132template <class T>
133bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
134  const size_t ret = fread(item, sizeof(T), 1, fp);
135  if (ret != 1)
136    return false;
137
138  if (context) {
139    base::MD5Update(context,
140                    base::StringPiece(reinterpret_cast<char*>(item),
141                                      sizeof(T)));
142  }
143  return true;
144}
145
146// Write |item| to |fp|, and fold the output data into the checksum in
147// |context|, if non-NULL.  Return true on success.
148template <class T>
149bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
150  const size_t ret = fwrite(&item, sizeof(T), 1, fp);
151  if (ret != 1)
152    return false;
153
154  if (context) {
155    base::MD5Update(context,
156                    base::StringPiece(reinterpret_cast<const char*>(&item),
157                                      sizeof(T)));
158  }
159
160  return true;
161}
162
163// Read |count| items into |values| from |fp|, and fold them into the
164// checksum in |context|.  Returns true on success.
165template <typename CT>
166bool ReadToContainer(CT* values, size_t count, FILE* fp,
167                     base::MD5Context* context) {
168  if (!count)
169    return true;
170
171  for (size_t i = 0; i < count; ++i) {
172    typename CT::value_type value;
173    if (!ReadItem(&value, fp, context))
174      return false;
175
176    // push_back() is more obvious, but coded this way std::set can
177    // also be read.
178    values->insert(values->end(), value);
179  }
180
181  return true;
182}
183
184// Write values between |beg| and |end| to |fp|, and fold the data into the
185// checksum in |context|, if non-NULL.  Returns true if all items successful.
186template <typename CTI>
187bool WriteRange(const CTI& beg, const CTI& end,
188                FILE* fp, base::MD5Context* context) {
189  for (CTI iter = beg; iter != end; ++iter) {
190    if (!WriteItem(*iter, fp, context))
191      return false;
192  }
193  return true;
194}
195
196// Write all of |values| to |fp|, and fold the data into the checksum
197// in |context|, if non-NULL.  Returns true if all items successful.
198template <typename CT>
199bool WriteContainer(const CT& values, FILE* fp,
200                    base::MD5Context* context) {
201  return WriteRange(values.begin(), values.end(), fp, context);
202}
203
204// Delete the chunks in |deleted| from |chunks|.
205void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
206                         std::set<int32>* chunks) {
207  for (std::set<int32>::iterator iter = chunks->begin();
208       iter != chunks->end();) {
209    std::set<int32>::iterator prev = iter++;
210    if (deleted.count(*prev) > 0)
211      chunks->erase(prev);
212  }
213}
214
215bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
216  base::MD5Digest calculated_digest;
217  base::MD5IntermediateFinal(&calculated_digest, context);
218
219  base::MD5Digest file_digest;
220  if (!ReadItem(&file_digest, fp, context))
221    return false;
222
223  return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
224}
225
226// Sanity-check the header against the file's size to make sure our
227// vectors aren't gigantic.  This doubles as a cheap way to detect
228// corruption without having to checksum the entire file.
229bool FileHeaderV7SanityCheck(const base::FilePath& filename,
230                             const FileHeaderV7& header) {
231  DCHECK_EQ(header.version, 7);
232
233  int64 size = 0;
234  if (!base::GetFileSize(filename, &size))
235    return false;
236
237  int64 expected_size = sizeof(FileHeaderV7);
238  expected_size += header.add_chunk_count * sizeof(int32);
239  expected_size += header.sub_chunk_count * sizeof(int32);
240  expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
241  expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
242  expected_size += header.add_hash_count * sizeof(SBAddFullHash);
243  expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
244  expected_size += sizeof(base::MD5Digest);
245  if (size != expected_size)
246    return false;
247
248  return true;
249}
250
251// Helper function to read the file header and chunk TOC.  Rewinds |fp| and
252// initializes |context|.  The header is left in |header|, with the version
253// returned.  kInvalidVersion is returned for sanity check or checksum failure.
254int ReadAndVerifyHeader(const base::FilePath& filename,
255                        FileHeader* header,
256                        std::set<int32>* add_chunks,
257                        std::set<int32>* sub_chunks,
258                        FILE* fp,
259                        base::MD5Context* context) {
260  DCHECK(header);
261  DCHECK(add_chunks);
262  DCHECK(sub_chunks);
263  DCHECK(fp);
264  DCHECK(context);
265
266  int version = kInvalidVersion;
267
268  base::MD5Init(context);
269  if (!FileRewind(fp))
270    return kInvalidVersion;
271  if (!ReadItem(&header->v8, fp, context))
272    return kInvalidVersion;
273  if (header->v8.magic != kFileMagic)
274    return kInvalidVersion;
275
276  size_t add_chunks_count = 0;
277  size_t sub_chunks_count = 0;
278
279  // Track version read to inform removal of support for older versions.
280  UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->v8.version);
281
282  if (header->v8.version == 7) {
283    version = 7;
284
285    // Reset the context and re-read the v7 header.
286    base::MD5Init(context);
287    if (!FileRewind(fp))
288      return kInvalidVersion;
289    if (!ReadItem(&header->v7, fp, context))
290      return kInvalidVersion;
291    if (header->v7.magic != kFileMagic || header->v7.version != 7)
292      return kInvalidVersion;
293    if (!FileHeaderV7SanityCheck(filename, header->v7))
294      return kInvalidVersion;
295
296    add_chunks_count = header->v7.add_chunk_count;
297    sub_chunks_count = header->v7.sub_chunk_count;
298  } else if (header->v8.version == kFileVersion) {
299    version = 8;
300    add_chunks_count = header->v8.add_chunk_count;
301    sub_chunks_count = header->v8.sub_chunk_count;
302  } else {
303    return kInvalidVersion;
304  }
305
306  if (!ReadToContainer(add_chunks, add_chunks_count, fp, context) ||
307      !ReadToContainer(sub_chunks, sub_chunks_count, fp, context)) {
308    return kInvalidVersion;
309  }
310
311  // v8 includes a checksum to validate the header.
312  if (version > 7 && !ReadAndVerifyChecksum(fp, context)) {
313    RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
314    return kInvalidVersion;
315  }
316
317  return version;
318}
319
320// Helper function to write out the initial header and chunks-contained data.
321// Rewinds |fp|, initializes |context|, then writes a file header and
322// |add_chunks| and |sub_chunks|.
323bool WriteHeader(uint32 out_stride,
324                 const std::set<int32>& add_chunks,
325                 const std::set<int32>& sub_chunks,
326                 FILE* fp,
327                 base::MD5Context* context) {
328  if (!FileRewind(fp))
329    return false;
330
331  base::MD5Init(context);
332  FileHeaderV8 header;
333  header.magic = kFileMagic;
334  header.version = kFileVersion;
335  header.add_chunk_count = add_chunks.size();
336  header.sub_chunk_count = sub_chunks.size();
337  header.shard_stride = out_stride;
338  if (!WriteItem(header, fp, context))
339    return false;
340
341  if (!WriteContainer(add_chunks, fp, context) ||
342      !WriteContainer(sub_chunks, fp, context))
343    return false;
344
345  // Write out the header digest.
346  base::MD5Digest header_digest;
347  base::MD5IntermediateFinal(&header_digest, context);
348  if (!WriteItem(header_digest, fp, context))
349    return false;
350
351  return true;
352}
353
354// Return |true| if the range is sorted by the given comparator.
355template <typename CTI, typename LESS>
356bool sorted(CTI beg, CTI end, LESS less) {
357  while ((end - beg) > 2) {
358    CTI n = beg++;
359    DCHECK(!less(*beg, *n));
360    if (less(*beg, *n))
361      return false;
362  }
363  return true;
364}
365
366// Merge |beg|..|end| into |container|.  Both should be sorted by the given
367// comparator, and the range iterators should not be derived from |container|.
368// Differs from std::inplace_merge() in that additional memory is not required
369// for linear performance.
370template <typename CT, typename CTI, typename COMP>
371void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
372  DCHECK(sorted(container->begin(), container->end(), less));
373  DCHECK(sorted(beg, end, less));
374
375  // Size the container to fit the results.
376  const size_t c_size = container->size();
377  container->resize(c_size + (end - beg));
378
379  // |c_end| points to the original endpoint, while |c_out| points to the
380  // endpoint that will scan from end to beginning while merging.
381  typename CT::iterator c_end = container->begin() + c_size;
382  typename CT::iterator c_out = container->end();
383
384  // While both inputs have data, move the greater to |c_out|.
385  while (c_end != container->begin() && end != beg) {
386    if (less(*(c_end - 1), *(end - 1))) {
387      *(--c_out) = *(--end);
388    } else {
389      *(--c_out) = *(--c_end);
390    }
391  }
392
393  // Copy any data remaining in the new range.
394  if (end != beg) {
395    // The original container data has been fully shifted.
396    DCHECK(c_end == container->begin());
397
398    // There is exactly the correct amount of space left.
399    DCHECK_EQ(c_out - c_end, end - beg);
400
401    std::copy(beg, end, container->begin());
402  }
403
404  DCHECK(sorted(container->begin(), container->end(), less));
405}
406
407// Collection of iterators used while stepping through StateInternal (see
408// below).
409class StateInternalPos {
410 public:
411  StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
412                   SBSubPrefixes::iterator sub_prefixes_iter,
413                   std::vector<SBAddFullHash>::iterator add_hashes_iter,
414                   std::vector<SBSubFullHash>::iterator sub_hashes_iter)
415      : add_prefixes_iter_(add_prefixes_iter),
416        sub_prefixes_iter_(sub_prefixes_iter),
417        add_hashes_iter_(add_hashes_iter),
418        sub_hashes_iter_(sub_hashes_iter) {
419  }
420
421  SBAddPrefixes::iterator add_prefixes_iter_;
422  SBSubPrefixes::iterator sub_prefixes_iter_;
423  std::vector<SBAddFullHash>::iterator add_hashes_iter_;
424  std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
425};
426
427// Helper to find the next shard boundary.
428template <class T>
429bool prefix_bounder(SBPrefix val, const T& elt) {
430  return val < elt.GetAddPrefix();
431}
432
433// Container for partial database state.  Includes add/sub prefixes/hashes, plus
434// aggregate operations on same.
435class StateInternal {
436 public:
437  // Append indicated amount of data from |fp|.
438  bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
439                  size_t add_hash_count, size_t sub_hash_count,
440                  FILE* fp, base::MD5Context* context) {
441    return
442        ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
443        ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
444        ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
445        ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
446  }
447
448  void ClearData() {
449    add_prefixes_.clear();
450    sub_prefixes_.clear();
451    add_full_hashes_.clear();
452    sub_full_hashes_.clear();
453  }
454
455  // Merge data from |beg|..|end| into receiver's state, then process the state.
456  // The current state and the range given should corrospond to the same sorted
457  // shard of data from different sources.  |add_del_cache| and |sub_del_cache|
458  // indicate the chunk ids which should be deleted during processing (see
459  // SBProcessSubs).
460  void MergeDataAndProcess(const StateInternalPos& beg,
461                           const StateInternalPos& end,
462                           const base::hash_set<int32>& add_del_cache,
463                           const base::hash_set<int32>& sub_del_cache) {
464    container_merge(&add_prefixes_,
465                    beg.add_prefixes_iter_,
466                    end.add_prefixes_iter_,
467                    SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
468
469    container_merge(&sub_prefixes_,
470                    beg.sub_prefixes_iter_,
471                    end.sub_prefixes_iter_,
472                    SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
473
474    container_merge(&add_full_hashes_,
475                    beg.add_hashes_iter_,
476                    end.add_hashes_iter_,
477                    SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
478
479    container_merge(&sub_full_hashes_,
480                    beg.sub_hashes_iter_,
481                    end.sub_hashes_iter_,
482                    SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
483
484    SBProcessSubs(&add_prefixes_, &sub_prefixes_,
485                  &add_full_hashes_, &sub_full_hashes_,
486                  add_del_cache, sub_del_cache);
487  }
488
489  // Sort the data appropriately for the sharding, merging, and processing
490  // operations.
491  void SortData() {
492    std::sort(add_prefixes_.begin(), add_prefixes_.end(),
493              SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
494    std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
495              SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
496    std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
497              SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
498    std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
499              SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
500  }
501
502  // Iterator from the beginning of the state's data.
503  StateInternalPos StateBegin() {
504    return StateInternalPos(add_prefixes_.begin(),
505                            sub_prefixes_.begin(),
506                            add_full_hashes_.begin(),
507                            sub_full_hashes_.begin());
508  }
509
510  // An iterator pointing just after the last possible element of the shard
511  // indicated by |shard_max|.  Used to step through the state by shard.
512  // TODO(shess): Verify whether binary search really improves over linear.
513  // Merging or writing will immediately touch all of these elements.
514  StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
515    return StateInternalPos(
516        std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
517                         shard_max, prefix_bounder<SBAddPrefix>),
518        std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
519                         shard_max, prefix_bounder<SBSubPrefix>),
520        std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
521                         shard_max, prefix_bounder<SBAddFullHash>),
522        std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
523                         shard_max, prefix_bounder<SBSubFullHash>));
524  }
525
526  // Write a shard header and data for the shard starting at |beg| and ending at
527  // the element before |end|.
528  bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
529                  FILE* fp, base::MD5Context* context) {
530    ShardHeader shard_header;
531    shard_header.add_prefix_count =
532        end.add_prefixes_iter_ - beg.add_prefixes_iter_;
533    shard_header.sub_prefix_count =
534        end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
535    shard_header.add_hash_count =
536        end.add_hashes_iter_ - beg.add_hashes_iter_;
537    shard_header.sub_hash_count =
538        end.sub_hashes_iter_ - beg.sub_hashes_iter_;
539
540    return
541        WriteItem(shard_header, fp, context) &&
542        WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
543                   fp, context) &&
544        WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
545                   fp, context) &&
546        WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
547                   fp, context) &&
548        WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
549                   fp, context);
550  }
551
552  SBAddPrefixes add_prefixes_;
553  SBSubPrefixes sub_prefixes_;
554  std::vector<SBAddFullHash> add_full_hashes_;
555  std::vector<SBSubFullHash> sub_full_hashes_;
556};
557
558// True if |val| is an even power of two.
559template <typename T>
560bool IsPowerOfTwo(const T& val) {
561  return val && (val & (val - 1)) == 0;
562}
563
564// Helper to read the entire database state, used by GetAddPrefixes() and
565// GetAddFullHashes().  Those functions are generally used only for smaller
566// files.  Returns false in case of errors reading the data.
567bool ReadDbStateHelper(const base::FilePath& filename,
568                       StateInternal* db_state) {
569  file_util::ScopedFILE file(base::OpenFile(filename, "rb"));
570  if (file.get() == NULL)
571    return false;
572
573  std::set<int32> add_chunks;
574  std::set<int32> sub_chunks;
575
576  base::MD5Context context;
577  FileHeader header;
578  const int version =
579      ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
580                          file.get(), &context);
581  if (version == kInvalidVersion)
582    return false;
583
584  if (version == 7) {
585    if (!db_state->AppendData(header.v7.add_prefix_count,
586                              header.v7.sub_prefix_count,
587                              header.v7.add_hash_count,
588                              header.v7.sub_hash_count,
589                              file.get(), &context)) {
590      return false;
591    }
592
593    // v7 data was not stored sorted.
594    db_state->SortData();
595  } else {
596    // Read until the shard start overflows, always at least one pass.
597    uint64 in_min = 0;
598    uint64 in_stride = header.v8.shard_stride;
599    if (!in_stride)
600      in_stride = kMaxShardStride;
601    if (!IsPowerOfTwo(in_stride))
602      return false;
603
604    do {
605      ShardHeader shard_header;
606      if (!ReadItem(&shard_header, file.get(), &context))
607        return false;
608
609      if (!db_state->AppendData(shard_header.add_prefix_count,
610                                shard_header.sub_prefix_count,
611                                shard_header.add_hash_count,
612                                shard_header.sub_hash_count,
613                                file.get(), &context)) {
614        return false;
615      }
616
617      in_min += in_stride;
618    } while (in_min <= kMaxSBPrefix);
619  }
620
621  if (!ReadAndVerifyChecksum(file.get(), &context))
622    return false;
623
624  int64 size = 0;
625  if (!base::GetFileSize(filename, &size))
626    return false;
627
628  return static_cast<int64>(ftell(file.get())) == size;
629}
630
631}  // namespace
632
633// static
634void SafeBrowsingStoreFile::CheckForOriginalAndDelete(
635    const base::FilePath& current_filename) {
636  const base::FilePath original_filename(
637      current_filename.DirName().AppendASCII("Safe Browsing"));
638  if (base::PathExists(original_filename)) {
639    int64 size = 0;
640    if (base::GetFileSize(original_filename, &size)) {
641      UMA_HISTOGRAM_COUNTS("SB2.OldDatabaseKilobytes",
642                           static_cast<int>(size / 1024));
643    }
644
645    if (base::DeleteFile(original_filename, false)) {
646      RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL);
647    } else {
648      RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL_FAILED);
649    }
650
651    // Just best-effort on the journal file, don't want to get lost in
652    // the weeds.
653    const base::FilePath journal_filename(
654        current_filename.DirName().AppendASCII("Safe Browsing-journal"));
655    base::DeleteFile(journal_filename, false);
656  }
657}
658
659SafeBrowsingStoreFile::SafeBrowsingStoreFile()
660    : chunks_written_(0), empty_(false), corruption_seen_(false) {}
661
662SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
663  Close();
664}
665
666bool SafeBrowsingStoreFile::Delete() {
667  // The database should not be open at this point.  But, just in
668  // case, close everything before deleting.
669  if (!Close()) {
670    NOTREACHED();
671    return false;
672  }
673
674  return DeleteStore(filename_);
675}
676
677bool SafeBrowsingStoreFile::CheckValidity() {
678  // The file was either empty or never opened.  The empty case is
679  // presumed not to be invalid.  The never-opened case can happen if
680  // BeginUpdate() fails for any databases, and should already have
681  // caused the corruption callback to fire.
682  if (!file_.get())
683    return true;
684
685  if (!FileRewind(file_.get()))
686    return OnCorruptDatabase();
687
688  int64 size = 0;
689  if (!base::GetFileSize(filename_, &size))
690    return OnCorruptDatabase();
691
692  base::MD5Context context;
693  base::MD5Init(&context);
694
695  // Read everything except the final digest.
696  size_t bytes_left = static_cast<size_t>(size);
697  CHECK(size == static_cast<int64>(bytes_left));
698  if (bytes_left < sizeof(base::MD5Digest))
699    return OnCorruptDatabase();
700  bytes_left -= sizeof(base::MD5Digest);
701
702  // Fold the contents of the file into the checksum.
703  while (bytes_left > 0) {
704    char buf[4096];
705    const size_t c = std::min(sizeof(buf), bytes_left);
706    const size_t ret = fread(buf, 1, c, file_.get());
707
708    // The file's size changed while reading, give up.
709    if (ret != c)
710      return OnCorruptDatabase();
711    base::MD5Update(&context, base::StringPiece(buf, c));
712    bytes_left -= c;
713  }
714
715  if (!ReadAndVerifyChecksum(file_.get(), &context)) {
716    RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
717    return OnCorruptDatabase();
718  }
719
720  return true;
721}
722
723void SafeBrowsingStoreFile::Init(
724    const base::FilePath& filename,
725    const base::Closure& corruption_callback
726) {
727  filename_ = filename;
728  corruption_callback_ = corruption_callback;
729}
730
731bool SafeBrowsingStoreFile::BeginChunk() {
732  return ClearChunkBuffers();
733}
734
735bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
736  add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
737  return true;
738}
739
740bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
741  add_prefixes->clear();
742  if (!base::PathExists(filename_))
743    return true;
744
745  StateInternal db_state;
746  if (!ReadDbStateHelper(filename_, &db_state))
747    return OnCorruptDatabase();
748
749  add_prefixes->swap(db_state.add_prefixes_);
750  return true;
751}
752
753bool SafeBrowsingStoreFile::GetAddFullHashes(
754    std::vector<SBAddFullHash>* add_full_hashes) {
755  add_full_hashes->clear();
756  if (!base::PathExists(filename_))
757    return true;
758
759  StateInternal db_state;
760  if (!ReadDbStateHelper(filename_, &db_state))
761    return OnCorruptDatabase();
762
763  add_full_hashes->swap(db_state.add_full_hashes_);
764  return true;
765}
766
767bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
768                                         const SBFullHash& full_hash) {
769  add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
770  return true;
771}
772
773bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
774                                           int32 add_chunk_id,
775                                           SBPrefix prefix) {
776  sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
777  return true;
778}
779
780bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
781                                         const SBFullHash& full_hash) {
782  sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
783  return true;
784}
785
786bool SafeBrowsingStoreFile::OnCorruptDatabase() {
787  if (!corruption_seen_)
788    RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
789  corruption_seen_ = true;
790
791  corruption_callback_.Run();
792
793  // Return false as a convenience to callers.
794  return false;
795}
796
797bool SafeBrowsingStoreFile::Close() {
798  ClearUpdateBuffers();
799
800  // Make sure the files are closed.
801  file_.reset();
802  new_file_.reset();
803  return true;
804}
805
806bool SafeBrowsingStoreFile::BeginUpdate() {
807  DCHECK(!file_.get() && !new_file_.get());
808
809  // Structures should all be clear unless something bad happened.
810  DCHECK(add_chunks_cache_.empty());
811  DCHECK(sub_chunks_cache_.empty());
812  DCHECK(add_del_cache_.empty());
813  DCHECK(sub_del_cache_.empty());
814  DCHECK(add_prefixes_.empty());
815  DCHECK(sub_prefixes_.empty());
816  DCHECK(add_hashes_.empty());
817  DCHECK(sub_hashes_.empty());
818  DCHECK_EQ(chunks_written_, 0);
819
820  // Since the following code will already hit the profile looking for
821  // database files, this is a reasonable to time delete any old
822  // files.
823  CheckForOriginalAndDelete(filename_);
824
825  corruption_seen_ = false;
826
827  const base::FilePath new_filename = TemporaryFileForFilename(filename_);
828  base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
829  if (new_file.get() == NULL)
830    return false;
831
832  base::ScopedFILE file(base::OpenFile(filename_, "rb"));
833  empty_ = (file.get() == NULL);
834  if (empty_) {
835    // If the file exists but cannot be opened, try to delete it (not
836    // deleting directly, the bloom filter needs to be deleted, too).
837    if (base::PathExists(filename_))
838      return OnCorruptDatabase();
839
840    new_file_.swap(new_file);
841    return true;
842  }
843
844  base::MD5Context context;
845  FileHeader header;
846  const int version =
847      ReadAndVerifyHeader(filename_, &header,
848                          &add_chunks_cache_, &sub_chunks_cache_,
849                          file.get(), &context);
850  if (version == kInvalidVersion) {
851    FileHeaderV8 retry_header;
852    if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL) &&
853        (retry_header.magic != kFileMagic ||
854         (retry_header.version != 8 && retry_header.version != 7))) {
855      // TODO(shess): Think on whether these histograms are generating any
856      // actionable data.  I kid you not, SQLITE happens many thousands of times
857      // per day, UNKNOWN about 3x higher than that.
858      if (!strcmp(reinterpret_cast<char*>(&retry_header.magic),
859                  "SQLite format 3")) {
860        RecordFormatEvent(FORMAT_EVENT_FOUND_SQLITE);
861      } else {
862        RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
863      }
864    }
865
866    // Close the file so that it can be deleted.
867    file.reset();
868
869    return OnCorruptDatabase();
870  }
871
872  file_.swap(file);
873  new_file_.swap(new_file);
874  return true;
875}
876
877bool SafeBrowsingStoreFile::FinishChunk() {
878  if (!add_prefixes_.size() && !sub_prefixes_.size() &&
879      !add_hashes_.size() && !sub_hashes_.size())
880    return true;
881
882  ChunkHeader header;
883  header.add_prefix_count = add_prefixes_.size();
884  header.sub_prefix_count = sub_prefixes_.size();
885  header.add_hash_count = add_hashes_.size();
886  header.sub_hash_count = sub_hashes_.size();
887  if (!WriteItem(header, new_file_.get(), NULL))
888    return false;
889
890  if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
891      !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
892      !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
893      !WriteContainer(sub_hashes_, new_file_.get(), NULL))
894    return false;
895
896  ++chunks_written_;
897
898  // Clear everything to save memory.
899  return ClearChunkBuffers();
900}
901
902bool SafeBrowsingStoreFile::DoUpdate(
903    safe_browsing::PrefixSetBuilder* builder,
904    std::vector<SBAddFullHash>* add_full_hashes_result) {
905  DCHECK(file_.get() || empty_);
906  DCHECK(new_file_.get());
907  CHECK(builder);
908  CHECK(add_full_hashes_result);
909
910  // Rewind the temporary storage.
911  if (!FileRewind(new_file_.get()))
912    return false;
913
914  // Get chunk file's size for validating counts.
915  int64 update_size = 0;
916  if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
917    return OnCorruptDatabase();
918
919  // Track update size to answer questions at http://crbug.com/72216 .
920  // Log small updates as 1k so that the 0 (underflow) bucket can be
921  // used for "empty" in SafeBrowsingDatabase.
922  UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
923                       std::max(static_cast<int>(update_size / 1024), 1));
924
925  // Chunk updates to integrate.
926  StateInternal new_state;
927
928  // Read update chunks.
929  for (int i = 0; i < chunks_written_; ++i) {
930    ChunkHeader header;
931
932    int64 ofs = ftell(new_file_.get());
933    if (ofs == -1)
934      return false;
935
936    if (!ReadItem(&header, new_file_.get(), NULL))
937      return false;
938
939    // As a safety measure, make sure that the header describes a sane
940    // chunk, given the remaining file size.
941    int64 expected_size = ofs + sizeof(ChunkHeader);
942    expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
943    expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
944    expected_size += header.add_hash_count * sizeof(SBAddFullHash);
945    expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
946    if (expected_size > update_size)
947      return false;
948
949    if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
950                              header.add_hash_count, header.sub_hash_count,
951                              new_file_.get(), NULL)) {
952      return false;
953    }
954  }
955
956  // The state was accumulated by chunk, sort by prefix.
957  new_state.SortData();
958
959  // These strides control how much data is loaded into memory per pass.
960  // Strides must be an even power of two.  |in_stride| will be derived from the
961  // input file.  |out_stride| will be derived from an estimate of the resulting
962  // file's size.  |process_stride| will be the max of both.
963  uint64 in_stride = kMaxShardStride;
964  uint64 out_stride = kMaxShardStride;
965  uint64 process_stride = 0;
966
967  // The header info is only used later if |!empty_|.  The v8 read loop only
968  // needs |in_stride|, while v7 needs to refer to header information.
969  base::MD5Context in_context;
970  int version = kInvalidVersion;
971  FileHeader header;
972
973  if (!empty_) {
974    DCHECK(file_.get());
975
976    version = ReadAndVerifyHeader(filename_, &header,
977                                  &add_chunks_cache_, &sub_chunks_cache_,
978                                  file_.get(), &in_context);
979    if (version == kInvalidVersion)
980      return OnCorruptDatabase();
981
982    if (version == 8 && header.v8.shard_stride)
983      in_stride = header.v8.shard_stride;
984
985    // The header checksum should have prevented this case, but the code will be
986    // broken if this is not correct.
987    if (!IsPowerOfTwo(in_stride))
988      return OnCorruptDatabase();
989  }
990
991  // We no longer need to track deleted chunks.
992  DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
993  DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
994
995  // Calculate |out_stride| to break the file down into reasonable shards.
996  {
997    int64 original_size = 0;
998    if (!empty_ && !base::GetFileSize(filename_, &original_size))
999      return OnCorruptDatabase();
1000
1001    // Approximate the final size as everything.  Subs and deletes will reduce
1002    // the size, but modest over-sharding won't hurt much.
1003    int64 shard_size = original_size + update_size;
1004
1005    // Keep splitting until a single stride of data fits the target.
1006    size_t shifts = 0;
1007    while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
1008      out_stride >>= 1;
1009      shard_size >>= 1;
1010      ++shifts;
1011    }
1012    UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
1013
1014    DCHECK(IsPowerOfTwo(out_stride));
1015  }
1016
1017  // Outer loop strides by the max of the input stride (to read integral shards)
1018  // and the output stride (to write integral shards).
1019  process_stride = std::max(in_stride, out_stride);
1020  DCHECK(IsPowerOfTwo(process_stride));
1021  DCHECK_EQ(0u, process_stride % in_stride);
1022  DCHECK_EQ(0u, process_stride % out_stride);
1023
1024  // Start writing the new data to |new_file_|.
1025  base::MD5Context out_context;
1026  if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
1027                   new_file_.get(), &out_context)) {
1028    return false;
1029  }
1030
1031  // Start at the beginning of the SBPrefix space.
1032  uint64 in_min = 0;
1033  uint64 out_min = 0;
1034  uint64 process_min = 0;
1035
1036  // Start at the beginning of the updates.
1037  StateInternalPos new_pos = new_state.StateBegin();
1038
1039  // Re-usable container for shard processing.
1040  StateInternal db_state;
1041
1042  // Track aggregate counts for histograms.
1043  size_t add_prefix_count = 0;
1044  size_t sub_prefix_count = 0;
1045
1046  do {
1047    // Maximum element in the current shard.
1048    SBPrefix process_max =
1049        static_cast<SBPrefix>(process_min + process_stride - 1);
1050    DCHECK_GT(process_max, process_min);
1051
1052    // Drop the data from previous pass.
1053    db_state.ClearData();
1054
1055    // Fill the processing shard with one or more input shards.
1056    if (!empty_) {
1057      if (version == 7) {
1058        // Treat v7 as a single-shard file.
1059        DCHECK_EQ(in_min, 0u);
1060        DCHECK_EQ(in_stride, kMaxShardStride);
1061        DCHECK_EQ(process_stride, kMaxShardStride);
1062        if (!db_state.AppendData(header.v7.add_prefix_count,
1063                                 header.v7.sub_prefix_count,
1064                                 header.v7.add_hash_count,
1065                                 header.v7.sub_hash_count,
1066                                 file_.get(), &in_context))
1067          return OnCorruptDatabase();
1068
1069        // v7 data is not sorted correctly.
1070        db_state.SortData();
1071      } else {
1072        do {
1073          ShardHeader shard_header;
1074          if (!ReadItem(&shard_header, file_.get(), &in_context))
1075            return OnCorruptDatabase();
1076
1077          if (!db_state.AppendData(shard_header.add_prefix_count,
1078                                   shard_header.sub_prefix_count,
1079                                   shard_header.add_hash_count,
1080                                   shard_header.sub_hash_count,
1081                                   file_.get(), &in_context))
1082            return OnCorruptDatabase();
1083
1084          in_min += in_stride;
1085        } while (in_min <= kMaxSBPrefix && in_min < process_max);
1086      }
1087    }
1088
1089    // Shard the update data to match the database data, then merge the update
1090    // data and process the results.
1091    {
1092      StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
1093      db_state.MergeDataAndProcess(new_pos, new_end,
1094                                   add_del_cache_, sub_del_cache_);
1095      new_pos = new_end;
1096    }
1097
1098    // Collect the processed data for return to caller.
1099    for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
1100      builder->AddPrefix(db_state.add_prefixes_[i].prefix);
1101    }
1102    add_full_hashes_result->insert(add_full_hashes_result->end(),
1103                                   db_state.add_full_hashes_.begin(),
1104                                   db_state.add_full_hashes_.end());
1105    add_prefix_count += db_state.add_prefixes_.size();
1106    sub_prefix_count += db_state.sub_prefixes_.size();
1107
1108    // Write one or more shards of processed output.
1109    StateInternalPos out_pos = db_state.StateBegin();
1110    do {
1111      SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
1112      DCHECK_GT(out_max, out_min);
1113
1114      StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
1115      if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
1116        return false;
1117      out_pos = out_end;
1118
1119      out_min += out_stride;
1120    } while (out_min == static_cast<SBPrefix>(out_min) &&
1121             out_min < process_max);
1122
1123    process_min += process_stride;
1124  } while (process_min <= kMaxSBPrefix);
1125
1126  // Verify the overall checksum.
1127  if (!empty_) {
1128    if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
1129      RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
1130      return OnCorruptDatabase();
1131    }
1132
1133    // TODO(shess): Verify EOF?
1134
1135    // Close the input file so the new file can be renamed over it.
1136    file_.reset();
1137  }
1138  DCHECK(!file_.get());
1139
1140  // Write the overall checksum.
1141  base::MD5Digest out_digest;
1142  base::MD5Final(&out_digest, &out_context);
1143  if (!WriteItem(out_digest, new_file_.get(), NULL))
1144    return false;
1145
1146  // Trim any excess left over from the temporary chunk data.
1147  if (!base::TruncateFile(new_file_.get()))
1148    return false;
1149
1150  // Close the file handle and swizzle the file into place.
1151  new_file_.reset();
1152  if (!base::DeleteFile(filename_, false) &&
1153      base::PathExists(filename_))
1154    return false;
1155
1156  const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1157  if (!base::Move(new_filename, filename_))
1158    return false;
1159
1160  // Record counts before swapping to caller.
1161  UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
1162  UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
1163
1164  return true;
1165}
1166
1167bool SafeBrowsingStoreFile::FinishUpdate(
1168    safe_browsing::PrefixSetBuilder* builder,
1169    std::vector<SBAddFullHash>* add_full_hashes_result) {
1170  DCHECK(builder);
1171  DCHECK(add_full_hashes_result);
1172
1173  if (!DoUpdate(builder, add_full_hashes_result)) {
1174    CancelUpdate();
1175    return false;
1176  }
1177
1178  DCHECK(!new_file_.get());
1179  DCHECK(!file_.get());
1180
1181  return Close();
1182}
1183
1184bool SafeBrowsingStoreFile::CancelUpdate() {
1185  return Close();
1186}
1187
1188void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
1189  add_chunks_cache_.insert(chunk_id);
1190}
1191
1192bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
1193  return add_chunks_cache_.count(chunk_id) > 0;
1194}
1195
1196void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
1197  out->clear();
1198  out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
1199}
1200
1201void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
1202  sub_chunks_cache_.insert(chunk_id);
1203}
1204
1205bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
1206  return sub_chunks_cache_.count(chunk_id) > 0;
1207}
1208
1209void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
1210  out->clear();
1211  out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
1212}
1213
1214void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
1215  add_del_cache_.insert(chunk_id);
1216}
1217
1218void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
1219  sub_del_cache_.insert(chunk_id);
1220}
1221
1222// static
1223bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
1224  if (!base::DeleteFile(basename, false) &&
1225      base::PathExists(basename)) {
1226    NOTREACHED();
1227    return false;
1228  }
1229
1230  const base::FilePath new_filename = TemporaryFileForFilename(basename);
1231  if (!base::DeleteFile(new_filename, false) &&
1232      base::PathExists(new_filename)) {
1233    NOTREACHED();
1234    return false;
1235  }
1236
1237  // With SQLite support gone, one way to get to this code is if the
1238  // existing file is a SQLite file.  Make sure the journal file is
1239  // also removed.
1240  const base::FilePath journal_filename(
1241      basename.value() + FILE_PATH_LITERAL("-journal"));
1242  if (base::PathExists(journal_filename))
1243    base::DeleteFile(journal_filename, false);
1244
1245  return true;
1246}
1247