1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
6#define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
7#pragma once
8
9#include <set>
10#include <vector>
11
12#include "chrome/browser/safe_browsing/safe_browsing_store.h"
13
14#include "base/callback.h"
15#include "base/file_util.h"
16
17// Implement SafeBrowsingStore in terms of a flat file.  The file
18// format is pretty literal:
19//
20// int32 magic;             // magic number "validating" file
21// int32 version;           // format version
22//
23// // Counts for the various data which follows the header.
24// uint32 add_chunk_count;   // Chunks seen, including empties.
25// uint32 sub_chunk_count;   // Ditto.
26// uint32 add_prefix_count;
27// uint32 sub_prefix_count;
28// uint32 add_hash_count;
29// uint32 sub_hash_count;
30//
31// array[add_chunk_count] {
32//   int32 chunk_id;
33// }
34// array[sub_chunk_count] {
35//   int32 chunk_id;
36// }
37// array[add_prefix_count] {
38//   int32 chunk_id;
39//   int32 prefix;
40// }
41// array[sub_prefix_count] {
42//   int32 chunk_id;
43//   int32 add_chunk_id;
44//   int32 add_prefix;
45// }
46// array[add_hash_count] {
47//   int32 chunk_id;
48//   int32 received_time;     // From base::Time::ToTimeT().
49//   char[32] full_hash;
50// array[sub_hash_count] {
51//   int32 chunk_id;
52//   int32 add_chunk_id;
53//   char[32] add_full_hash;
54// }
55// MD5Digest checksum;      // Checksum over preceeding data.
56//
57// During the course of an update, uncommitted data is stored in a
58// temporary file (which is later re-used to commit).  This is an
59// array of chunks, with the count kept in memory until the end of the
60// transaction.  The format of this file is like the main file, with
61// the list of chunks seen omitted, as that data is tracked in-memory:
62//
63// array[] {
64//   uint32 add_prefix_count;
65//   uint32 sub_prefix_count;
66//   uint32 add_hash_count;
67//   uint32 sub_hash_count;
68//   array[add_prefix_count] {
69//     int32 chunk_id;
70//     int32 prefix;
71//   }
72//   array[sub_prefix_count] {
73//     int32 chunk_id;
74//     int32 add_chunk_id;
75//     int32 add_prefix;
76//   }
77//   array[add_hash_count] {
78//     int32 chunk_id;
79//     int32 received_time;     // From base::Time::ToTimeT().
80//     char[32] full_hash;
81//   }
82//   array[sub_hash_count] {
83//     int32 chunk_id;
84//     int32 add_chunk_id;
85//     char[32] add_full_hash;
86//   }
87// }
88//
89// The overall transaction works like this:
90// - Open the original file to get the chunks-seen data.
91// - Open a temp file for storing new chunk info.
92// - Write new chunks to the temp file.
93// - When the transaction is finished:
94//   - Read the rest of the original file's data into buffers.
95//   - Rewind the temp file and merge the new data into buffers.
96//   - Process buffers for deletions and apply subs.
97//   - Rewind and write the buffers out to temp file.
98//   - Delete original file.
99//   - Rename temp file to original filename.
100
101// TODO(shess): By using a checksum, this code can avoid doing an
102// fsync(), at the possible cost of more frequently retrieving the
103// full dataset.  Measure how often this occurs, and if it occurs too
104// often, consider retaining the last known-good file for recovery
105// purposes, rather than deleting it.
106
107class SafeBrowsingStoreFile : public SafeBrowsingStore {
108 public:
109  SafeBrowsingStoreFile();
110  virtual ~SafeBrowsingStoreFile();
111
112  virtual void Init(const FilePath& filename,
113                    Callback0::Type* corruption_callback);
114
115  // Delete any on-disk files, including the permanent storage.
116  virtual bool Delete();
117
118  // Get all add hash prefixes and full-length hashes, respectively, from
119  // the store.
120  virtual bool GetAddPrefixes(std::vector<SBAddPrefix>* add_prefixes);
121  virtual bool GetAddFullHashes(std::vector<SBAddFullHash>* add_full_hashes);
122
123  virtual bool BeginChunk();
124
125  virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix);
126  virtual bool WriteAddHash(int32 chunk_id,
127                            base::Time receive_time,
128                            const SBFullHash& full_hash);
129  virtual bool WriteSubPrefix(int32 chunk_id,
130                              int32 add_chunk_id, SBPrefix prefix);
131  virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
132                            const SBFullHash& full_hash);
133  virtual bool FinishChunk();
134
135  virtual bool BeginUpdate();
136  // Store updates with pending add full hashes in file store and
137  // return |add_prefixes_result| and |add_full_hashes_result|.
138  virtual bool FinishUpdate(const std::vector<SBAddFullHash>& pending_adds,
139                            const std::set<SBPrefix>& prefix_misses,
140                            std::vector<SBAddPrefix>* add_prefixes_result,
141                            std::vector<SBAddFullHash>* add_full_hashes_result);
142  virtual bool CancelUpdate();
143
144  virtual void SetAddChunk(int32 chunk_id);
145  virtual bool CheckAddChunk(int32 chunk_id);
146  virtual void GetAddChunks(std::vector<int32>* out);
147  virtual void SetSubChunk(int32 chunk_id);
148  virtual bool CheckSubChunk(int32 chunk_id);
149  virtual void GetSubChunks(std::vector<int32>* out);
150
151  virtual void DeleteAddChunk(int32 chunk_id);
152  virtual void DeleteSubChunk(int32 chunk_id);
153
154  // Returns the name of the temporary file used to buffer data for
155  // |filename|.  Exported for unit tests.
156  static const FilePath TemporaryFileForFilename(const FilePath& filename) {
157    return FilePath(filename.value() + FILE_PATH_LITERAL("_new"));
158  }
159
160 private:
161  // Update store file with pending full hashes.
162  virtual bool DoUpdate(const std::vector<SBAddFullHash>& pending_adds,
163                        const std::set<SBPrefix>& prefix_misses,
164                        std::vector<SBAddPrefix>* add_prefixes_result,
165                        std::vector<SBAddFullHash>* add_full_hashes_result);
166
167  // Enumerate different format-change events for histogramming
168  // purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
169  // TODO(shess): Remove this once the format change is complete.
170  enum FormatEventType {
171    // Corruption detected, broken down by file format.
172    FORMAT_EVENT_FILE_CORRUPT,
173    FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete
174
175    // The type of format found in the file.  The expected case (new
176    // file format) is intentionally not covered.
177    FORMAT_EVENT_FOUND_SQLITE,
178    FORMAT_EVENT_FOUND_UNKNOWN,
179
180    // The number of SQLite-format files deleted should be the same as
181    // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
182    // or if a failure prevents the update from succeeding.
183    FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
184    FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete
185
186    // Found and deleted (or failed to delete) the ancient "Safe
187    // Browsing" file.
188    FORMAT_EVENT_DELETED_ORIGINAL,
189    FORMAT_EVENT_DELETED_ORIGINAL_FAILED,
190
191    // Memory space for histograms is determined by the max.  ALWAYS
192    // ADD NEW VALUES BEFORE THIS ONE.
193    FORMAT_EVENT_MAX
194  };
195
196  // Helper to record an event related to format conversion from
197  // SQLite to file.
198  static void RecordFormatEvent(FormatEventType event_type);
199
200  // Some very lucky users have an original-format file still in their
201  // profile.  Check for it and delete, recording a histogram for the
202  // result (no histogram for not-found).  Logically this
203  // would make more sense at the SafeBrowsingDatabase level, but
204  // practically speaking that code doesn't touch files directly.
205  static void CheckForOriginalAndDelete(const FilePath& filename);
206
207  // Close all files and clear all buffers.
208  bool Close();
209
210  // Calls |corruption_callback_| if non-NULL, always returns false as
211  // a convenience to the caller.
212  bool OnCorruptDatabase();
213
214  // Helper for creating a corruption callback for |old_store_|.
215  // TODO(shess): Remove after migration.
216  void HandleCorruptDatabase();
217
218  // Clear temporary buffers used to accumulate chunk data.
219  bool ClearChunkBuffers() {
220    // NOTE: .clear() doesn't release memory.
221    // TODO(shess): Figure out if this is overkill.  Some amount of
222    // pre-reserved space is probably reasonable between each chunk
223    // collected.
224    std::vector<SBAddPrefix>().swap(add_prefixes_);
225    std::vector<SBSubPrefix>().swap(sub_prefixes_);
226    std::vector<SBAddFullHash>().swap(add_hashes_);
227    std::vector<SBSubFullHash>().swap(sub_hashes_);
228    return true;
229  }
230
231  // Clear all buffers used during update.
232  void ClearUpdateBuffers() {
233    ClearChunkBuffers();
234    chunks_written_ = 0;
235    std::set<int32>().swap(add_chunks_cache_);
236    std::set<int32>().swap(sub_chunks_cache_);
237    base::hash_set<int32>().swap(add_del_cache_);
238    base::hash_set<int32>().swap(sub_del_cache_);
239  }
240
241  // Buffers for collecting data between BeginChunk() and
242  // FinishChunk().
243  std::vector<SBAddPrefix> add_prefixes_;
244  std::vector<SBSubPrefix> sub_prefixes_;
245  std::vector<SBAddFullHash> add_hashes_;
246  std::vector<SBSubFullHash> sub_hashes_;
247
248  // Count of chunks collected in |new_file_|.
249  int chunks_written_;
250
251  // Name of the main database file.
252  FilePath filename_;
253
254  // Handles to the main and scratch files.  |empty_| is true if the
255  // main file didn't exist when the update was started.
256  file_util::ScopedFILE file_;
257  file_util::ScopedFILE new_file_;
258  bool empty_;
259
260  // Cache of chunks which have been seen.  Loaded from the database
261  // on BeginUpdate() so that it can be queried during the
262  // transaction.
263  std::set<int32> add_chunks_cache_;
264  std::set<int32> sub_chunks_cache_;
265
266  // Cache the set of deleted chunks during a transaction, applied on
267  // FinishUpdate().
268  // TODO(shess): If the set is small enough, hash_set<> might be
269  // slower than plain set<>.
270  base::hash_set<int32> add_del_cache_;
271  base::hash_set<int32> sub_del_cache_;
272
273  scoped_ptr<Callback0::Type> corruption_callback_;
274
275  // Tracks whether corruption has already been seen in the current
276  // update, so that only one instance is recorded in the stats.
277  // TODO(shess): Remove with format-migration support.
278  bool corruption_seen_;
279
280  DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile);
281};
282
283#endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
284