safe_browsing_store.h revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
6#define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
7
8#include <set>
9#include <vector>
10
11#include "base/basictypes.h"
12#include "base/callback_forward.h"
13#include "base/hash_tables.h"
14#include "base/time.h"
15#include "chrome/browser/safe_browsing/safe_browsing_util.h"
16
17class FilePath;
18
19// SafeBrowsingStore provides a storage abstraction for the
20// safe-browsing data used to build the bloom filter.  The items
21// stored are:
22//   The set of add and sub chunks seen.
23//   List of SBAddPrefix (chunk_id and SBPrefix).
24//   List of SBSubPrefix (chunk_id and the target SBAddPrefix).
25//   List of SBAddFullHash (SBAddPrefix, time received and an SBFullHash).
26//   List of SBSubFullHash (chunk_id, target SBAddPrefix, and an SBFullHash).
27//
28// The store is geared towards updating the data, not runtime access
29// to the data (that is handled by SafeBrowsingDatabase).  Updates are
30// handled similar to a SQL transaction cycle, with the new data being
31// returned from FinishUpdate() (the COMMIT).  Data is not persistent
32// until FinishUpdate() returns successfully.
33//
34// FinishUpdate() also handles dropping items who's chunk has been
35// deleted, and netting out the add/sub lists (when a sub matches an
36// add, both are dropped).
37
38// GetAddChunkId(), GetAddPrefix() and GetFullHash() are exposed so
39// that these items can be generically compared with each other by
40// SBAddPrefixLess() and SBAddPrefixHashLess().
41
42struct SBAddPrefix {
43  int32 chunk_id;
44  SBPrefix prefix;
45
46  SBAddPrefix(int32 id, SBPrefix p) : chunk_id(id), prefix(p) {}
47  SBAddPrefix() : chunk_id(), prefix() {}
48
49  int32 GetAddChunkId() const { return chunk_id; }
50  SBPrefix GetAddPrefix() const { return prefix; }
51};
52
53typedef std::deque<SBAddPrefix> SBAddPrefixes;
54
55struct SBSubPrefix {
56  int32 chunk_id;
57  int32 add_chunk_id;
58  SBPrefix add_prefix;
59
60  SBSubPrefix(int32 id, int32 add_id, int prefix)
61      : chunk_id(id), add_chunk_id(add_id), add_prefix(prefix) {}
62  SBSubPrefix() : chunk_id(), add_chunk_id(), add_prefix() {}
63
64  int32 GetAddChunkId() const { return add_chunk_id; }
65  SBPrefix GetAddPrefix() const { return add_prefix; }
66};
67
68struct SBAddFullHash {
69  int32 chunk_id;
70  int32 received;
71  SBFullHash full_hash;
72
73  SBAddFullHash(int32 id, base::Time r, const SBFullHash& h)
74      : chunk_id(id),
75        received(static_cast<int32>(r.ToTimeT())),
76        full_hash(h) {
77  }
78
79  // Provided for ReadAddHashes() implementations, which already have
80  // an int32 for the time.
81  SBAddFullHash(int32 id, int32 r, const SBFullHash& h)
82      : chunk_id(id), received(r), full_hash(h) {}
83
84  SBAddFullHash() : chunk_id(), received(), full_hash() {}
85
86  int32 GetAddChunkId() const { return chunk_id; }
87  SBPrefix GetAddPrefix() const { return full_hash.prefix; }
88};
89
90struct SBSubFullHash {
91  int32 chunk_id;
92  int32 add_chunk_id;
93  SBFullHash full_hash;
94
95  SBSubFullHash(int32 id, int32 add_id, const SBFullHash& h)
96      : chunk_id(id), add_chunk_id(add_id), full_hash(h) {}
97  SBSubFullHash() : chunk_id(), add_chunk_id(), full_hash() {}
98
99  int32 GetAddChunkId() const { return add_chunk_id; }
100  SBPrefix GetAddPrefix() const { return full_hash.prefix; }
101};
102
103// Determine less-than based on add chunk and prefix.
104template <class T, class U>
105bool SBAddPrefixLess(const T& a, const U& b) {
106  if (a.GetAddChunkId() != b.GetAddChunkId())
107    return a.GetAddChunkId() < b.GetAddChunkId();
108
109  return a.GetAddPrefix() < b.GetAddPrefix();
110}
111
112// Determine less-than based on add chunk, prefix, and full hash.
113// Prefix can compare differently than hash due to byte ordering,
114// so it must take precedence.
115template <class T, class U>
116bool SBAddPrefixHashLess(const T& a, const U& b) {
117  if (SBAddPrefixLess(a, b))
118    return true;
119
120  if (SBAddPrefixLess(b, a))
121    return false;
122
123  return memcmp(a.full_hash.full_hash, b.full_hash.full_hash,
124                sizeof(a.full_hash.full_hash)) < 0;
125}
126
127// Process the lists for subs which knock out adds.  For any item in
128// |sub_prefixes| which has a match in |add_prefixes|, knock out the
129// matched items from all vectors.  Additionally remove items from
130// deleted chunks.
131//
132// TODO(shess): Since the prefixes are uniformly-distributed hashes,
133// there aren't many ways to organize the inputs for efficient
134// processing.  For this reason, the vectors are sorted and processed
135// in parallel.  At this time this code does the sorting internally,
136// but it might make sense to make sorting an API requirement so that
137// the storage can optimize for it.
138void SBProcessSubs(SBAddPrefixes* add_prefixes,
139                   std::vector<SBSubPrefix>* sub_prefixes,
140                   std::vector<SBAddFullHash>* add_full_hashes,
141                   std::vector<SBSubFullHash>* sub_full_hashes,
142                   const base::hash_set<int32>& add_chunks_deleted,
143                   const base::hash_set<int32>& sub_chunks_deleted);
144
145// Records a histogram of the number of items in |prefix_misses| which
146// are not in |add_prefixes|.
147void SBCheckPrefixMisses(const SBAddPrefixes& add_prefixes,
148                         const std::set<SBPrefix>& prefix_misses);
149
150// TODO(shess): This uses int32 rather than int because it's writing
151// specifically-sized items to files.  SBPrefix should likewise be
152// explicitly sized.
153
154// Abstract interface for storing data.
155class SafeBrowsingStore {
156 public:
157  SafeBrowsingStore() {}
158  virtual ~SafeBrowsingStore() {}
159
160  // Sets up the information for later use, but does not necessarily
161  // check whether the underlying file exists, or is valid.  If
162  // |curruption_callback| is non-NULL it will be called if corruption
163  // is detected, which could happen as part of any call other than
164  // Delete().  The appropriate action is to use Delete() to clear the
165  // store.
166  virtual void Init(const FilePath& filename,
167                    const base::Closure& corruption_callback) = 0;
168
169  // Deletes the files which back the store, returning true if
170  // successful.
171  virtual bool Delete() = 0;
172
173  // Get all Add prefixes out from the store.
174  virtual bool GetAddPrefixes(SBAddPrefixes* add_prefixes) = 0;
175
176  // Get all add full-length hashes.
177  virtual bool GetAddFullHashes(
178      std::vector<SBAddFullHash>* add_full_hashes) = 0;
179
180  // Start an update.  None of the following methods should be called
181  // unless this returns true.  If this returns true, the update
182  // should be terminated by FinishUpdate() or CancelUpdate().
183  virtual bool BeginUpdate() = 0;
184
185  // Start a chunk of data.  None of the methods through FinishChunk()
186  // should be called unless this returns true.
187  // TODO(shess): Would it make sense for this to accept |chunk_id|?
188  // Possibly not, because of possible confusion between sub_chunk_id
189  // and add_chunk_id.
190  virtual bool BeginChunk() = 0;
191
192  virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) = 0;
193  virtual bool WriteAddHash(int32 chunk_id,
194                            base::Time receive_time,
195                            const SBFullHash& full_hash) = 0;
196  virtual bool WriteSubPrefix(int32 chunk_id,
197                              int32 add_chunk_id, SBPrefix prefix) = 0;
198  virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
199                            const SBFullHash& full_hash) = 0;
200
201  // Collect the chunk data and preferrably store it on disk to
202  // release memory.  Shoul not modify the data in-place.
203  virtual bool FinishChunk() = 0;
204
205  // Track the chunks which have been seen.
206  virtual void SetAddChunk(int32 chunk_id) = 0;
207  virtual bool CheckAddChunk(int32 chunk_id) = 0;
208  virtual void GetAddChunks(std::vector<int32>* out) = 0;
209  virtual void SetSubChunk(int32 chunk_id) = 0;
210  virtual bool CheckSubChunk(int32 chunk_id) = 0;
211  virtual void GetSubChunks(std::vector<int32>* out) = 0;
212
213  // Delete the indicated chunk_id.  The chunk will continue to be
214  // visible until the end of the transaction.
215  virtual void DeleteAddChunk(int32 chunk_id) = 0;
216  virtual void DeleteSubChunk(int32 chunk_id) = 0;
217
218  // May be called during update to verify that the storage is valid.
219  // Return true if the store seems valid.  If corruption is detected,
220  // calls the corruption callback and return false.
221  // NOTE(shess): When storage was SQLite, there was no guarantee that
222  // a structurally sound database actually contained valid data,
223  // whereas SafeBrowsingStoreFile checksums the data.  For now, this
224  // distinction doesn't matter.
225  virtual bool CheckValidity() = 0;
226
227  // Pass the collected chunks through SBPRocessSubs() and commit to
228  // permanent storage.  The resulting add prefixes and hashes will be
229  // stored in |add_prefixes_result| and |add_full_hashes_result|.
230  // |pending_adds| is the set of full hashes which have been received
231  // since the previous update, and is provided as a convenience
232  // (could be written via WriteAddHash(), but that would flush the
233  // chunk to disk).  |prefix_misses| is the set of prefixes where the
234  // |GetHash()| request returned no full hashes, used for diagnostic
235  // purposes.
236  virtual bool FinishUpdate(
237      const std::vector<SBAddFullHash>& pending_adds,
238      const std::set<SBPrefix>& prefix_misses,
239      SBAddPrefixes* add_prefixes_result,
240      std::vector<SBAddFullHash>* add_full_hashes_result) = 0;
241
242  // Cancel the update in process and remove any temporary disk
243  // storage, leaving the original data unmodified.
244  virtual bool CancelUpdate() = 0;
245
246 private:
247  DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStore);
248};
249
250#endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
251