safe_browsing_util.h revision 0f1bc08d4cfcc34181b0b5cbf065c40f687bf740
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// Utilities for the SafeBrowsing code.
6
7#ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_UTIL_H_
8#define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_UTIL_H_
9
10#include <cstring>
11#include <deque>
12#include <set>
13#include <string>
14#include <vector>
15
16#include "base/basictypes.h"
17#include "chrome/browser/safe_browsing/chunk_range.h"
18
19class GURL;
20
21class SBEntry;
22
23// A truncated hash's type.
24typedef int32 SBPrefix;
25
26// Container for holding a chunk URL and the list it belongs to.
27struct ChunkUrl {
28  std::string url;
29  std::string list_name;
30};
31
32// A full hash.
33union SBFullHash {
34  char full_hash[32];
35  SBPrefix prefix;
36};
37
38inline bool operator==(const SBFullHash& lhash, const SBFullHash& rhash) {
39  return memcmp(lhash.full_hash, rhash.full_hash, sizeof(SBFullHash)) == 0;
40}
41
42inline bool operator<(const SBFullHash& lhash, const SBFullHash& rhash) {
43  return memcmp(lhash.full_hash, rhash.full_hash, sizeof(SBFullHash)) < 0;
44}
45
46// Container for information about a specific host in an add/sub chunk.
47struct SBChunkHost {
48  SBPrefix host;
49  SBEntry* entry;
50};
51
52// Container for an add/sub chunk.
53struct SBChunk {
54  SBChunk();
55  ~SBChunk();
56
57  int chunk_number;
58  int list_id;
59  bool is_add;
60  std::deque<SBChunkHost> hosts;
61};
62
63// Container for a set of chunks.  Interim wrapper to replace use of
64// |std::deque<SBChunk>| with something having safer memory semantics.
65// management.
66// TODO(shess): |SBEntry| is currently a very roundabout way to hold
67// things pending storage.  It could be replaced with the structures
68// used in SafeBrowsingStore, then lots of bridging code could
69// dissappear.
70class SBChunkList {
71 public:
72  SBChunkList();
73  ~SBChunkList();
74
75  // Implement that subset of the |std::deque<>| interface which
76  // callers expect.
77  bool empty() const { return chunks_.empty(); }
78  size_t size() { return chunks_.size(); }
79
80  void push_back(const SBChunk& chunk) { chunks_.push_back(chunk); }
81  SBChunk& back() { return chunks_.back(); }
82  SBChunk& front() { return chunks_.front(); }
83  const SBChunk& front() const { return chunks_.front(); }
84
85  typedef std::vector<SBChunk>::const_iterator const_iterator;
86  const_iterator begin() const { return chunks_.begin(); }
87  const_iterator end() const { return chunks_.end(); }
88
89  typedef std::vector<SBChunk>::iterator iterator;
90  iterator begin() { return chunks_.begin(); }
91  iterator end() { return chunks_.end(); }
92
93  SBChunk& operator[](size_t n) { return chunks_[n]; }
94  const SBChunk& operator[](size_t n) const { return chunks_[n]; }
95
96  // Calls |SBEvent::Destroy()| before clearing |chunks_|.
97  void clear();
98
99 private:
100  std::vector<SBChunk> chunks_;
101
102  DISALLOW_COPY_AND_ASSIGN(SBChunkList);
103};
104
105// Used when we get a gethash response.
106struct SBFullHashResult {
107  SBFullHash hash;
108  std::string list_name;
109  int add_chunk_id;
110};
111
112// Contains information about a list in the database.
113struct SBListChunkRanges {
114  explicit SBListChunkRanges(const std::string& n);
115
116  std::string name;  // The list name.
117  std::string adds;  // The ranges for add chunks.
118  std::string subs;  // The ranges for sub chunks.
119};
120
121// Container for deleting chunks from the database.
122struct SBChunkDelete {
123  SBChunkDelete();
124  ~SBChunkDelete();
125
126  std::string list_name;
127  bool is_sub_del;
128  std::vector<ChunkRange> chunk_del;
129};
130
131// Different types of threats that SafeBrowsing protects against.
132enum SBThreatType {
133  // No threat at all.
134  SB_THREAT_TYPE_SAFE,
135
136  // The URL is being used for phishing.
137  SB_THREAT_TYPE_URL_PHISHING,
138
139  // The URL hosts malware.
140  SB_THREAT_TYPE_URL_MALWARE,
141
142  // The download URL is malware.
143  SB_THREAT_TYPE_BINARY_MALWARE_URL,
144
145  // The hash of the download contents is malware.
146  SB_THREAT_TYPE_BINARY_MALWARE_HASH,
147
148  // Url detected by the client-side phishing model.  Note that unlike the
149  // above values, this does not correspond to a downloaded list.
150  SB_THREAT_TYPE_CLIENT_SIDE_PHISHING_URL,
151
152  // The Chrome extension or app (given by its ID) is malware.
153  SB_THREAT_TYPE_EXTENSION,
154
155  // Url detected by the client-side malware IP list. This IP list is part
156  // of the client side detection model.
157  SB_THREAT_TYPE_CLIENT_SIDE_MALWARE_URL,
158};
159
160// SBEntry ---------------------------------------------------------------------
161
162// Holds information about the prefixes for a hostkey.  prefixes can either be
163// 4 bytes (truncated hash) or 32 bytes (full hash).
164// For adds:
165//   [list id ][chunk id][prefix count (0..n)][prefix1][prefix2]
166// For subs:
167//   [list id ][chunk id (only used if prefix count is 0][prefix count (0..n)]
168//       [add chunk][prefix][add chunk][prefix]
169class SBEntry {
170 public:
171  enum Type {
172    ADD_PREFIX,     // 4 byte add entry.
173    SUB_PREFIX,     // 4 byte sub entry.
174    ADD_FULL_HASH,  // 32 byte add entry.
175    SUB_FULL_HASH,  // 32 byte sub entry.
176  };
177
178  // Creates a SBEntry with the necessary size for the given number of prefixes.
179  // Caller ownes the object and needs to free it by calling Destroy.
180  static SBEntry* Create(Type type, int prefix_count);
181
182  // Frees the entry's memory.
183  void Destroy();
184
185  void set_list_id(int list_id) { data_.list_id = list_id; }
186  int list_id() const { return data_.list_id; }
187  void set_chunk_id(int chunk_id) { data_.chunk_id = chunk_id; }
188  int chunk_id() const { return data_.chunk_id; }
189  int prefix_count() const { return data_.prefix_count; }
190
191  // Returns true if this is a prefix as opposed to a full hash.
192  bool IsPrefix() const {
193    return type() == ADD_PREFIX || type() == SUB_PREFIX;
194  }
195
196  // Returns true if this is an add entry.
197  bool IsAdd() const {
198    return type() == ADD_PREFIX || type() == ADD_FULL_HASH;
199  }
200
201  // Returns true if this is a sub entry.
202  bool IsSub() const {
203    return type() == SUB_PREFIX || type() == SUB_FULL_HASH;
204  }
205
206  // Helper to return the size of the prefixes.
207  int HashLen() const {
208    return IsPrefix() ? sizeof(SBPrefix) : sizeof(SBFullHash);
209  }
210
211  // For add entries, returns the add chunk id.  For sub entries, returns the
212  // add_chunk id for the prefix at the given index.
213  int ChunkIdAtPrefix(int index) const;
214
215  // Used for sub chunks to set the chunk id at a given index.
216  void SetChunkIdAtPrefix(int index, int chunk_id);
217
218  // Return the prefix/full hash at the given index.  Caller is expected to
219  // call the right function based on the hash length.
220  const SBPrefix& PrefixAt(int index) const;
221  const SBFullHash& FullHashAt(int index) const;
222
223  // Return the prefix/full hash at the given index.  Caller is expected to
224  // call the right function based on the hash length.
225  void SetPrefixAt(int index, const SBPrefix& prefix);
226  void SetFullHashAt(int index, const SBFullHash& full_hash);
227
228 private:
229  // Container for a sub prefix.
230  struct SBSubPrefix {
231    int add_chunk;
232    SBPrefix prefix;
233  };
234
235  // Container for a sub full hash.
236  struct SBSubFullHash {
237    int add_chunk;
238    SBFullHash prefix;
239  };
240
241  // Keep the fixed data together in one struct so that we can get its size
242  // easily.  If any of this is modified, the database will have to be cleared.
243  struct Data {
244    int list_id;
245    // For adds, this is the add chunk number.
246    // For subs: if prefix_count is 0 then this is the add chunk that this sub
247    //     refers to.  Otherwise it's ignored, and the add_chunk in sub_prefixes
248    //     or sub_full_hashes is used for each corresponding prefix.
249    int chunk_id;
250    Type type;
251    int prefix_count;
252  };
253
254  SBEntry();
255  ~SBEntry();
256
257  // Helper to return the size of each prefix entry (i.e. for subs this
258  // includes an add chunk id).
259  static int PrefixSize(Type type);
260
261  // Helper to return how much memory a given Entry would require.
262  static int Size(Type type, int prefix_count);
263
264  // Returns how many bytes this entry is.
265  int Size() const;
266
267  Type type() const { return data_.type; }
268
269  void set_prefix_count(int count) { data_.prefix_count = count; }
270  void set_type(Type type) { data_.type = type; }
271
272  // The prefixes union must follow the fixed data so that they're contiguous
273  // in memory.
274  Data data_;
275  union {
276    SBPrefix add_prefixes_[1];
277    SBSubPrefix sub_prefixes_[1];
278    SBFullHash add_full_hashes_[1];
279    SBSubFullHash sub_full_hashes_[1];
280  };
281};
282
283
284// Utility functions -----------------------------------------------------------
285
286namespace safe_browsing_util {
287
288// SafeBrowsing list names.
289extern const char kMalwareList[];
290extern const char kPhishingList[];
291// Binary Download list names.
292extern const char kBinUrlList[];
293extern const char kBinHashList[];
294// SafeBrowsing client-side detection whitelist list name.
295extern const char kCsdWhiteList[];
296// SafeBrowsing download whitelist list name.
297extern const char kDownloadWhiteList[];
298// SafeBrowsing extension list name.
299extern const char kExtensionBlacklist[];
300// SafeBrowsing side-effect free whitelist name.
301extern const char kSideEffectFreeWhitelist[];
302// SafeBrowsing csd malware IP blacklist name.
303extern const char kIPBlacklist[];
304
305enum ListType {
306  INVALID = -1,
307  MALWARE = 0,
308  PHISH = 1,
309  BINURL = 2,
310  BINHASH = 3,
311  CSDWHITELIST = 4,
312  // SafeBrowsing lists are stored in pairs.  Keep ListType 5
313  // available for a potential second list that we would store in the
314  // csd-whitelist store file.
315  DOWNLOADWHITELIST = 6,
316  // See above comment. Leave 7 available.
317  EXTENSIONBLACKLIST = 8,
318  // See above comment. Leave 9 available.
319  SIDEEFFECTFREEWHITELIST = 10,
320  // See above comment. Leave 11 available.
321  IPBLACKLIST = 12,
322  // See above comment.  Leave 13 available.
323};
324
325// Maps a list name to ListType.
326ListType GetListId(const std::string& name);
327
328// Maps a ListId to list name. Return false if fails.
329bool GetListName(ListType list_id, std::string* list);
330
331// Canonicalizes url as per Google Safe Browsing Specification.
332// See section 6.1 in
333// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
334void CanonicalizeUrl(const GURL& url, std::string* canonicalized_hostname,
335                     std::string* canonicalized_path,
336                     std::string* canonicalized_query);
337
338// Given a URL, returns all the hosts we need to check.  They are returned
339// in order of size (i.e. b.c is first, then a.b.c).
340void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts);
341
342// Given a URL, returns all the paths we need to check.
343void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths);
344
345// Given a URL, returns all the patterns we need to check.
346void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls);
347
348int GetHashIndex(const SBFullHash& hash,
349                 const std::vector<SBFullHashResult>& full_hashes);
350
351// Given a URL, compare all the possible host + path full hashes to the set of
352// provided full hashes.  Returns the index of the match if one is found, or -1
353// otherwise.
354int GetUrlHashIndex(const GURL& url,
355                    const std::vector<SBFullHashResult>& full_hashes);
356
357bool IsPhishingList(const std::string& list_name);
358bool IsMalwareList(const std::string& list_name);
359bool IsBadbinurlList(const std::string& list_name);
360bool IsBadbinhashList(const std::string& list_name);
361bool IsExtensionList(const std::string& list_name);
362
363GURL GeneratePhishingReportUrl(const std::string& report_page,
364                               const std::string& url_to_report,
365                               bool is_client_side_detection);
366
367SBFullHash StringToSBFullHash(const std::string& hash_in);
368std::string SBFullHashToString(const SBFullHash& hash_out);
369
370}  // namespace safe_browsing_util
371
372#endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_UTIL_H_
373