safe_browsing_util.h revision a1401311d1ab56c4ed0a474bd38c108f75cb0cd9
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// Utilities for the SafeBrowsing code.
6
7#ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_UTIL_H_
8#define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_UTIL_H_
9
10#include <cstring>
11#include <deque>
12#include <set>
13#include <string>
14#include <vector>
15
16#include "base/basictypes.h"
17#include "base/strings/string_piece.h"
18#include "chrome/browser/safe_browsing/chunk_range.h"
19
20class GURL;
21
22class SBEntry;
23
24// A truncated hash's type.
25typedef uint32 SBPrefix;
26
27// Container for holding a chunk URL and the list it belongs to.
28struct ChunkUrl {
29  std::string url;
30  std::string list_name;
31};
32
33// A full hash.
34union SBFullHash {
35  char full_hash[32];
36  SBPrefix prefix;
37};
38
39inline bool SBFullHashEqual(const SBFullHash& a, const SBFullHash& b) {
40  return !memcmp(a.full_hash, b.full_hash, sizeof(a.full_hash));
41}
42
43// Generate full hash for the given string.
44SBFullHash SBFullHashForString(const base::StringPiece& str);
45
46// Container for information about a specific host in an add/sub chunk.
47struct SBChunkHost {
48  SBPrefix host;
49  SBEntry* entry;
50};
51
52// Container for an add/sub chunk.
53struct SBChunk {
54  SBChunk();
55  ~SBChunk();
56
57  int chunk_number;
58  int list_id;
59  bool is_add;
60  std::deque<SBChunkHost> hosts;
61};
62
63// Container for a set of chunks.  Interim wrapper to replace use of
64// |std::deque<SBChunk>| with something having safer memory semantics.
65// management.
66// TODO(shess): |SBEntry| is currently a very roundabout way to hold
67// things pending storage.  It could be replaced with the structures
68// used in SafeBrowsingStore, then lots of bridging code could
69// dissappear.
70class SBChunkList {
71 public:
72  SBChunkList();
73  ~SBChunkList();
74
75  // Implement that subset of the |std::deque<>| interface which
76  // callers expect.
77  bool empty() const { return chunks_.empty(); }
78  size_t size() { return chunks_.size(); }
79
80  void push_back(const SBChunk& chunk) { chunks_.push_back(chunk); }
81  SBChunk& back() { return chunks_.back(); }
82  SBChunk& front() { return chunks_.front(); }
83  const SBChunk& front() const { return chunks_.front(); }
84
85  typedef std::vector<SBChunk>::const_iterator const_iterator;
86  const_iterator begin() const { return chunks_.begin(); }
87  const_iterator end() const { return chunks_.end(); }
88
89  typedef std::vector<SBChunk>::iterator iterator;
90  iterator begin() { return chunks_.begin(); }
91  iterator end() { return chunks_.end(); }
92
93  SBChunk& operator[](size_t n) { return chunks_[n]; }
94  const SBChunk& operator[](size_t n) const { return chunks_[n]; }
95
96  // Calls |SBEvent::Destroy()| before clearing |chunks_|.
97  void clear();
98
99 private:
100  std::vector<SBChunk> chunks_;
101
102  DISALLOW_COPY_AND_ASSIGN(SBChunkList);
103};
104
105// Used when we get a gethash response.
106struct SBFullHashResult {
107  SBFullHash hash;
108  std::string list_name;
109  int add_chunk_id;
110};
111
112// Contains information about a list in the database.
113struct SBListChunkRanges {
114  explicit SBListChunkRanges(const std::string& n);
115
116  std::string name;  // The list name.
117  std::string adds;  // The ranges for add chunks.
118  std::string subs;  // The ranges for sub chunks.
119};
120
121// Container for deleting chunks from the database.
122struct SBChunkDelete {
123  SBChunkDelete();
124  ~SBChunkDelete();
125
126  std::string list_name;
127  bool is_sub_del;
128  std::vector<ChunkRange> chunk_del;
129};
130
131// Different types of threats that SafeBrowsing protects against.
132enum SBThreatType {
133  // No threat at all.
134  SB_THREAT_TYPE_SAFE,
135
136  // The URL is being used for phishing.
137  SB_THREAT_TYPE_URL_PHISHING,
138
139  // The URL hosts malware.
140  SB_THREAT_TYPE_URL_MALWARE,
141
142  // The download URL is malware.
143  SB_THREAT_TYPE_BINARY_MALWARE_URL,
144
145  // Url detected by the client-side phishing model.  Note that unlike the
146  // above values, this does not correspond to a downloaded list.
147  SB_THREAT_TYPE_CLIENT_SIDE_PHISHING_URL,
148
149  // The Chrome extension or app (given by its ID) is malware.
150  SB_THREAT_TYPE_EXTENSION,
151
152  // Url detected by the client-side malware IP list. This IP list is part
153  // of the client side detection model.
154  SB_THREAT_TYPE_CLIENT_SIDE_MALWARE_URL,
155};
156
157// SBEntry ---------------------------------------------------------------------
158
159// Holds information about the prefixes for a hostkey.  prefixes can either be
160// 4 bytes (truncated hash) or 32 bytes (full hash).
161// For adds:
162//   [list id ][chunk id][prefix count (0..n)][prefix1][prefix2]
163// For subs:
164//   [list id ][chunk id (only used if prefix count is 0][prefix count (0..n)]
165//       [add chunk][prefix][add chunk][prefix]
166class SBEntry {
167 public:
168  enum Type {
169    ADD_PREFIX,     // 4 byte add entry.
170    SUB_PREFIX,     // 4 byte sub entry.
171    ADD_FULL_HASH,  // 32 byte add entry.
172    SUB_FULL_HASH,  // 32 byte sub entry.
173  };
174
175  // Creates a SBEntry with the necessary size for the given number of prefixes.
176  // Caller ownes the object and needs to free it by calling Destroy.
177  static SBEntry* Create(Type type, int prefix_count);
178
179  // Frees the entry's memory.
180  void Destroy();
181
182  void set_list_id(int list_id) { data_.list_id = list_id; }
183  int list_id() const { return data_.list_id; }
184  void set_chunk_id(int chunk_id) { data_.chunk_id = chunk_id; }
185  int chunk_id() const { return data_.chunk_id; }
186  int prefix_count() const { return data_.prefix_count; }
187
188  // Returns true if this is a prefix as opposed to a full hash.
189  bool IsPrefix() const {
190    return type() == ADD_PREFIX || type() == SUB_PREFIX;
191  }
192
193  // Returns true if this is an add entry.
194  bool IsAdd() const {
195    return type() == ADD_PREFIX || type() == ADD_FULL_HASH;
196  }
197
198  // Returns true if this is a sub entry.
199  bool IsSub() const {
200    return type() == SUB_PREFIX || type() == SUB_FULL_HASH;
201  }
202
203  // Helper to return the size of the prefixes.
204  int HashLen() const {
205    return IsPrefix() ? sizeof(SBPrefix) : sizeof(SBFullHash);
206  }
207
208  // For add entries, returns the add chunk id.  For sub entries, returns the
209  // add_chunk id for the prefix at the given index.
210  int ChunkIdAtPrefix(int index) const;
211
212  // Used for sub chunks to set the chunk id at a given index.
213  void SetChunkIdAtPrefix(int index, int chunk_id);
214
215  // Return the prefix/full hash at the given index.  Caller is expected to
216  // call the right function based on the hash length.
217  const SBPrefix& PrefixAt(int index) const;
218  const SBFullHash& FullHashAt(int index) const;
219
220  // Return the prefix/full hash at the given index.  Caller is expected to
221  // call the right function based on the hash length.
222  void SetPrefixAt(int index, const SBPrefix& prefix);
223  void SetFullHashAt(int index, const SBFullHash& full_hash);
224
225 private:
226  // Container for a sub prefix.
227  struct SBSubPrefix {
228    int add_chunk;
229    SBPrefix prefix;
230  };
231
232  // Container for a sub full hash.
233  struct SBSubFullHash {
234    int add_chunk;
235    SBFullHash prefix;
236  };
237
238  // Keep the fixed data together in one struct so that we can get its size
239  // easily.  If any of this is modified, the database will have to be cleared.
240  struct Data {
241    int list_id;
242    // For adds, this is the add chunk number.
243    // For subs: if prefix_count is 0 then this is the add chunk that this sub
244    //     refers to.  Otherwise it's ignored, and the add_chunk in sub_prefixes
245    //     or sub_full_hashes is used for each corresponding prefix.
246    int chunk_id;
247    Type type;
248    int prefix_count;
249  };
250
251  SBEntry();
252  ~SBEntry();
253
254  // Helper to return the size of each prefix entry (i.e. for subs this
255  // includes an add chunk id).
256  static int PrefixSize(Type type);
257
258  // Helper to return how much memory a given Entry would require.
259  static int Size(Type type, int prefix_count);
260
261  // Returns how many bytes this entry is.
262  int Size() const;
263
264  Type type() const { return data_.type; }
265
266  void set_prefix_count(int count) { data_.prefix_count = count; }
267  void set_type(Type type) { data_.type = type; }
268
269  // The prefixes union must follow the fixed data so that they're contiguous
270  // in memory.
271  Data data_;
272  union {
273    SBPrefix add_prefixes_[1];
274    SBSubPrefix sub_prefixes_[1];
275    SBFullHash add_full_hashes_[1];
276    SBSubFullHash sub_full_hashes_[1];
277  };
278};
279
280
281// Utility functions -----------------------------------------------------------
282
283namespace safe_browsing_util {
284
285// SafeBrowsing list names.
286extern const char kMalwareList[];
287extern const char kPhishingList[];
288// Binary Download list name.
289extern const char kBinUrlList[];
290// SafeBrowsing client-side detection whitelist list name.
291extern const char kCsdWhiteList[];
292// SafeBrowsing download whitelist list name.
293extern const char kDownloadWhiteList[];
294// SafeBrowsing extension list name.
295extern const char kExtensionBlacklist[];
296// SafeBrowsing side-effect free whitelist name.
297extern const char kSideEffectFreeWhitelist[];
298// SafeBrowsing csd malware IP blacklist name.
299extern const char kIPBlacklist[];
300
301// This array must contain all Safe Browsing lists.
302extern const char* kAllLists[8];
303
304enum ListType {
305  INVALID = -1,
306  MALWARE = 0,
307  PHISH = 1,
308  BINURL = 2,
309  // Obsolete BINHASH = 3,
310  CSDWHITELIST = 4,
311  // SafeBrowsing lists are stored in pairs.  Keep ListType 5
312  // available for a potential second list that we would store in the
313  // csd-whitelist store file.
314  DOWNLOADWHITELIST = 6,
315  // See above comment. Leave 7 available.
316  EXTENSIONBLACKLIST = 8,
317  // See above comment. Leave 9 available.
318  SIDEEFFECTFREEWHITELIST = 10,
319  // See above comment. Leave 11 available.
320  IPBLACKLIST = 12,
321  // See above comment.  Leave 13 available.
322};
323
324// Maps a list name to ListType.
325ListType GetListId(const std::string& name);
326
327// Maps a ListId to list name. Return false if fails.
328bool GetListName(ListType list_id, std::string* list);
329
330// Canonicalizes url as per Google Safe Browsing Specification.
331// See section 6.1 in
332// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
333void CanonicalizeUrl(const GURL& url, std::string* canonicalized_hostname,
334                     std::string* canonicalized_path,
335                     std::string* canonicalized_query);
336
337// Given a URL, returns all the hosts we need to check.  They are returned
338// in order of size (i.e. b.c is first, then a.b.c).
339void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts);
340
341// Given a URL, returns all the paths we need to check.
342void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths);
343
344// Given a URL, returns all the patterns we need to check.
345void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls);
346
347int GetHashIndex(const SBFullHash& hash,
348                 const std::vector<SBFullHashResult>& full_hashes);
349
350// Given a URL, compare all the possible host + path full hashes to the set of
351// provided full hashes.  Returns the index of the match if one is found, or -1
352// otherwise.
353int GetUrlHashIndex(const GURL& url,
354                    const std::vector<SBFullHashResult>& full_hashes);
355
356bool IsPhishingList(const std::string& list_name);
357bool IsMalwareList(const std::string& list_name);
358bool IsBadbinurlList(const std::string& list_name);
359bool IsExtensionList(const std::string& list_name);
360
361GURL GeneratePhishingReportUrl(const std::string& report_page,
362                               const std::string& url_to_report,
363                               bool is_client_side_detection);
364
365SBFullHash StringToSBFullHash(const std::string& hash_in);
366std::string SBFullHashToString(const SBFullHash& hash_out);
367
368}  // namespace safe_browsing_util
369
370#endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_UTIL_H_
371