supervised_user_url_filter.cc revision 1320f92c476a1ad9d19dba2a48c72b75566198e9
1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/supervised_user/supervised_user_url_filter.h"
6
7#include "base/containers/hash_tables.h"
8#include "base/files/file_path.h"
9#include "base/json/json_file_value_serializer.h"
10#include "base/metrics/histogram.h"
11#include "base/sha1.h"
12#include "base/strings/string_number_conversions.h"
13#include "base/strings/string_util.h"
14#include "base/task_runner_util.h"
15#include "base/threading/sequenced_worker_pool.h"
16#include "chrome/browser/supervised_user/experimental/supervised_user_blacklist.h"
17#include "components/policy/core/browser/url_blacklist_manager.h"
18#include "components/url_fixer/url_fixer.h"
19#include "components/url_matcher/url_matcher.h"
20#include "content/public/browser/browser_thread.h"
21#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
22#include "url/gurl.h"
23
24using content::BrowserThread;
25using net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES;
26using net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES;
27using net::registry_controlled_domains::GetRegistryLength;
28using policy::URLBlacklist;
29using url_matcher::URLMatcher;
30using url_matcher::URLMatcherConditionSet;
31
32struct SupervisedUserURLFilter::Contents {
33  URLMatcher url_matcher;
34  std::map<URLMatcherConditionSet::ID, int> matcher_site_map;
35  base::hash_multimap<std::string, int> hash_site_map;
36  std::vector<SupervisedUserSiteList::Site> sites;
37};
38
39namespace {
40
41// URL schemes not in this list (e.g., file:// and chrome://) will always be
42// allowed.
43const char* kFilteredSchemes[] = {
44  "http",
45  "https",
46  "ftp",
47  "gopher",
48  "ws",
49  "wss"
50};
51
52
53// This class encapsulates all the state that is required during construction of
54// a new SupervisedUserURLFilter::Contents.
55class FilterBuilder {
56 public:
57  FilterBuilder();
58  ~FilterBuilder();
59
60  // Adds a single URL pattern for the site identified by |site_id|.
61  bool AddPattern(const std::string& pattern, int site_id);
62
63  // Adds a single hostname SHA1 hash for the site identified by |site_id|.
64  void AddHostnameHash(const std::string& hash, int site_id);
65
66  // Adds all the sites in |site_list|, with URL patterns and hostname hashes.
67  void AddSiteList(SupervisedUserSiteList* site_list);
68
69  // Finalizes construction of the SupervisedUserURLFilter::Contents and returns
70  // them. This method should be called before this object is destroyed.
71  scoped_ptr<SupervisedUserURLFilter::Contents> Build();
72
73 private:
74  scoped_ptr<SupervisedUserURLFilter::Contents> contents_;
75  URLMatcherConditionSet::Vector all_conditions_;
76  URLMatcherConditionSet::ID matcher_id_;
77};
78
79FilterBuilder::FilterBuilder()
80    : contents_(new SupervisedUserURLFilter::Contents()),
81      matcher_id_(0) {}
82
83FilterBuilder::~FilterBuilder() {
84  DCHECK(!contents_.get());
85}
86
87bool FilterBuilder::AddPattern(const std::string& pattern, int site_id) {
88  DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
89  std::string scheme;
90  std::string host;
91  uint16 port;
92  std::string path;
93  std::string query;
94  bool match_subdomains = true;
95  URLBlacklist::SegmentURLCallback callback =
96      static_cast<URLBlacklist::SegmentURLCallback>(url_fixer::SegmentURL);
97  if (!URLBlacklist::FilterToComponents(
98          callback, pattern,
99          &scheme, &host, &match_subdomains, &port, &path, &query)) {
100    LOG(ERROR) << "Invalid pattern " << pattern;
101    return false;
102  }
103
104  scoped_refptr<URLMatcherConditionSet> condition_set =
105      URLBlacklist::CreateConditionSet(
106          &contents_->url_matcher, ++matcher_id_,
107          scheme, host, match_subdomains, port, path, query, true);
108  all_conditions_.push_back(condition_set);
109  contents_->matcher_site_map[matcher_id_] = site_id;
110  return true;
111}
112
113void FilterBuilder::AddHostnameHash(const std::string& hash, int site_id) {
114  contents_->hash_site_map.insert(std::make_pair(StringToUpperASCII(hash),
115                                                 site_id));
116}
117
118void FilterBuilder::AddSiteList(SupervisedUserSiteList* site_list) {
119  std::vector<SupervisedUserSiteList::Site> sites;
120  site_list->GetSites(&sites);
121  int site_id = contents_->sites.size();
122  for (std::vector<SupervisedUserSiteList::Site>::const_iterator it =
123           sites.begin(); it != sites.end(); ++it) {
124    const SupervisedUserSiteList::Site& site = *it;
125    contents_->sites.push_back(site);
126
127    for (std::vector<std::string>::const_iterator pattern_it =
128             site.patterns.begin();
129         pattern_it != site.patterns.end(); ++pattern_it) {
130      AddPattern(*pattern_it, site_id);
131    }
132
133    for (std::vector<std::string>::const_iterator hash_it =
134             site.hostname_hashes.begin();
135         hash_it != site.hostname_hashes.end(); ++hash_it) {
136      AddHostnameHash(*hash_it, site_id);
137    }
138
139    site_id++;
140  }
141}
142
143scoped_ptr<SupervisedUserURLFilter::Contents> FilterBuilder::Build() {
144  DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
145  contents_->url_matcher.AddConditionSets(all_conditions_);
146  return contents_.Pass();
147}
148
149scoped_ptr<SupervisedUserURLFilter::Contents> CreateWhitelistFromPatterns(
150    const std::vector<std::string>& patterns) {
151  DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
152
153  FilterBuilder builder;
154  for (std::vector<std::string>::const_iterator it = patterns.begin();
155       it != patterns.end(); ++it) {
156    // TODO(bauerb): We should create a fake site for the whitelist.
157    builder.AddPattern(*it, -1);
158  }
159
160  return builder.Build();
161}
162
163scoped_ptr<SupervisedUserURLFilter::Contents>
164LoadWhitelistsOnBlockingPoolThread(
165    ScopedVector<SupervisedUserSiteList> site_lists) {
166  DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
167
168  FilterBuilder builder;
169  for (ScopedVector<SupervisedUserSiteList>::iterator it = site_lists.begin();
170       it != site_lists.end(); ++it) {
171    builder.AddSiteList(*it);
172  }
173
174  return builder.Build();
175}
176
177}  // namespace
178
179SupervisedUserURLFilter::SupervisedUserURLFilter()
180    : default_behavior_(ALLOW),
181      contents_(new Contents()),
182      blacklist_(NULL) {
183  // Detach from the current thread so we can be constructed on a different
184  // thread than the one where we're used.
185  DetachFromThread();
186}
187
188SupervisedUserURLFilter::~SupervisedUserURLFilter() {
189  DCHECK(CalledOnValidThread());
190}
191
192// static
193SupervisedUserURLFilter::FilteringBehavior
194SupervisedUserURLFilter::BehaviorFromInt(int behavior_value) {
195  DCHECK_GE(behavior_value, ALLOW);
196  DCHECK_LE(behavior_value, BLOCK);
197  return static_cast<FilteringBehavior>(behavior_value);
198}
199
200// static
201GURL SupervisedUserURLFilter::Normalize(const GURL& url) {
202  GURL normalized_url = url;
203  GURL::Replacements replacements;
204  // Strip username, password, query, and ref.
205  replacements.ClearUsername();
206  replacements.ClearPassword();
207  replacements.ClearQuery();
208  replacements.ClearRef();
209  return url.ReplaceComponents(replacements);
210}
211
212// static
213bool SupervisedUserURLFilter::HasFilteredScheme(const GURL& url) {
214  for (size_t i = 0; i < arraysize(kFilteredSchemes); ++i) {
215    if (url.scheme() == kFilteredSchemes[i])
216      return true;
217  }
218  return false;
219}
220
221std::string GetHostnameHash(const GURL& url) {
222  std::string hash = base::SHA1HashString(url.host());
223  return base::HexEncode(hash.data(), hash.length());
224}
225
226// static
227bool SupervisedUserURLFilter::HostMatchesPattern(const std::string& host,
228                                                 const std::string& pattern) {
229  std::string trimmed_pattern = pattern;
230  std::string trimmed_host = host;
231  if (EndsWith(pattern, ".*", true)) {
232    size_t registry_length = GetRegistryLength(
233        trimmed_host, EXCLUDE_UNKNOWN_REGISTRIES, EXCLUDE_PRIVATE_REGISTRIES);
234    // A host without a known registry part does not match.
235    if (registry_length == 0)
236      return false;
237
238    trimmed_pattern.erase(trimmed_pattern.length() - 2);
239    trimmed_host.erase(trimmed_host.length() - (registry_length + 1));
240  }
241
242  if (StartsWithASCII(trimmed_pattern, "*.", true)) {
243    trimmed_pattern.erase(0, 2);
244
245    // The remaining pattern should be non-empty, and it should not contain
246    // further stars. Also the trimmed host needs to end with the trimmed
247    // pattern.
248    if (trimmed_pattern.empty() ||
249        trimmed_pattern.find('*') != std::string::npos ||
250        !EndsWith(trimmed_host, trimmed_pattern, true)) {
251      return false;
252    }
253
254    // The trimmed host needs to have a dot separating the subdomain from the
255    // matched pattern piece, unless there is no subdomain.
256    int pos = trimmed_host.length() - trimmed_pattern.length();
257    DCHECK_GE(pos, 0);
258    return (pos == 0) || (trimmed_host[pos - 1] == '.');
259  }
260
261  return trimmed_host == trimmed_pattern;
262}
263
264SupervisedUserURLFilter::FilteringBehavior
265SupervisedUserURLFilter::GetFilteringBehaviorForURL(const GURL& url) const {
266  DCHECK(CalledOnValidThread());
267
268  // URLs with a non-standard scheme (e.g. chrome://) are always allowed.
269  if (!HasFilteredScheme(url))
270    return ALLOW;
271
272  // Check manual overrides for the exact URL.
273  std::map<GURL, bool>::const_iterator url_it = url_map_.find(Normalize(url));
274  if (url_it != url_map_.end())
275    return url_it->second ? ALLOW : BLOCK;
276
277  // Check manual overrides for the hostname.
278  std::string host = url.host();
279  std::map<std::string, bool>::const_iterator host_it = host_map_.find(host);
280  if (host_it != host_map_.end())
281    return host_it->second ? ALLOW : BLOCK;
282
283  // Look for patterns matching the hostname, with a value that is different
284  // from the default (a value of true in the map meaning allowed).
285  for (std::map<std::string, bool>::const_iterator host_it =
286      host_map_.begin(); host_it != host_map_.end(); ++host_it) {
287    if ((host_it->second == (default_behavior_ == BLOCK)) &&
288        HostMatchesPattern(host, host_it->first)) {
289      return host_it->second ? ALLOW : BLOCK;
290    }
291  }
292
293  // If there's no blacklist and the default behavior is to allow, we don't need
294  // to check anything else.
295  if (!blacklist_ && default_behavior_ == ALLOW)
296    return ALLOW;
297
298  // Check the list of URL patterns.
299  std::set<URLMatcherConditionSet::ID> matching_ids =
300      contents_->url_matcher.MatchURL(url);
301  if (!matching_ids.empty())
302    return ALLOW;
303
304  // Check the list of hostname hashes.
305  if (contents_->hash_site_map.count(GetHostnameHash(url)))
306    return ALLOW;
307
308  // Check the static blacklist.
309  if (blacklist_ && blacklist_->HasURL(url))
310    return BLOCK;
311
312  // Fall back to the default behavior.
313  return default_behavior_;
314}
315
316void SupervisedUserURLFilter::GetSites(
317    const GURL& url,
318    std::vector<SupervisedUserSiteList::Site*>* sites) const {
319  std::set<URLMatcherConditionSet::ID> matching_ids =
320      contents_->url_matcher.MatchURL(url);
321  for (std::set<URLMatcherConditionSet::ID>::const_iterator it =
322           matching_ids.begin(); it != matching_ids.end(); ++it) {
323    std::map<URLMatcherConditionSet::ID, int>::const_iterator entry =
324        contents_->matcher_site_map.find(*it);
325    if (entry == contents_->matcher_site_map.end()) {
326      NOTREACHED();
327      continue;
328    }
329    sites->push_back(&contents_->sites[entry->second]);
330  }
331
332  typedef base::hash_multimap<std::string, int>::const_iterator
333      hash_site_map_iterator;
334  std::pair<hash_site_map_iterator, hash_site_map_iterator> bounds =
335      contents_->hash_site_map.equal_range(GetHostnameHash(url));
336  for (hash_site_map_iterator hash_it = bounds.first;
337       hash_it != bounds.second; hash_it++) {
338    sites->push_back(&contents_->sites[hash_it->second]);
339  }
340}
341
342void SupervisedUserURLFilter::SetDefaultFilteringBehavior(
343    FilteringBehavior behavior) {
344  DCHECK(CalledOnValidThread());
345  default_behavior_ = behavior;
346}
347
348void SupervisedUserURLFilter::LoadWhitelists(
349    ScopedVector<SupervisedUserSiteList> site_lists) {
350  DCHECK(CalledOnValidThread());
351
352  base::PostTaskAndReplyWithResult(
353      BrowserThread::GetBlockingPool(),
354      FROM_HERE,
355      base::Bind(&LoadWhitelistsOnBlockingPoolThread,
356                 base::Passed(&site_lists)),
357      base::Bind(&SupervisedUserURLFilter::SetContents, this));
358}
359
360void SupervisedUserURLFilter::SetBlacklist(SupervisedUserBlacklist* blacklist) {
361  blacklist_ = blacklist;
362}
363
364void SupervisedUserURLFilter::SetFromPatterns(
365    const std::vector<std::string>& patterns) {
366  DCHECK(CalledOnValidThread());
367
368  base::PostTaskAndReplyWithResult(
369      BrowserThread::GetBlockingPool(),
370      FROM_HERE,
371      base::Bind(&CreateWhitelistFromPatterns, patterns),
372      base::Bind(&SupervisedUserURLFilter::SetContents, this));
373}
374
375void SupervisedUserURLFilter::SetManualHosts(
376    const std::map<std::string, bool>* host_map) {
377  DCHECK(CalledOnValidThread());
378  host_map_ = *host_map;
379  UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualHostsEntries",
380                              host_map->size(), 1, 1000, 50);
381}
382
383void SupervisedUserURLFilter::SetManualURLs(
384    const std::map<GURL, bool>* url_map) {
385  DCHECK(CalledOnValidThread());
386  url_map_ = *url_map;
387  UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualURLsEntries",
388                              url_map->size(), 1, 1000, 50);
389}
390
391void SupervisedUserURLFilter::AddObserver(Observer* observer) {
392  observers_.AddObserver(observer);
393}
394
395void SupervisedUserURLFilter::RemoveObserver(Observer* observer) {
396  observers_.RemoveObserver(observer);
397}
398
399void SupervisedUserURLFilter::SetContents(scoped_ptr<Contents> contents) {
400  DCHECK(CalledOnValidThread());
401  contents_ = contents.Pass();
402  FOR_EACH_OBSERVER(Observer, observers_, OnSiteListUpdated());
403}
404