supervised_user_url_filter.cc revision f8ee788a64d60abd8f2d742a5fdedde054ecd910
1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/supervised_user/supervised_user_url_filter.h"
6
7#include "base/containers/hash_tables.h"
8#include "base/files/file_path.h"
9#include "base/json/json_file_value_serializer.h"
10#include "base/metrics/histogram.h"
11#include "base/sha1.h"
12#include "base/strings/string_number_conversions.h"
13#include "base/strings/string_util.h"
14#include "base/task_runner_util.h"
15#include "base/threading/sequenced_worker_pool.h"
16#include "components/policy/core/browser/url_blacklist_manager.h"
17#include "components/url_fixer/url_fixer.h"
18#include "components/url_matcher/url_matcher.h"
19#include "content/public/browser/browser_thread.h"
20#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
21#include "url/gurl.h"
22
23using content::BrowserThread;
24using net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES;
25using net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES;
26using net::registry_controlled_domains::GetRegistryLength;
27using policy::URLBlacklist;
28using url_matcher::URLMatcher;
29using url_matcher::URLMatcherConditionSet;
30
31struct SupervisedUserURLFilter::Contents {
32  URLMatcher url_matcher;
33  std::map<URLMatcherConditionSet::ID, int> matcher_site_map;
34  base::hash_multimap<std::string, int> hash_site_map;
35  std::vector<SupervisedUserSiteList::Site> sites;
36};
37
38namespace {
39
40// URL schemes not in this list (e.g., file:// and chrome://) will always be
41// allowed.
42const char* kFilteredSchemes[] = {
43  "http",
44  "https",
45  "ftp",
46  "gopher",
47  "ws",
48  "wss"
49};
50
51
52// This class encapsulates all the state that is required during construction of
53// a new SupervisedUserURLFilter::Contents.
54class FilterBuilder {
55 public:
56  FilterBuilder();
57  ~FilterBuilder();
58
59  // Adds a single URL pattern for the site identified by |site_id|.
60  bool AddPattern(const std::string& pattern, int site_id);
61
62  // Adds a single hostname SHA1 hash for the site identified by |site_id|.
63  void AddHostnameHash(const std::string& hash, int site_id);
64
65  // Adds all the sites in |site_list|, with URL patterns and hostname hashes.
66  void AddSiteList(SupervisedUserSiteList* site_list);
67
68  // Finalizes construction of the SupervisedUserURLFilter::Contents and returns
69  // them. This method should be called before this object is destroyed.
70  scoped_ptr<SupervisedUserURLFilter::Contents> Build();
71
72 private:
73  scoped_ptr<SupervisedUserURLFilter::Contents> contents_;
74  URLMatcherConditionSet::Vector all_conditions_;
75  URLMatcherConditionSet::ID matcher_id_;
76};
77
78FilterBuilder::FilterBuilder()
79    : contents_(new SupervisedUserURLFilter::Contents()),
80      matcher_id_(0) {}
81
82FilterBuilder::~FilterBuilder() {
83  DCHECK(!contents_.get());
84}
85
86bool FilterBuilder::AddPattern(const std::string& pattern, int site_id) {
87  DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
88  std::string scheme;
89  std::string host;
90  uint16 port;
91  std::string path;
92  std::string query;
93  bool match_subdomains = true;
94  URLBlacklist::SegmentURLCallback callback =
95      static_cast<URLBlacklist::SegmentURLCallback>(url_fixer::SegmentURL);
96  if (!URLBlacklist::FilterToComponents(
97          callback, pattern,
98          &scheme, &host, &match_subdomains, &port, &path, &query)) {
99    LOG(ERROR) << "Invalid pattern " << pattern;
100    return false;
101  }
102
103  scoped_refptr<URLMatcherConditionSet> condition_set =
104      URLBlacklist::CreateConditionSet(
105          &contents_->url_matcher, ++matcher_id_,
106          scheme, host, match_subdomains, port, path, query, true);
107  all_conditions_.push_back(condition_set);
108  contents_->matcher_site_map[matcher_id_] = site_id;
109  return true;
110}
111
112void FilterBuilder::AddHostnameHash(const std::string& hash, int site_id) {
113  contents_->hash_site_map.insert(std::make_pair(StringToUpperASCII(hash),
114                                                 site_id));
115}
116
117void FilterBuilder::AddSiteList(SupervisedUserSiteList* site_list) {
118  std::vector<SupervisedUserSiteList::Site> sites;
119  site_list->GetSites(&sites);
120  int site_id = contents_->sites.size();
121  for (std::vector<SupervisedUserSiteList::Site>::const_iterator it =
122           sites.begin(); it != sites.end(); ++it) {
123    const SupervisedUserSiteList::Site& site = *it;
124    contents_->sites.push_back(site);
125
126    for (std::vector<std::string>::const_iterator pattern_it =
127             site.patterns.begin();
128         pattern_it != site.patterns.end(); ++pattern_it) {
129      AddPattern(*pattern_it, site_id);
130    }
131
132    for (std::vector<std::string>::const_iterator hash_it =
133             site.hostname_hashes.begin();
134         hash_it != site.hostname_hashes.end(); ++hash_it) {
135      AddHostnameHash(*hash_it, site_id);
136    }
137
138    site_id++;
139  }
140}
141
142scoped_ptr<SupervisedUserURLFilter::Contents> FilterBuilder::Build() {
143  DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
144  contents_->url_matcher.AddConditionSets(all_conditions_);
145  return contents_.Pass();
146}
147
148scoped_ptr<SupervisedUserURLFilter::Contents> CreateWhitelistFromPatterns(
149    const std::vector<std::string>& patterns) {
150  DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
151
152  FilterBuilder builder;
153  for (std::vector<std::string>::const_iterator it = patterns.begin();
154       it != patterns.end(); ++it) {
155    // TODO(bauerb): We should create a fake site for the whitelist.
156    builder.AddPattern(*it, -1);
157  }
158
159  return builder.Build();
160}
161
162scoped_ptr<SupervisedUserURLFilter::Contents>
163LoadWhitelistsOnBlockingPoolThread(
164    ScopedVector<SupervisedUserSiteList> site_lists) {
165  DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
166
167  FilterBuilder builder;
168  for (ScopedVector<SupervisedUserSiteList>::iterator it = site_lists.begin();
169       it != site_lists.end(); ++it) {
170    builder.AddSiteList(*it);
171  }
172
173  return builder.Build();
174}
175
176}  // namespace
177
178SupervisedUserURLFilter::SupervisedUserURLFilter()
179    : default_behavior_(ALLOW),
180      contents_(new Contents()) {
181  // Detach from the current thread so we can be constructed on a different
182  // thread than the one where we're used.
183  DetachFromThread();
184}
185
186SupervisedUserURLFilter::~SupervisedUserURLFilter() {
187  DCHECK(CalledOnValidThread());
188}
189
190// static
191SupervisedUserURLFilter::FilteringBehavior
192SupervisedUserURLFilter::BehaviorFromInt(int behavior_value) {
193  DCHECK_GE(behavior_value, ALLOW);
194  DCHECK_LE(behavior_value, BLOCK);
195  return static_cast<FilteringBehavior>(behavior_value);
196}
197
198// static
199GURL SupervisedUserURLFilter::Normalize(const GURL& url) {
200  GURL normalized_url = url;
201  GURL::Replacements replacements;
202  // Strip username, password, query, and ref.
203  replacements.ClearUsername();
204  replacements.ClearPassword();
205  replacements.ClearQuery();
206  replacements.ClearRef();
207  return url.ReplaceComponents(replacements);
208}
209
210// static
211bool SupervisedUserURLFilter::HasFilteredScheme(const GURL& url) {
212  for (size_t i = 0; i < arraysize(kFilteredSchemes); ++i) {
213      if (url.scheme() == kFilteredSchemes[i])
214        return true;
215    }
216  return false;
217}
218
219std::string GetHostnameHash(const GURL& url) {
220  std::string hash = base::SHA1HashString(url.host());
221  return base::HexEncode(hash.data(), hash.length());
222}
223
224// static
225bool SupervisedUserURLFilter::HostMatchesPattern(const std::string& host,
226                                                 const std::string& pattern) {
227  std::string trimmed_pattern = pattern;
228  std::string trimmed_host = host;
229  if (EndsWith(pattern, ".*", true)) {
230    size_t registry_length = GetRegistryLength(
231        trimmed_host, EXCLUDE_UNKNOWN_REGISTRIES, EXCLUDE_PRIVATE_REGISTRIES);
232    // A host without a known registry part does not match.
233    if (registry_length == 0)
234      return false;
235
236    trimmed_pattern.erase(trimmed_pattern.length() - 2);
237    trimmed_host.erase(trimmed_host.length() - (registry_length + 1));
238  }
239
240  if (StartsWithASCII(trimmed_pattern, "*.", true)) {
241    trimmed_pattern.erase(0, 2);
242
243    // The remaining pattern should be non-empty, and it should not contain
244    // further stars. Also the trimmed host needs to end with the trimmed
245    // pattern.
246    if (trimmed_pattern.empty() ||
247        trimmed_pattern.find('*') != std::string::npos ||
248        !EndsWith(trimmed_host, trimmed_pattern, true)) {
249      return false;
250    }
251
252    // The trimmed host needs to have a dot separating the subdomain from the
253    // matched pattern piece, unless there is no subdomain.
254    int pos = trimmed_host.length() - trimmed_pattern.length();
255    DCHECK_GE(pos, 0);
256    return (pos == 0) || (trimmed_host[pos - 1] == '.');
257  }
258
259  return trimmed_host == trimmed_pattern;
260}
261
262SupervisedUserURLFilter::FilteringBehavior
263SupervisedUserURLFilter::GetFilteringBehaviorForURL(const GURL& url) const {
264  DCHECK(CalledOnValidThread());
265
266  // URLs with a non-standard scheme (e.g. chrome://) are always allowed.
267  if (!HasFilteredScheme(url))
268    return ALLOW;
269
270  // Check manual overrides for the exact URL.
271  std::map<GURL, bool>::const_iterator url_it = url_map_.find(Normalize(url));
272  if (url_it != url_map_.end())
273    return url_it->second ? ALLOW : BLOCK;
274
275  // Check manual overrides for the hostname.
276  std::string host = url.host();
277  std::map<std::string, bool>::const_iterator host_it = host_map_.find(host);
278  if (host_it != host_map_.end())
279    return host_it->second ? ALLOW : BLOCK;
280
281  // Look for patterns matching the hostname, with a value that is different
282  // from the default (a value of true in the map meaning allowed).
283  for (std::map<std::string, bool>::const_iterator host_it =
284      host_map_.begin(); host_it != host_map_.end(); ++host_it) {
285    if ((host_it->second == (default_behavior_ == BLOCK)) &&
286        HostMatchesPattern(host, host_it->first)) {
287      return host_it->second ? ALLOW : BLOCK;
288    }
289  }
290
291  // If the default behavior is to allow, we don't need to check anything else.
292  if (default_behavior_ == ALLOW)
293    return ALLOW;
294
295  // Check the list of URL patterns.
296  std::set<URLMatcherConditionSet::ID> matching_ids =
297      contents_->url_matcher.MatchURL(url);
298  if (!matching_ids.empty())
299    return ALLOW;
300
301  // Check the list of hostname hashes.
302  if (contents_->hash_site_map.count(GetHostnameHash(url)))
303    return ALLOW;
304
305  // Fall back to the default behavior.
306  return default_behavior_;
307}
308
309void SupervisedUserURLFilter::GetSites(
310    const GURL& url,
311    std::vector<SupervisedUserSiteList::Site*>* sites) const {
312  std::set<URLMatcherConditionSet::ID> matching_ids =
313      contents_->url_matcher.MatchURL(url);
314  for (std::set<URLMatcherConditionSet::ID>::const_iterator it =
315           matching_ids.begin(); it != matching_ids.end(); ++it) {
316    std::map<URLMatcherConditionSet::ID, int>::const_iterator entry =
317        contents_->matcher_site_map.find(*it);
318    if (entry == contents_->matcher_site_map.end()) {
319      NOTREACHED();
320      continue;
321    }
322    sites->push_back(&contents_->sites[entry->second]);
323  }
324
325  typedef base::hash_multimap<std::string, int>::const_iterator
326      hash_site_map_iterator;
327  std::pair<hash_site_map_iterator, hash_site_map_iterator> bounds =
328      contents_->hash_site_map.equal_range(GetHostnameHash(url));
329  for (hash_site_map_iterator hash_it = bounds.first;
330       hash_it != bounds.second; hash_it++) {
331    sites->push_back(&contents_->sites[hash_it->second]);
332  }
333}
334
335void SupervisedUserURLFilter::SetDefaultFilteringBehavior(
336    FilteringBehavior behavior) {
337  DCHECK(CalledOnValidThread());
338  default_behavior_ = behavior;
339}
340
341void SupervisedUserURLFilter::LoadWhitelists(
342    ScopedVector<SupervisedUserSiteList> site_lists) {
343  DCHECK(CalledOnValidThread());
344
345  base::PostTaskAndReplyWithResult(
346      BrowserThread::GetBlockingPool(),
347      FROM_HERE,
348      base::Bind(&LoadWhitelistsOnBlockingPoolThread,
349                 base::Passed(&site_lists)),
350      base::Bind(&SupervisedUserURLFilter::SetContents, this));
351}
352
353void SupervisedUserURLFilter::SetFromPatterns(
354    const std::vector<std::string>& patterns) {
355  DCHECK(CalledOnValidThread());
356
357  base::PostTaskAndReplyWithResult(
358      BrowserThread::GetBlockingPool(),
359      FROM_HERE,
360      base::Bind(&CreateWhitelistFromPatterns, patterns),
361      base::Bind(&SupervisedUserURLFilter::SetContents, this));
362}
363
364void SupervisedUserURLFilter::SetManualHosts(
365    const std::map<std::string, bool>* host_map) {
366  DCHECK(CalledOnValidThread());
367  host_map_ = *host_map;
368  UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualHostsEntries",
369                              host_map->size(), 1, 1000, 50);
370}
371
372void SupervisedUserURLFilter::SetManualURLs(
373    const std::map<GURL, bool>* url_map) {
374  DCHECK(CalledOnValidThread());
375  url_map_ = *url_map;
376  UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualURLsEntries",
377                              url_map->size(), 1, 1000, 50);
378}
379
380void SupervisedUserURLFilter::AddObserver(Observer* observer) {
381  observers_.AddObserver(observer);
382}
383
384void SupervisedUserURLFilter::RemoveObserver(Observer* observer) {
385  observers_.RemoveObserver(observer);
386}
387
388void SupervisedUserURLFilter::SetContents(scoped_ptr<Contents> contents) {
389  DCHECK(CalledOnValidThread());
390  contents_ = contents.Pass();
391  FOR_EACH_OBSERVER(Observer, observers_, OnSiteListUpdated());
392}
393