url_blacklist_manager.cc revision 010d83a9304c5a91596085d917d248abff47903a
1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "components/policy/core/browser/url_blacklist_manager.h"
6
7#include "base/bind.h"
8#include "base/files/file_path.h"
9#include "base/location.h"
10#include "base/message_loop/message_loop_proxy.h"
11#include "base/prefs/pref_service.h"
12#include "base/sequenced_task_runner.h"
13#include "base/stl_util.h"
14#include "base/strings/string_number_conversions.h"
15#include "base/task_runner_util.h"
16#include "base/values.h"
17#include "components/policy/core/common/policy_pref_names.h"
18#include "components/user_prefs/pref_registry_syncable.h"
19#include "net/base/filename_util.h"
20#include "net/base/load_flags.h"
21#include "net/base/net_errors.h"
22#include "net/url_request/url_request.h"
23#include "url/url_parse.h"
24
25using url_matcher::URLMatcher;
26using url_matcher::URLMatcherCondition;
27using url_matcher::URLMatcherConditionFactory;
28using url_matcher::URLMatcherConditionSet;
29using url_matcher::URLMatcherPortFilter;
30using url_matcher::URLMatcherSchemeFilter;
31using url_matcher::URLQueryElementMatcherCondition;
32
33namespace policy {
34
35namespace {
36
37const char kFileScheme[] = "file";
38
39// Maximum filters per policy. Filters over this index are ignored.
40const size_t kMaxFiltersPerPolicy = 1000;
41
42// A task that builds the blacklist on a background thread.
43scoped_ptr<URLBlacklist> BuildBlacklist(
44    scoped_ptr<base::ListValue> block,
45    scoped_ptr<base::ListValue> allow,
46    URLBlacklist::SegmentURLCallback segment_url) {
47  scoped_ptr<URLBlacklist> blacklist(new URLBlacklist(segment_url));
48  blacklist->Block(block.get());
49  blacklist->Allow(allow.get());
50  return blacklist.Pass();
51}
52
53// Tokenise the parameter |query| and add appropriate query element matcher
54// conditions to the |query_conditions|.
55void ProcessQueryToConditions(
56    url_matcher::URLMatcherConditionFactory* condition_factory,
57    const std::string& query,
58    bool allow,
59    std::set<URLQueryElementMatcherCondition>* query_conditions) {
60  url::Component query_left = url::MakeRange(0, query.length());
61  url::Component key;
62  url::Component value;
63  // Depending on the filter type being black-list or white-list, the matcher
64  // choose any or every match. The idea is a URL should be black-listed if
65  // there is any occurrence of the key value pair. It should be white-listed
66  // only if every occurrence of the key is followed by the value. This avoids
67  // situations such as a user appending a white-listed video parameter in the
68  // end of the query and watching a video of his choice (the last parameter is
69  // ignored by some web servers like youtube's).
70  URLQueryElementMatcherCondition::Type match_type =
71      allow ? URLQueryElementMatcherCondition::MATCH_ALL
72            : URLQueryElementMatcherCondition::MATCH_ANY;
73
74  while (ExtractQueryKeyValue(query.data(), &query_left, &key, &value)) {
75    URLQueryElementMatcherCondition::QueryElementType query_element_type =
76        value.len ? URLQueryElementMatcherCondition::ELEMENT_TYPE_KEY_VALUE
77                  : URLQueryElementMatcherCondition::ELEMENT_TYPE_KEY;
78    URLQueryElementMatcherCondition::QueryValueMatchType query_value_match_type;
79    if (!value.len && key.len && query[key.end() - 1] == '*') {
80      --key.len;
81      query_value_match_type =
82          URLQueryElementMatcherCondition::QUERY_VALUE_MATCH_PREFIX;
83    } else if (value.len && query[value.end() - 1] == '*') {
84      --value.len;
85      query_value_match_type =
86          URLQueryElementMatcherCondition::QUERY_VALUE_MATCH_PREFIX;
87    } else {
88      query_value_match_type =
89          URLQueryElementMatcherCondition::QUERY_VALUE_MATCH_EXACT;
90    }
91    query_conditions->insert(
92        URLQueryElementMatcherCondition(query.substr(key.begin, key.len),
93                                        query.substr(value.begin, value.len),
94                                        query_value_match_type,
95                                        query_element_type,
96                                        match_type,
97                                        condition_factory));
98  }
99}
100
101}  // namespace
102
103struct URLBlacklist::FilterComponents {
104  FilterComponents() : port(0), match_subdomains(true), allow(true) {}
105  ~FilterComponents() {}
106
107  std::string scheme;
108  std::string host;
109  uint16 port;
110  std::string path;
111  std::string query;
112  int number_of_key_value_pairs;
113  bool match_subdomains;
114  bool allow;
115};
116
117URLBlacklist::URLBlacklist(SegmentURLCallback segment_url)
118    : segment_url_(segment_url), id_(0), url_matcher_(new URLMatcher) {}
119
120URLBlacklist::~URLBlacklist() {}
121
122void URLBlacklist::AddFilters(bool allow,
123                              const base::ListValue* list) {
124  URLMatcherConditionSet::Vector all_conditions;
125  size_t size = std::min(kMaxFiltersPerPolicy, list->GetSize());
126  for (size_t i = 0; i < size; ++i) {
127    std::string pattern;
128    bool success = list->GetString(i, &pattern);
129    DCHECK(success);
130    FilterComponents components;
131    components.allow = allow;
132    if (!FilterToComponents(segment_url_,
133                            pattern,
134                            &components.scheme,
135                            &components.host,
136                            &components.match_subdomains,
137                            &components.port,
138                            &components.path,
139                            &components.query)) {
140      LOG(ERROR) << "Invalid pattern " << pattern;
141      continue;
142    }
143
144    scoped_refptr<URLMatcherConditionSet> condition_set =
145        CreateConditionSet(url_matcher_.get(),
146                           ++id_,
147                           components.scheme,
148                           components.host,
149                           components.match_subdomains,
150                           components.port,
151                           components.path,
152                           components.query,
153                           allow);
154    components.number_of_key_value_pairs =
155        condition_set->query_conditions().size();
156    all_conditions.push_back(condition_set);
157    filters_[id_] = components;
158  }
159  url_matcher_->AddConditionSets(all_conditions);
160}
161
162void URLBlacklist::Block(const base::ListValue* filters) {
163  AddFilters(false, filters);
164}
165
166void URLBlacklist::Allow(const base::ListValue* filters) {
167  AddFilters(true, filters);
168}
169
170bool URLBlacklist::IsURLBlocked(const GURL& url) const {
171  std::set<URLMatcherConditionSet::ID> matching_ids =
172      url_matcher_->MatchURL(url);
173
174  const FilterComponents* max = NULL;
175  for (std::set<URLMatcherConditionSet::ID>::iterator id = matching_ids.begin();
176       id != matching_ids.end(); ++id) {
177    std::map<int, FilterComponents>::const_iterator it = filters_.find(*id);
178    DCHECK(it != filters_.end());
179    const FilterComponents& filter = it->second;
180    if (!max || FilterTakesPrecedence(filter, *max))
181      max = &filter;
182  }
183
184  // Default to allow.
185  if (!max)
186    return false;
187
188  return !max->allow;
189}
190
191size_t URLBlacklist::Size() const {
192  return filters_.size();
193}
194
195// static
196bool URLBlacklist::FilterToComponents(SegmentURLCallback segment_url,
197                                      const std::string& filter,
198                                      std::string* scheme,
199                                      std::string* host,
200                                      bool* match_subdomains,
201                                      uint16* port,
202                                      std::string* path,
203                                      std::string* query) {
204  url::Parsed parsed;
205
206  if (segment_url(filter, &parsed) == kFileScheme) {
207    base::FilePath file_path;
208    if (!net::FileURLToFilePath(GURL(filter), &file_path))
209      return false;
210
211    *scheme = kFileScheme;
212    host->clear();
213    *match_subdomains = true;
214    *port = 0;
215    // Special path when the |filter| is 'file://*'.
216    *path = (filter == "file://*") ? "" : file_path.AsUTF8Unsafe();
217#if defined(FILE_PATH_USES_WIN_SEPARATORS)
218    // Separators have to be canonicalized on Windows.
219    std::replace(path->begin(), path->end(), '\\', '/');
220    *path = "/" + *path;
221#endif
222    return true;
223  }
224
225  if (!parsed.host.is_nonempty())
226    return false;
227
228  if (parsed.scheme.is_nonempty())
229    scheme->assign(filter, parsed.scheme.begin, parsed.scheme.len);
230  else
231    scheme->clear();
232
233  host->assign(filter, parsed.host.begin, parsed.host.len);
234  // Special '*' host, matches all hosts.
235  if (*host == "*") {
236    host->clear();
237    *match_subdomains = true;
238  } else if ((*host)[0] == '.') {
239    // A leading dot in the pattern syntax means that we don't want to match
240    // subdomains.
241    host->erase(0, 1);
242    *match_subdomains = false;
243  } else {
244    url::RawCanonOutputT<char> output;
245    url::CanonHostInfo host_info;
246    url::CanonicalizeHostVerbose(filter.c_str(), parsed.host, &output,
247                                 &host_info);
248    if (host_info.family == url::CanonHostInfo::NEUTRAL) {
249      // We want to match subdomains. Add a dot in front to make sure we only
250      // match at domain component boundaries.
251      *host = "." + *host;
252      *match_subdomains = true;
253    } else {
254      *match_subdomains = false;
255    }
256  }
257
258  if (parsed.port.is_nonempty()) {
259    int int_port;
260    if (!base::StringToInt(filter.substr(parsed.port.begin, parsed.port.len),
261                           &int_port)) {
262      return false;
263    }
264    if (int_port <= 0 || int_port > kuint16max)
265      return false;
266    *port = int_port;
267  } else {
268    // Match any port.
269    *port = 0;
270  }
271
272  if (parsed.path.is_nonempty())
273    path->assign(filter, parsed.path.begin, parsed.path.len);
274  else
275    path->clear();
276
277  if (query) {
278    if (parsed.query.is_nonempty())
279      query->assign(filter, parsed.query.begin, parsed.query.len);
280    else
281      query->clear();
282  }
283
284  return true;
285}
286
287// static
288scoped_refptr<URLMatcherConditionSet> URLBlacklist::CreateConditionSet(
289    URLMatcher* url_matcher,
290    int id,
291    const std::string& scheme,
292    const std::string& host,
293    bool match_subdomains,
294    uint16 port,
295    const std::string& path,
296    const std::string& query,
297    bool allow) {
298  URLMatcherConditionFactory* condition_factory =
299      url_matcher->condition_factory();
300  std::set<URLMatcherCondition> conditions;
301  conditions.insert(match_subdomains ?
302      condition_factory->CreateHostSuffixPathPrefixCondition(host, path) :
303      condition_factory->CreateHostEqualsPathPrefixCondition(host, path));
304
305  std::set<URLQueryElementMatcherCondition> query_conditions;
306  if (!query.empty()) {
307    ProcessQueryToConditions(
308        condition_factory, query, allow, &query_conditions);
309  }
310
311  scoped_ptr<URLMatcherSchemeFilter> scheme_filter;
312  if (!scheme.empty())
313    scheme_filter.reset(new URLMatcherSchemeFilter(scheme));
314
315  scoped_ptr<URLMatcherPortFilter> port_filter;
316  if (port != 0) {
317    std::vector<URLMatcherPortFilter::Range> ranges;
318    ranges.push_back(URLMatcherPortFilter::CreateRange(port));
319    port_filter.reset(new URLMatcherPortFilter(ranges));
320  }
321
322  return new URLMatcherConditionSet(id,
323                                    conditions,
324                                    query_conditions,
325                                    scheme_filter.Pass(),
326                                    port_filter.Pass());
327}
328
329// static
330bool URLBlacklist::FilterTakesPrecedence(const FilterComponents& lhs,
331                                         const FilterComponents& rhs) {
332  if (lhs.match_subdomains && !rhs.match_subdomains)
333    return false;
334  if (!lhs.match_subdomains && rhs.match_subdomains)
335    return true;
336
337  size_t host_length = lhs.host.length();
338  size_t other_host_length = rhs.host.length();
339  if (host_length != other_host_length)
340    return host_length > other_host_length;
341
342  size_t path_length = lhs.path.length();
343  size_t other_path_length = rhs.path.length();
344  if (path_length != other_path_length)
345    return path_length > other_path_length;
346
347  if (lhs.number_of_key_value_pairs != rhs.number_of_key_value_pairs)
348    return lhs.number_of_key_value_pairs > rhs.number_of_key_value_pairs;
349
350  if (lhs.allow && !rhs.allow)
351    return true;
352
353  return false;
354}
355
356URLBlacklistManager::URLBlacklistManager(
357    PrefService* pref_service,
358    const scoped_refptr<base::SequencedTaskRunner>& background_task_runner,
359    const scoped_refptr<base::SequencedTaskRunner>& io_task_runner,
360    URLBlacklist::SegmentURLCallback segment_url,
361    OverrideBlacklistCallback override_blacklist)
362    : ui_weak_ptr_factory_(this),
363      pref_service_(pref_service),
364      background_task_runner_(background_task_runner),
365      io_task_runner_(io_task_runner),
366      segment_url_(segment_url),
367      override_blacklist_(override_blacklist),
368      io_weak_ptr_factory_(this),
369      ui_task_runner_(base::MessageLoopProxy::current()),
370      blacklist_(new URLBlacklist(segment_url)) {
371  pref_change_registrar_.Init(pref_service_);
372  base::Closure callback = base::Bind(&URLBlacklistManager::ScheduleUpdate,
373                                      base::Unretained(this));
374  pref_change_registrar_.Add(policy_prefs::kUrlBlacklist, callback);
375  pref_change_registrar_.Add(policy_prefs::kUrlWhitelist, callback);
376
377  // Start enforcing the policies without a delay when they are present at
378  // startup.
379  if (pref_service_->HasPrefPath(policy_prefs::kUrlBlacklist))
380    Update();
381}
382
383void URLBlacklistManager::ShutdownOnUIThread() {
384  DCHECK(ui_task_runner_->RunsTasksOnCurrentThread());
385  // Cancel any pending updates, and stop listening for pref change updates.
386  ui_weak_ptr_factory_.InvalidateWeakPtrs();
387  pref_change_registrar_.RemoveAll();
388}
389
390URLBlacklistManager::~URLBlacklistManager() {
391}
392
393void URLBlacklistManager::ScheduleUpdate() {
394  DCHECK(ui_task_runner_->RunsTasksOnCurrentThread());
395  // Cancel pending updates, if any. This can happen if two preferences that
396  // change the blacklist are updated in one message loop cycle. In those cases,
397  // only rebuild the blacklist after all the preference updates are processed.
398  ui_weak_ptr_factory_.InvalidateWeakPtrs();
399  ui_task_runner_->PostTask(
400      FROM_HERE,
401      base::Bind(&URLBlacklistManager::Update,
402                 ui_weak_ptr_factory_.GetWeakPtr()));
403}
404
405void URLBlacklistManager::Update() {
406  DCHECK(ui_task_runner_->RunsTasksOnCurrentThread());
407
408  // The preferences can only be read on the UI thread.
409  scoped_ptr<base::ListValue> block(
410      pref_service_->GetList(policy_prefs::kUrlBlacklist)->DeepCopy());
411  scoped_ptr<base::ListValue> allow(
412      pref_service_->GetList(policy_prefs::kUrlWhitelist)->DeepCopy());
413
414  // Go through the IO thread to grab a WeakPtr to |this|. This is safe from
415  // here, since this task will always execute before a potential deletion of
416  // ProfileIOData on IO.
417  io_task_runner_->PostTask(FROM_HERE,
418                            base::Bind(&URLBlacklistManager::UpdateOnIO,
419                                       base::Unretained(this),
420                                       base::Passed(&block),
421                                       base::Passed(&allow)));
422}
423
424void URLBlacklistManager::UpdateOnIO(scoped_ptr<base::ListValue> block,
425                                     scoped_ptr<base::ListValue> allow) {
426  DCHECK(io_task_runner_->RunsTasksOnCurrentThread());
427  // The URLBlacklist is built on a worker thread. Once it's ready, it is passed
428  // to the URLBlacklistManager on IO.
429  base::PostTaskAndReplyWithResult(
430      background_task_runner_,
431      FROM_HERE,
432      base::Bind(&BuildBlacklist,
433                 base::Passed(&block),
434                 base::Passed(&allow),
435                 segment_url_),
436      base::Bind(&URLBlacklistManager::SetBlacklist,
437                 io_weak_ptr_factory_.GetWeakPtr()));
438}
439
440void URLBlacklistManager::SetBlacklist(scoped_ptr<URLBlacklist> blacklist) {
441  DCHECK(io_task_runner_->RunsTasksOnCurrentThread());
442  blacklist_ = blacklist.Pass();
443}
444
445bool URLBlacklistManager::IsURLBlocked(const GURL& url) const {
446  DCHECK(io_task_runner_->RunsTasksOnCurrentThread());
447  return blacklist_->IsURLBlocked(url);
448}
449
450bool URLBlacklistManager::IsRequestBlocked(
451    const net::URLRequest& request, int* reason) const {
452  DCHECK(io_task_runner_->RunsTasksOnCurrentThread());
453#if !defined(OS_IOS)
454  // TODO(joaodasilva): iOS doesn't set these flags. http://crbug.com/338283
455  int filter_flags = net::LOAD_MAIN_FRAME | net::LOAD_SUB_FRAME;
456  if ((request.load_flags() & filter_flags) == 0)
457    return false;
458#endif
459
460  bool block = false;
461  if (override_blacklist_.Run(request.url(), &block, reason))
462    return block;
463
464  *reason = net::ERR_BLOCKED_BY_ADMINISTRATOR;
465  return IsURLBlocked(request.url());
466}
467
468// static
469void URLBlacklistManager::RegisterProfilePrefs(
470    user_prefs::PrefRegistrySyncable* registry) {
471  registry->RegisterListPref(policy_prefs::kUrlBlacklist,
472                             user_prefs::PrefRegistrySyncable::UNSYNCABLE_PREF);
473  registry->RegisterListPref(policy_prefs::kUrlWhitelist,
474                             user_prefs::PrefRegistrySyncable::UNSYNCABLE_PREF);
475}
476
477}  // namespace policy
478