1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "components/policy/core/browser/url_blacklist_manager.h"
6
7#include "base/bind.h"
8#include "base/files/file_path.h"
9#include "base/location.h"
10#include "base/message_loop/message_loop_proxy.h"
11#include "base/prefs/pref_service.h"
12#include "base/sequenced_task_runner.h"
13#include "base/stl_util.h"
14#include "base/strings/string_number_conversions.h"
15#include "base/task_runner_util.h"
16#include "base/values.h"
17#include "components/policy/core/common/policy_pref_names.h"
18#include "components/pref_registry/pref_registry_syncable.h"
19#include "net/base/filename_util.h"
20#include "net/base/load_flags.h"
21#include "net/base/net_errors.h"
22#include "net/url_request/url_request.h"
23#include "url/url_constants.h"
24#include "url/url_parse.h"
25
26using url_matcher::URLMatcher;
27using url_matcher::URLMatcherCondition;
28using url_matcher::URLMatcherConditionFactory;
29using url_matcher::URLMatcherConditionSet;
30using url_matcher::URLMatcherPortFilter;
31using url_matcher::URLMatcherSchemeFilter;
32using url_matcher::URLQueryElementMatcherCondition;
33
34namespace policy {
35
36namespace {
37
38// List of schemes of URLs that should not be blocked by the "*" wildcard in
39// the blacklist. Note that URLs with these schemes can still be blocked with
40// a more specific filter e.g. "chrome-extension://*".
41// The schemes are hardcoded here to avoid dependencies on //extensions and
42// //chrome.
43const char* kBypassBlacklistWildcardForSchemes[] = {
44  // For internal extension URLs e.g. the Bookmark Manager and the File
45  // Manager on Chrome OS.
46  "chrome-extension",
47
48  // NTP on Android.
49  "chrome-native",
50
51  // NTP on other platforms.
52  "chrome-search",
53};
54
55// Maximum filters per policy. Filters over this index are ignored.
56const size_t kMaxFiltersPerPolicy = 1000;
57
58// A task that builds the blacklist on a background thread.
59scoped_ptr<URLBlacklist> BuildBlacklist(
60    scoped_ptr<base::ListValue> block,
61    scoped_ptr<base::ListValue> allow,
62    URLBlacklist::SegmentURLCallback segment_url) {
63  scoped_ptr<URLBlacklist> blacklist(new URLBlacklist(segment_url));
64  blacklist->Block(block.get());
65  blacklist->Allow(allow.get());
66  return blacklist.Pass();
67}
68
69// Tokenise the parameter |query| and add appropriate query element matcher
70// conditions to the |query_conditions|.
71void ProcessQueryToConditions(
72    url_matcher::URLMatcherConditionFactory* condition_factory,
73    const std::string& query,
74    bool allow,
75    std::set<URLQueryElementMatcherCondition>* query_conditions) {
76  url::Component query_left = url::MakeRange(0, query.length());
77  url::Component key;
78  url::Component value;
79  // Depending on the filter type being black-list or white-list, the matcher
80  // choose any or every match. The idea is a URL should be black-listed if
81  // there is any occurrence of the key value pair. It should be white-listed
82  // only if every occurrence of the key is followed by the value. This avoids
83  // situations such as a user appending a white-listed video parameter in the
84  // end of the query and watching a video of his choice (the last parameter is
85  // ignored by some web servers like youtube's).
86  URLQueryElementMatcherCondition::Type match_type =
87      allow ? URLQueryElementMatcherCondition::MATCH_ALL
88            : URLQueryElementMatcherCondition::MATCH_ANY;
89
90  while (ExtractQueryKeyValue(query.data(), &query_left, &key, &value)) {
91    URLQueryElementMatcherCondition::QueryElementType query_element_type =
92        value.len ? URLQueryElementMatcherCondition::ELEMENT_TYPE_KEY_VALUE
93                  : URLQueryElementMatcherCondition::ELEMENT_TYPE_KEY;
94    URLQueryElementMatcherCondition::QueryValueMatchType query_value_match_type;
95    if (!value.len && key.len && query[key.end() - 1] == '*') {
96      --key.len;
97      query_value_match_type =
98          URLQueryElementMatcherCondition::QUERY_VALUE_MATCH_PREFIX;
99    } else if (value.len && query[value.end() - 1] == '*') {
100      --value.len;
101      query_value_match_type =
102          URLQueryElementMatcherCondition::QUERY_VALUE_MATCH_PREFIX;
103    } else {
104      query_value_match_type =
105          URLQueryElementMatcherCondition::QUERY_VALUE_MATCH_EXACT;
106    }
107    query_conditions->insert(
108        URLQueryElementMatcherCondition(query.substr(key.begin, key.len),
109                                        query.substr(value.begin, value.len),
110                                        query_value_match_type,
111                                        query_element_type,
112                                        match_type,
113                                        condition_factory));
114  }
115}
116
117bool BypassBlacklistWildcardForURL(const GURL& url) {
118  const std::string& scheme = url.scheme();
119  for (size_t i = 0; i < arraysize(kBypassBlacklistWildcardForSchemes); ++i) {
120    if (scheme == kBypassBlacklistWildcardForSchemes[i])
121      return true;
122  }
123  return false;
124}
125
126}  // namespace
127
128struct URLBlacklist::FilterComponents {
129  FilterComponents() : port(0), match_subdomains(true), allow(true) {}
130  ~FilterComponents() {}
131
132  // Returns true if |this| represents the "*" filter in the blacklist.
133  bool IsBlacklistWildcard() const {
134    return !allow && host.empty() && scheme.empty() && path.empty() &&
135           query.empty() && port == 0 && number_of_key_value_pairs == 0 &&
136           match_subdomains;
137  }
138
139  std::string scheme;
140  std::string host;
141  uint16 port;
142  std::string path;
143  std::string query;
144  int number_of_key_value_pairs;
145  bool match_subdomains;
146  bool allow;
147};
148
149URLBlacklist::URLBlacklist(SegmentURLCallback segment_url)
150    : segment_url_(segment_url), id_(0), url_matcher_(new URLMatcher) {}
151
152URLBlacklist::~URLBlacklist() {}
153
154void URLBlacklist::AddFilters(bool allow,
155                              const base::ListValue* list) {
156  URLMatcherConditionSet::Vector all_conditions;
157  size_t size = std::min(kMaxFiltersPerPolicy, list->GetSize());
158  for (size_t i = 0; i < size; ++i) {
159    std::string pattern;
160    bool success = list->GetString(i, &pattern);
161    DCHECK(success);
162    FilterComponents components;
163    components.allow = allow;
164    if (!FilterToComponents(segment_url_,
165                            pattern,
166                            &components.scheme,
167                            &components.host,
168                            &components.match_subdomains,
169                            &components.port,
170                            &components.path,
171                            &components.query)) {
172      LOG(ERROR) << "Invalid pattern " << pattern;
173      continue;
174    }
175
176    scoped_refptr<URLMatcherConditionSet> condition_set =
177        CreateConditionSet(url_matcher_.get(),
178                           ++id_,
179                           components.scheme,
180                           components.host,
181                           components.match_subdomains,
182                           components.port,
183                           components.path,
184                           components.query,
185                           allow);
186    components.number_of_key_value_pairs =
187        condition_set->query_conditions().size();
188    all_conditions.push_back(condition_set);
189    filters_[id_] = components;
190  }
191  url_matcher_->AddConditionSets(all_conditions);
192}
193
194void URLBlacklist::Block(const base::ListValue* filters) {
195  AddFilters(false, filters);
196}
197
198void URLBlacklist::Allow(const base::ListValue* filters) {
199  AddFilters(true, filters);
200}
201
202bool URLBlacklist::IsURLBlocked(const GURL& url) const {
203  std::set<URLMatcherConditionSet::ID> matching_ids =
204      url_matcher_->MatchURL(url);
205
206  const FilterComponents* max = NULL;
207  for (std::set<URLMatcherConditionSet::ID>::iterator id = matching_ids.begin();
208       id != matching_ids.end(); ++id) {
209    std::map<int, FilterComponents>::const_iterator it = filters_.find(*id);
210    DCHECK(it != filters_.end());
211    const FilterComponents& filter = it->second;
212    if (!max || FilterTakesPrecedence(filter, *max))
213      max = &filter;
214  }
215
216  // Default to allow.
217  if (!max)
218    return false;
219
220  // Some of the internal Chrome URLs are not affected by the "*" in the
221  // blacklist. Note that the "*" is the lowest priority filter possible, so
222  // any higher priority filter will be applied first.
223  if (max->IsBlacklistWildcard() && BypassBlacklistWildcardForURL(url))
224    return false;
225
226  return !max->allow;
227}
228
229size_t URLBlacklist::Size() const {
230  return filters_.size();
231}
232
233// static
234bool URLBlacklist::FilterToComponents(SegmentURLCallback segment_url,
235                                      const std::string& filter,
236                                      std::string* scheme,
237                                      std::string* host,
238                                      bool* match_subdomains,
239                                      uint16* port,
240                                      std::string* path,
241                                      std::string* query) {
242  url::Parsed parsed;
243
244  if (segment_url(filter, &parsed) == url::kFileScheme) {
245    base::FilePath file_path;
246    if (!net::FileURLToFilePath(GURL(filter), &file_path))
247      return false;
248
249    *scheme = url::kFileScheme;
250    host->clear();
251    *match_subdomains = true;
252    *port = 0;
253    // Special path when the |filter| is 'file://*'.
254    *path = (filter == "file://*") ? "" : file_path.AsUTF8Unsafe();
255#if defined(FILE_PATH_USES_WIN_SEPARATORS)
256    // Separators have to be canonicalized on Windows.
257    std::replace(path->begin(), path->end(), '\\', '/');
258    *path = "/" + *path;
259#endif
260    return true;
261  }
262
263  if (!parsed.host.is_nonempty())
264    return false;
265
266  if (parsed.scheme.is_nonempty())
267    scheme->assign(filter, parsed.scheme.begin, parsed.scheme.len);
268  else
269    scheme->clear();
270
271  host->assign(filter, parsed.host.begin, parsed.host.len);
272  // Special '*' host, matches all hosts.
273  if (*host == "*") {
274    host->clear();
275    *match_subdomains = true;
276  } else if ((*host)[0] == '.') {
277    // A leading dot in the pattern syntax means that we don't want to match
278    // subdomains.
279    host->erase(0, 1);
280    *match_subdomains = false;
281  } else {
282    url::RawCanonOutputT<char> output;
283    url::CanonHostInfo host_info;
284    url::CanonicalizeHostVerbose(filter.c_str(), parsed.host, &output,
285                                 &host_info);
286    if (host_info.family == url::CanonHostInfo::NEUTRAL) {
287      // We want to match subdomains. Add a dot in front to make sure we only
288      // match at domain component boundaries.
289      *host = "." + *host;
290      *match_subdomains = true;
291    } else {
292      *match_subdomains = false;
293    }
294  }
295
296  if (parsed.port.is_nonempty()) {
297    int int_port;
298    if (!base::StringToInt(filter.substr(parsed.port.begin, parsed.port.len),
299                           &int_port)) {
300      return false;
301    }
302    if (int_port <= 0 || int_port > kuint16max)
303      return false;
304    *port = int_port;
305  } else {
306    // Match any port.
307    *port = 0;
308  }
309
310  if (parsed.path.is_nonempty())
311    path->assign(filter, parsed.path.begin, parsed.path.len);
312  else
313    path->clear();
314
315  if (query) {
316    if (parsed.query.is_nonempty())
317      query->assign(filter, parsed.query.begin, parsed.query.len);
318    else
319      query->clear();
320  }
321
322  return true;
323}
324
325// static
326scoped_refptr<URLMatcherConditionSet> URLBlacklist::CreateConditionSet(
327    URLMatcher* url_matcher,
328    int id,
329    const std::string& scheme,
330    const std::string& host,
331    bool match_subdomains,
332    uint16 port,
333    const std::string& path,
334    const std::string& query,
335    bool allow) {
336  URLMatcherConditionFactory* condition_factory =
337      url_matcher->condition_factory();
338  std::set<URLMatcherCondition> conditions;
339  conditions.insert(match_subdomains ?
340      condition_factory->CreateHostSuffixPathPrefixCondition(host, path) :
341      condition_factory->CreateHostEqualsPathPrefixCondition(host, path));
342
343  std::set<URLQueryElementMatcherCondition> query_conditions;
344  if (!query.empty()) {
345    ProcessQueryToConditions(
346        condition_factory, query, allow, &query_conditions);
347  }
348
349  scoped_ptr<URLMatcherSchemeFilter> scheme_filter;
350  if (!scheme.empty())
351    scheme_filter.reset(new URLMatcherSchemeFilter(scheme));
352
353  scoped_ptr<URLMatcherPortFilter> port_filter;
354  if (port != 0) {
355    std::vector<URLMatcherPortFilter::Range> ranges;
356    ranges.push_back(URLMatcherPortFilter::CreateRange(port));
357    port_filter.reset(new URLMatcherPortFilter(ranges));
358  }
359
360  return new URLMatcherConditionSet(id,
361                                    conditions,
362                                    query_conditions,
363                                    scheme_filter.Pass(),
364                                    port_filter.Pass());
365}
366
367// static
368bool URLBlacklist::FilterTakesPrecedence(const FilterComponents& lhs,
369                                         const FilterComponents& rhs) {
370  // The "*" wildcard is the lowest priority filter.
371  if (rhs.IsBlacklistWildcard())
372    return true;
373
374  if (lhs.match_subdomains && !rhs.match_subdomains)
375    return false;
376  if (!lhs.match_subdomains && rhs.match_subdomains)
377    return true;
378
379  size_t host_length = lhs.host.length();
380  size_t other_host_length = rhs.host.length();
381  if (host_length != other_host_length)
382    return host_length > other_host_length;
383
384  size_t path_length = lhs.path.length();
385  size_t other_path_length = rhs.path.length();
386  if (path_length != other_path_length)
387    return path_length > other_path_length;
388
389  if (lhs.number_of_key_value_pairs != rhs.number_of_key_value_pairs)
390    return lhs.number_of_key_value_pairs > rhs.number_of_key_value_pairs;
391
392  if (lhs.allow && !rhs.allow)
393    return true;
394
395  return false;
396}
397
398URLBlacklistManager::URLBlacklistManager(
399    PrefService* pref_service,
400    const scoped_refptr<base::SequencedTaskRunner>& background_task_runner,
401    const scoped_refptr<base::SequencedTaskRunner>& io_task_runner,
402    URLBlacklist::SegmentURLCallback segment_url,
403    OverrideBlacklistCallback override_blacklist)
404    : pref_service_(pref_service),
405      background_task_runner_(background_task_runner),
406      io_task_runner_(io_task_runner),
407      segment_url_(segment_url),
408      override_blacklist_(override_blacklist),
409      ui_task_runner_(base::MessageLoopProxy::current()),
410      blacklist_(new URLBlacklist(segment_url)),
411      ui_weak_ptr_factory_(this),
412      io_weak_ptr_factory_(this) {
413  pref_change_registrar_.Init(pref_service_);
414  base::Closure callback = base::Bind(&URLBlacklistManager::ScheduleUpdate,
415                                      base::Unretained(this));
416  pref_change_registrar_.Add(policy_prefs::kUrlBlacklist, callback);
417  pref_change_registrar_.Add(policy_prefs::kUrlWhitelist, callback);
418
419  // Start enforcing the policies without a delay when they are present at
420  // startup.
421  if (pref_service_->HasPrefPath(policy_prefs::kUrlBlacklist))
422    Update();
423}
424
425void URLBlacklistManager::ShutdownOnUIThread() {
426  DCHECK(ui_task_runner_->RunsTasksOnCurrentThread());
427  // Cancel any pending updates, and stop listening for pref change updates.
428  ui_weak_ptr_factory_.InvalidateWeakPtrs();
429  pref_change_registrar_.RemoveAll();
430}
431
432URLBlacklistManager::~URLBlacklistManager() {
433}
434
435void URLBlacklistManager::ScheduleUpdate() {
436  DCHECK(ui_task_runner_->RunsTasksOnCurrentThread());
437  // Cancel pending updates, if any. This can happen if two preferences that
438  // change the blacklist are updated in one message loop cycle. In those cases,
439  // only rebuild the blacklist after all the preference updates are processed.
440  ui_weak_ptr_factory_.InvalidateWeakPtrs();
441  ui_task_runner_->PostTask(
442      FROM_HERE,
443      base::Bind(&URLBlacklistManager::Update,
444                 ui_weak_ptr_factory_.GetWeakPtr()));
445}
446
447void URLBlacklistManager::Update() {
448  DCHECK(ui_task_runner_->RunsTasksOnCurrentThread());
449
450  // The preferences can only be read on the UI thread.
451  scoped_ptr<base::ListValue> block(
452      pref_service_->GetList(policy_prefs::kUrlBlacklist)->DeepCopy());
453  scoped_ptr<base::ListValue> allow(
454      pref_service_->GetList(policy_prefs::kUrlWhitelist)->DeepCopy());
455
456  // Go through the IO thread to grab a WeakPtr to |this|. This is safe from
457  // here, since this task will always execute before a potential deletion of
458  // ProfileIOData on IO.
459  io_task_runner_->PostTask(FROM_HERE,
460                            base::Bind(&URLBlacklistManager::UpdateOnIO,
461                                       base::Unretained(this),
462                                       base::Passed(&block),
463                                       base::Passed(&allow)));
464}
465
466void URLBlacklistManager::UpdateOnIO(scoped_ptr<base::ListValue> block,
467                                     scoped_ptr<base::ListValue> allow) {
468  DCHECK(io_task_runner_->RunsTasksOnCurrentThread());
469  // The URLBlacklist is built on a worker thread. Once it's ready, it is passed
470  // to the URLBlacklistManager on IO.
471  base::PostTaskAndReplyWithResult(
472      background_task_runner_.get(),
473      FROM_HERE,
474      base::Bind(&BuildBlacklist,
475                 base::Passed(&block),
476                 base::Passed(&allow),
477                 segment_url_),
478      base::Bind(&URLBlacklistManager::SetBlacklist,
479                 io_weak_ptr_factory_.GetWeakPtr()));
480}
481
482void URLBlacklistManager::SetBlacklist(scoped_ptr<URLBlacklist> blacklist) {
483  DCHECK(io_task_runner_->RunsTasksOnCurrentThread());
484  blacklist_ = blacklist.Pass();
485}
486
487bool URLBlacklistManager::IsURLBlocked(const GURL& url) const {
488  DCHECK(io_task_runner_->RunsTasksOnCurrentThread());
489  return blacklist_->IsURLBlocked(url);
490}
491
492bool URLBlacklistManager::IsRequestBlocked(
493    const net::URLRequest& request, int* reason) const {
494  DCHECK(io_task_runner_->RunsTasksOnCurrentThread());
495#if !defined(OS_IOS)
496  // TODO(joaodasilva): iOS doesn't set these flags. http://crbug.com/338283
497  int filter_flags = net::LOAD_MAIN_FRAME | net::LOAD_SUB_FRAME;
498  if ((request.load_flags() & filter_flags) == 0)
499    return false;
500#endif
501
502  bool block = false;
503  if (override_blacklist_.Run(request.url(), &block, reason))
504    return block;
505
506  *reason = net::ERR_BLOCKED_BY_ADMINISTRATOR;
507  return IsURLBlocked(request.url());
508}
509
510// static
511void URLBlacklistManager::RegisterProfilePrefs(
512    user_prefs::PrefRegistrySyncable* registry) {
513  registry->RegisterListPref(policy_prefs::kUrlBlacklist,
514                             user_prefs::PrefRegistrySyncable::UNSYNCABLE_PREF);
515  registry->RegisterListPref(policy_prefs::kUrlWhitelist,
516                             user_prefs::PrefRegistrySyncable::UNSYNCABLE_PREF);
517}
518
519}  // namespace policy
520