1a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)// Copyright 2013 The Chromium Authors. All rights reserved.
2a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
3a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)// found in the LICENSE file.
4a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
5a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)#include "net/tools/tld_cleanup/tld_cleanup_util.h"
6a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
71320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci#include "base/files/file_util.h"
8a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)#include "base/logging.h"
97d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/strings/string_number_conversions.h"
10868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h"
117dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "url/gurl.h"
127dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "url/url_parse.h"
13a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
14a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)namespace {
15a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
16a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
17a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
187d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)
197d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)const int kExceptionRule = 1;
207d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)const int kWildcardRule = 2;
217d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)const int kPrivateRule = 4;
22a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)}
23a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
24a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)namespace net {
25a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)namespace tld_cleanup {
26a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
27a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)// Writes the list of domain rules contained in the 'rules' set to the
28a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)// 'outfile', with each rule terminated by a LF.  The file must already have
29a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)// been created with write access.
30a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
31a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  std::string data;
327dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  data.append("%{\n"
337dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "// Copyright 2012 The Chromium Authors. All rights reserved.\n"
347dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "// Use of this source code is governed by a BSD-style license "
357dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "that can be\n"
367dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "// found in the LICENSE file.\n\n"
377dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "// This file is generated by net/tools/tld_cleanup/.\n"
387dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "// DO NOT MANUALLY EDIT!\n"
397dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "%}\n"
407dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "struct DomainRule {\n"
417dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "  int name_offset;\n"
427dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "  int type;  // flags: 1: exception, 2: wildcard, 4: private\n"
437dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "};\n"
447dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch              "%%\n");
45a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
46a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
47a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    data.append(i->first);
48a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    data.append(", ");
497d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    int type = 0;
50a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    if (i->second.exception) {
517d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)      type = kExceptionRule;
52a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    } else if (i->second.wildcard) {
537d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)      type = kWildcardRule;
54a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    }
55a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    if (i->second.is_private) {
567d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)      type += kPrivateRule;
57a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    }
587d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    data.append(base::IntToString(type));
59a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    data.append("\n");
60a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
61a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
62a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  data.append("%%\n");
63a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
64a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)  int written = base::WriteFile(outfile,
65a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)                                     data.data(),
66a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)                                     static_cast<int>(data.size()));
67a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
68a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  return written == static_cast<int>(data.size());
69a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)}
70a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
71a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)// Adjusts the rule to a standard form: removes single extraneous dots and
72a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
73a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)// valid; logs a warning and returns kWarning if it is probably invalid; and
74a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)// logs an error and returns kError if the rule is (almost) certainly invalid.
75a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
76a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  NormalizeResult result = kSuccess;
77a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
78a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  // Strip single leading and trailing dots.
79a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  if (domain->at(0) == '.')
80a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    domain->erase(0, 1);
81a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  if (domain->empty()) {
82a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    LOG(WARNING) << "Ignoring empty rule";
83a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    return kWarning;
84a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
85a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  if (domain->at(domain->size() - 1) == '.')
86a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    domain->erase(domain->size() - 1, 1);
87a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  if (domain->empty()) {
88a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    LOG(WARNING) << "Ignoring empty rule";
89a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    return kWarning;
90a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
91a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
92a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  // Allow single leading '*.' or '!', saved here so it's not canonicalized.
93a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  size_t start_offset = 0;
94a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  if (domain->at(0) == '!') {
95a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    domain->erase(0, 1);
96a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    rule->exception = true;
97a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  } else if (domain->find("*.") == 0) {
98a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    domain->erase(0, 2);
99a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    rule->wildcard = true;
100a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
101a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  if (domain->empty()) {
102a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    LOG(WARNING) << "Ignoring empty rule";
103a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    return kWarning;
104a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
105a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
106a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  // Warn about additional '*.' or '!'.
107a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  if (domain->find("*.", start_offset) != std::string::npos ||
108a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      domain->find('!', start_offset) != std::string::npos) {
109a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
110a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    result = kWarning;
111a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
112a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
113a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  // Make a GURL and normalize it, then get the host back out.
114a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  std::string url = "http://";
115a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  url.append(*domain);
116a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  GURL gurl(url);
117a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  const std::string& spec = gurl.possibly_invalid_spec();
1185c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
119a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  if (host.len < 0) {
120a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
121a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    return kError;
122a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
123a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  if (!gurl.is_valid()) {
124a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
125a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    result = kWarning;
126a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
127a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  domain->assign(spec.substr(host.begin, host.len));
128a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
129a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  return result;
130a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)}
131a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
132a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)NormalizeResult NormalizeDataToRuleMap(const std::string data,
133a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)                                       RuleMap* rules) {
134a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  CHECK(rules);
135a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  // We do a lot of string assignment during parsing, but simplicity is more
136a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  // important than performance here.
137a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  std::string domain;
138a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  NormalizeResult result = kSuccess;
139a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  size_t line_start = 0;
140a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  size_t line_end = 0;
141a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  bool is_private = false;
142a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  RuleMap extra_rules;
143a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
144a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
145a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  while (line_start < data.size()) {
146a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    if (line_start + begin_private_length < data.size() &&
147a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        !data.compare(line_start, begin_private_length,
148a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)                      kBeginPrivateDomainsComment)) {
149a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      is_private = true;
150a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      line_end = line_start + begin_private_length;
151a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    } else if (line_start + end_private_length < data.size() &&
152a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        !data.compare(line_start, end_private_length,
153a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)                      kEndPrivateDomainsComment)) {
154a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      is_private = false;
155a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      line_end = line_start + end_private_length;
156a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    } else if (line_start + 1 < data.size() &&
157a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        data[line_start] == '/' &&
158a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        data[line_start + 1] == '/') {
159a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      // Skip comments.
160a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      line_end = data.find_first_of("\r\n", line_start);
161a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      if (line_end == std::string::npos)
162a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        line_end = data.size();
163a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    } else {
164a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      // Truncate at first whitespace.
165a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      line_end = data.find_first_of("\r\n \t", line_start);
166a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      if (line_end == std::string::npos)
167a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        line_end = data.size();
168a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      domain.assign(data.data(), line_start, line_end - line_start);
169a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
170a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      Rule rule;
171a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      rule.wildcard = false;
172a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      rule.exception = false;
173a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      rule.is_private = is_private;
174a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      NormalizeResult new_result = NormalizeRule(&domain, &rule);
175a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      if (new_result != kError) {
176a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        // Check the existing rules to make sure we don't have an exception and
177a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        // wildcard for the same rule, or that the same domain is listed as both
178a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        // private and not private. If we did, we'd have to update our
179a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        // parsing code to handle this case.
1805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        CHECK(rules->find(domain) == rules->end())
1815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            << "Duplicate rule found for " << domain;
182a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
183a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        (*rules)[domain] = rule;
184a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        // Add true TLD for multi-level rules.  We don't add them right now, in
185a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        // case there's an exception or wild card that either exists or might be
186a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        // added in a later iteration.  In those cases, there's no need to add
187a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        // it and it would just slow down parsing the data.
188a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        size_t tld_start = domain.find_last_of('.');
189a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
190a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          std::string extra_rule_domain = domain.substr(tld_start + 1);
191a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
192a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          Rule extra_rule;
193a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          extra_rule.exception = false;
194a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          extra_rule.wildcard = false;
195a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          if (iter == extra_rules.end()) {
196a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)            extra_rule.is_private = is_private;
197a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          } else {
198a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)            // A rule already exists, so we ensure that if any of the entries is
199a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)            // not private the result should be that the entry is not private.
200a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)            // An example is .au which is not listed as a real TLD, but only
201a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)            // lists second-level domains such as com.au. Subdomains of .au
202a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)            // (eg. blogspot.com.au) are also listed in the private section,
203a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)            // which is processed later, so this ensures that the real TLD
204a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)            // (eg. .au) is listed as public.
205a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)            extra_rule.is_private = is_private && iter->second.is_private;
206a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          }
207a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          extra_rules[extra_rule_domain] = extra_rule;
208a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        }
209a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      }
210a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      result = std::max(result, new_result);
211a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    }
212a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
213a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    // Find beginning of next non-empty line.
214a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    line_start = data.find_first_of("\r\n", line_end);
215a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    if (line_start == std::string::npos)
216a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      line_start = data.size();
217a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    line_start = data.find_first_not_of("\r\n", line_start);
218a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    if (line_start == std::string::npos)
219a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      line_start = data.size();
220a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
221a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
222a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  for (RuleMap::const_iterator iter = extra_rules.begin();
223a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)       iter != extra_rules.end();
224a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)       ++iter) {
225a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    if (rules->find(iter->first) == rules->end()) {
226a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      (*rules)[iter->first] = iter->second;
227a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    }
228a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
229a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
230a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  return result;
231a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)}
232a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
233a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)NormalizeResult NormalizeFile(const base::FilePath& in_filename,
234a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)                              const base::FilePath& out_filename) {
235a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  RuleMap rules;
236a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  std::string data;
23758537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)  if (!base::ReadFileToString(in_filename, &data)) {
238a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    LOG(ERROR) << "Unable to read file";
239a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    // We return success since we've already reported the error.
240a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    return kSuccess;
241a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
242a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
243a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
244a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
245a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  if (!WriteRules(rules, out_filename)) {
246a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    LOG(ERROR) << "Error(s) writing output file";
247a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    result = kError;
248a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  }
249a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
250a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  return result;
251a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)}
252a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
253a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)
254a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)}  // namespace tld_cleanup
255a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)}  // namespace net
256