1// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// This command-line program converts an effective-TLD data file in UTF-8 from
6// the format provided by Mozilla to the format expected by Chrome.  This
7// program generates an intermediate file which is then used by gperf to
8// generate a perfect hash map.  The benefit of this approach is that no time is
9// spent on program initialization to generate the map of this data.
10//
11// Running this program finds "effective_tld_names.cc" in the expected location
12// in the source checkout and generates "effective_tld_names.gperf" next to it.
13//
14// Any errors or warnings from this program are recorded in tld_cleanup.log.
15//
16// In particular, it
17//  * Strips blank lines and comments, as well as notes for individual rules.
18//  * Strips a single leading and/or trailing dot from each rule, if present.
19//  * Logs a warning if a rule contains '!' or '*.' other than at the beginning
20//    of the rule.  (This also catches multiple ! or *. at the start of a rule.)
21//  * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
22//  * Canonicalizes each rule's domain by converting it to a GURL and back.
23//  * Adds explicit rules for true TLDs found in any rule.
24
25#include <map>
26#include <set>
27#include <string>
28
29#include "base/at_exit.h"
30#include "base/command_line.h"
31#include "base/file_util.h"
32#include "base/i18n/icu_util.h"
33#include "base/logging.h"
34#include "base/file_path.h"
35#include "base/file_util.h"
36#include "base/path_service.h"
37#include "base/process_util.h"
38#include "base/string_util.h"
39#include "googleurl/src/gurl.h"
40#include "googleurl/src/url_parse.h"
41
42namespace {
43struct Rule {
44  bool exception;
45  bool wildcard;
46};
47
48typedef std::map<std::string, Rule> RuleMap;
49typedef std::set<std::string> RuleSet;
50}
51
52// Writes the list of domain rules contained in the 'rules' set to the
53// 'outfile', with each rule terminated by a LF.  The file must already have
54// been created with write access.
55bool WriteRules(const RuleMap& rules, FilePath outfile) {
56  std::string data;
57  data.append(
58      "%{\n"
59      "// Copyright (c) 2009 The Chromium Authors. All rights reserved.\n"
60      "// Use of this source code is governed by a BSD-style license that\n"
61      "// can be found in the LICENSE file.\n\n"
62      "// This file is generated by net/tools/tld_cleanup/.\n"
63      "// DO NOT MANUALLY EDIT!\n"
64      "%}\n"
65      "struct DomainRule {\n"
66      "  const char *name;\n"
67      "  int type;  // 1: exception, 2: wildcard\n"
68      "};\n"
69      "%%\n"
70  );
71
72  for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
73    data.append(i->first);
74    data.append(", ");
75    if (i->second.exception) {
76      data.append("1");
77    } else if (i->second.wildcard) {
78      data.append("2");
79    } else {
80      data.append("0");
81    }
82    data.append("\n");
83  }
84
85  data.append("%%\n");
86
87  int written = file_util::WriteFile(outfile, data.data(), data.size());
88
89  return written == static_cast<int>(data.size());
90}
91
92// These result codes should be in increasing order of severity.
93typedef enum {
94  kSuccess,
95  kWarning,
96  kError,
97} NormalizeResult;
98
99// Adjusts the rule to a standard form: removes single extraneous dots and
100// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
101// valid; logs a warning and returns kWarning if it is probably invalid; and
102// logs an error and returns kError if the rule is (almost) certainly invalid.
103NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
104  NormalizeResult result = kSuccess;
105
106  // Strip single leading and trailing dots.
107  if (domain->at(0) == '.')
108    domain->erase(0, 1);
109  if (domain->empty()) {
110    LOG(WARNING) << "Ignoring empty rule";
111    return kWarning;
112  }
113  if (domain->at(domain->size() - 1) == '.')
114    domain->erase(domain->size() - 1, 1);
115  if (domain->empty()) {
116    LOG(WARNING) << "Ignoring empty rule";
117    return kWarning;
118  }
119
120  // Allow single leading '*.' or '!', saved here so it's not canonicalized.
121  size_t start_offset = 0;
122  if (domain->at(0) == '!') {
123    domain->erase(0, 1);
124    rule->exception = true;
125  } else if (domain->find("*.") == 0) {
126    domain->erase(0, 2);
127    rule->wildcard = true;
128  }
129  if (domain->empty()) {
130    LOG(WARNING) << "Ignoring empty rule";
131    return kWarning;
132  }
133
134  // Warn about additional '*.' or '!'.
135  if (domain->find("*.", start_offset) != std::string::npos ||
136      domain->find('!', start_offset) != std::string::npos) {
137    LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
138    result = kWarning;
139  }
140
141  // Make a GURL and normalize it, then get the host back out.
142  std::string url = "http://";
143  url.append(*domain);
144  GURL gurl(url);
145  const std::string& spec = gurl.possibly_invalid_spec();
146  url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;
147  if (host.len < 0) {
148    LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
149    return kError;
150  }
151  if (!gurl.is_valid()) {
152    LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
153    result = kWarning;
154  }
155  domain->assign(spec.substr(host.begin, host.len));
156
157  return result;
158}
159
160// Loads the file described by 'in_filename', converts it to the desired format
161// (see the file comments above), and saves it into 'out_filename'.  Returns
162// the most severe of the result codes encountered when normalizing the rules.
163NormalizeResult NormalizeFile(const FilePath& in_filename,
164                              const FilePath& out_filename) {
165  std::string data;
166  if (!file_util::ReadFileToString(in_filename, &data)) {
167    LOG(ERROR) << "Unable to read file";
168    // We return success since we've already reported the error.
169    return kSuccess;
170  }
171
172  // We do a lot of string assignment during parsing, but simplicity is more
173  // important than performance here.
174  std::string domain;
175  NormalizeResult result = kSuccess;
176  size_t line_start = 0;
177  size_t line_end = 0;
178  RuleMap rules;
179  RuleSet extra_rules;
180  while (line_start < data.size()) {
181    // Skip comments.
182    if (line_start + 1 < data.size() &&
183        data[line_start] == '/' &&
184        data[line_start + 1] == '/') {
185      line_end = data.find_first_of("\r\n", line_start);
186      if (line_end == std::string::npos)
187        line_end = data.size();
188    } else {
189      // Truncate at first whitespace.
190      line_end = data.find_first_of("\r\n \t", line_start);
191      if (line_end == std::string::npos)
192        line_end = data.size();
193      domain.assign(data.data(), line_start, line_end - line_start);
194
195      Rule rule;
196      rule.wildcard = false;
197      rule.exception = false;
198      NormalizeResult new_result = NormalizeRule(&domain, &rule);
199      if (new_result != kError) {
200        // Check the existing rules to make sure we don't have an exception and
201        // wildcard for the same rule.  If we did, we'd have to update our
202        // parsing code to handle this case.
203        CHECK(rules.find(domain) == rules.end());
204
205        rules[domain] = rule;
206        // Add true TLD for multi-level rules.  We don't add them right now, in
207        // case there's an exception or wild card that either exists or might be
208        // added in a later iteration.  In those cases, there's no need to add
209        // it and it would just slow down parsing the data.
210        size_t tld_start = domain.find_last_of('.');
211        if (tld_start != std::string::npos && tld_start + 1 < domain.size())
212          extra_rules.insert(domain.substr(tld_start + 1));
213      }
214      result = std::max(result, new_result);
215    }
216
217    // Find beginning of next non-empty line.
218    line_start = data.find_first_of("\r\n", line_end);
219    if (line_start == std::string::npos)
220      line_start = data.size();
221    line_start = data.find_first_not_of("\r\n", line_start);
222    if (line_start == std::string::npos)
223      line_start = data.size();
224  }
225
226  for (RuleSet::const_iterator iter = extra_rules.begin();
227       iter != extra_rules.end();
228       ++iter) {
229    if (rules.find(*iter) == rules.end()) {
230      Rule rule;
231      rule.exception = false;
232      rule.wildcard = false;
233      rules[*iter] = rule;
234    }
235  }
236
237  if (!WriteRules(rules, out_filename)) {
238    LOG(ERROR) << "Error(s) writing output file";
239    result = kError;
240  }
241
242  return result;
243}
244
245int main(int argc, const char* argv[]) {
246  base::EnableTerminationOnHeapCorruption();
247  if (argc != 1) {
248    fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
249    fprintf(stderr, "Usage: %s\n", argv[0]);
250    return 1;
251  }
252
253  // Manages the destruction of singletons.
254  base::AtExitManager exit_manager;
255
256  // Only use OutputDebugString in debug mode.
257#ifdef NDEBUG
258  logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;
259#else
260  logging::LoggingDestination destination =
261      logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;
262#endif
263
264  CommandLine::Init(argc, argv);
265
266  FilePath log_filename;
267  PathService::Get(base::DIR_EXE, &log_filename);
268  log_filename = log_filename.AppendASCII("tld_cleanup.log");
269  logging::InitLogging(
270      log_filename.value().c_str(),
271      destination,
272      logging::LOCK_LOG_FILE,
273      logging::DELETE_OLD_LOG_FILE,
274      logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS);
275
276  icu_util::Initialize();
277
278  FilePath input_file;
279  PathService::Get(base::DIR_SOURCE_ROOT, &input_file);
280  input_file = input_file.Append(FILE_PATH_LITERAL("net"))
281                         .Append(FILE_PATH_LITERAL("base"))
282                         .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
283  FilePath output_file;
284  PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
285  output_file = output_file.Append(FILE_PATH_LITERAL("net"))
286                           .Append(FILE_PATH_LITERAL("base"))
287                           .Append(FILE_PATH_LITERAL(
288                               "effective_tld_names.gperf"));
289  NormalizeResult result = NormalizeFile(input_file, output_file);
290  if (result != kSuccess) {
291    fprintf(stderr,
292            "Errors or warnings processing file.  See log in tld_cleanup.log.");
293  }
294
295  if (result == kError)
296    return 1;
297  return 0;
298}
299