1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// This command-line program converts an effective-TLD data file in UTF-8 from
6// the format provided by Mozilla to the format expected by Chrome.  This
7// program generates an intermediate file which is then used by gperf to
8// generate a perfect hash map.  The benefit of this approach is that no time is
9// spent on program initialization to generate the map of this data.
10//
11// Running this program finds "effective_tld_names.dat" in the expected location
12// in the source checkout and generates "effective_tld_names.gperf" next to it.
13//
14// Any errors or warnings from this program are recorded in tld_cleanup.log.
15//
16// In particular, it
17//  * Strips blank lines and comments, as well as notes for individual rules.
18//  * Strips a single leading and/or trailing dot from each rule, if present.
19//  * Logs a warning if a rule contains '!' or '*.' other than at the beginning
20//    of the rule.  (This also catches multiple ! or *. at the start of a rule.)
21//  * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
22//  * Canonicalizes each rule's domain by converting it to a GURL and back.
23//  * Adds explicit rules for true TLDs found in any rule.
24//  * Marks entries in the file between "// ===BEGIN PRIVATE DOMAINS==="
25//    and "// ===END PRIVATE DOMAINS===" as private.
26
27#include "base/at_exit.h"
28#include "base/command_line.h"
29#include "base/files/file_path.h"
30#include "base/files/file_util.h"
31#include "base/i18n/icu_util.h"
32#include "base/logging.h"
33#include "base/path_service.h"
34#include "base/process/memory.h"
35#include "net/tools/tld_cleanup/tld_cleanup_util.h"
36
37int main(int argc, const char* argv[]) {
38  base::EnableTerminationOnHeapCorruption();
39  if (argc != 1) {
40    fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
41    fprintf(stderr, "Usage: %s\n", argv[0]);
42    return 1;
43  }
44
45  // Manages the destruction of singletons.
46  base::AtExitManager exit_manager;
47
48  // Only use OutputDebugString in debug mode.
49#ifdef NDEBUG
50  logging::LoggingDestination destination = logging::LOG_TO_FILE;
51#else
52  logging::LoggingDestination destination =
53      logging::LOG_TO_ALL;
54#endif
55
56  base::CommandLine::Init(argc, argv);
57
58  base::FilePath log_filename;
59  PathService::Get(base::DIR_EXE, &log_filename);
60  log_filename = log_filename.AppendASCII("tld_cleanup.log");
61  logging::LoggingSettings settings;
62  settings.logging_dest = destination;
63  settings.log_file = log_filename.value().c_str();
64  settings.delete_old = logging::DELETE_OLD_LOG_FILE;
65  logging::InitLogging(settings);
66
67  base::i18n::InitializeICU();
68
69  base::FilePath input_file;
70  PathService::Get(base::DIR_SOURCE_ROOT, &input_file);
71  input_file = input_file.Append(FILE_PATH_LITERAL("net"))
72                         .Append(FILE_PATH_LITERAL("base"))
73                         .Append(FILE_PATH_LITERAL(
74                             "registry_controlled_domains"))
75                         .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
76  base::FilePath output_file;
77  PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
78  output_file = output_file.Append(FILE_PATH_LITERAL("net"))
79                           .Append(FILE_PATH_LITERAL("base"))
80                           .Append(FILE_PATH_LITERAL(
81                               "registry_controlled_domains"))
82                           .Append(FILE_PATH_LITERAL(
83                               "effective_tld_names.gperf"));
84  net::tld_cleanup::NormalizeResult result =
85      net::tld_cleanup::NormalizeFile(input_file, output_file);
86  if (result != net::tld_cleanup::kSuccess) {
87    fprintf(stderr,
88            "Errors or warnings processing file.  See log in tld_cleanup.log.");
89  }
90
91  if (result == net::tld_cleanup::kError)
92    return 1;
93  return 0;
94}
95