1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/dns/dns_hosts.h"
6
7#include "base/files/file_util.h"
8#include "base/logging.h"
9#include "base/metrics/histogram.h"
10#include "base/strings/string_util.h"
11
12using base::StringPiece;
13
14namespace net {
15
16namespace {
17
18// Parses the contents of a hosts file.  Returns one token (IP or hostname) at
19// a time.  Doesn't copy anything; accepts the file as a StringPiece and
20// returns tokens as StringPieces.
21class HostsParser {
22 public:
23  explicit HostsParser(const StringPiece& text, ParseHostsCommaMode comma_mode)
24      : text_(text),
25        data_(text.data()),
26        end_(text.size()),
27        pos_(0),
28        token_is_ip_(false),
29        comma_mode_(comma_mode) {}
30
31  // Advances to the next token (IP or hostname).  Returns whether another
32  // token was available.  |token_is_ip| and |token| can be used to find out
33  // the type and text of the token.
34  bool Advance() {
35    bool next_is_ip = (pos_ == 0);
36    while (pos_ < end_ && pos_ != std::string::npos) {
37      switch (text_[pos_]) {
38        case ' ':
39        case '\t':
40          SkipWhitespace();
41          break;
42
43        case '\r':
44        case '\n':
45          next_is_ip = true;
46          pos_++;
47          break;
48
49        case '#':
50          SkipRestOfLine();
51          break;
52
53        case ',':
54          if (comma_mode_ == PARSE_HOSTS_COMMA_IS_WHITESPACE) {
55            SkipWhitespace();
56            break;
57          }
58
59          // If comma_mode_ is COMMA_IS_TOKEN, fall through:
60
61        default: {
62          size_t token_start = pos_;
63          SkipToken();
64          size_t token_end = (pos_ == std::string::npos) ? end_ : pos_;
65
66          token_ = StringPiece(data_ + token_start, token_end - token_start);
67          token_is_ip_ = next_is_ip;
68
69          return true;
70        }
71      }
72    }
73
74    return false;
75  }
76
77  // Fast-forwards the parser to the next line.  Should be called if an IP
78  // address doesn't parse, to avoid wasting time tokenizing hostnames that
79  // will be ignored.
80  void SkipRestOfLine() {
81    pos_ = text_.find("\n", pos_);
82  }
83
84  // Returns whether the last-parsed token is an IP address (true) or a
85  // hostname (false).
86  bool token_is_ip() { return token_is_ip_; }
87
88  // Returns the text of the last-parsed token as a StringPiece referencing
89  // the same underlying memory as the StringPiece passed to the constructor.
90  // Returns an empty StringPiece if no token has been parsed or the end of
91  // the input string has been reached.
92  const StringPiece& token() { return token_; }
93
94 private:
95  void SkipToken() {
96    switch (comma_mode_) {
97      case PARSE_HOSTS_COMMA_IS_TOKEN:
98        pos_ = text_.find_first_of(" \t\n\r#", pos_);
99        break;
100      case PARSE_HOSTS_COMMA_IS_WHITESPACE:
101        pos_ = text_.find_first_of(" ,\t\n\r#", pos_);
102        break;
103    }
104  }
105
106  void SkipWhitespace() {
107    switch (comma_mode_) {
108      case PARSE_HOSTS_COMMA_IS_TOKEN:
109        pos_ = text_.find_first_not_of(" \t", pos_);
110        break;
111      case PARSE_HOSTS_COMMA_IS_WHITESPACE:
112        pos_ = text_.find_first_not_of(" ,\t", pos_);
113        break;
114    }
115  }
116
117  const StringPiece text_;
118  const char* data_;
119  const size_t end_;
120
121  size_t pos_;
122  StringPiece token_;
123  bool token_is_ip_;
124
125  const ParseHostsCommaMode comma_mode_;
126
127  DISALLOW_COPY_AND_ASSIGN(HostsParser);
128};
129
130void ParseHostsWithCommaMode(const std::string& contents,
131                             DnsHosts* dns_hosts,
132                             ParseHostsCommaMode comma_mode) {
133  CHECK(dns_hosts);
134  DnsHosts& hosts = *dns_hosts;
135
136  StringPiece ip_text;
137  IPAddressNumber ip;
138  AddressFamily family = ADDRESS_FAMILY_IPV4;
139  HostsParser parser(contents, comma_mode);
140  while (parser.Advance()) {
141    if (parser.token_is_ip()) {
142      StringPiece new_ip_text = parser.token();
143      // Some ad-blocking hosts files contain thousands of entries pointing to
144      // the same IP address (usually 127.0.0.1).  Don't bother parsing the IP
145      // again if it's the same as the one above it.
146      if (new_ip_text != ip_text) {
147        IPAddressNumber new_ip;
148        if (ParseIPLiteralToNumber(parser.token().as_string(), &new_ip)) {
149          ip_text = new_ip_text;
150          ip.swap(new_ip);
151          family = (ip.size() == 4) ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_IPV6;
152        } else {
153          parser.SkipRestOfLine();
154        }
155      }
156    } else {
157      DnsHostsKey key(parser.token().as_string(), family);
158      base::StringToLowerASCII(&key.first);
159      IPAddressNumber& mapped_ip = hosts[key];
160      if (mapped_ip.empty())
161        mapped_ip = ip;
162      // else ignore this entry (first hit counts)
163    }
164  }
165}
166
167}  // namespace
168
169void ParseHostsWithCommaModeForTesting(const std::string& contents,
170                                       DnsHosts* dns_hosts,
171                                       ParseHostsCommaMode comma_mode) {
172  ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);
173}
174
175void ParseHosts(const std::string& contents, DnsHosts* dns_hosts) {
176  ParseHostsCommaMode comma_mode;
177#if defined(OS_MACOSX)
178  // Mac OS X allows commas to separate hostnames.
179  comma_mode = PARSE_HOSTS_COMMA_IS_WHITESPACE;
180#else
181  // Linux allows commas in hostnames.
182  comma_mode = PARSE_HOSTS_COMMA_IS_TOKEN;
183#endif
184
185  ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);
186}
187
188bool ParseHostsFile(const base::FilePath& path, DnsHosts* dns_hosts) {
189  dns_hosts->clear();
190  // Missing file indicates empty HOSTS.
191  if (!base::PathExists(path))
192    return true;
193
194  int64 size;
195  if (!base::GetFileSize(path, &size))
196    return false;
197
198  UMA_HISTOGRAM_COUNTS("AsyncDNS.HostsSize", size);
199
200  // Reject HOSTS files larger than |kMaxHostsSize| bytes.
201  const int64 kMaxHostsSize = 1 << 25;  // 32MB
202  if (size > kMaxHostsSize)
203    return false;
204
205  std::string contents;
206  if (!base::ReadFileToString(path, &contents))
207    return false;
208
209  ParseHosts(contents, dns_hosts);
210  return true;
211}
212
213}  // namespace net
214
215