1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "content/child/site_isolation_policy.h"
6
7#include "base/basictypes.h"
8#include "base/command_line.h"
9#include "base/lazy_instance.h"
10#include "base/logging.h"
11#include "base/metrics/histogram.h"
12#include "base/strings/string_util.h"
13#include "content/public/common/content_switches.h"
14#include "content/public/common/resource_response_info.h"
15#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
16#include "net/http/http_response_headers.h"
17
18using base::StringPiece;
19
20namespace content {
21
22namespace {
23
24// The cross-site document blocking/UMA data collection is deactivated by
25// default, and only activated in renderer processes.
26static bool g_policy_enabled = false;
27
28// MIME types
29const char kTextHtml[] = "text/html";
30const char kTextXml[] = "text/xml";
31const char xAppRssXml[] = "application/rss+xml";
32const char kAppXml[] = "application/xml";
33const char kAppJson[] = "application/json";
34const char kTextJson[] = "text/json";
35const char kTextXjson[] = "text/x-json";
36const char kTextPlain[] = "text/plain";
37
38// TODO(dsjang): this is only needed for collecting UMA stat. Will be deleted
39// when this class is used for actual blocking.
40bool IsRenderableStatusCode(int status_code) {
41  // Chrome only uses the content of a response with one of these status codes
42  // for CSS/JavaScript. For images, Chrome just ignores status code.
43  const int renderable_status_code[] = {200, 201, 202, 203, 206, 300,
44                                        301, 302, 303, 305, 306, 307};
45  for (size_t i = 0; i < arraysize(renderable_status_code); ++i) {
46    if (renderable_status_code[i] == status_code)
47      return true;
48  }
49  return false;
50}
51
52bool MatchesSignature(StringPiece data,
53                      const StringPiece signatures[],
54                      size_t arr_size) {
55
56  size_t offset = data.find_first_not_of(" \t\r\n");
57  // There is no not-whitespace character in this document.
58  if (offset == base::StringPiece::npos)
59    return false;
60
61  data.remove_prefix(offset);
62  size_t length = data.length();
63
64  for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
65    const StringPiece& signature = signatures[sig_index];
66    size_t signature_length = signature.length();
67    if (length < signature_length)
68      continue;
69
70    if (LowerCaseEqualsASCII(
71            data.begin(), data.begin() + signature_length, signature.data()))
72      return true;
73  }
74  return false;
75}
76
77void IncrementHistogramCount(const std::string& name) {
78  // The default value of min, max, bucket_count are copied from histogram.h.
79  base::HistogramBase* histogram_pointer = base::Histogram::FactoryGet(
80      name, 1, 100000, 50, base::HistogramBase::kUmaTargetedHistogramFlag);
81  histogram_pointer->Add(1);
82}
83
84void IncrementHistogramEnum(const std::string& name,
85                          uint32 sample,
86                          uint32 boundary_value) {
87  // The default value of min, max, bucket_count are copied from histogram.h.
88  base::HistogramBase* histogram_pointer = base::LinearHistogram::FactoryGet(
89      name,
90      1,
91      boundary_value,
92      boundary_value + 1,
93      base::HistogramBase::kUmaTargetedHistogramFlag);
94  histogram_pointer->Add(sample);
95}
96
97void HistogramCountBlockedResponse(
98    const std::string& bucket_prefix,
99    linked_ptr<SiteIsolationResponseMetaData>& resp_data,
100    bool nosniff_block) {
101  std::string block_label(nosniff_block ? ".NoSniffBlocked" : ".Blocked");
102  IncrementHistogramCount(bucket_prefix + block_label);
103
104  // The content is blocked if it is sniffed as HTML/JSON/XML. When
105  // the blocked response is with an error status code, it is not
106  // disruptive for the following reasons : 1) the blocked content is
107  // not a binary object (such as an image) since it is sniffed as
108  // text; 2) then, this blocking only breaks the renderer behavior
109  // only if it is either JavaScript or CSS. However, the renderer
110  // doesn't use the contents of JS/CSS with unaffected status code
111  // (e.g, 404). 3) the renderer is expected not to use the cross-site
112  // document content for purposes other than JS/CSS (e.g, XHR).
113  bool renderable_status_code =
114      IsRenderableStatusCode(resp_data->http_status_code);
115
116  if (renderable_status_code) {
117    IncrementHistogramEnum(
118        bucket_prefix + block_label + ".RenderableStatusCode",
119        resp_data->resource_type,
120        RESOURCE_TYPE_LAST_TYPE);
121  } else {
122    IncrementHistogramCount(bucket_prefix + block_label +
123                            ".NonRenderableStatusCode");
124  }
125}
126
127void HistogramCountNotBlockedResponse(const std::string& bucket_prefix,
128                                      bool sniffed_as_js) {
129  IncrementHistogramCount(bucket_prefix + ".NotBlocked");
130  if (sniffed_as_js)
131    IncrementHistogramCount(bucket_prefix + ".NotBlocked.MaybeJS");
132}
133
134}  // namespace
135
136SiteIsolationResponseMetaData::SiteIsolationResponseMetaData() {}
137
138void SiteIsolationPolicy::SetPolicyEnabled(bool enabled) {
139  g_policy_enabled = enabled;
140}
141
142linked_ptr<SiteIsolationResponseMetaData>
143SiteIsolationPolicy::OnReceivedResponse(const GURL& frame_origin,
144                                        const GURL& response_url,
145                                        ResourceType resource_type,
146                                        int origin_pid,
147                                        const ResourceResponseInfo& info) {
148  if (!g_policy_enabled)
149    return linked_ptr<SiteIsolationResponseMetaData>();
150
151  // if |origin_pid| is non-zero, it means that this response is for a plugin
152  // spawned from this renderer process. We exclude responses for plugins for
153  // now, but eventually, we're going to make plugin processes directly talk to
154  // the browser process so that we don't apply cross-site document blocking to
155  // them.
156  if (origin_pid)
157    return linked_ptr<SiteIsolationResponseMetaData>();
158
159  UMA_HISTOGRAM_COUNTS("SiteIsolation.AllResponses", 1);
160
161  // See if this is for navigation. If it is, don't block it, under the
162  // assumption that we will put it in an appropriate process.
163  if (IsResourceTypeFrame(resource_type))
164    return linked_ptr<SiteIsolationResponseMetaData>();
165
166  if (!IsBlockableScheme(response_url))
167    return linked_ptr<SiteIsolationResponseMetaData>();
168
169  if (IsSameSite(frame_origin, response_url))
170    return linked_ptr<SiteIsolationResponseMetaData>();
171
172  SiteIsolationResponseMetaData::CanonicalMimeType canonical_mime_type =
173      GetCanonicalMimeType(info.mime_type);
174
175  if (canonical_mime_type == SiteIsolationResponseMetaData::Others)
176    return linked_ptr<SiteIsolationResponseMetaData>();
177
178  // Every CORS request should have the Access-Control-Allow-Origin header even
179  // if it is preceded by a pre-flight request. Therefore, if this is a CORS
180  // request, it has this header.  response.httpHeaderField() internally uses
181  // case-insensitive matching for the header name.
182  std::string access_control_origin;
183
184  // We can use a case-insensitive header name for EnumerateHeader().
185  info.headers->EnumerateHeader(
186      NULL, "access-control-allow-origin", &access_control_origin);
187  if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin))
188    return linked_ptr<SiteIsolationResponseMetaData>();
189
190  // Real XSD data collection starts from here.
191  std::string no_sniff;
192  info.headers->EnumerateHeader(NULL, "x-content-type-options", &no_sniff);
193
194  linked_ptr<SiteIsolationResponseMetaData> resp_data(
195      new SiteIsolationResponseMetaData);
196  resp_data->frame_origin = frame_origin.spec();
197  resp_data->response_url = response_url;
198  resp_data->resource_type = resource_type;
199  resp_data->canonical_mime_type = canonical_mime_type;
200  resp_data->http_status_code = info.headers->response_code();
201  resp_data->no_sniff = LowerCaseEqualsASCII(no_sniff, "nosniff");
202
203  return resp_data;
204}
205
206bool SiteIsolationPolicy::ShouldBlockResponse(
207    linked_ptr<SiteIsolationResponseMetaData>& resp_data,
208    const char* raw_data,
209    int raw_length,
210    std::string* alternative_data) {
211  if (!g_policy_enabled)
212    return false;
213
214  DCHECK(resp_data.get());
215
216  StringPiece data(raw_data, raw_length);
217
218  // Record the length of the first received network packet to see if it's
219  // enough for sniffing.
220  UMA_HISTOGRAM_COUNTS("SiteIsolation.XSD.DataLength", raw_length);
221
222  // Record the number of cross-site document responses with a specific mime
223  // type (text/html, text/xml, etc).
224  UMA_HISTOGRAM_ENUMERATION(
225      "SiteIsolation.XSD.MimeType",
226      resp_data->canonical_mime_type,
227      SiteIsolationResponseMetaData::MaxCanonicalMimeType);
228
229  // Store the result of cross-site document blocking analysis.
230  bool is_blocked = false;
231  bool sniffed_as_js = SniffForJS(data);
232
233  // Record the number of responses whose content is sniffed for what its mime
234  // type claims it to be. For example, we apply a HTML sniffer for a document
235  // tagged with text/html here. Whenever this check becomes true, we'll block
236  // the response.
237  if (resp_data->canonical_mime_type !=
238          SiteIsolationResponseMetaData::Plain) {
239    std::string bucket_prefix;
240    bool sniffed_as_target_document = false;
241    if (resp_data->canonical_mime_type ==
242            SiteIsolationResponseMetaData::HTML) {
243      bucket_prefix = "SiteIsolation.XSD.HTML";
244      sniffed_as_target_document = SniffForHTML(data);
245    } else if (resp_data->canonical_mime_type ==
246                   SiteIsolationResponseMetaData::XML) {
247      bucket_prefix = "SiteIsolation.XSD.XML";
248      sniffed_as_target_document = SniffForXML(data);
249    } else if (resp_data->canonical_mime_type ==
250                   SiteIsolationResponseMetaData::JSON) {
251      bucket_prefix = "SiteIsolation.XSD.JSON";
252      sniffed_as_target_document = SniffForJSON(data);
253    } else {
254      NOTREACHED() << "Not a blockable mime type: "
255                   << resp_data->canonical_mime_type;
256    }
257
258    if (sniffed_as_target_document) {
259      is_blocked = true;
260      HistogramCountBlockedResponse(bucket_prefix, resp_data, false);
261    } else {
262      if (resp_data->no_sniff) {
263        is_blocked = true;
264        HistogramCountBlockedResponse(bucket_prefix, resp_data, true);
265      } else {
266        HistogramCountNotBlockedResponse(bucket_prefix, sniffed_as_js);
267      }
268    }
269  } else {
270    // This block is for plain text documents. We apply our HTML, XML,
271    // and JSON sniffer to a text document in the order, and block it
272    // if any of them succeeds in sniffing.
273    std::string bucket_prefix;
274    if (SniffForHTML(data))
275      bucket_prefix = "SiteIsolation.XSD.Plain.HTML";
276    else if (SniffForXML(data))
277      bucket_prefix = "SiteIsolation.XSD.Plain.XML";
278    else if (SniffForJSON(data))
279      bucket_prefix = "SiteIsolation.XSD.Plain.JSON";
280
281    if (bucket_prefix.size() > 0) {
282      is_blocked = true;
283      HistogramCountBlockedResponse(bucket_prefix, resp_data, false);
284    } else if (resp_data->no_sniff) {
285      is_blocked = true;
286      HistogramCountBlockedResponse("SiteIsolation.XSD.Plain", resp_data, true);
287    } else {
288      HistogramCountNotBlockedResponse("SiteIsolation.XSD.Plain",
289                                       sniffed_as_js);
290    }
291  }
292
293  if (!CommandLine::ForCurrentProcess()->HasSwitch(
294           switches::kBlockCrossSiteDocuments))
295    is_blocked = false;
296
297  if (is_blocked) {
298    alternative_data->erase();
299    alternative_data->insert(0, " ");
300    LOG(ERROR) << resp_data->response_url
301               << " is blocked as an illegal cross-site document from "
302               << resp_data->frame_origin;
303  }
304  return is_blocked;
305}
306
307SiteIsolationResponseMetaData::CanonicalMimeType
308SiteIsolationPolicy::GetCanonicalMimeType(const std::string& mime_type) {
309  if (LowerCaseEqualsASCII(mime_type, kTextHtml)) {
310    return SiteIsolationResponseMetaData::HTML;
311  }
312
313  if (LowerCaseEqualsASCII(mime_type, kTextPlain)) {
314    return SiteIsolationResponseMetaData::Plain;
315  }
316
317  if (LowerCaseEqualsASCII(mime_type, kAppJson) ||
318      LowerCaseEqualsASCII(mime_type, kTextJson) ||
319      LowerCaseEqualsASCII(mime_type, kTextXjson)) {
320    return SiteIsolationResponseMetaData::JSON;
321  }
322
323  if (LowerCaseEqualsASCII(mime_type, kTextXml) ||
324      LowerCaseEqualsASCII(mime_type, xAppRssXml) ||
325      LowerCaseEqualsASCII(mime_type, kAppXml)) {
326    return SiteIsolationResponseMetaData::XML;
327  }
328
329 return SiteIsolationResponseMetaData::Others;
330}
331
332bool SiteIsolationPolicy::IsBlockableScheme(const GURL& url) {
333  // We exclude ftp:// from here. FTP doesn't provide a Content-Type
334  // header which our policy depends on, so we cannot protect any
335  // document from FTP servers.
336  return url.SchemeIs(url::kHttpScheme) || url.SchemeIs(url::kHttpsScheme);
337}
338
339bool SiteIsolationPolicy::IsSameSite(const GURL& frame_origin,
340                                     const GURL& response_url) {
341
342  if (!frame_origin.is_valid() || !response_url.is_valid())
343    return false;
344
345  if (frame_origin.scheme() != response_url.scheme())
346    return false;
347
348  // SameDomainOrHost() extracts the effective domains (public suffix plus one)
349  // from the two URLs and compare them.
350  return net::registry_controlled_domains::SameDomainOrHost(
351      frame_origin,
352      response_url,
353      net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
354}
355
356// We don't use Webkit's existing CORS policy implementation since
357// their policy works in terms of origins, not sites. For example,
358// when frame is sub.a.com and it is not allowed to access a document
359// with sub1.a.com. But under Site Isolation, it's allowed.
360bool SiteIsolationPolicy::IsValidCorsHeaderSet(
361    const GURL& frame_origin,
362    const GURL& website_origin,
363    const std::string& access_control_origin) {
364  // Many websites are sending back "\"*\"" instead of "*". This is
365  // non-standard practice, and not supported by Chrome. Refer to
366  // CrossOriginAccessControl::passesAccessControlCheck().
367
368  // TODO(dsjang): * is not allowed for the response from a request
369  // with cookies. This allows for more than what the renderer will
370  // eventually be able to receive, so we won't see illegal cross-site
371  // documents allowed by this. We have to find a way to see if this
372  // response is from a cookie-tagged request or not in the future.
373  if (access_control_origin == "*")
374    return true;
375
376  // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
377  // "*", but many websites are using just a domain for access_control_origin,
378  // and this is blocked by Webkit's CORS logic here :
379  // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
380  // is_valid() to false when it is created from a URL containing * in the
381  // domain part.
382
383  GURL cors_origin(access_control_origin);
384  return IsSameSite(frame_origin, cors_origin);
385}
386
387// This function is a slight modification of |net::SniffForHTML|.
388bool SiteIsolationPolicy::SniffForHTML(StringPiece data) {
389  // The content sniffer used by Chrome and Firefox are using "<!--"
390  // as one of the HTML signatures, but it also appears in valid
391  // JavaScript, considered as well-formed JS by the browser.  Since
392  // we do not want to block any JS, we exclude it from our HTML
393  // signatures. This can weaken our document block policy, but we can
394  // break less websites.
395  // TODO(dsjang): parameterize |net::SniffForHTML| with an option
396  // that decides whether to include <!-- or not, so that we can
397  // remove this function.
398  // TODO(dsjang): Once SiteIsolationPolicy is moved into the browser
399  // process, we should do single-thread checking here for the static
400  // initializer.
401  static const StringPiece kHtmlSignatures[] = {
402    StringPiece("<!DOCTYPE html"),  // HTML5 spec
403    StringPiece("<script"),  // HTML5 spec, Mozilla
404    StringPiece("<html"),    // HTML5 spec, Mozilla
405    StringPiece("<head"),    // HTML5 spec, Mozilla
406    StringPiece("<iframe"),  // Mozilla
407    StringPiece("<h1"),      // Mozilla
408    StringPiece("<div"),     // Mozilla
409    StringPiece("<font"),    // Mozilla
410    StringPiece("<table"),   // Mozilla
411    StringPiece("<a"),       // Mozilla
412    StringPiece("<style"),   // Mozilla
413    StringPiece("<title"),   // Mozilla
414    StringPiece("<b"),       // Mozilla
415    StringPiece("<body"),    // Mozilla
416    StringPiece("<br"),      // Mozilla
417    StringPiece("<p"),       // Mozilla
418    StringPiece("<?xml")     // Mozilla
419  };
420
421  while (data.length() > 0) {
422    if (MatchesSignature(
423          data, kHtmlSignatures, arraysize(kHtmlSignatures)))
424      return true;
425
426    // If we cannot find "<!--", we fail sniffing this as HTML.
427    static const StringPiece kCommentBegins[] = { StringPiece("<!--") };
428    if (!MatchesSignature(data, kCommentBegins, arraysize(kCommentBegins)))
429      break;
430
431    // Search for --> and do SniffForHTML after that. If we can find the
432    // comment's end, we start HTML sniffing from there again.
433    static const char kEndComment[] = "-->";
434    size_t offset = data.find(kEndComment);
435    if (offset == base::StringPiece::npos)
436      break;
437
438    // Proceed to the index next to the ending comment (-->).
439    data.remove_prefix(offset + strlen(kEndComment));
440  }
441
442  return false;
443}
444
445bool SiteIsolationPolicy::SniffForXML(base::StringPiece data) {
446  // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
447  // this signature. However, XML is case-sensitive. Don't we have to
448  // be more lenient only to block documents starting with the exact
449  // string <?xml rather than <?XML ?
450  // TODO(dsjang): Once SiteIsolationPolicy is moved into the browser
451  // process, we should do single-thread checking here for the static
452  // initializer.
453  static const StringPiece kXmlSignatures[] = { StringPiece("<?xml") };
454  return MatchesSignature(data, kXmlSignatures, arraysize(kXmlSignatures));
455}
456
457bool SiteIsolationPolicy::SniffForJSON(base::StringPiece data) {
458  // TODO(dsjang): We have to come up with a better way to sniff
459  // JSON. However, even RE cannot help us that much due to the fact
460  // that we don't do full parsing.  This DFA starts with state 0, and
461  // finds {, "/' and : in that order. We're avoiding adding a
462  // dependency on a regular expression library.
463  enum {
464    kStartState,
465    kLeftBraceState,
466    kLeftQuoteState,
467    kColonState,
468    kTerminalState,
469  } state = kStartState;
470
471  size_t length = data.length();
472  for (size_t i = 0; i < length && state < kColonState; ++i) {
473    const char c = data[i];
474    if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
475      continue;
476
477    switch (state) {
478      case kStartState:
479        if (c == '{')
480          state = kLeftBraceState;
481        else
482          state = kTerminalState;
483        break;
484      case kLeftBraceState:
485        if (c == '\"' || c == '\'')
486          state = kLeftQuoteState;
487        else
488          state = kTerminalState;
489        break;
490      case kLeftQuoteState:
491        if (c == ':')
492          state = kColonState;
493        break;
494      case kColonState:
495      case kTerminalState:
496        NOTREACHED();
497        break;
498    }
499  }
500  return state == kColonState;
501}
502
503bool SiteIsolationPolicy::SniffForJS(StringPiece data) {
504  // TODO(dsjang): This is a real hack. The only purpose of this function is to
505  // try to see if there's any possibility that this data can be JavaScript
506  // (superset of JS). This function will be removed once UMA stats are
507  // gathered.
508
509  // Search for "var " for JS detection.
510  return data.find("var ") != base::StringPiece::npos;
511}
512
513}  // namespace content
514