1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Detecting mime types is a tricky business because we need to balance
6// compatibility concerns with security issues.  Here is a survey of how other
7// browsers behave and then a description of how we intend to behave.
8//
9// HTML payload, no Content-Type header:
10// * IE 7: Render as HTML
11// * Firefox 2: Render as HTML
12// * Safari 3: Render as HTML
13// * Opera 9: Render as HTML
14//
15// Here the choice seems clear:
16// => Chrome: Render as HTML
17//
18// HTML payload, Content-Type: "text/plain":
19// * IE 7: Render as HTML
20// * Firefox 2: Render as text
21// * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
22//                                   has an HTML extension)
23// * Opera 9: Render as text
24//
25// Here we choose to follow the majority (and break some compatibility with IE).
26// Many folks dislike IE's behavior here.
27// => Chrome: Render as text
28// We generalize this as follows.  If the Content-Type header is text/plain
29// we won't detect dangerous mime types (those that can execute script).
30//
31// HTML payload, Content-Type: "application/octet-stream":
32// * IE 7: Render as HTML
33// * Firefox 2: Download as application/octet-stream
34// * Safari 3: Render as HTML
35// * Opera 9: Render as HTML
36//
37// We follow Firefox.
38// => Chrome: Download as application/octet-stream
39// One factor in this decision is that IIS 4 and 5 will send
40// application/octet-stream for .xhtml files (because they don't recognize
41// the extension).  We did some experiments and it looks like this doesn't occur
42// very often on the web.  We choose the more secure option.
43//
44// GIF payload, no Content-Type header:
45// * IE 7: Render as GIF
46// * Firefox 2: Render as GIF
47// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
48//                                        URL has an GIF extension)
49// * Opera 9: Render as GIF
50//
51// The choice is clear.
52// => Chrome: Render as GIF
53// Once we decide to render HTML without a Content-Type header, there isn't much
54// reason not to render GIFs.
55//
56// GIF payload, Content-Type: "text/plain":
57// * IE 7: Render as GIF
58// * Firefox 2: Download as application/octet-stream (Note: Firefox will
59//                              Download as GIF if the URL has an GIF extension)
60// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
61//                                        URL has an GIF extension)
62// * Opera 9: Render as GIF
63//
64// Displaying as text/plain makes little sense as the content will look like
65// gibberish.  Here, we could change our minds and download.
66// => Chrome: Render as GIF
67//
68// GIF payload, Content-Type: "application/octet-stream":
69// * IE 7: Render as GIF
70// * Firefox 2: Download as application/octet-stream (Note: Firefox will
71//                              Download as GIF if the URL has an GIF extension)
72// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
73//                                        URL has an GIF extension)
74// * Opera 9: Render as GIF
75//
76// We used to render as GIF here, but the problem is that some sites want to
77// trigger downloads by sending application/octet-stream (even though they
78// should be sending Content-Disposition: attachment).  Although it is safe
79// to render as GIF from a security perspective, we actually get better
80// compatibility if we don't sniff from application/octet stream at all.
81// => Chrome: Download as application/octet-stream
82//
83// XHTML payload, Content-Type: "text/xml":
84// * IE 7: Render as XML
85// * Firefox 2: Render as HTML
86// * Safari 3: Render as HTML
87// * Opera 9: Render as HTML
88// The layout tests rely on us rendering this as HTML.
89// But we're conservative in XHTML detection, as this runs afoul of the
90// "don't detect dangerous mime types" rule.
91//
92// Note that our definition of HTML payload is much stricter than IE's
93// definition and roughly the same as Firefox's definition.
94
95#include <string>
96
97#include "net/base/mime_sniffer.h"
98
99#include "base/basictypes.h"
100#include "base/logging.h"
101#include "base/metrics/histogram.h"
102#include "base/string_util.h"
103#include "googleurl/src/gurl.h"
104#include "net/base/mime_util.h"
105
106namespace net {
107
108// The number of content bytes we need to use all our magic numbers.  Feel free
109// to increase this number if you add a longer magic number.
110static const size_t kBytesRequiredForMagic = 42;
111
112struct MagicNumber {
113  const char* mime_type;
114  const char* magic;
115  size_t magic_len;
116  bool is_string;
117};
118
119#define MAGIC_NUMBER(mime_type, magic) \
120  { (mime_type), (magic), sizeof(magic)-1, false },
121
122// Magic strings are case insensitive and must not include '\0' characters
123#define MAGIC_STRING(mime_type, magic) \
124  { (mime_type), (magic), sizeof(magic)-1, true },
125
126static const MagicNumber kMagicNumbers[] = {
127  // Source: HTML 5 specification
128  MAGIC_NUMBER("application/pdf", "%PDF-")
129  MAGIC_NUMBER("application/postscript", "%!PS-Adobe-")
130  MAGIC_NUMBER("image/gif", "GIF87a")
131  MAGIC_NUMBER("image/gif", "GIF89a")
132  MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A")
133  MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF")
134  MAGIC_NUMBER("image/bmp", "BM")
135  // Source: Mozilla
136  MAGIC_NUMBER("text/plain", "#!")  // Script
137  MAGIC_NUMBER("text/plain", "%!")  // Script, similar to PS
138  MAGIC_NUMBER("text/plain", "From")
139  MAGIC_NUMBER("text/plain", ">From")
140  // Chrome specific
141  MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08")
142  MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46")
143  MAGIC_NUMBER("video/x-ms-asf",
144      "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C")
145  MAGIC_NUMBER("image/tiff", "I I")
146  MAGIC_NUMBER("image/tiff", "II*")
147  MAGIC_NUMBER("image/tiff", "MM\x00*")
148  MAGIC_NUMBER("audio/mpeg", "ID3")
149  MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ")
150  MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3")
151  // TODO(abarth): we don't handle partial byte matches yet
152  // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB")
153  // MAGIC_NUMBER("audio/mpeg", "\xFF\xE")
154  // MAGIC_NUMBER("audio/mpeg", "\xFF\xF")
155  MAGIC_NUMBER("application/zip", "PK\x03\x04")
156  MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00")
157  MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A")
158  MAGIC_NUMBER("application/octet-stream", "MZ")  // EXE
159  // Sniffing for Flash:
160  //
161  //   MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
162  //   MAGIC_NUMBER("application/x-shockwave-flash", "FLV")
163  //   MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
164  //
165  // Including these magic number for Flash is a trade off.
166  //
167  // Pros:
168  //   * Flash is an important and popular file format
169  //
170  // Cons:
171  //   * These patterns are fairly weak
172  //   * If we mistakenly decide something is Flash, we will execute it
173  //     in the origin of an unsuspecting site.  This could be a security
174  //     vulnerability if the site allows users to upload content.
175  //
176  // On balance, we do not include these patterns.
177};
178
179// Our HTML sniffer differs slightly from Mozilla.  For example, Mozilla will
180// decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
181// HTML, but we will not.
182
183#define MAGIC_HTML_TAG(tag) \
184  MAGIC_STRING("text/html", "<" tag)
185
186static const MagicNumber kSniffableTags[] = {
187  // XML processing directive.  Although this is not an HTML mime type, we sniff
188  // for this in the HTML phase because text/xml is just as powerful as HTML and
189  // we want to leverage our white space skipping technology.
190  MAGIC_NUMBER("text/xml", "<?xml")  // Mozilla
191  // DOCTYPEs
192  MAGIC_HTML_TAG("!DOCTYPE html")  // HTML5 spec
193  // Sniffable tags, ordered by how often they occur in sniffable documents.
194  MAGIC_HTML_TAG("script")  // HTML5 spec, Mozilla
195  MAGIC_HTML_TAG("html")  // HTML5 spec, Mozilla
196  MAGIC_HTML_TAG("!--")
197  MAGIC_HTML_TAG("head")  // HTML5 spec, Mozilla
198  MAGIC_HTML_TAG("iframe")  // Mozilla
199  MAGIC_HTML_TAG("h1")  // Mozilla
200  MAGIC_HTML_TAG("div")  // Mozilla
201  MAGIC_HTML_TAG("font")  // Mozilla
202  MAGIC_HTML_TAG("table")  // Mozilla
203  MAGIC_HTML_TAG("a")  // Mozilla
204  MAGIC_HTML_TAG("style")  // Mozilla
205  MAGIC_HTML_TAG("title")  // Mozilla
206  MAGIC_HTML_TAG("b")  // Mozilla
207  MAGIC_HTML_TAG("body")  // Mozilla
208  MAGIC_HTML_TAG("br")
209  MAGIC_HTML_TAG("p")  // Mozilla
210};
211
212static base::Histogram* UMASnifferHistogramGet(const char* name,
213                                               int array_size) {
214  base::Histogram* counter =
215      base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size,
216      base::Histogram::kUmaTargetedHistogramFlag);
217  return counter;
218}
219
220// Compare content header to a magic number where magic_entry can contain '.'
221// for single character of anything, allowing some bytes to be skipped.
222static bool MagicCmp(const char* magic_entry, const char* content, size_t len) {
223  while (len) {
224    if ((*magic_entry != '.') && (*magic_entry != *content))
225      return false;
226    ++magic_entry;
227    ++content;
228    --len;
229  }
230  return true;
231}
232
233static bool MatchMagicNumber(const char* content, size_t size,
234                             const MagicNumber* magic_entry,
235                             std::string* result) {
236  const size_t len = magic_entry->magic_len;
237
238  // Keep kBytesRequiredForMagic honest.
239  DCHECK_LE(len, kBytesRequiredForMagic);
240
241  // To compare with magic strings, we need to compute strlen(content), but
242  // content might not actually have a null terminator.  In that case, we
243  // pretend the length is content_size.
244  const char* end =
245      static_cast<const char*>(memchr(content, '\0', size));
246  const size_t content_strlen =
247      (end != NULL) ? static_cast<size_t>(end - content) : size;
248
249  bool match = false;
250  if (magic_entry->is_string) {
251    if (content_strlen >= len) {
252      // String comparisons are case-insensitive
253      match = (base::strncasecmp(magic_entry->magic, content, len) == 0);
254    }
255  } else {
256    if (size >= len)
257      match = MagicCmp(magic_entry->magic, content, len);
258  }
259
260  if (match) {
261    result->assign(magic_entry->mime_type);
262    return true;
263  }
264  return false;
265}
266
267static bool CheckForMagicNumbers(const char* content, size_t size,
268                                 const MagicNumber* magic, size_t magic_len,
269                                 base::Histogram* counter,
270                                 std::string* result) {
271  for (size_t i = 0; i < magic_len; ++i) {
272    if (MatchMagicNumber(content, size, &(magic[i]), result)) {
273      if (counter) counter->Add(static_cast<int>(i));
274      return true;
275    }
276  }
277  return false;
278}
279
280// Truncates |size| to |max_size| and returns true if |size| is at least
281// |max_size|.
282static bool TruncateSize(const size_t max_size, size_t* size) {
283  // Keep kMaxBytesToSniff honest.
284  DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
285
286  if (*size >= max_size) {
287    *size = max_size;
288    return true;
289  }
290  return false;
291}
292
293// Returns true and sets result if the content appears to be HTML.
294// Clears have_enough_content if more data could possibly change the result.
295static bool SniffForHTML(const char* content,
296                         size_t size,
297                         bool* have_enough_content,
298                         std::string* result) {
299  // For HTML, we are willing to consider up to 512 bytes. This may be overly
300  // conservative as IE only considers 256.
301  *have_enough_content &= TruncateSize(512, &size);
302
303  // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
304  // but with some modifications to better match the HTML5 spec.
305  const char* const end = content + size;
306  const char* pos;
307  for (pos = content; pos < end; ++pos) {
308    if (!IsAsciiWhitespace(*pos))
309      break;
310  }
311  static base::Histogram* counter(NULL);
312  if (!counter)
313    counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2",
314                                     arraysize(kSniffableTags));
315  // |pos| now points to first non-whitespace character (or at end).
316  return CheckForMagicNumbers(pos, end - pos,
317                              kSniffableTags, arraysize(kSniffableTags),
318                              counter, result);
319}
320
321// Returns true and sets result if the content matches any of kMagicNumbers.
322// Clears have_enough_content if more data could possibly change the result.
323static bool SniffForMagicNumbers(const char* content,
324                                 size_t size,
325                                 bool* have_enough_content,
326                                 std::string* result) {
327  *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
328
329  // Check our big table of Magic Numbers
330  static base::Histogram* counter(NULL);
331  if (!counter)
332    counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2",
333                                     arraysize(kMagicNumbers));
334  return CheckForMagicNumbers(content, size,
335                              kMagicNumbers, arraysize(kMagicNumbers),
336                              counter, result);
337}
338
339// Byte order marks
340static const MagicNumber kMagicXML[] = {
341  // We want to be very conservative in interpreting text/xml content as
342  // XHTML -- we just want to sniff enough to make unit tests pass.
343  // So we match explicitly on this, and don't match other ways of writing
344  // it in semantically-equivalent ways.
345  MAGIC_STRING("application/xhtml+xml",
346               "<html xmlns=\"http://www.w3.org/1999/xhtml\"")
347  MAGIC_STRING("application/atom+xml", "<feed")
348  MAGIC_STRING("application/rss+xml", "<rss")  // UTF-8
349};
350
351// Returns true and sets result if the content appears to contain XHTML or a
352// feed.
353// Clears have_enough_content if more data could possibly change the result.
354//
355// TODO(evanm): this is similar but more conservative than what Safari does,
356// while HTML5 has a different recommendation -- what should we do?
357// TODO(evanm): this is incorrect for documents whose encoding isn't a superset
358// of ASCII -- do we care?
359static bool SniffXML(const char* content,
360                     size_t size,
361                     bool* have_enough_content,
362                     std::string* result) {
363  // We allow at most 300 bytes of content before we expect the opening tag.
364  *have_enough_content &= TruncateSize(300, &size);
365  const char* pos = content;
366  const char* const end = content + size;
367
368  // This loop iterates through tag-looking offsets in the file.
369  // We want to skip XML processing instructions (of the form "<?xml ...")
370  // and stop at the first "plain" tag, then make a decision on the mime-type
371  // based on the name (or possibly attributes) of that tag.
372  static base::Histogram* counter(NULL);
373  if (!counter)
374    counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2",
375                                     arraysize(kMagicXML));
376  const int kMaxTagIterations = 5;
377  for (int i = 0; i < kMaxTagIterations && pos < end; ++i) {
378    pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos));
379    if (!pos)
380      return false;
381
382    if (base::strncasecmp(pos, "<?xml", sizeof("<?xml")-1) == 0) {
383      // Skip XML declarations.
384      ++pos;
385      continue;
386    } else if (base::strncasecmp(pos, "<!DOCTYPE",
387                                 sizeof("<!DOCTYPE")-1) == 0) {
388      // Skip DOCTYPE declarations.
389      ++pos;
390      continue;
391    }
392
393    if (CheckForMagicNumbers(pos, end - pos,
394                             kMagicXML, arraysize(kMagicXML),
395                             counter, result))
396      return true;
397
398    // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
399    // to identify.
400
401    // If we get here, we've hit an initial tag that hasn't matched one of the
402    // above tests.  Abort.
403    return true;
404  }
405
406  // We iterated too far without finding a start tag.
407  // If we have more content to look at, we aren't going to change our mind by
408  // seeing more bytes from the network.
409  return pos < end;
410}
411
412// Byte order marks
413static const MagicNumber kByteOrderMark[] = {
414  MAGIC_NUMBER("text/plain", "\xFE\xFF")  // UTF-16BE
415  MAGIC_NUMBER("text/plain", "\xFF\xFE")  // UTF-16LE
416  MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF")  // UTF-8
417};
418
419// Whether a given byte looks like it might be part of binary content.
420// Source: HTML5 spec
421static char kByteLooksBinary[] = {
422  1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  // 0x00 - 0x0F
423  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  // 0x10 - 0x1F
424  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x20 - 0x2F
425  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x30 - 0x3F
426  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x40 - 0x4F
427  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x50 - 0x5F
428  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x60 - 0x6F
429  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x70 - 0x7F
430  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80 - 0x8F
431  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90 - 0x9F
432  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xA0 - 0xAF
433  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xB0 - 0xBF
434  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xC0 - 0xCF
435  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xD0 - 0xDF
436  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xE0 - 0xEF
437  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xF0 - 0xFF
438};
439
440// Returns true and sets result to "application/octet-stream" if the content
441// appears to be binary data. Otherwise, returns false and sets "text/plain".
442// Clears have_enough_content if more data could possibly change the result.
443static bool SniffBinary(const char* content,
444                        size_t size,
445                        bool* have_enough_content,
446                        std::string* result) {
447  // There is no concensus about exactly how to sniff for binary content.
448  // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
449  // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
450  // Here, we side with FF, but with a smaller buffer. This size was chosen
451  // because it is small enough to comfortably fit into a single packet (after
452  // allowing for headers) and yet large enough to account for binary formats
453  // that have a significant amount of ASCII at the beginning (crbug.com/15314).
454  const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size);
455
456  // First, we look for a BOM.
457  static base::Histogram* counter(NULL);
458  if (!counter)
459    counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2",
460                                     arraysize(kByteOrderMark));
461  std::string unused;
462  if (CheckForMagicNumbers(content, size,
463                           kByteOrderMark, arraysize(kByteOrderMark),
464                           counter, &unused)) {
465    // If there is BOM, we think the buffer is not binary.
466    result->assign("text/plain");
467    return false;
468  }
469
470  // Next we look to see if any of the bytes "look binary."
471  for (size_t i = 0; i < size; ++i) {
472    // If we a see a binary-looking byte, we think the content is binary.
473    if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) {
474      result->assign("application/octet-stream");
475      return true;
476    }
477  }
478
479  // No evidence either way. Default to non-binary and, if truncated, clear
480  // have_enough_content because there could be a binary looking byte in the
481  // truncated data.
482  *have_enough_content &= is_truncated;
483  result->assign("text/plain");
484  return false;
485}
486
487static bool IsUnknownMimeType(const std::string& mime_type) {
488  // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
489  // If we do, please be careful not to alter the semantics at all.
490  static const char* kUnknownMimeTypes[] = {
491    // Empty mime types are as unknown as they get.
492    "",
493    // The unknown/unknown type is popular and uninformative
494    "unknown/unknown",
495    // The second most popular unknown mime type is application/unknown
496    "application/unknown",
497    // Firefox rejects a mime type if it is exactly */*
498    "*/*",
499  };
500  static base::Histogram* counter(NULL);
501  if (!counter)
502    counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2",
503                                     arraysize(kUnknownMimeTypes) + 1);
504  for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) {
505    if (mime_type == kUnknownMimeTypes[i]) {
506      counter->Add(i);
507      return true;
508    }
509  }
510  if (mime_type.find('/') == std::string::npos) {
511    // Firefox rejects a mime type if it does not contain a slash
512    counter->Add(arraysize(kUnknownMimeTypes));
513    return true;
514  }
515  return false;
516}
517
518// Returns true and sets result if the content appears to be a crx (chrome
519// extension) file.
520// Clears have_enough_content if more data could possibly change the result.
521static bool SniffCRX(const char* content,
522                     size_t size,
523                     const GURL& url,
524                     const std::string& type_hint,
525                     bool* have_enough_content,
526                     std::string* result) {
527  static base::Histogram* counter(NULL);
528  if (!counter)
529    counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3);
530
531  // Technically, the crx magic number is just Cr24, but the bytes after that
532  // are a version number which changes infrequently. Including it in the
533  // sniffing gives us less room for error. If the version number ever changes,
534  // we can just add an entry to this list.
535  //
536  // TODO(aa): If we ever have another magic number, we'll want to pass a
537  // histogram into CheckForMagicNumbers(), below, to see which one matched.
538  static const struct MagicNumber kCRXMagicNumbers[] = {
539    MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")
540  };
541
542  // Only consider files that have the extension ".crx".
543  static const char kCRXExtension[] = ".crx";
544  // Ignore null by subtracting 1.
545  static const int kExtensionLength = arraysize(kCRXExtension) - 1;
546  if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) ==
547      url.path().size() - kExtensionLength) {
548    counter->Add(1);
549  } else {
550    return false;
551  }
552
553  *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
554  if (CheckForMagicNumbers(content, size,
555                           kCRXMagicNumbers, arraysize(kCRXMagicNumbers),
556                           NULL, result)) {
557    counter->Add(2);
558  } else {
559    return false;
560  }
561
562  return true;
563}
564
565bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
566  static base::Histogram* should_sniff_counter(NULL);
567  if (!should_sniff_counter)
568    should_sniff_counter =
569        UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3);
570  // We are willing to sniff the mime type for HTTP, HTTPS, and FTP
571  bool sniffable_scheme = url.is_empty() ||
572                          url.SchemeIs("http") ||
573                          url.SchemeIs("https") ||
574                          url.SchemeIs("ftp") ||
575                          url.SchemeIsFile();
576  if (!sniffable_scheme) {
577    should_sniff_counter->Add(1);
578    return false;
579  }
580
581  static const char* kSniffableTypes[] = {
582    // Many web servers are misconfigured to send text/plain for many
583    // different types of content.
584    "text/plain",
585    // We want to sniff application/octet-stream for
586    // application/x-chrome-extension, but nothing else.
587    "application/octet-stream",
588    // XHTML and Atom/RSS feeds are often served as plain xml instead of
589    // their more specific mime types.
590    "text/xml",
591    "application/xml",
592  };
593  static base::Histogram* counter(NULL);
594  if (!counter)
595    counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2",
596                                     arraysize(kSniffableTypes) + 1);
597  for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) {
598    if (mime_type == kSniffableTypes[i]) {
599      counter->Add(i);
600      should_sniff_counter->Add(2);
601      return true;
602    }
603  }
604  if (IsUnknownMimeType(mime_type)) {
605    // The web server didn't specify a content type or specified a mime
606    // type that we ignore.
607    counter->Add(arraysize(kSniffableTypes));
608    should_sniff_counter->Add(2);
609    return true;
610  }
611  should_sniff_counter->Add(1);
612  return false;
613}
614
615bool SniffMimeType(const char* content, size_t content_size,
616                   const GURL& url, const std::string& type_hint,
617                   std::string* result) {
618  DCHECK_LT(content_size, 1000000U);  // sanity check
619  DCHECK(content);
620  DCHECK(result);
621
622  // By default, we assume we have enough content.
623  // Each sniff routine may unset this if it wasn't provided enough content.
624  bool have_enough_content = true;
625
626  // By default, we'll return the type hint.
627  // Each sniff routine may modify this if it has a better guess..
628  result->assign(type_hint);
629
630  // Cache information about the type_hint
631  const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
632
633  // First check for HTML
634  if (hint_is_unknown_mime_type) {
635    // We're only willing to sniff HTML if the server has not supplied a mime
636    // type, or if the type it did supply indicates that it doesn't know what
637    // the type should be.
638    if (SniffForHTML(content, content_size, &have_enough_content, result))
639      return true;  // We succeeded in sniffing HTML.  No more content needed.
640  }
641
642  // We're only willing to sniff for binary in 3 cases:
643  // 1. The server has not supplied a mime type.
644  // 2. The type it did supply indicates that it doesn't know what the type
645  //    should be.
646  // 3. The type is "text/plain" which is the default on some web servers and
647  //    could be indicative of a mis-configuration that we shield the user from.
648  const bool hint_is_text_plain = (type_hint == "text/plain");
649  if (hint_is_unknown_mime_type || hint_is_text_plain) {
650    if (!SniffBinary(content, content_size, &have_enough_content, result)) {
651      // If the server said the content was text/plain and it doesn't appear
652      // to be binary, then we trust it.
653      if (hint_is_text_plain) {
654        return have_enough_content;
655      }
656    }
657  }
658
659  // If we have plain XML, sniff XML subtypes.
660  if (type_hint == "text/xml" || type_hint == "application/xml") {
661    // We're not interested in sniffing these types for images and the like.
662    // Instead, we're looking explicitly for a feed.  If we don't find one
663    // we're done and return early.
664    if (SniffXML(content, content_size, &have_enough_content, result))
665      return true;
666    return have_enough_content;
667  }
668
669  // CRX files (chrome extensions) have a special sniffing algorithm. It is
670  // tighter than the others because we don't have to match legacy behavior.
671  if (SniffCRX(content, content_size, url, type_hint,
672               &have_enough_content, result))
673    return true;
674
675  // We're not interested in sniffing for magic numbers when the type_hint
676  // is application/octet-stream.  Time to bail out.
677  if (type_hint == "application/octet-stream")
678    return have_enough_content;
679
680  // Now we look in our large table of magic numbers to see if we can find
681  // anything that matches the content.
682  if (SniffForMagicNumbers(content, content_size,
683                           &have_enough_content, result))
684    return true;  // We've matched a magic number.  No more content needed.
685
686  return have_enough_content;
687}
688
689}  // namespace net
690