1// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// Detecting mime types is a tricky business because we need to balance 6// compatibility concerns with security issues. Here is a survey of how other 7// browsers behave and then a description of how we intend to behave. 8// 9// HTML payload, no Content-Type header: 10// * IE 7: Render as HTML 11// * Firefox 2: Render as HTML 12// * Safari 3: Render as HTML 13// * Opera 9: Render as HTML 14// 15// Here the choice seems clear: 16// => Chrome: Render as HTML 17// 18// HTML payload, Content-Type: "text/plain": 19// * IE 7: Render as HTML 20// * Firefox 2: Render as text 21// * Safari 3: Render as text (Note: Safari will Render as HTML if the URL 22// has an HTML extension) 23// * Opera 9: Render as text 24// 25// Here we choose to follow the majority (and break some compatibility with IE). 26// Many folks dislike IE's behavior here. 27// => Chrome: Render as text 28// We generalize this as follows. If the Content-Type header is text/plain 29// we won't detect dangerous mime types (those that can execute script). 30// 31// HTML payload, Content-Type: "application/octet-stream": 32// * IE 7: Render as HTML 33// * Firefox 2: Download as application/octet-stream 34// * Safari 3: Render as HTML 35// * Opera 9: Render as HTML 36// 37// We follow Firefox. 38// => Chrome: Download as application/octet-stream 39// One factor in this decision is that IIS 4 and 5 will send 40// application/octet-stream for .xhtml files (because they don't recognize 41// the extension). We did some experiments and it looks like this doesn't occur 42// very often on the web. We choose the more secure option. 43// 44// GIF payload, no Content-Type header: 45// * IE 7: Render as GIF 46// * Firefox 2: Render as GIF 47// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the 48// URL has an GIF extension) 49// * Opera 9: Render as GIF 50// 51// The choice is clear. 52// => Chrome: Render as GIF 53// Once we decide to render HTML without a Content-Type header, there isn't much 54// reason not to render GIFs. 55// 56// GIF payload, Content-Type: "text/plain": 57// * IE 7: Render as GIF 58// * Firefox 2: Download as application/octet-stream (Note: Firefox will 59// Download as GIF if the URL has an GIF extension) 60// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the 61// URL has an GIF extension) 62// * Opera 9: Render as GIF 63// 64// Displaying as text/plain makes little sense as the content will look like 65// gibberish. Here, we could change our minds and download. 66// => Chrome: Render as GIF 67// 68// GIF payload, Content-Type: "application/octet-stream": 69// * IE 7: Render as GIF 70// * Firefox 2: Download as application/octet-stream (Note: Firefox will 71// Download as GIF if the URL has an GIF extension) 72// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the 73// URL has an GIF extension) 74// * Opera 9: Render as GIF 75// 76// We used to render as GIF here, but the problem is that some sites want to 77// trigger downloads by sending application/octet-stream (even though they 78// should be sending Content-Disposition: attachment). Although it is safe 79// to render as GIF from a security perspective, we actually get better 80// compatibility if we don't sniff from application/octet stream at all. 81// => Chrome: Download as application/octet-stream 82// 83// XHTML payload, Content-Type: "text/xml": 84// * IE 7: Render as XML 85// * Firefox 2: Render as HTML 86// * Safari 3: Render as HTML 87// * Opera 9: Render as HTML 88// The layout tests rely on us rendering this as HTML. 89// But we're conservative in XHTML detection, as this runs afoul of the 90// "don't detect dangerous mime types" rule. 91// 92// Note that our definition of HTML payload is much stricter than IE's 93// definition and roughly the same as Firefox's definition. 94 95#include <string> 96 97#include "net/base/mime_sniffer.h" 98 99#include "base/basictypes.h" 100#include "base/logging.h" 101#include "base/metrics/histogram.h" 102#include "base/string_util.h" 103#include "googleurl/src/gurl.h" 104#include "net/base/mime_util.h" 105 106namespace net { 107 108// The number of content bytes we need to use all our magic numbers. Feel free 109// to increase this number if you add a longer magic number. 110static const size_t kBytesRequiredForMagic = 42; 111 112struct MagicNumber { 113 const char* mime_type; 114 const char* magic; 115 size_t magic_len; 116 bool is_string; 117}; 118 119#define MAGIC_NUMBER(mime_type, magic) \ 120 { (mime_type), (magic), sizeof(magic)-1, false }, 121 122// Magic strings are case insensitive and must not include '\0' characters 123#define MAGIC_STRING(mime_type, magic) \ 124 { (mime_type), (magic), sizeof(magic)-1, true }, 125 126static const MagicNumber kMagicNumbers[] = { 127 // Source: HTML 5 specification 128 MAGIC_NUMBER("application/pdf", "%PDF-") 129 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-") 130 MAGIC_NUMBER("image/gif", "GIF87a") 131 MAGIC_NUMBER("image/gif", "GIF89a") 132 MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A") 133 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") 134 MAGIC_NUMBER("image/bmp", "BM") 135 // Source: Mozilla 136 MAGIC_NUMBER("text/plain", "#!") // Script 137 MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS 138 MAGIC_NUMBER("text/plain", "From") 139 MAGIC_NUMBER("text/plain", ">From") 140 // Chrome specific 141 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08") 142 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") 143 MAGIC_NUMBER("video/x-ms-asf", 144 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C") 145 MAGIC_NUMBER("image/tiff", "I I") 146 MAGIC_NUMBER("image/tiff", "II*") 147 MAGIC_NUMBER("image/tiff", "MM\x00*") 148 MAGIC_NUMBER("audio/mpeg", "ID3") 149 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ") 150 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3") 151 // TODO(abarth): we don't handle partial byte matches yet 152 // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB") 153 // MAGIC_NUMBER("audio/mpeg", "\xFF\xE") 154 // MAGIC_NUMBER("audio/mpeg", "\xFF\xF") 155 MAGIC_NUMBER("application/zip", "PK\x03\x04") 156 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00") 157 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A") 158 MAGIC_NUMBER("application/octet-stream", "MZ") // EXE 159 // Sniffing for Flash: 160 // 161 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS") 162 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV") 163 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS") 164 // 165 // Including these magic number for Flash is a trade off. 166 // 167 // Pros: 168 // * Flash is an important and popular file format 169 // 170 // Cons: 171 // * These patterns are fairly weak 172 // * If we mistakenly decide something is Flash, we will execute it 173 // in the origin of an unsuspecting site. This could be a security 174 // vulnerability if the site allows users to upload content. 175 // 176 // On balance, we do not include these patterns. 177}; 178 179// Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will 180// decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is 181// HTML, but we will not. 182 183#define MAGIC_HTML_TAG(tag) \ 184 MAGIC_STRING("text/html", "<" tag) 185 186static const MagicNumber kSniffableTags[] = { 187 // XML processing directive. Although this is not an HTML mime type, we sniff 188 // for this in the HTML phase because text/xml is just as powerful as HTML and 189 // we want to leverage our white space skipping technology. 190 MAGIC_NUMBER("text/xml", "<?xml") // Mozilla 191 // DOCTYPEs 192 MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec 193 // Sniffable tags, ordered by how often they occur in sniffable documents. 194 MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla 195 MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla 196 MAGIC_HTML_TAG("!--") 197 MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla 198 MAGIC_HTML_TAG("iframe") // Mozilla 199 MAGIC_HTML_TAG("h1") // Mozilla 200 MAGIC_HTML_TAG("div") // Mozilla 201 MAGIC_HTML_TAG("font") // Mozilla 202 MAGIC_HTML_TAG("table") // Mozilla 203 MAGIC_HTML_TAG("a") // Mozilla 204 MAGIC_HTML_TAG("style") // Mozilla 205 MAGIC_HTML_TAG("title") // Mozilla 206 MAGIC_HTML_TAG("b") // Mozilla 207 MAGIC_HTML_TAG("body") // Mozilla 208 MAGIC_HTML_TAG("br") 209 MAGIC_HTML_TAG("p") // Mozilla 210}; 211 212static base::Histogram* UMASnifferHistogramGet(const char* name, 213 int array_size) { 214 base::Histogram* counter = 215 base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size, 216 base::Histogram::kUmaTargetedHistogramFlag); 217 return counter; 218} 219 220// Compare content header to a magic number where magic_entry can contain '.' 221// for single character of anything, allowing some bytes to be skipped. 222static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { 223 while (len) { 224 if ((*magic_entry != '.') && (*magic_entry != *content)) 225 return false; 226 ++magic_entry; 227 ++content; 228 --len; 229 } 230 return true; 231} 232 233static bool MatchMagicNumber(const char* content, size_t size, 234 const MagicNumber* magic_entry, 235 std::string* result) { 236 const size_t len = magic_entry->magic_len; 237 238 // Keep kBytesRequiredForMagic honest. 239 DCHECK_LE(len, kBytesRequiredForMagic); 240 241 // To compare with magic strings, we need to compute strlen(content), but 242 // content might not actually have a null terminator. In that case, we 243 // pretend the length is content_size. 244 const char* end = 245 static_cast<const char*>(memchr(content, '\0', size)); 246 const size_t content_strlen = 247 (end != NULL) ? static_cast<size_t>(end - content) : size; 248 249 bool match = false; 250 if (magic_entry->is_string) { 251 if (content_strlen >= len) { 252 // String comparisons are case-insensitive 253 match = (base::strncasecmp(magic_entry->magic, content, len) == 0); 254 } 255 } else { 256 if (size >= len) 257 match = MagicCmp(magic_entry->magic, content, len); 258 } 259 260 if (match) { 261 result->assign(magic_entry->mime_type); 262 return true; 263 } 264 return false; 265} 266 267static bool CheckForMagicNumbers(const char* content, size_t size, 268 const MagicNumber* magic, size_t magic_len, 269 base::Histogram* counter, 270 std::string* result) { 271 for (size_t i = 0; i < magic_len; ++i) { 272 if (MatchMagicNumber(content, size, &(magic[i]), result)) { 273 if (counter) counter->Add(static_cast<int>(i)); 274 return true; 275 } 276 } 277 return false; 278} 279 280// Truncates |size| to |max_size| and returns true if |size| is at least 281// |max_size|. 282static bool TruncateSize(const size_t max_size, size_t* size) { 283 // Keep kMaxBytesToSniff honest. 284 DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff); 285 286 if (*size >= max_size) { 287 *size = max_size; 288 return true; 289 } 290 return false; 291} 292 293// Returns true and sets result if the content appears to be HTML. 294// Clears have_enough_content if more data could possibly change the result. 295static bool SniffForHTML(const char* content, 296 size_t size, 297 bool* have_enough_content, 298 std::string* result) { 299 // For HTML, we are willing to consider up to 512 bytes. This may be overly 300 // conservative as IE only considers 256. 301 *have_enough_content &= TruncateSize(512, &size); 302 303 // We adopt a strategy similar to that used by Mozilla to sniff HTML tags, 304 // but with some modifications to better match the HTML5 spec. 305 const char* const end = content + size; 306 const char* pos; 307 for (pos = content; pos < end; ++pos) { 308 if (!IsAsciiWhitespace(*pos)) 309 break; 310 } 311 static base::Histogram* counter(NULL); 312 if (!counter) 313 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", 314 arraysize(kSniffableTags)); 315 // |pos| now points to first non-whitespace character (or at end). 316 return CheckForMagicNumbers(pos, end - pos, 317 kSniffableTags, arraysize(kSniffableTags), 318 counter, result); 319} 320 321// Returns true and sets result if the content matches any of kMagicNumbers. 322// Clears have_enough_content if more data could possibly change the result. 323static bool SniffForMagicNumbers(const char* content, 324 size_t size, 325 bool* have_enough_content, 326 std::string* result) { 327 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); 328 329 // Check our big table of Magic Numbers 330 static base::Histogram* counter(NULL); 331 if (!counter) 332 counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", 333 arraysize(kMagicNumbers)); 334 return CheckForMagicNumbers(content, size, 335 kMagicNumbers, arraysize(kMagicNumbers), 336 counter, result); 337} 338 339// Byte order marks 340static const MagicNumber kMagicXML[] = { 341 // We want to be very conservative in interpreting text/xml content as 342 // XHTML -- we just want to sniff enough to make unit tests pass. 343 // So we match explicitly on this, and don't match other ways of writing 344 // it in semantically-equivalent ways. 345 MAGIC_STRING("application/xhtml+xml", 346 "<html xmlns=\"http://www.w3.org/1999/xhtml\"") 347 MAGIC_STRING("application/atom+xml", "<feed") 348 MAGIC_STRING("application/rss+xml", "<rss") // UTF-8 349}; 350 351// Returns true and sets result if the content appears to contain XHTML or a 352// feed. 353// Clears have_enough_content if more data could possibly change the result. 354// 355// TODO(evanm): this is similar but more conservative than what Safari does, 356// while HTML5 has a different recommendation -- what should we do? 357// TODO(evanm): this is incorrect for documents whose encoding isn't a superset 358// of ASCII -- do we care? 359static bool SniffXML(const char* content, 360 size_t size, 361 bool* have_enough_content, 362 std::string* result) { 363 // We allow at most 300 bytes of content before we expect the opening tag. 364 *have_enough_content &= TruncateSize(300, &size); 365 const char* pos = content; 366 const char* const end = content + size; 367 368 // This loop iterates through tag-looking offsets in the file. 369 // We want to skip XML processing instructions (of the form "<?xml ...") 370 // and stop at the first "plain" tag, then make a decision on the mime-type 371 // based on the name (or possibly attributes) of that tag. 372 static base::Histogram* counter(NULL); 373 if (!counter) 374 counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2", 375 arraysize(kMagicXML)); 376 const int kMaxTagIterations = 5; 377 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { 378 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); 379 if (!pos) 380 return false; 381 382 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml")-1) == 0) { 383 // Skip XML declarations. 384 ++pos; 385 continue; 386 } else if (base::strncasecmp(pos, "<!DOCTYPE", 387 sizeof("<!DOCTYPE")-1) == 0) { 388 // Skip DOCTYPE declarations. 389 ++pos; 390 continue; 391 } 392 393 if (CheckForMagicNumbers(pos, end - pos, 394 kMagicXML, arraysize(kMagicXML), 395 counter, result)) 396 return true; 397 398 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult 399 // to identify. 400 401 // If we get here, we've hit an initial tag that hasn't matched one of the 402 // above tests. Abort. 403 return true; 404 } 405 406 // We iterated too far without finding a start tag. 407 // If we have more content to look at, we aren't going to change our mind by 408 // seeing more bytes from the network. 409 return pos < end; 410} 411 412// Byte order marks 413static const MagicNumber kByteOrderMark[] = { 414 MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE 415 MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE 416 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 417}; 418 419// Whether a given byte looks like it might be part of binary content. 420// Source: HTML5 spec 421static char kByteLooksBinary[] = { 422 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F 423 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F 424 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F 425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F 426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F 427 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F 428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F 429 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F 430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F 431 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F 432 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF 433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF 434 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF 435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF 436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF 437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF 438}; 439 440// Returns true and sets result to "application/octet-stream" if the content 441// appears to be binary data. Otherwise, returns false and sets "text/plain". 442// Clears have_enough_content if more data could possibly change the result. 443static bool SniffBinary(const char* content, 444 size_t size, 445 bool* have_enough_content, 446 std::string* result) { 447 // There is no concensus about exactly how to sniff for binary content. 448 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. 449 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. 450 // Here, we side with FF, but with a smaller buffer. This size was chosen 451 // because it is small enough to comfortably fit into a single packet (after 452 // allowing for headers) and yet large enough to account for binary formats 453 // that have a significant amount of ASCII at the beginning (crbug.com/15314). 454 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); 455 456 // First, we look for a BOM. 457 static base::Histogram* counter(NULL); 458 if (!counter) 459 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", 460 arraysize(kByteOrderMark)); 461 std::string unused; 462 if (CheckForMagicNumbers(content, size, 463 kByteOrderMark, arraysize(kByteOrderMark), 464 counter, &unused)) { 465 // If there is BOM, we think the buffer is not binary. 466 result->assign("text/plain"); 467 return false; 468 } 469 470 // Next we look to see if any of the bytes "look binary." 471 for (size_t i = 0; i < size; ++i) { 472 // If we a see a binary-looking byte, we think the content is binary. 473 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { 474 result->assign("application/octet-stream"); 475 return true; 476 } 477 } 478 479 // No evidence either way. Default to non-binary and, if truncated, clear 480 // have_enough_content because there could be a binary looking byte in the 481 // truncated data. 482 *have_enough_content &= is_truncated; 483 result->assign("text/plain"); 484 return false; 485} 486 487static bool IsUnknownMimeType(const std::string& mime_type) { 488 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here. 489 // If we do, please be careful not to alter the semantics at all. 490 static const char* kUnknownMimeTypes[] = { 491 // Empty mime types are as unknown as they get. 492 "", 493 // The unknown/unknown type is popular and uninformative 494 "unknown/unknown", 495 // The second most popular unknown mime type is application/unknown 496 "application/unknown", 497 // Firefox rejects a mime type if it is exactly */* 498 "*/*", 499 }; 500 static base::Histogram* counter(NULL); 501 if (!counter) 502 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", 503 arraysize(kUnknownMimeTypes) + 1); 504 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { 505 if (mime_type == kUnknownMimeTypes[i]) { 506 counter->Add(i); 507 return true; 508 } 509 } 510 if (mime_type.find('/') == std::string::npos) { 511 // Firefox rejects a mime type if it does not contain a slash 512 counter->Add(arraysize(kUnknownMimeTypes)); 513 return true; 514 } 515 return false; 516} 517 518// Returns true and sets result if the content appears to be a crx (chrome 519// extension) file. 520// Clears have_enough_content if more data could possibly change the result. 521static bool SniffCRX(const char* content, 522 size_t size, 523 const GURL& url, 524 const std::string& type_hint, 525 bool* have_enough_content, 526 std::string* result) { 527 static base::Histogram* counter(NULL); 528 if (!counter) 529 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); 530 531 // Technically, the crx magic number is just Cr24, but the bytes after that 532 // are a version number which changes infrequently. Including it in the 533 // sniffing gives us less room for error. If the version number ever changes, 534 // we can just add an entry to this list. 535 // 536 // TODO(aa): If we ever have another magic number, we'll want to pass a 537 // histogram into CheckForMagicNumbers(), below, to see which one matched. 538 static const struct MagicNumber kCRXMagicNumbers[] = { 539 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00") 540 }; 541 542 // Only consider files that have the extension ".crx". 543 static const char kCRXExtension[] = ".crx"; 544 // Ignore null by subtracting 1. 545 static const int kExtensionLength = arraysize(kCRXExtension) - 1; 546 if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) == 547 url.path().size() - kExtensionLength) { 548 counter->Add(1); 549 } else { 550 return false; 551 } 552 553 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); 554 if (CheckForMagicNumbers(content, size, 555 kCRXMagicNumbers, arraysize(kCRXMagicNumbers), 556 NULL, result)) { 557 counter->Add(2); 558 } else { 559 return false; 560 } 561 562 return true; 563} 564 565bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { 566 static base::Histogram* should_sniff_counter(NULL); 567 if (!should_sniff_counter) 568 should_sniff_counter = 569 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); 570 // We are willing to sniff the mime type for HTTP, HTTPS, and FTP 571 bool sniffable_scheme = url.is_empty() || 572 url.SchemeIs("http") || 573 url.SchemeIs("https") || 574 url.SchemeIs("ftp") || 575 url.SchemeIsFile(); 576 if (!sniffable_scheme) { 577 should_sniff_counter->Add(1); 578 return false; 579 } 580 581 static const char* kSniffableTypes[] = { 582 // Many web servers are misconfigured to send text/plain for many 583 // different types of content. 584 "text/plain", 585 // We want to sniff application/octet-stream for 586 // application/x-chrome-extension, but nothing else. 587 "application/octet-stream", 588 // XHTML and Atom/RSS feeds are often served as plain xml instead of 589 // their more specific mime types. 590 "text/xml", 591 "application/xml", 592 }; 593 static base::Histogram* counter(NULL); 594 if (!counter) 595 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", 596 arraysize(kSniffableTypes) + 1); 597 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { 598 if (mime_type == kSniffableTypes[i]) { 599 counter->Add(i); 600 should_sniff_counter->Add(2); 601 return true; 602 } 603 } 604 if (IsUnknownMimeType(mime_type)) { 605 // The web server didn't specify a content type or specified a mime 606 // type that we ignore. 607 counter->Add(arraysize(kSniffableTypes)); 608 should_sniff_counter->Add(2); 609 return true; 610 } 611 should_sniff_counter->Add(1); 612 return false; 613} 614 615bool SniffMimeType(const char* content, size_t content_size, 616 const GURL& url, const std::string& type_hint, 617 std::string* result) { 618 DCHECK_LT(content_size, 1000000U); // sanity check 619 DCHECK(content); 620 DCHECK(result); 621 622 // By default, we assume we have enough content. 623 // Each sniff routine may unset this if it wasn't provided enough content. 624 bool have_enough_content = true; 625 626 // By default, we'll return the type hint. 627 // Each sniff routine may modify this if it has a better guess.. 628 result->assign(type_hint); 629 630 // Cache information about the type_hint 631 const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint); 632 633 // First check for HTML 634 if (hint_is_unknown_mime_type) { 635 // We're only willing to sniff HTML if the server has not supplied a mime 636 // type, or if the type it did supply indicates that it doesn't know what 637 // the type should be. 638 if (SniffForHTML(content, content_size, &have_enough_content, result)) 639 return true; // We succeeded in sniffing HTML. No more content needed. 640 } 641 642 // We're only willing to sniff for binary in 3 cases: 643 // 1. The server has not supplied a mime type. 644 // 2. The type it did supply indicates that it doesn't know what the type 645 // should be. 646 // 3. The type is "text/plain" which is the default on some web servers and 647 // could be indicative of a mis-configuration that we shield the user from. 648 const bool hint_is_text_plain = (type_hint == "text/plain"); 649 if (hint_is_unknown_mime_type || hint_is_text_plain) { 650 if (!SniffBinary(content, content_size, &have_enough_content, result)) { 651 // If the server said the content was text/plain and it doesn't appear 652 // to be binary, then we trust it. 653 if (hint_is_text_plain) { 654 return have_enough_content; 655 } 656 } 657 } 658 659 // If we have plain XML, sniff XML subtypes. 660 if (type_hint == "text/xml" || type_hint == "application/xml") { 661 // We're not interested in sniffing these types for images and the like. 662 // Instead, we're looking explicitly for a feed. If we don't find one 663 // we're done and return early. 664 if (SniffXML(content, content_size, &have_enough_content, result)) 665 return true; 666 return have_enough_content; 667 } 668 669 // CRX files (chrome extensions) have a special sniffing algorithm. It is 670 // tighter than the others because we don't have to match legacy behavior. 671 if (SniffCRX(content, content_size, url, type_hint, 672 &have_enough_content, result)) 673 return true; 674 675 // We're not interested in sniffing for magic numbers when the type_hint 676 // is application/octet-stream. Time to bail out. 677 if (type_hint == "application/octet-stream") 678 return have_enough_content; 679 680 // Now we look in our large table of magic numbers to see if we can find 681 // anything that matches the content. 682 if (SniffForMagicNumbers(content, content_size, 683 &have_enough_content, result)) 684 return true; // We've matched a magic number. No more content needed. 685 686 return have_enough_content; 687} 688 689} // namespace net 690