mime_sniffer.cc revision b2df76ea8fec9e32f6f3718986dba0d95315b29c
1f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 3f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// found in the LICENSE file. 4f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 5f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Detecting mime types is a tricky business because we need to balance 6f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// compatibility concerns with security issues. Here is a survey of how other 7f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// browsers behave and then a description of how we intend to behave. 8f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 9f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// HTML payload, no Content-Type header: 10f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as HTML 11f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Render as HTML 12f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Render as HTML 13f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as HTML 14f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 15f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Here the choice seems clear: 16f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// => Chrome: Render as HTML 17f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// HTML payload, Content-Type: "text/plain": 19f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as HTML 20f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Render as text 21f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Render as text (Note: Safari will Render as HTML if the URL 22f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// has an HTML extension) 23f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as text 24f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 25f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Here we choose to follow the majority (and break some compatibility with IE). 26f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Many folks dislike IE's behavior here. 27a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// => Chrome: Render as text 28f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// We generalize this as follows. If the Content-Type header is text/plain 295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// we won't detect dangerous mime types (those that can execute script). 305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 31f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// HTML payload, Content-Type: "application/octet-stream": 32f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as HTML 33f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Download as application/octet-stream 34f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Render as HTML 35f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as HTML 36f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 37f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// We follow Firefox. 38f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// => Chrome: Download as application/octet-stream 39f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// One factor in this decision is that IIS 4 and 5 will send 40f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// application/octet-stream for .xhtml files (because they don't recognize 41f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// the extension). We did some experiments and it looks like this doesn't occur 42f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// very often on the web. We choose the more secure option. 43f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 44f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// GIF payload, no Content-Type header: 45f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as GIF 46f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Render as GIF 47f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the 48f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// URL has an GIF extension) 49f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as GIF 50f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 51f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// The choice is clear. 52f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// => Chrome: Render as GIF 53f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Once we decide to render HTML without a Content-Type header, there isn't much 54f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// reason not to render GIFs. 55f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// GIF payload, Content-Type: "text/plain": 57f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as GIF 58f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Download as application/octet-stream (Note: Firefox will 59f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Download as GIF if the URL has an GIF extension) 60f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the 61f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// URL has an GIF extension) 62f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as GIF 63f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 64f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Displaying as text/plain makes little sense as the content will look like 65f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// gibberish. Here, we could change our minds and download. 66f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// => Chrome: Render as GIF 675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// 685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// GIF payload, Content-Type: "application/octet-stream": 69f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as GIF 70f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Download as application/octet-stream (Note: Firefox will 71f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Download as GIF if the URL has an GIF extension) 72f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the 73f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// URL has an GIF extension) 74f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as GIF 75f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 76f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// We used to render as GIF here, but the problem is that some sites want to 77f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// trigger downloads by sending application/octet-stream (even though they 78f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// should be sending Content-Disposition: attachment). Although it is safe 79f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// to render as GIF from a security perspective, we actually get better 80f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// compatibility if we don't sniff from application/octet stream at all. 81f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// => Chrome: Download as application/octet-stream 82f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 83f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// XHTML payload, Content-Type: "text/xml": 84f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as XML 85f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Render as HTML 86a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// * Safari 3: Render as HTML 87a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// * Opera 9: Render as HTML 88f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// The layout tests rely on us rendering this as HTML. 895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// But we're conservative in XHTML detection, as this runs afoul of the 905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// "don't detect dangerous mime types" rule. 91f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// 92f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Note that our definition of HTML payload is much stricter than IE's 93f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// definition and roughly the same as Firefox's definition. 94f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 95f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include <string> 96f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 97f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "net/base/mime_sniffer.h" 98f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 99f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "base/basictypes.h" 100f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "base/logging.h" 101f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "base/metrics/histogram.h" 102f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "base/string_util.h" 103f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "googleurl/src/gurl.h" 104f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "net/base/mime_util.h" 105f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 106f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)namespace net { 107f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 108f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// The number of content bytes we need to use all our magic numbers. Feel free 109f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// to increase this number if you add a longer magic number. 110f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const size_t kBytesRequiredForMagic = 42; 111f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 112f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)struct MagicNumber { 113f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const char* mime_type; 114f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const char* magic; 115f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) size_t magic_len; 116f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) bool is_string; 117f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}; 118f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 119f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#define MAGIC_NUMBER(mime_type, magic) \ 120f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) { (mime_type), (magic), sizeof(magic)-1, false }, 121f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 122f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Magic strings are case insensitive and must not include '\0' characters 123f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#define MAGIC_STRING(mime_type, magic) \ 124f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) { (mime_type), (magic), sizeof(magic)-1, true }, 125f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 126f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const MagicNumber kMagicNumbers[] = { 127f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Source: HTML 5 specification 128f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("application/pdf", "%PDF-") 129f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("application/postscript", "%!PS-Adobe-") 130f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("image/gif", "GIF87a") 131f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("image/gif", "GIF89a") 132f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A") 133f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") 134f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("image/bmp", "BM") 135f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Source: Mozilla 136f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("text/plain", "#!") // Script 137a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS 138f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("text/plain", "From") 139f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("text/plain", ">From") 140f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Chrome specific 141a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08") 142f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") 143f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("video/x-ms-asf", 144f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C") 145f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("image/tiff", "I I") 146f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("image/tiff", "II*") 147f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("image/tiff", "MM\x00*") 148f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("audio/mpeg", "ID3") 149f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ") 150f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3") 151f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // TODO(abarth): we don't handle partial byte matches yet 152f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB") 153f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // MAGIC_NUMBER("audio/mpeg", "\xFF\xE") 154f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // MAGIC_NUMBER("audio/mpeg", "\xFF\xF") 155f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("application/zip", "PK\x03\x04") 1565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00") 1575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A") 158f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("application/octet-stream", "MZ") // EXE 159f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Sniffing for Flash: 160f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // 161f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // MAGIC_NUMBER("application/x-shockwave-flash", "CWS") 162f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // MAGIC_NUMBER("application/x-shockwave-flash", "FLV") 163f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // MAGIC_NUMBER("application/x-shockwave-flash", "FWS") 164f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // 165f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Including these magic number for Flash is a trade off. 166f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // 167f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Pros: 168f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // * Flash is an important and popular file format 169f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // 170f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Cons: 171f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // * These patterns are fairly weak 172f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // * If we mistakenly decide something is Flash, we will execute it 173f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // in the origin of an unsuspecting site. This could be a security 174a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) // vulnerability if the site allows users to upload content. 175f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // 176f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // On balance, we do not include these patterns. 177f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}; 178f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 179f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// The number of content bytes we need to use all our Microsoft Office magic 180f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// numbers. 181f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const size_t kBytesRequiredForOfficeMagic = 8; 182f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 183f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const MagicNumber kOfficeMagicNumbers[] = { 184f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1") 185f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("OOXML", "PK\x03\x04") 186f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}; 187f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 188f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)enum OfficeDocType { 189f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) DOC_TYPE_WORD, 190f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) DOC_TYPE_EXCEL, 191f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) DOC_TYPE_POWERPOINT, 192f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) DOC_TYPE_NONE 193f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}; 194f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 195f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)struct OfficeExtensionType { 196f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) OfficeDocType doc_type; 197f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const char* extension; 198f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) size_t extension_len; 199f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}; 200f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 2015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define OFFICE_EXTENSION(type, extension) \ 202f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) { (type), (extension), sizeof(extension) - 1 }, 203f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 204f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const OfficeExtensionType kOfficeExtensionTypes[] = { 205f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc") 206f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls") 2075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt") 208f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx") 209f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx") 210f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx") 211f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}; 212f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 213f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will 214f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is 215f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// HTML, but we will not. 216f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 217f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#define MAGIC_HTML_TAG(tag) \ 218f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_STRING("text/html", "<" tag) 219f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 220f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const MagicNumber kSniffableTags[] = { 221f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // XML processing directive. Although this is not an HTML mime type, we sniff 222f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // for this in the HTML phase because text/xml is just as powerful as HTML and 223f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // we want to leverage our white space skipping technology. 224f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_NUMBER("text/xml", "<?xml") // Mozilla 225f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // DOCTYPEs 226f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec 227f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Sniffable tags, ordered by how often they occur in sniffable documents. 228f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla 229f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla 230f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("!--") 231f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla 232f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("iframe") // Mozilla 233f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("h1") // Mozilla 234f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("div") // Mozilla 235f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("font") // Mozilla 236a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) MAGIC_HTML_TAG("table") // Mozilla 237f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("a") // Mozilla 238f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("style") // Mozilla 239a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) MAGIC_HTML_TAG("title") // Mozilla 240f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("b") // Mozilla 241f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("body") // Mozilla 242a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) MAGIC_HTML_TAG("br") 243f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) MAGIC_HTML_TAG("p") // Mozilla 244f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}; 245a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 246f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static base::HistogramBase* UMASnifferHistogramGet(const char* name, 247f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) int array_size) { 248f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) base::HistogramBase* counter = 249f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size, 250a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) base::HistogramBase::kUmaTargetedHistogramFlag); 251a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return counter; 252a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)} 253f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 254f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Compare content header to a magic number where magic_entry can contain '.' 255f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// for single character of anything, allowing some bytes to be skipped. 256f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { 257f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) while (len) { 258f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) if ((*magic_entry != '.') && (*magic_entry != *content)) 259f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) return false; 260f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) ++magic_entry; 261f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) ++content; 262f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) --len; 263f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) } 264f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) return true; 265a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)} 266f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 267f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool MatchMagicNumber(const char* content, size_t size, 268a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) const MagicNumber* magic_entry, 269f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) std::string* result) { 270f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const size_t len = magic_entry->magic_len; 271f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 272f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Keep kBytesRequiredForMagic honest. 273f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) DCHECK_LE(len, kBytesRequiredForMagic); 274f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 275f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // To compare with magic strings, we need to compute strlen(content), but 276f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // content might not actually have a null terminator. In that case, we 277f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // pretend the length is content_size. 278a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) const char* end = 279f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) static_cast<const char*>(memchr(content, '\0', size)); 280f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const size_t content_strlen = 281a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) (end != NULL) ? static_cast<size_t>(end - content) : size; 282f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 283f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) bool match = false; 284f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) if (magic_entry->is_string) { 285a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) if (content_strlen >= len) { 286f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // String comparisons are case-insensitive 287f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) match = (base::strncasecmp(magic_entry->magic, content, len) == 0); 288a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) } 289a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) } else { 290a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) if (size >= len) 291f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) match = MagicCmp(magic_entry->magic, content, len); 292f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) } 293f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 294f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) if (match) { 295f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) result->assign(magic_entry->mime_type); 296f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) return true; 297f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) } 298f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) return false; 299f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)} 300f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 301f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool CheckForMagicNumbers(const char* content, size_t size, 302f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const MagicNumber* magic, size_t magic_len, 303f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) base::HistogramBase* counter, 304f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) std::string* result) { 305a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) for (size_t i = 0; i < magic_len; ++i) { 306f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) if (MatchMagicNumber(content, size, &(magic[i]), result)) { 307f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) if (counter) counter->Add(static_cast<int>(i)); 308a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return true; 309f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) } 310f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) } 311a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return false; 312f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)} 313f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 314a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Truncates |size| to |max_size| and returns true if |size| is at least 315f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// |max_size|. 316f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool TruncateSize(const size_t max_size, size_t* size) { 317f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Keep kMaxBytesToSniff honest. 318f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff); 319f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 320f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) if (*size >= max_size) { 321f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) *size = max_size; 322f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) return true; 323a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) } 324f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) return false; 325f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)} 326f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 327f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Returns true and sets result if the content appears to be HTML. 328a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Clears have_enough_content if more data could possibly change the result. 329f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool SniffForHTML(const char* content, 330f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) size_t size, 331a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) bool* have_enough_content, 332a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) std::string* result) { 333a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) // For HTML, we are willing to consider up to 512 bytes. This may be overly 334f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // conservative as IE only considers 256. 335f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) *have_enough_content &= TruncateSize(512, &size); 336f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 337f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // We adopt a strategy similar to that used by Mozilla to sniff HTML tags, 338f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // but with some modifications to better match the HTML5 spec. 339f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const char* const end = content + size; 340f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const char* pos; 341f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) for (pos = content; pos < end; ++pos) { 342f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) if (!IsAsciiWhitespace(*pos)) 343a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) break; 344f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) } 345f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) static base::HistogramBase* counter(NULL); 346f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) if (!counter) 347f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", 348a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) arraysize(kSniffableTags)); 349f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // |pos| now points to first non-whitespace character (or at end). 350f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) return CheckForMagicNumbers(pos, end - pos, 351a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) kSniffableTags, arraysize(kSniffableTags), 352a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) counter, result); 353a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)} 354a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 355f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Returns true and sets result if the content matches any of kMagicNumbers. 356f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Clears have_enough_content if more data could possibly change the result. 357f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool SniffForMagicNumbers(const char* content, 358f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) size_t size, 359f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) bool* have_enough_content, 360f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) std::string* result) { 361f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); 362f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 363f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Check our big table of Magic Numbers 364f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) static base::HistogramBase* counter(NULL); 365f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) if (!counter) 366f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", 367a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) arraysize(kMagicNumbers)); 368f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) return CheckForMagicNumbers(content, size, 369f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) kMagicNumbers, arraysize(kMagicNumbers), 370f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) counter, result); 371a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)} 372f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 373f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Returns true and sets result if the content matches any of 374f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// kOfficeMagicNumbers, and the URL has the proper extension. 375a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Clears |have_enough_content| if more data could possibly change the result. 376f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool SniffForOfficeDocs(const char* content, 377f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) size_t size, 378a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) const GURL& url, 379a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) bool* have_enough_content, 380a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) std::string* result) { 381f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size); 382f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 383f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) // Check our table of magic numbers for Office file types. 384f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) std::string office_version; 385f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) if (!CheckForMagicNumbers(content, size, 386f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers), 387f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) NULL, &office_version)) 388f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) return false; 389f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 390a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) OfficeDocType type = DOC_TYPE_NONE; 391f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) { 392f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) std::string url_path = url.path(); 393f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 394f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) if (url_path.length() < kOfficeExtensionTypes[i].extension_len) 395a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) continue; 396a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 397a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) const char* extension = 398a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) &url_path[url_path.length() - 399a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) kOfficeExtensionTypes[i].extension_len]; 400a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 401a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) if (0 == base::strncasecmp(extension, kOfficeExtensionTypes[i].extension, 402a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) kOfficeExtensionTypes[i].extension_len)) { 403a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) type = kOfficeExtensionTypes[i].doc_type; 404a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) break; 405a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) } 406a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) } 407a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 408a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) if (type == DOC_TYPE_NONE) 409a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return false; 410a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 411a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) if (office_version == "CFB") { 412a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) switch (type) { 413a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) case DOC_TYPE_WORD: 414a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) *result = "application/msword"; 415a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return true; 416f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) case DOC_TYPE_EXCEL: 417f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) *result = "application/vnd.ms-excel"; 418a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return true; 419a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) case DOC_TYPE_POWERPOINT: 420a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) *result = "application/vnd.ms-powerpoint"; 421a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return true; 422f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) case DOC_TYPE_NONE: 423f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) NOTREACHED(); 424a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return false; 425a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) } 426a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) } else if (office_version == "OOXML") { 427a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) switch (type) { 428a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) case DOC_TYPE_WORD: 429a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) *result = "application/vnd.openxmlformats-officedocument." 430a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) "wordprocessingml.document"; 431a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return true; 432a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) case DOC_TYPE_EXCEL: 433a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) *result = "application/vnd.openxmlformats-officedocument." 434a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) "spreadsheetml.sheet"; 435a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return true; 436a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) case DOC_TYPE_POWERPOINT: 437a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) *result = "application/vnd.openxmlformats-officedocument." 438a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) "presentationml.presentation"; 439a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return true; 440a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) case DOC_TYPE_NONE: 441a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) NOTREACHED(); 442a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return false; 443a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) } 444a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) } 445a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 446a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) NOTREACHED(); 447a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) return false; 448a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)} 449a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 450a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Byte order marks 451a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)static const MagicNumber kMagicXML[] = { 452a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) // We want to be very conservative in interpreting text/xml content as 453a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) // XHTML -- we just want to sniff enough to make unit tests pass. 454a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) // So we match explicitly on this, and don't match other ways of writing 455a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) // it in semantically-equivalent ways. 456a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) MAGIC_STRING("application/xhtml+xml", 457a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) "<html xmlns=\"http://www.w3.org/1999/xhtml\"") 458a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) MAGIC_STRING("application/atom+xml", "<feed") 459a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) MAGIC_STRING("application/rss+xml", "<rss") // UTF-8 460a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)}; 461a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 462a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Returns true and sets result if the content appears to contain XHTML or a 463a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// feed. 464a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Clears have_enough_content if more data could possibly change the result. 465a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// 466a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// TODO(evanm): this is similar but more conservative than what Safari does, 467a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// while HTML5 has a different recommendation -- what should we do? 468a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// TODO(evanm): this is incorrect for documents whose encoding isn't a superset 469a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// of ASCII -- do we care? 470f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool SniffXML(const char* content, 471f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) size_t size, 472f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) bool* have_enough_content, 473 std::string* result) { 474 // We allow at most 300 bytes of content before we expect the opening tag. 475 *have_enough_content &= TruncateSize(300, &size); 476 const char* pos = content; 477 const char* const end = content + size; 478 479 // This loop iterates through tag-looking offsets in the file. 480 // We want to skip XML processing instructions (of the form "<?xml ...") 481 // and stop at the first "plain" tag, then make a decision on the mime-type 482 // based on the name (or possibly attributes) of that tag. 483 static base::HistogramBase* counter(NULL); 484 if (!counter) 485 counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2", 486 arraysize(kMagicXML)); 487 const int kMaxTagIterations = 5; 488 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { 489 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); 490 if (!pos) 491 return false; 492 493 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml")-1) == 0) { 494 // Skip XML declarations. 495 ++pos; 496 continue; 497 } else if (base::strncasecmp(pos, "<!DOCTYPE", 498 sizeof("<!DOCTYPE")-1) == 0) { 499 // Skip DOCTYPE declarations. 500 ++pos; 501 continue; 502 } 503 504 if (CheckForMagicNumbers(pos, end - pos, 505 kMagicXML, arraysize(kMagicXML), 506 counter, result)) 507 return true; 508 509 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult 510 // to identify. 511 512 // If we get here, we've hit an initial tag that hasn't matched one of the 513 // above tests. Abort. 514 return true; 515 } 516 517 // We iterated too far without finding a start tag. 518 // If we have more content to look at, we aren't going to change our mind by 519 // seeing more bytes from the network. 520 return pos < end; 521} 522 523// Byte order marks 524static const MagicNumber kByteOrderMark[] = { 525 MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE 526 MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE 527 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 528}; 529 530// Whether a given byte looks like it might be part of binary content. 531// Source: HTML5 spec 532static char kByteLooksBinary[] = { 533 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F 534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F 535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F 536 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F 537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F 538 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F 539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F 540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F 541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F 542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F 543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF 544 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF 545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF 546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF 547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF 548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF 549}; 550 551// Returns true and sets result to "application/octet-stream" if the content 552// appears to be binary data. Otherwise, returns false and sets "text/plain". 553// Clears have_enough_content if more data could possibly change the result. 554static bool SniffBinary(const char* content, 555 size_t size, 556 bool* have_enough_content, 557 std::string* result) { 558 // There is no concensus about exactly how to sniff for binary content. 559 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. 560 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. 561 // Here, we side with FF, but with a smaller buffer. This size was chosen 562 // because it is small enough to comfortably fit into a single packet (after 563 // allowing for headers) and yet large enough to account for binary formats 564 // that have a significant amount of ASCII at the beginning (crbug.com/15314). 565 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); 566 567 // First, we look for a BOM. 568 static base::HistogramBase* counter(NULL); 569 if (!counter) 570 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", 571 arraysize(kByteOrderMark)); 572 std::string unused; 573 if (CheckForMagicNumbers(content, size, 574 kByteOrderMark, arraysize(kByteOrderMark), 575 counter, &unused)) { 576 // If there is BOM, we think the buffer is not binary. 577 result->assign("text/plain"); 578 return false; 579 } 580 581 // Next we look to see if any of the bytes "look binary." 582 for (size_t i = 0; i < size; ++i) { 583 // If we a see a binary-looking byte, we think the content is binary. 584 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { 585 result->assign("application/octet-stream"); 586 return true; 587 } 588 } 589 590 // No evidence either way. Default to non-binary and, if truncated, clear 591 // have_enough_content because there could be a binary looking byte in the 592 // truncated data. 593 *have_enough_content &= is_truncated; 594 result->assign("text/plain"); 595 return false; 596} 597 598static bool IsUnknownMimeType(const std::string& mime_type) { 599 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here. 600 // If we do, please be careful not to alter the semantics at all. 601 static const char* kUnknownMimeTypes[] = { 602 // Empty mime types are as unknown as they get. 603 "", 604 // The unknown/unknown type is popular and uninformative 605 "unknown/unknown", 606 // The second most popular unknown mime type is application/unknown 607 "application/unknown", 608 // Firefox rejects a mime type if it is exactly */* 609 "*/*", 610 }; 611 static base::HistogramBase* counter(NULL); 612 if (!counter) 613 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", 614 arraysize(kUnknownMimeTypes) + 1); 615 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { 616 if (mime_type == kUnknownMimeTypes[i]) { 617 counter->Add(i); 618 return true; 619 } 620 } 621 if (mime_type.find('/') == std::string::npos) { 622 // Firefox rejects a mime type if it does not contain a slash 623 counter->Add(arraysize(kUnknownMimeTypes)); 624 return true; 625 } 626 return false; 627} 628 629// Returns true and sets result if the content appears to be a crx (Chrome 630// extension) file. 631// Clears have_enough_content if more data could possibly change the result. 632static bool SniffCRX(const char* content, 633 size_t size, 634 const GURL& url, 635 const std::string& type_hint, 636 bool* have_enough_content, 637 std::string* result) { 638 static base::HistogramBase* counter(NULL); 639 if (!counter) 640 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); 641 642 // Technically, the crx magic number is just Cr24, but the bytes after that 643 // are a version number which changes infrequently. Including it in the 644 // sniffing gives us less room for error. If the version number ever changes, 645 // we can just add an entry to this list. 646 // 647 // TODO(aa): If we ever have another magic number, we'll want to pass a 648 // histogram into CheckForMagicNumbers(), below, to see which one matched. 649 static const struct MagicNumber kCRXMagicNumbers[] = { 650 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00") 651 }; 652 653 // Only consider files that have the extension ".crx". 654 static const char kCRXExtension[] = ".crx"; 655 // Ignore null by subtracting 1. 656 static const int kExtensionLength = arraysize(kCRXExtension) - 1; 657 if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) == 658 url.path().size() - kExtensionLength) { 659 counter->Add(1); 660 } else { 661 return false; 662 } 663 664 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); 665 if (CheckForMagicNumbers(content, size, 666 kCRXMagicNumbers, arraysize(kCRXMagicNumbers), 667 NULL, result)) { 668 counter->Add(2); 669 } else { 670 return false; 671 } 672 673 return true; 674} 675 676bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { 677 static base::HistogramBase* should_sniff_counter(NULL); 678 if (!should_sniff_counter) 679 should_sniff_counter = 680 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); 681 bool sniffable_scheme = url.is_empty() || 682 url.SchemeIs("http") || 683 url.SchemeIs("https") || 684 url.SchemeIs("ftp") || 685 url.SchemeIsFile() || 686 url.SchemeIsFileSystem(); 687 if (!sniffable_scheme) { 688 should_sniff_counter->Add(1); 689 return false; 690 } 691 692 static const char* kSniffableTypes[] = { 693 // Many web servers are misconfigured to send text/plain for many 694 // different types of content. 695 "text/plain", 696 // We want to sniff application/octet-stream for 697 // application/x-chrome-extension, but nothing else. 698 "application/octet-stream", 699 // XHTML and Atom/RSS feeds are often served as plain xml instead of 700 // their more specific mime types. 701 "text/xml", 702 "application/xml", 703 }; 704 static base::HistogramBase* counter(NULL); 705 if (!counter) 706 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", 707 arraysize(kSniffableTypes) + 1); 708 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { 709 if (mime_type == kSniffableTypes[i]) { 710 counter->Add(i); 711 should_sniff_counter->Add(2); 712 return true; 713 } 714 } 715 if (IsUnknownMimeType(mime_type)) { 716 // The web server didn't specify a content type or specified a mime 717 // type that we ignore. 718 counter->Add(arraysize(kSniffableTypes)); 719 should_sniff_counter->Add(2); 720 return true; 721 } 722 should_sniff_counter->Add(1); 723 return false; 724} 725 726bool SniffMimeType(const char* content, size_t content_size, 727 const GURL& url, const std::string& type_hint, 728 std::string* result) { 729 DCHECK_LT(content_size, 1000000U); // sanity check 730 DCHECK(content); 731 DCHECK(result); 732 733 // By default, we assume we have enough content. 734 // Each sniff routine may unset this if it wasn't provided enough content. 735 bool have_enough_content = true; 736 737 // By default, we'll return the type hint. 738 // Each sniff routine may modify this if it has a better guess.. 739 result->assign(type_hint); 740 741 // Cache information about the type_hint 742 const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint); 743 744 // First check for HTML 745 if (hint_is_unknown_mime_type) { 746 // We're only willing to sniff HTML if the server has not supplied a mime 747 // type, or if the type it did supply indicates that it doesn't know what 748 // the type should be. 749 if (SniffForHTML(content, content_size, &have_enough_content, result)) 750 return true; // We succeeded in sniffing HTML. No more content needed. 751 } 752 753 // We're only willing to sniff for binary in 3 cases: 754 // 1. The server has not supplied a mime type. 755 // 2. The type it did supply indicates that it doesn't know what the type 756 // should be. 757 // 3. The type is "text/plain" which is the default on some web servers and 758 // could be indicative of a mis-configuration that we shield the user from. 759 const bool hint_is_text_plain = (type_hint == "text/plain"); 760 if (hint_is_unknown_mime_type || hint_is_text_plain) { 761 if (!SniffBinary(content, content_size, &have_enough_content, result)) { 762 // If the server said the content was text/plain and it doesn't appear 763 // to be binary, then we trust it. 764 if (hint_is_text_plain) { 765 return have_enough_content; 766 } 767 } 768 } 769 770 // If we have plain XML, sniff XML subtypes. 771 if (type_hint == "text/xml" || type_hint == "application/xml") { 772 // We're not interested in sniffing these types for images and the like. 773 // Instead, we're looking explicitly for a feed. If we don't find one 774 // we're done and return early. 775 if (SniffXML(content, content_size, &have_enough_content, result)) 776 return true; 777 return have_enough_content; 778 } 779 780 // CRX files (Chrome extensions) have a special sniffing algorithm. It is 781 // tighter than the others because we don't have to match legacy behavior. 782 if (SniffCRX(content, content_size, url, type_hint, 783 &have_enough_content, result)) 784 return true; 785 786 // Check the file extension and magic numbers to see if this is an Office 787 // document. This needs to be checked before the general magic numbers 788 // because zip files and Office documents (OOXML) have the same magic number. 789 if (SniffForOfficeDocs(content, content_size, url, 790 &have_enough_content, result)) 791 return true; // We've matched a magic number. No more content needed. 792 793 // We're not interested in sniffing for magic numbers when the type_hint 794 // is application/octet-stream. Time to bail out. 795 if (type_hint == "application/octet-stream") 796 return have_enough_content; 797 798 // Now we look in our large table of magic numbers to see if we can find 799 // anything that matches the content. 800 if (SniffForMagicNumbers(content, content_size, 801 &have_enough_content, result)) 802 return true; // We've matched a magic number. No more content needed. 803 804 return have_enough_content; 805} 806 807} // namespace net 808