mime_sniffer.cc revision b2df76ea8fec9e32f6f3718986dba0d95315b29c
1f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
3f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// found in the LICENSE file.
4f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
5f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Detecting mime types is a tricky business because we need to balance
6f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// compatibility concerns with security issues.  Here is a survey of how other
7f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// browsers behave and then a description of how we intend to behave.
8f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
9f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// HTML payload, no Content-Type header:
10f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as HTML
11f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Render as HTML
12f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Render as HTML
13f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as HTML
14f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
15f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Here the choice seems clear:
16f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// => Chrome: Render as HTML
17f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// HTML payload, Content-Type: "text/plain":
19f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as HTML
20f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Render as text
21f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
22f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//                                   has an HTML extension)
23f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as text
24f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
25f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Here we choose to follow the majority (and break some compatibility with IE).
26f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Many folks dislike IE's behavior here.
27a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// => Chrome: Render as text
28f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// We generalize this as follows.  If the Content-Type header is text/plain
295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// we won't detect dangerous mime types (those that can execute script).
305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//
31f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// HTML payload, Content-Type: "application/octet-stream":
32f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as HTML
33f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Download as application/octet-stream
34f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Render as HTML
35f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as HTML
36f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
37f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// We follow Firefox.
38f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// => Chrome: Download as application/octet-stream
39f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// One factor in this decision is that IIS 4 and 5 will send
40f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// application/octet-stream for .xhtml files (because they don't recognize
41f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// the extension).  We did some experiments and it looks like this doesn't occur
42f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// very often on the web.  We choose the more secure option.
43f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
44f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// GIF payload, no Content-Type header:
45f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as GIF
46f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Render as GIF
47f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
48f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//                                        URL has an GIF extension)
49f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as GIF
50f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
51f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// The choice is clear.
52f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// => Chrome: Render as GIF
53f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Once we decide to render HTML without a Content-Type header, there isn't much
54f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// reason not to render GIFs.
55f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// GIF payload, Content-Type: "text/plain":
57f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as GIF
58f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Download as application/octet-stream (Note: Firefox will
59f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//                              Download as GIF if the URL has an GIF extension)
60f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
61f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//                                        URL has an GIF extension)
62f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as GIF
63f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
64f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Displaying as text/plain makes little sense as the content will look like
65f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// gibberish.  Here, we could change our minds and download.
66f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// => Chrome: Render as GIF
675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//
685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// GIF payload, Content-Type: "application/octet-stream":
69f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as GIF
70f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Download as application/octet-stream (Note: Firefox will
71f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//                              Download as GIF if the URL has an GIF extension)
72f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
73f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//                                        URL has an GIF extension)
74f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Opera 9: Render as GIF
75f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
76f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// We used to render as GIF here, but the problem is that some sites want to
77f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// trigger downloads by sending application/octet-stream (even though they
78f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// should be sending Content-Disposition: attachment).  Although it is safe
79f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// to render as GIF from a security perspective, we actually get better
80f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// compatibility if we don't sniff from application/octet stream at all.
81f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// => Chrome: Download as application/octet-stream
82f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
83f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// XHTML payload, Content-Type: "text/xml":
84f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * IE 7: Render as XML
85f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// * Firefox 2: Render as HTML
86a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// * Safari 3: Render as HTML
87a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// * Opera 9: Render as HTML
88f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// The layout tests rely on us rendering this as HTML.
895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// But we're conservative in XHTML detection, as this runs afoul of the
905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// "don't detect dangerous mime types" rule.
91f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)//
92f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Note that our definition of HTML payload is much stricter than IE's
93f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// definition and roughly the same as Firefox's definition.
94f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
95f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include <string>
96f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
97f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "net/base/mime_sniffer.h"
98f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
99f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "base/basictypes.h"
100f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "base/logging.h"
101f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "base/metrics/histogram.h"
102f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "base/string_util.h"
103f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "googleurl/src/gurl.h"
104f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "net/base/mime_util.h"
105f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
106f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)namespace net {
107f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
108f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// The number of content bytes we need to use all our magic numbers.  Feel free
109f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// to increase this number if you add a longer magic number.
110f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const size_t kBytesRequiredForMagic = 42;
111f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
112f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)struct MagicNumber {
113f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  const char* mime_type;
114f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  const char* magic;
115f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  size_t magic_len;
116f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  bool is_string;
117f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)};
118f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
119f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#define MAGIC_NUMBER(mime_type, magic) \
120f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  { (mime_type), (magic), sizeof(magic)-1, false },
121f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
122f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Magic strings are case insensitive and must not include '\0' characters
123f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#define MAGIC_STRING(mime_type, magic) \
124f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  { (mime_type), (magic), sizeof(magic)-1, true },
125f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
126f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const MagicNumber kMagicNumbers[] = {
127f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Source: HTML 5 specification
128f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("application/pdf", "%PDF-")
129f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("application/postscript", "%!PS-Adobe-")
130f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("image/gif", "GIF87a")
131f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("image/gif", "GIF89a")
132f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A")
133f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF")
134f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("image/bmp", "BM")
135f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Source: Mozilla
136f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("text/plain", "#!")  // Script
137a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  MAGIC_NUMBER("text/plain", "%!")  // Script, similar to PS
138f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("text/plain", "From")
139f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("text/plain", ">From")
140f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Chrome specific
141a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08")
142f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46")
143f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("video/x-ms-asf",
144f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)      "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C")
145f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("image/tiff", "I I")
146f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("image/tiff", "II*")
147f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("image/tiff", "MM\x00*")
148f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("audio/mpeg", "ID3")
149f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ")
150f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3")
151f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // TODO(abarth): we don't handle partial byte matches yet
152f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB")
153f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // MAGIC_NUMBER("audio/mpeg", "\xFF\xE")
154f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // MAGIC_NUMBER("audio/mpeg", "\xFF\xF")
155f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("application/zip", "PK\x03\x04")
1565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00")
1575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A")
158f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("application/octet-stream", "MZ")  // EXE
159f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Sniffing for Flash:
160f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //
161f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //   MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
162f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //   MAGIC_NUMBER("application/x-shockwave-flash", "FLV")
163f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //   MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
164f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //
165f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Including these magic number for Flash is a trade off.
166f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //
167f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Pros:
168f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //   * Flash is an important and popular file format
169f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //
170f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Cons:
171f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //   * These patterns are fairly weak
172f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //   * If we mistakenly decide something is Flash, we will execute it
173f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //     in the origin of an unsuspecting site.  This could be a security
174a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  //     vulnerability if the site allows users to upload content.
175f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  //
176f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // On balance, we do not include these patterns.
177f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)};
178f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
179f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// The number of content bytes we need to use all our Microsoft Office magic
180f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// numbers.
181f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const size_t kBytesRequiredForOfficeMagic = 8;
182f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
183f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const MagicNumber kOfficeMagicNumbers[] = {
184f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1")
185f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("OOXML", "PK\x03\x04")
186f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)};
187f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
188f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)enum OfficeDocType {
189f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  DOC_TYPE_WORD,
190f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  DOC_TYPE_EXCEL,
191f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  DOC_TYPE_POWERPOINT,
192f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  DOC_TYPE_NONE
193f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)};
194f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
195f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)struct OfficeExtensionType {
196f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  OfficeDocType doc_type;
197f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  const char* extension;
198f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  size_t extension_len;
199f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)};
200f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
2015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define OFFICE_EXTENSION(type, extension) \
202f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  { (type), (extension), sizeof(extension) - 1 },
203f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
204f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const OfficeExtensionType kOfficeExtensionTypes[] = {
205f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc")
206f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls")
2075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt")
208f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx")
209f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx")
210f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx")
211f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)};
212f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
213f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Our HTML sniffer differs slightly from Mozilla.  For example, Mozilla will
214f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
215f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// HTML, but we will not.
216f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
217f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#define MAGIC_HTML_TAG(tag) \
218f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_STRING("text/html", "<" tag)
219f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
220f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static const MagicNumber kSniffableTags[] = {
221f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // XML processing directive.  Although this is not an HTML mime type, we sniff
222f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // for this in the HTML phase because text/xml is just as powerful as HTML and
223f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // we want to leverage our white space skipping technology.
224f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_NUMBER("text/xml", "<?xml")  // Mozilla
225f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // DOCTYPEs
226f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("!DOCTYPE html")  // HTML5 spec
227f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Sniffable tags, ordered by how often they occur in sniffable documents.
228f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("script")  // HTML5 spec, Mozilla
229f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("html")  // HTML5 spec, Mozilla
230f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("!--")
231f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("head")  // HTML5 spec, Mozilla
232f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("iframe")  // Mozilla
233f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("h1")  // Mozilla
234f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("div")  // Mozilla
235f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("font")  // Mozilla
236a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  MAGIC_HTML_TAG("table")  // Mozilla
237f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("a")  // Mozilla
238f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("style")  // Mozilla
239a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  MAGIC_HTML_TAG("title")  // Mozilla
240f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("b")  // Mozilla
241f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("body")  // Mozilla
242a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  MAGIC_HTML_TAG("br")
243f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  MAGIC_HTML_TAG("p")  // Mozilla
244f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)};
245a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
246f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static base::HistogramBase* UMASnifferHistogramGet(const char* name,
247f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                                                   int array_size) {
248f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  base::HistogramBase* counter =
249f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)      base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size,
250a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      base::HistogramBase::kUmaTargetedHistogramFlag);
251a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  return counter;
252a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)}
253f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
254f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Compare content header to a magic number where magic_entry can contain '.'
255f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// for single character of anything, allowing some bytes to be skipped.
256f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool MagicCmp(const char* magic_entry, const char* content, size_t len) {
257f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  while (len) {
258f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    if ((*magic_entry != '.') && (*magic_entry != *content))
259f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)      return false;
260f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    ++magic_entry;
261f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    ++content;
262f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    --len;
263f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  }
264f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  return true;
265a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)}
266f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
267f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool MatchMagicNumber(const char* content, size_t size,
268a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                             const MagicNumber* magic_entry,
269f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                             std::string* result) {
270f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  const size_t len = magic_entry->magic_len;
271f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
272f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Keep kBytesRequiredForMagic honest.
273f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  DCHECK_LE(len, kBytesRequiredForMagic);
274f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
275f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // To compare with magic strings, we need to compute strlen(content), but
276f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // content might not actually have a null terminator.  In that case, we
277f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // pretend the length is content_size.
278a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  const char* end =
279f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)      static_cast<const char*>(memchr(content, '\0', size));
280f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  const size_t content_strlen =
281a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      (end != NULL) ? static_cast<size_t>(end - content) : size;
282f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
283f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  bool match = false;
284f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  if (magic_entry->is_string) {
285a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    if (content_strlen >= len) {
286f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)      // String comparisons are case-insensitive
287f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)      match = (base::strncasecmp(magic_entry->magic, content, len) == 0);
288a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    }
289a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  } else {
290a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    if (size >= len)
291f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)      match = MagicCmp(magic_entry->magic, content, len);
292f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  }
293f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
294f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  if (match) {
295f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    result->assign(magic_entry->mime_type);
296f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    return true;
297f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  }
298f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  return false;
299f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}
300f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
301f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool CheckForMagicNumbers(const char* content, size_t size,
302f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                                 const MagicNumber* magic, size_t magic_len,
303f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                                 base::HistogramBase* counter,
304f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                                 std::string* result) {
305a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  for (size_t i = 0; i < magic_len; ++i) {
306f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    if (MatchMagicNumber(content, size, &(magic[i]), result)) {
307f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)      if (counter) counter->Add(static_cast<int>(i));
308a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      return true;
309f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    }
310f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  }
311a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  return false;
312f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}
313f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
314a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Truncates |size| to |max_size| and returns true if |size| is at least
315f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// |max_size|.
316f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool TruncateSize(const size_t max_size, size_t* size) {
317f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Keep kMaxBytesToSniff honest.
318f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
319f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
320f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  if (*size >= max_size) {
321f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    *size = max_size;
322f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    return true;
323a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  }
324f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  return false;
325f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}
326f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
327f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Returns true and sets result if the content appears to be HTML.
328a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Clears have_enough_content if more data could possibly change the result.
329f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool SniffForHTML(const char* content,
330f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                         size_t size,
331a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                         bool* have_enough_content,
332a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                         std::string* result) {
333a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  // For HTML, we are willing to consider up to 512 bytes. This may be overly
334f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // conservative as IE only considers 256.
335f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  *have_enough_content &= TruncateSize(512, &size);
336f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
337f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
338f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // but with some modifications to better match the HTML5 spec.
339f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  const char* const end = content + size;
340f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  const char* pos;
341f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  for (pos = content; pos < end; ++pos) {
342f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    if (!IsAsciiWhitespace(*pos))
343a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      break;
344f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  }
345f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  static base::HistogramBase* counter(NULL);
346f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  if (!counter)
347f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2",
348a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                                     arraysize(kSniffableTags));
349f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // |pos| now points to first non-whitespace character (or at end).
350f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  return CheckForMagicNumbers(pos, end - pos,
351a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                              kSniffableTags, arraysize(kSniffableTags),
352a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                              counter, result);
353a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)}
354a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
355f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Returns true and sets result if the content matches any of kMagicNumbers.
356f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Clears have_enough_content if more data could possibly change the result.
357f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool SniffForMagicNumbers(const char* content,
358f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                                 size_t size,
359f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                                 bool* have_enough_content,
360f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                                 std::string* result) {
361f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
362f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
363f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Check our big table of Magic Numbers
364f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  static base::HistogramBase* counter(NULL);
365f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  if (!counter)
366f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2",
367a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                                     arraysize(kMagicNumbers));
368f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  return CheckForMagicNumbers(content, size,
369f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                              kMagicNumbers, arraysize(kMagicNumbers),
370f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                              counter, result);
371a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)}
372f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
373f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// Returns true and sets result if the content matches any of
374f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)// kOfficeMagicNumbers, and the URL has the proper extension.
375a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Clears |have_enough_content| if more data could possibly change the result.
376f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool SniffForOfficeDocs(const char* content,
377f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                               size_t size,
378a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                               const GURL& url,
379a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                               bool* have_enough_content,
380a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                               std::string* result) {
381f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size);
382f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
383f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  // Check our table of magic numbers for Office file types.
384f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  std::string office_version;
385f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  if (!CheckForMagicNumbers(content, size,
386f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                            kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers),
387f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                            NULL, &office_version))
388f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    return false;
389f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
390a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  OfficeDocType type = DOC_TYPE_NONE;
391f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) {
392f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    std::string url_path = url.path();
393f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
394f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    if (url_path.length() < kOfficeExtensionTypes[i].extension_len)
395a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      continue;
396a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
397a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    const char* extension =
398a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        &url_path[url_path.length() -
399a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                  kOfficeExtensionTypes[i].extension_len];
400a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
401a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    if (0 == base::strncasecmp(extension, kOfficeExtensionTypes[i].extension,
402a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                               kOfficeExtensionTypes[i].extension_len)) {
403a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      type = kOfficeExtensionTypes[i].doc_type;
404a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      break;
405a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    }
406a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  }
407a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
408a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  if (type == DOC_TYPE_NONE)
409a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    return false;
410a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
411a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  if (office_version == "CFB") {
412a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    switch (type) {
413a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      case DOC_TYPE_WORD:
414a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        *result = "application/msword";
415a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        return true;
416f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)      case DOC_TYPE_EXCEL:
417f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)        *result = "application/vnd.ms-excel";
418a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        return true;
419a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      case DOC_TYPE_POWERPOINT:
420a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        *result = "application/vnd.ms-powerpoint";
421a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        return true;
422f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)      case DOC_TYPE_NONE:
423f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)        NOTREACHED();
424a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        return false;
425a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    }
426a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  } else if (office_version == "OOXML") {
427a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    switch (type) {
428a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      case DOC_TYPE_WORD:
429a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        *result = "application/vnd.openxmlformats-officedocument."
430a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                  "wordprocessingml.document";
431a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        return true;
432a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      case DOC_TYPE_EXCEL:
433a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        *result = "application/vnd.openxmlformats-officedocument."
434a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                  "spreadsheetml.sheet";
435a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        return true;
436a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      case DOC_TYPE_POWERPOINT:
437a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        *result = "application/vnd.openxmlformats-officedocument."
438a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                  "presentationml.presentation";
439a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        return true;
440a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      case DOC_TYPE_NONE:
441a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        NOTREACHED();
442a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        return false;
443a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    }
444a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  }
445a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
446a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  NOTREACHED();
447a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  return false;
448a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)}
449a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
450a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Byte order marks
451a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)static const MagicNumber kMagicXML[] = {
452a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  // We want to be very conservative in interpreting text/xml content as
453a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  // XHTML -- we just want to sniff enough to make unit tests pass.
454a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  // So we match explicitly on this, and don't match other ways of writing
455a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  // it in semantically-equivalent ways.
456a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  MAGIC_STRING("application/xhtml+xml",
457a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)               "<html xmlns=\"http://www.w3.org/1999/xhtml\"")
458a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  MAGIC_STRING("application/atom+xml", "<feed")
459a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  MAGIC_STRING("application/rss+xml", "<rss")  // UTF-8
460a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)};
461a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
462a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Returns true and sets result if the content appears to contain XHTML or a
463a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// feed.
464a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// Clears have_enough_content if more data could possibly change the result.
465a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)//
466a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// TODO(evanm): this is similar but more conservative than what Safari does,
467a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// while HTML5 has a different recommendation -- what should we do?
468a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// TODO(evanm): this is incorrect for documents whose encoding isn't a superset
469a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)// of ASCII -- do we care?
470f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)static bool SniffXML(const char* content,
471f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                     size_t size,
472f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                     bool* have_enough_content,
473                     std::string* result) {
474  // We allow at most 300 bytes of content before we expect the opening tag.
475  *have_enough_content &= TruncateSize(300, &size);
476  const char* pos = content;
477  const char* const end = content + size;
478
479  // This loop iterates through tag-looking offsets in the file.
480  // We want to skip XML processing instructions (of the form "<?xml ...")
481  // and stop at the first "plain" tag, then make a decision on the mime-type
482  // based on the name (or possibly attributes) of that tag.
483  static base::HistogramBase* counter(NULL);
484  if (!counter)
485    counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2",
486                                     arraysize(kMagicXML));
487  const int kMaxTagIterations = 5;
488  for (int i = 0; i < kMaxTagIterations && pos < end; ++i) {
489    pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos));
490    if (!pos)
491      return false;
492
493    if (base::strncasecmp(pos, "<?xml", sizeof("<?xml")-1) == 0) {
494      // Skip XML declarations.
495      ++pos;
496      continue;
497    } else if (base::strncasecmp(pos, "<!DOCTYPE",
498                                 sizeof("<!DOCTYPE")-1) == 0) {
499      // Skip DOCTYPE declarations.
500      ++pos;
501      continue;
502    }
503
504    if (CheckForMagicNumbers(pos, end - pos,
505                             kMagicXML, arraysize(kMagicXML),
506                             counter, result))
507      return true;
508
509    // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
510    // to identify.
511
512    // If we get here, we've hit an initial tag that hasn't matched one of the
513    // above tests.  Abort.
514    return true;
515  }
516
517  // We iterated too far without finding a start tag.
518  // If we have more content to look at, we aren't going to change our mind by
519  // seeing more bytes from the network.
520  return pos < end;
521}
522
523// Byte order marks
524static const MagicNumber kByteOrderMark[] = {
525  MAGIC_NUMBER("text/plain", "\xFE\xFF")  // UTF-16BE
526  MAGIC_NUMBER("text/plain", "\xFF\xFE")  // UTF-16LE
527  MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF")  // UTF-8
528};
529
530// Whether a given byte looks like it might be part of binary content.
531// Source: HTML5 spec
532static char kByteLooksBinary[] = {
533  1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  // 0x00 - 0x0F
534  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  // 0x10 - 0x1F
535  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x20 - 0x2F
536  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x30 - 0x3F
537  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x40 - 0x4F
538  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x50 - 0x5F
539  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x60 - 0x6F
540  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x70 - 0x7F
541  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80 - 0x8F
542  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90 - 0x9F
543  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xA0 - 0xAF
544  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xB0 - 0xBF
545  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xC0 - 0xCF
546  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xD0 - 0xDF
547  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xE0 - 0xEF
548  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xF0 - 0xFF
549};
550
551// Returns true and sets result to "application/octet-stream" if the content
552// appears to be binary data. Otherwise, returns false and sets "text/plain".
553// Clears have_enough_content if more data could possibly change the result.
554static bool SniffBinary(const char* content,
555                        size_t size,
556                        bool* have_enough_content,
557                        std::string* result) {
558  // There is no concensus about exactly how to sniff for binary content.
559  // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
560  // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
561  // Here, we side with FF, but with a smaller buffer. This size was chosen
562  // because it is small enough to comfortably fit into a single packet (after
563  // allowing for headers) and yet large enough to account for binary formats
564  // that have a significant amount of ASCII at the beginning (crbug.com/15314).
565  const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size);
566
567  // First, we look for a BOM.
568  static base::HistogramBase* counter(NULL);
569  if (!counter)
570    counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2",
571                                     arraysize(kByteOrderMark));
572  std::string unused;
573  if (CheckForMagicNumbers(content, size,
574                           kByteOrderMark, arraysize(kByteOrderMark),
575                           counter, &unused)) {
576    // If there is BOM, we think the buffer is not binary.
577    result->assign("text/plain");
578    return false;
579  }
580
581  // Next we look to see if any of the bytes "look binary."
582  for (size_t i = 0; i < size; ++i) {
583    // If we a see a binary-looking byte, we think the content is binary.
584    if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) {
585      result->assign("application/octet-stream");
586      return true;
587    }
588  }
589
590  // No evidence either way. Default to non-binary and, if truncated, clear
591  // have_enough_content because there could be a binary looking byte in the
592  // truncated data.
593  *have_enough_content &= is_truncated;
594  result->assign("text/plain");
595  return false;
596}
597
598static bool IsUnknownMimeType(const std::string& mime_type) {
599  // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
600  // If we do, please be careful not to alter the semantics at all.
601  static const char* kUnknownMimeTypes[] = {
602    // Empty mime types are as unknown as they get.
603    "",
604    // The unknown/unknown type is popular and uninformative
605    "unknown/unknown",
606    // The second most popular unknown mime type is application/unknown
607    "application/unknown",
608    // Firefox rejects a mime type if it is exactly */*
609    "*/*",
610  };
611  static base::HistogramBase* counter(NULL);
612  if (!counter)
613    counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2",
614                                     arraysize(kUnknownMimeTypes) + 1);
615  for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) {
616    if (mime_type == kUnknownMimeTypes[i]) {
617      counter->Add(i);
618      return true;
619    }
620  }
621  if (mime_type.find('/') == std::string::npos) {
622    // Firefox rejects a mime type if it does not contain a slash
623    counter->Add(arraysize(kUnknownMimeTypes));
624    return true;
625  }
626  return false;
627}
628
629// Returns true and sets result if the content appears to be a crx (Chrome
630// extension) file.
631// Clears have_enough_content if more data could possibly change the result.
632static bool SniffCRX(const char* content,
633                     size_t size,
634                     const GURL& url,
635                     const std::string& type_hint,
636                     bool* have_enough_content,
637                     std::string* result) {
638  static base::HistogramBase* counter(NULL);
639  if (!counter)
640    counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3);
641
642  // Technically, the crx magic number is just Cr24, but the bytes after that
643  // are a version number which changes infrequently. Including it in the
644  // sniffing gives us less room for error. If the version number ever changes,
645  // we can just add an entry to this list.
646  //
647  // TODO(aa): If we ever have another magic number, we'll want to pass a
648  // histogram into CheckForMagicNumbers(), below, to see which one matched.
649  static const struct MagicNumber kCRXMagicNumbers[] = {
650    MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")
651  };
652
653  // Only consider files that have the extension ".crx".
654  static const char kCRXExtension[] = ".crx";
655  // Ignore null by subtracting 1.
656  static const int kExtensionLength = arraysize(kCRXExtension) - 1;
657  if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) ==
658      url.path().size() - kExtensionLength) {
659    counter->Add(1);
660  } else {
661    return false;
662  }
663
664  *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
665  if (CheckForMagicNumbers(content, size,
666                           kCRXMagicNumbers, arraysize(kCRXMagicNumbers),
667                           NULL, result)) {
668    counter->Add(2);
669  } else {
670    return false;
671  }
672
673  return true;
674}
675
676bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
677  static base::HistogramBase* should_sniff_counter(NULL);
678  if (!should_sniff_counter)
679    should_sniff_counter =
680        UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3);
681  bool sniffable_scheme = url.is_empty() ||
682                          url.SchemeIs("http") ||
683                          url.SchemeIs("https") ||
684                          url.SchemeIs("ftp") ||
685                          url.SchemeIsFile() ||
686                          url.SchemeIsFileSystem();
687  if (!sniffable_scheme) {
688    should_sniff_counter->Add(1);
689    return false;
690  }
691
692  static const char* kSniffableTypes[] = {
693    // Many web servers are misconfigured to send text/plain for many
694    // different types of content.
695    "text/plain",
696    // We want to sniff application/octet-stream for
697    // application/x-chrome-extension, but nothing else.
698    "application/octet-stream",
699    // XHTML and Atom/RSS feeds are often served as plain xml instead of
700    // their more specific mime types.
701    "text/xml",
702    "application/xml",
703  };
704  static base::HistogramBase* counter(NULL);
705  if (!counter)
706    counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2",
707                                     arraysize(kSniffableTypes) + 1);
708  for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) {
709    if (mime_type == kSniffableTypes[i]) {
710      counter->Add(i);
711      should_sniff_counter->Add(2);
712      return true;
713    }
714  }
715  if (IsUnknownMimeType(mime_type)) {
716    // The web server didn't specify a content type or specified a mime
717    // type that we ignore.
718    counter->Add(arraysize(kSniffableTypes));
719    should_sniff_counter->Add(2);
720    return true;
721  }
722  should_sniff_counter->Add(1);
723  return false;
724}
725
726bool SniffMimeType(const char* content, size_t content_size,
727                   const GURL& url, const std::string& type_hint,
728                   std::string* result) {
729  DCHECK_LT(content_size, 1000000U);  // sanity check
730  DCHECK(content);
731  DCHECK(result);
732
733  // By default, we assume we have enough content.
734  // Each sniff routine may unset this if it wasn't provided enough content.
735  bool have_enough_content = true;
736
737  // By default, we'll return the type hint.
738  // Each sniff routine may modify this if it has a better guess..
739  result->assign(type_hint);
740
741  // Cache information about the type_hint
742  const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
743
744  // First check for HTML
745  if (hint_is_unknown_mime_type) {
746    // We're only willing to sniff HTML if the server has not supplied a mime
747    // type, or if the type it did supply indicates that it doesn't know what
748    // the type should be.
749    if (SniffForHTML(content, content_size, &have_enough_content, result))
750      return true;  // We succeeded in sniffing HTML.  No more content needed.
751  }
752
753  // We're only willing to sniff for binary in 3 cases:
754  // 1. The server has not supplied a mime type.
755  // 2. The type it did supply indicates that it doesn't know what the type
756  //    should be.
757  // 3. The type is "text/plain" which is the default on some web servers and
758  //    could be indicative of a mis-configuration that we shield the user from.
759  const bool hint_is_text_plain = (type_hint == "text/plain");
760  if (hint_is_unknown_mime_type || hint_is_text_plain) {
761    if (!SniffBinary(content, content_size, &have_enough_content, result)) {
762      // If the server said the content was text/plain and it doesn't appear
763      // to be binary, then we trust it.
764      if (hint_is_text_plain) {
765        return have_enough_content;
766      }
767    }
768  }
769
770  // If we have plain XML, sniff XML subtypes.
771  if (type_hint == "text/xml" || type_hint == "application/xml") {
772    // We're not interested in sniffing these types for images and the like.
773    // Instead, we're looking explicitly for a feed.  If we don't find one
774    // we're done and return early.
775    if (SniffXML(content, content_size, &have_enough_content, result))
776      return true;
777    return have_enough_content;
778  }
779
780  // CRX files (Chrome extensions) have a special sniffing algorithm. It is
781  // tighter than the others because we don't have to match legacy behavior.
782  if (SniffCRX(content, content_size, url, type_hint,
783               &have_enough_content, result))
784    return true;
785
786  // Check the file extension and magic numbers to see if this is an Office
787  // document.  This needs to be checked before the general magic numbers
788  // because zip files and Office documents (OOXML) have the same magic number.
789  if (SniffForOfficeDocs(content, content_size, url,
790                         &have_enough_content, result))
791    return true;  // We've matched a magic number.  No more content needed.
792
793  // We're not interested in sniffing for magic numbers when the type_hint
794  // is application/octet-stream.  Time to bail out.
795  if (type_hint == "application/octet-stream")
796    return have_enough_content;
797
798  // Now we look in our large table of magic numbers to see if we can find
799  // anything that matches the content.
800  if (SniffForMagicNumbers(content, content_size,
801                           &have_enough_content, result))
802    return true;  // We've matched a magic number.  No more content needed.
803
804  return have_enough_content;
805}
806
807}  // namespace net
808