1/*
2    Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies)
3
4    This library is free software; you can redistribute it and/or
5    modify it under the terms of the GNU Library General Public
6    License as published by the Free Software Foundation; either
7    version 2 of the License, or (at your option) any later version.
8
9    This library is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12    Library General Public License for more details.
13
14    You should have received a copy of the GNU Library General Public License
15    along with this library; see the file COPYING.LIB.  If not, write to
16    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17    Boston, MA 02110-1301, USA.
18*/
19
20#include "config.h"
21#include "MIMESniffing.h"
22
23#include <cstring>
24#include <stdint.h>
25
26// MIME type sniffing implementation based on http://tools.ietf.org/html/draft-abarth-mime-sniff-06
27
28namespace {
29
30static inline bool isTextInList(const char* text, size_t size, const char** data)
31{
32    for (size_t i = 0; i < size; ++i) {
33        if (!strcmp(text, data[i]))
34            return true;
35    }
36    return false;
37
38}
39
40// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-6
41const char* textTypes[] = {
42    "text/plain",
43    "text/plain; charset=ISO-8859-1",
44    "text/plain; charset=iso-8859-1",
45    "text/plain; charset=UTF-8"
46};
47const size_t textTypesSize = sizeof(textTypes) / sizeof(textTypes[0]);
48
49static inline bool isTextOrBinaryType(const char* type)
50{
51    return isTextInList(type, textTypesSize, textTypes);
52}
53
54// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-6
55const char* unknownTypes[] = {
56    "",
57    "unknown/unknown",
58    "application/unknown",
59    "*/*"
60};
61const size_t unknownTypesSize = sizeof(unknownTypes) / sizeof(unknownTypes[0]);
62
63static inline bool isUnknownType(const char* type)
64{
65    return isTextInList(type, unknownTypesSize, unknownTypes);
66}
67
68const char* xmlTypes[] = {
69    "text/xml",
70    "application/xml"
71};
72const size_t xmlTypesSize = sizeof(xmlTypes) / sizeof(xmlTypes[0]);
73
74const char xmlSuffix[] = "+xml";
75
76static inline bool isXMLType(const char* type)
77{
78    const size_t xmlSuffixSize = sizeof(xmlSuffix) - 1;
79    size_t typeSize = strlen(type);
80    if (typeSize >= xmlSuffixSize && !memcmp(type + typeSize - xmlSuffixSize, xmlSuffix, xmlSuffixSize))
81        return true;
82
83    return isTextInList(type, xmlTypesSize, xmlTypes);
84}
85
86// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-8
87const char binaryFlags[256] = {
88    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
89    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
90    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
91    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
92    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
93    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
94    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
104};
105
106static inline bool isBinaryChar(unsigned char data)
107{
108    return binaryFlags[data];
109}
110
111static inline bool isBinaryData(const char* data, size_t size)
112{
113    for (size_t i = 0; i < size; ++i) {
114        if (isBinaryChar(data[i]))
115            return true;
116    }
117    return false;
118}
119
120// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-11
121const char whiteSpaceChars[256] = {
122    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
123    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
126    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
127    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
129    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
132    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
134    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
135    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
136    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
137    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
138};
139
140static inline bool isWhiteSpace(unsigned char data)
141{
142    return whiteSpaceChars[data];
143}
144
145static inline void skipWhiteSpace(const char* data, size_t& pos, size_t dataSize)
146{
147    while (pos < dataSize && isWhiteSpace(data[pos]))
148        ++pos;
149}
150
151enum {
152    SkipWhiteSpace = 1,
153    TrailingSpaceOrBracket = 2
154};
155
156struct MagicNumbers {
157    const char* pattern;
158    const char* mask;
159    const char* mimeType;
160    size_t size;
161    int flags;
162};
163
164#define MAGIC_NUMBERS_MASKED(pattern, mask, mimeType, flags) {(pattern), (mask), (mimeType), sizeof(pattern) - 1, (flags)}
165#define MAGIC_NUMBERS_SIMPLE(pattern, mimeType) {(pattern), 0, (mimeType), sizeof(pattern) - 1, 0}
166
167// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-12
168const MagicNumbers securityConstrainedTypes[] = {
169    MAGIC_NUMBERS_MASKED("<!DOCTYPE HTML", "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
170    MAGIC_NUMBERS_MASKED("<HTML", "\xFF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
171    MAGIC_NUMBERS_MASKED("<HEAD", "\xFF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
172    MAGIC_NUMBERS_MASKED("<SCRIPT", "\xFF\xDF\xDF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
173    MAGIC_NUMBERS_MASKED("<IFRAME", "\xFF\xDF\xDF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
174    MAGIC_NUMBERS_MASKED("<H1", "\xFF\xDF\xFF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
175    MAGIC_NUMBERS_MASKED("<DIV", "\xFF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
176    MAGIC_NUMBERS_MASKED("<FONT", "\xFF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
177    MAGIC_NUMBERS_MASKED("<TABLE", "\xFF\xDF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
178    MAGIC_NUMBERS_MASKED("<A", "\xFF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
179    MAGIC_NUMBERS_MASKED("<STYLE", "\xFF\xDF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
180    MAGIC_NUMBERS_MASKED("<TITLE", "\xFF\xDF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
181    MAGIC_NUMBERS_MASKED("<B", "\xFF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
182    MAGIC_NUMBERS_MASKED("<BODY", "\xFF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
183    MAGIC_NUMBERS_MASKED("<BR", "\xFF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
184    MAGIC_NUMBERS_MASKED("<P", "\xFF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
185    MAGIC_NUMBERS_MASKED("<!--", 0, "text/html", SkipWhiteSpace | TrailingSpaceOrBracket),
186    MAGIC_NUMBERS_MASKED("<?xml", 0, "text/xml", SkipWhiteSpace),
187    MAGIC_NUMBERS_SIMPLE("%PDF-", "application/pdf")
188};
189const size_t securityConstrainedTypesSize = sizeof(securityConstrainedTypes) / sizeof(securityConstrainedTypes[0]);
190
191// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-8
192const MagicNumbers bomTypes[] = {
193    MAGIC_NUMBERS_SIMPLE("\xFE\xFF", "text/plain"), // UTF-16BE BOM
194    MAGIC_NUMBERS_SIMPLE("\xFF\xFE", "text/plain"), // UTF-16LE BOM
195    MAGIC_NUMBERS_SIMPLE("\xEF\xBB\xBF", "text/plain") // UTF-8 BOM
196};
197const size_t bomTypesSize = sizeof(bomTypes) / sizeof(bomTypes[0]);
198
199// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-13
200const MagicNumbers safeTypes[] = {
201    MAGIC_NUMBERS_SIMPLE("%!PS-Adobe-", "application/postscript"),
202    MAGIC_NUMBERS_SIMPLE("\x4F\x67\x67\x53\x00", "application/ogg"), // An Ogg Vorbis audio or video signature.
203    MAGIC_NUMBERS_MASKED("RIFF\x00\x00\x00\x00WAVE", "\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", "audio/x-wave", 0), // "RIFF" followed by four bytes, followed by "WAVE".
204    MAGIC_NUMBERS_SIMPLE("\x1A\x45\xDF\xA3", "video/webm"), // The WebM signature.
205    MAGIC_NUMBERS_SIMPLE("Rar!\x1A\x07\x00", "application/x-rar-compressed"), // A RAR archive.
206    MAGIC_NUMBERS_SIMPLE("\x50\x4B\x03\x04", "application/zip"), // A ZIP archive.
207    MAGIC_NUMBERS_SIMPLE("\x1F\x8B\x08", "application/x-gzip") // A GZIP archive.
208};
209const size_t safeTypesSize = sizeof(safeTypes) / sizeof(safeTypes[0]);
210
211// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-16
212const MagicNumbers imageTypes[] = {
213    MAGIC_NUMBERS_MASKED("RIFF\x00\x00\x00\x00WEBPVP", "\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF", "image/webp", 0), // "RIFF" followed by four bytes, followed by "WEBPVP".
214    MAGIC_NUMBERS_SIMPLE("GIF87a", "image/gif"),
215    MAGIC_NUMBERS_SIMPLE("GIF89a", "image/gif"),
216    MAGIC_NUMBERS_SIMPLE("\x89\x50\x4E\x47\x0D\x0A\x1A\x0A", "image/png"),
217    MAGIC_NUMBERS_SIMPLE("\xFF\xD8\xFF", "image/jpeg"),
218    MAGIC_NUMBERS_SIMPLE("BM", "image/bmp"),
219    MAGIC_NUMBERS_SIMPLE("\x00\x00\x01\x00", "image/vnd.microsoft.icon") // A Windows Icon signature.
220};
221const size_t imageTypesSize = sizeof(imageTypes) / sizeof(imageTypes[0]);
222
223static inline size_t dataSizeNeededForImageSniffing()
224{
225    size_t result = 0;
226    for (int i = 0; i < imageTypesSize; ++i) {
227        if (imageTypes[i].size > result)
228            result = imageTypes[i].size;
229    }
230    return result;
231}
232
233static inline bool maskedCompare(const MagicNumbers& info, const char* data, size_t dataSize)
234{
235    if (dataSize < info.size)
236        return false;
237
238    const uint32_t* pattern32 = reinterpret_cast<const uint32_t*>(info.pattern);
239    const uint32_t* mask32 = reinterpret_cast<const uint32_t*>(info.mask);
240    const uint32_t* data32 = reinterpret_cast<const uint32_t*>(data);
241
242    size_t count = info.size >> 2;
243
244    for (size_t i = 0; i < count; ++i) {
245        if ((*data32++ & *mask32++) != *pattern32++)
246            return false;
247    }
248
249    const char* p = reinterpret_cast<const char*>(pattern32);
250    const char* m = reinterpret_cast<const char*>(mask32);
251    const char* d = reinterpret_cast<const char*>(data32);
252
253    count = info.size & 3;
254
255    for (size_t i = 0; i < count; ++i) {
256        if ((*d++ & *m++) != *p++)
257            return false;
258    }
259
260    return true;
261}
262
263// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-11
264static inline bool checkSpaceOrBracket(const char* data)
265{
266    return isWhiteSpace(*data) || *data == 0x3E;
267}
268
269static inline bool compare(const MagicNumbers& info, const char* data, size_t dataSize)
270{
271    if (info.flags & SkipWhiteSpace) {
272        size_t pos = 0;
273        skipWhiteSpace(data, pos, dataSize);
274        data += pos;
275        dataSize -= pos;
276    }
277
278    bool result;
279    if (info.mask)
280        result = maskedCompare(info, data, info.size);
281    else
282        result = dataSize >= info.size && !memcmp(data, info.pattern, info.size);
283
284    return result && (!(info.flags & TrailingSpaceOrBracket) || checkSpaceOrBracket(data + info.size));
285}
286
287static inline const char* findMIMEType(const char* data, size_t dataSize, const MagicNumbers* types, size_t typesCount)
288{
289    for (size_t i = 0; i < typesCount; ++i) {
290        if (compare(types[i], data, dataSize))
291            return types[i].mimeType;
292    }
293    return 0;
294}
295
296static inline const char* findSimpleMIMEType(const char* data, size_t dataSize, const MagicNumbers* types, size_t typesCount)
297{
298    for (size_t i = 0; i < typesCount; ++i) {
299        ASSERT(!types[i].mask);
300        ASSERT(!types[i].flags);
301
302        if (dataSize >= types[i].size && !memcmp(data, types[i].pattern, types[i].size))
303            return types[i].mimeType;
304    }
305    return 0;
306}
307
308bool isTypeInList(const char* type, const MagicNumbers* types, size_t typesCount)
309{
310    for (size_t i = 0; i < typesCount; ++i) {
311        if (!strcmp(type, types[i].mimeType))
312            return true;
313    }
314    return false;
315}
316
317// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-8
318static const char* internalTextOrBinaryTypeSniffingProcedure(const char* data, size_t dataSize)
319{
320    const char* mimeType = 0;
321
322    mimeType = findSimpleMIMEType(data, dataSize, bomTypes, bomTypesSize);
323    if (mimeType)
324        return mimeType;
325
326    if (!isBinaryData(data, dataSize))
327        return "text/plain";
328
329    mimeType = findMIMEType(data, dataSize, safeTypes, safeTypesSize);
330    if (mimeType)
331        return mimeType;
332
333    mimeType = findMIMEType(data, dataSize, imageTypes, imageTypesSize);
334    if (mimeType)
335        return mimeType;
336
337    return "application/octet-stream";
338}
339
340static const char* textOrBinaryTypeSniffingProcedure(const char* data, size_t dataSize)
341{
342    const char* result = internalTextOrBinaryTypeSniffingProcedure(data, dataSize);
343    ASSERT(!isTypeInList(result, securityConstrainedTypes, securityConstrainedTypesSize));
344    return result;
345}
346
347// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-10
348static const char* unknownTypeSniffingProcedure(const char* data, size_t dataSize)
349{
350    const char* mimeType = 0;
351
352    mimeType = findMIMEType(data, dataSize, securityConstrainedTypes, securityConstrainedTypesSize);
353    if (mimeType)
354        return mimeType;
355
356    mimeType = findSimpleMIMEType(data, dataSize, bomTypes, bomTypesSize);
357    if (mimeType)
358        return mimeType;
359
360    mimeType = findMIMEType(data, dataSize, safeTypes, safeTypesSize);
361    if (mimeType)
362        return mimeType;
363
364    mimeType = findMIMEType(data, dataSize, imageTypes, imageTypesSize);
365    if (mimeType)
366        return mimeType;
367
368    if (!isBinaryData(data, dataSize))
369        return "text/plain";
370
371    return "application/octet-stream";
372}
373
374// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-16
375static const char* imageTypeSniffingProcedure(const char* data, size_t dataSize)
376{
377    return findMIMEType(data, dataSize, imageTypes, imageTypesSize);
378}
379
380static inline bool checkText(const char* data, size_t& pos, size_t dataSize, const char* text, size_t textSize)
381{
382    if (dataSize - pos < textSize || memcmp(data + pos, text, textSize))
383        return false;
384
385    pos += textSize;
386    return true;
387}
388
389const char rssUrl[] = "http://purl.org/rss/1.0";
390const char rdfUrl[] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
391
392static inline const char* checkRDF(const char* data, size_t pos, size_t dataSize)
393{
394    bool isRDF = false;
395    bool isRSS = false;
396
397    while (pos <= dataSize) {
398        if (checkText(data, pos, dataSize, rssUrl, sizeof(rssUrl) - 1)) {
399            isRSS = true;
400            continue;
401        }
402
403        if (checkText(data, pos, dataSize, rdfUrl, sizeof(rdfUrl) - 1)) {
404            isRDF = true;
405            continue;
406        }
407
408        ++pos;
409
410        if (isRSS && isRDF)
411            return "application/rdf+xml";
412    }
413
414    return 0;
415}
416
417static inline bool skipTag(const char*& data, size_t& pos, size_t dataSize, const char* tag, size_t tagSize, const char* tagEnd, size_t tagEndSize)
418{
419    if (!checkText(data, pos, dataSize, tag, tagSize))
420        return false;
421
422    while (pos < dataSize && !checkText(data, pos, dataSize, tagEnd, tagEndSize))
423        ++pos;
424
425    return true;
426}
427
428// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-17
429static const char* feedTypeSniffingProcedure(const char* data, size_t dataSize)
430{
431    size_t pos = 0;
432
433    if (dataSize >= 3 && !memcmp(data, "\xEF\xBB\xBF", 3))
434        pos += 3;
435
436    while (pos < dataSize) {
437        skipWhiteSpace(data, pos, dataSize);
438
439        if (!skipTag(data, pos, dataSize, "<!--", 4, "-->", 3) && !skipTag(data, pos, dataSize, "<!", 2, "!>", 2) && !skipTag(data, pos, dataSize, "<?", 2, "?>", 2))
440            break;
441    }
442
443    if (checkText(data, pos, dataSize, "<rss", 4))
444        return "application/rss+xml";
445
446    if (checkText(data, pos, dataSize, "<feed", 5))
447        return "application/atom+xml";
448
449    if (checkText(data, pos, dataSize, "<rdf:RDF", 8))
450        return checkRDF(data, pos, dataSize);
451
452    return 0;
453}
454
455}
456
457// http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-6
458MIMESniffer::MIMESniffer(const char* advertisedMIMEType, bool isSupportedImageType)
459    : m_dataSize(0)
460    , m_function(0)
461{
462    if (!advertisedMIMEType) {
463        m_dataSize = 512;
464        m_function = &unknownTypeSniffingProcedure;
465        return;
466    }
467
468    if (isTextOrBinaryType(advertisedMIMEType)) {
469        m_dataSize = 512;
470        m_function = &textOrBinaryTypeSniffingProcedure;
471        return;
472    }
473
474    if (isUnknownType(advertisedMIMEType)) {
475        m_dataSize = 512;
476        m_function = &unknownTypeSniffingProcedure;
477        return;
478    }
479
480    if (isXMLType(advertisedMIMEType))
481        return;
482
483    if (isSupportedImageType) {
484        static const size_t dataSize = dataSizeNeededForImageSniffing();
485        m_dataSize = dataSize;
486        m_function = &imageTypeSniffingProcedure;
487        return;
488    }
489
490    if (!strcmp(advertisedMIMEType, "text/html")) {
491        m_dataSize = 512;
492        m_function = &feedTypeSniffingProcedure;
493        return;
494    }
495}
496