1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32#include "platform/mhtml/MHTMLParser.h"
33
34#include "platform/MIMETypeRegistry.h"
35#include "platform/mhtml/ArchiveResource.h"
36#include "platform/mhtml/MHTMLArchive.h"
37#include "platform/network/ParsedContentType.h"
38#include "platform/text/QuotedPrintable.h"
39#include "wtf/HashMap.h"
40#include "wtf/RefCounted.h"
41#include "wtf/text/Base64.h"
42#include "wtf/text/StringBuilder.h"
43#include "wtf/text/StringConcatenate.h"
44#include "wtf/text/StringHash.h"
45#include "wtf/text/WTFString.h"
46
47namespace blink {
48
49// This class is a limited MIME parser used to parse the MIME headers of MHTML files.
50class MIMEHeader : public RefCountedWillBeGarbageCollectedFinalized<MIMEHeader> {
51public:
52    static PassRefPtrWillBeRawPtr<MIMEHeader> create()
53    {
54        return adoptRefWillBeNoop(new MIMEHeader());
55    }
56
57    enum Encoding {
58        QuotedPrintable,
59        Base64,
60        EightBit,
61        SevenBit,
62        Binary,
63        Unknown
64    };
65
66    static PassRefPtrWillBeRawPtr<MIMEHeader> parseHeader(SharedBufferChunkReader* crLFLineReader);
67
68    bool isMultipart() const { return m_contentType.startsWith("multipart/"); }
69
70    String contentType() const { return m_contentType; }
71    String charset() const { return m_charset; }
72    Encoding contentTransferEncoding() const { return m_contentTransferEncoding; }
73    String contentLocation() const { return m_contentLocation; }
74
75    // Multi-part type and boundaries are only valid for multipart MIME headers.
76    String multiPartType() const { return m_multipartType; }
77    String endOfPartBoundary() const { return m_endOfPartBoundary; }
78    String endOfDocumentBoundary() const { return m_endOfDocumentBoundary; }
79
80    void trace(Visitor*) { }
81
82private:
83    MIMEHeader();
84
85    static Encoding parseContentTransferEncoding(const String&);
86
87    String m_contentType;
88    String m_charset;
89    Encoding m_contentTransferEncoding;
90    String m_contentLocation;
91    String m_multipartType;
92    String m_endOfPartBoundary;
93    String m_endOfDocumentBoundary;
94};
95
96typedef HashMap<String, String> KeyValueMap;
97
98static KeyValueMap retrieveKeyValuePairs(blink::SharedBufferChunkReader* buffer)
99{
100    KeyValueMap keyValuePairs;
101    String line;
102    String key;
103    StringBuilder value;
104    while (!(line = buffer->nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
105        if (line.isEmpty())
106            break; // Empty line means end of key/value section.
107        if (line[0] == '\t') {
108            ASSERT(!key.isEmpty());
109            value.append(line.substring(1));
110            continue;
111        }
112        // New key/value, store the previous one if any.
113        if (!key.isEmpty()) {
114            if (keyValuePairs.find(key) != keyValuePairs.end())
115                WTF_LOG_ERROR("Key duplicate found in MIME header. Key is '%s', previous value replaced.", key.ascii().data());
116            keyValuePairs.add(key, value.toString().stripWhiteSpace());
117            key = String();
118            value.clear();
119        }
120        size_t semiColonIndex = line.find(':');
121        if (semiColonIndex == kNotFound) {
122            // This is not a key value pair, ignore.
123            continue;
124        }
125        key = line.substring(0, semiColonIndex).lower().stripWhiteSpace();
126        value.append(line.substring(semiColonIndex + 1));
127    }
128    // Store the last property if there is one.
129    if (!key.isEmpty())
130        keyValuePairs.set(key, value.toString().stripWhiteSpace());
131    return keyValuePairs;
132}
133
134PassRefPtrWillBeRawPtr<MIMEHeader> MIMEHeader::parseHeader(SharedBufferChunkReader* buffer)
135{
136    RefPtrWillBeRawPtr<MIMEHeader> mimeHeader = MIMEHeader::create();
137    KeyValueMap keyValuePairs = retrieveKeyValuePairs(buffer);
138    KeyValueMap::iterator mimeParametersIterator = keyValuePairs.find("content-type");
139    if (mimeParametersIterator != keyValuePairs.end()) {
140        ParsedContentType parsedContentType(mimeParametersIterator->value);
141        mimeHeader->m_contentType = parsedContentType.mimeType();
142        if (!mimeHeader->isMultipart()) {
143            mimeHeader->m_charset = parsedContentType.charset().stripWhiteSpace();
144        } else {
145            mimeHeader->m_multipartType = parsedContentType.parameterValueForName("type");
146            mimeHeader->m_endOfPartBoundary = parsedContentType.parameterValueForName("boundary");
147            if (mimeHeader->m_endOfPartBoundary.isNull()) {
148                WTF_LOG_ERROR("No boundary found in multipart MIME header.");
149                return nullptr;
150            }
151            mimeHeader->m_endOfPartBoundary.insert("--", 0);
152            mimeHeader->m_endOfDocumentBoundary = mimeHeader->m_endOfPartBoundary;
153            mimeHeader->m_endOfDocumentBoundary.append("--");
154        }
155    }
156
157    mimeParametersIterator = keyValuePairs.find("content-transfer-encoding");
158    if (mimeParametersIterator != keyValuePairs.end())
159        mimeHeader->m_contentTransferEncoding = parseContentTransferEncoding(mimeParametersIterator->value);
160
161    mimeParametersIterator = keyValuePairs.find("content-location");
162    if (mimeParametersIterator != keyValuePairs.end())
163        mimeHeader->m_contentLocation = mimeParametersIterator->value;
164
165    return mimeHeader.release();
166}
167
168MIMEHeader::Encoding MIMEHeader::parseContentTransferEncoding(const String& text)
169{
170    String encoding = text.stripWhiteSpace().lower();
171    if (encoding == "base64")
172        return Base64;
173    if (encoding == "quoted-printable")
174        return QuotedPrintable;
175    if (encoding == "8bit")
176        return EightBit;
177    if (encoding == "7bit")
178        return SevenBit;
179    if (encoding == "binary")
180        return Binary;
181    WTF_LOG_ERROR("Unknown encoding '%s' found in MIME header.", text.ascii().data());
182    return Unknown;
183}
184
185MIMEHeader::MIMEHeader()
186    : m_contentTransferEncoding(Unknown)
187{
188}
189
190static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary)
191{
192    String line;
193    while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
194        if (line == boundary)
195            return true;
196    }
197    return false;
198}
199
200MHTMLParser::MHTMLParser(SharedBuffer* data)
201    : m_lineReader(data, "\r\n")
202{
203}
204
205PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLParser::parseArchive()
206{
207    RefPtrWillBeRawPtr<MIMEHeader> header = MIMEHeader::parseHeader(&m_lineReader);
208    return parseArchiveWithHeader(header.get());
209}
210
211PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header)
212{
213    if (!header) {
214        WTF_LOG_ERROR("Failed to parse MHTML part: no header.");
215        return nullptr;
216    }
217
218    RefPtrWillBeRawPtr<MHTMLArchive> archive = MHTMLArchive::create();
219    if (!header->isMultipart()) {
220        // With IE a page with no resource is not multi-part.
221        bool endOfArchiveReached = false;
222        RefPtrWillBeRawPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached);
223        if (!resource)
224            return nullptr;
225        archive->setMainResource(resource);
226        return archive;
227    }
228
229    // Skip the message content (it's a generic browser specific message).
230    skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
231
232    bool endOfArchive = false;
233    while (!endOfArchive) {
234        RefPtrWillBeRawPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(&m_lineReader);
235        if (!resourceHeader) {
236            WTF_LOG_ERROR("Failed to parse MHTML, invalid MIME header.");
237            return nullptr;
238        }
239        if (resourceHeader->contentType() == "multipart/alternative") {
240            // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames).
241            RefPtrWillBeRawPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get());
242            if (!subframeArchive) {
243                WTF_LOG_ERROR("Failed to parse MHTML subframe.");
244                return nullptr;
245            }
246            bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
247            ASSERT_UNUSED(endOfPartReached, endOfPartReached);
248            // The top-frame is the first frame found, regardless of the nesting level.
249            if (subframeArchive->mainResource())
250                addResourceToArchive(subframeArchive->mainResource(), archive.get());
251            archive->addSubframeArchive(subframeArchive);
252            continue;
253        }
254
255        RefPtrWillBeRawPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive);
256        if (!resource) {
257            WTF_LOG_ERROR("Failed to parse MHTML part.");
258            return nullptr;
259        }
260        addResourceToArchive(resource.get(), archive.get());
261    }
262
263    return archive.release();
264}
265
266void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive)
267{
268    const AtomicString& mimeType = resource->mimeType();
269    if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") {
270        m_resources.append(resource);
271        return;
272    }
273
274    // The first document suitable resource is the main frame.
275    if (!archive->mainResource()) {
276        archive->setMainResource(resource);
277        m_frames.append(archive);
278        return;
279    }
280
281    RefPtrWillBeRawPtr<MHTMLArchive> subframe = MHTMLArchive::create();
282    subframe->setMainResource(resource);
283    m_frames.append(subframe);
284}
285
286PassRefPtrWillBeRawPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached)
287{
288    ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty());
289
290    // If no content transfer encoding is specified, default to binary encoding.
291    MIMEHeader::Encoding contentTransferEncoding = mimeHeader.contentTransferEncoding();
292    if (contentTransferEncoding == MIMEHeader::Unknown)
293        contentTransferEncoding = MIMEHeader::Binary;
294
295    RefPtr<SharedBuffer> content = SharedBuffer::create();
296    const bool checkBoundary = !endOfPartBoundary.isEmpty();
297    bool endOfPartReached = false;
298    if (contentTransferEncoding == MIMEHeader::Binary) {
299        if (!checkBoundary) {
300            WTF_LOG_ERROR("Binary contents requires end of part");
301            return nullptr;
302        }
303        m_lineReader.setSeparator(endOfPartBoundary.utf8().data());
304        Vector<char> part;
305        if (!m_lineReader.nextChunk(part)) {
306            WTF_LOG_ERROR("Binary contents requires end of part");
307            return nullptr;
308        }
309        content->append(part);
310        m_lineReader.setSeparator("\r\n");
311        Vector<char> nextChars;
312        if (m_lineReader.peek(nextChars, 2) != 2) {
313            WTF_LOG_ERROR("Invalid seperator.");
314            return nullptr;
315        }
316        endOfPartReached = true;
317        ASSERT(nextChars.size() == 2);
318        endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-');
319        if (!endOfArchiveReached) {
320            String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback();
321            if (!line.isEmpty()) {
322                WTF_LOG_ERROR("No CRLF at end of binary section.");
323                return nullptr;
324            }
325        }
326    } else {
327        String line;
328        while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
329            endOfArchiveReached = (line == endOfDocumentBoundary);
330            if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) {
331                endOfPartReached = true;
332                break;
333            }
334            // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'.
335            content->append(line.utf8().data(), line.length());
336            if (contentTransferEncoding == MIMEHeader::QuotedPrintable) {
337                // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines.
338                content->append("\r\n", 2);
339            }
340        }
341    }
342    if (!endOfPartReached && checkBoundary) {
343        WTF_LOG_ERROR("No bounday found for MHTML part.");
344        return nullptr;
345    }
346
347    Vector<char> data;
348    switch (contentTransferEncoding) {
349    case MIMEHeader::Base64:
350        if (!base64Decode(content->data(), content->size(), data)) {
351            WTF_LOG_ERROR("Invalid base64 content for MHTML part.");
352            return nullptr;
353        }
354        break;
355    case MIMEHeader::QuotedPrintable:
356        quotedPrintableDecode(content->data(), content->size(), data);
357        break;
358    case MIMEHeader::EightBit:
359    case MIMEHeader::SevenBit:
360    case MIMEHeader::Binary:
361        data.append(content->data(), content->size());
362        break;
363    default:
364        WTF_LOG_ERROR("Invalid encoding for MHTML part.");
365        return nullptr;
366    }
367    RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data);
368    // FIXME: the URL in the MIME header could be relative, we should resolve it if it is.
369    // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5
370    // IE and Firefox (UNMht) seem to generate only absolute URLs.
371    KURL location = KURL(KURL(), mimeHeader.contentLocation());
372    return ArchiveResource::create(contentBuffer, location, AtomicString(mimeHeader.contentType()), AtomicString(mimeHeader.charset()), String());
373}
374
375size_t MHTMLParser::frameCount() const
376{
377    return m_frames.size();
378}
379
380MHTMLArchive* MHTMLParser::frameAt(size_t index) const
381{
382    return m_frames[index].get();
383}
384
385size_t MHTMLParser::subResourceCount() const
386{
387    return m_resources.size();
388}
389
390ArchiveResource* MHTMLParser::subResourceAt(size_t index) const
391{
392    return m_resources[index].get();
393}
394
395}
396