1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32
33#include "core/loader/archive/MHTMLArchive.h"
34
35#include "core/loader/archive/MHTMLParser.h"
36#include "core/platform/MIMETypeRegistry.h"
37#include "core/platform/SerializedResource.h"
38#include "core/platform/SharedBuffer.h"
39#include "core/platform/text/QuotedPrintable.h"
40#include "weborigin/SchemeRegistry.h"
41#include "wtf/CryptographicallyRandomNumber.h"
42#include "wtf/DateMath.h"
43#include "wtf/GregorianDateTime.h"
44#include "wtf/text/Base64.h"
45#include "wtf/text/StringBuilder.h"
46
47namespace WebCore {
48
49const char* const quotedPrintable = "quoted-printable";
50const char* const base64 = "base64";
51const char* const binary = "binary";
52
53static String generateRandomBoundary()
54{
55    // Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0).
56    const size_t randomValuesLength = 10;
57    char randomValues[randomValuesLength];
58    cryptographicallyRandomValues(&randomValues, randomValuesLength);
59    StringBuilder stringBuilder;
60    stringBuilder.append("----=_NextPart_000_");
61    for (size_t i = 0; i < randomValuesLength; ++i) {
62        if (i == 2)
63            stringBuilder.append('_');
64        else if (i == 6)
65            stringBuilder.append('.');
66        stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i]));
67        stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i]));
68    }
69    return stringBuilder.toString();
70}
71
72static String replaceNonPrintableCharacters(const String& text)
73{
74    StringBuilder stringBuilder;
75    for (size_t i = 0; i < text.length(); ++i) {
76        if (isASCIIPrintable(text[i]))
77            stringBuilder.append(text[i]);
78        else
79            stringBuilder.append('?');
80    }
81    return stringBuilder.toString();
82}
83
84MHTMLArchive::MHTMLArchive()
85{
86}
87
88MHTMLArchive::~MHTMLArchive()
89{
90    // Because all frames know about each other we need to perform a deep clearing of the archives graph.
91    clearAllSubframeArchives();
92}
93
94PassRefPtr<MHTMLArchive> MHTMLArchive::create()
95{
96    return adoptRef(new MHTMLArchive);
97}
98
99PassRefPtr<MHTMLArchive> MHTMLArchive::create(const KURL& url, SharedBuffer* data)
100{
101    // For security reasons we only load MHTML pages from local URLs.
102    if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol()))
103        return 0;
104
105    MHTMLParser parser(data);
106    RefPtr<MHTMLArchive> mainArchive = parser.parseArchive();
107    if (!mainArchive)
108        return 0; // Invalid MHTML file.
109
110    // Since MHTML is a flat format, we need to make all frames aware of all resources.
111    for (size_t i = 0; i < parser.frameCount(); ++i) {
112        RefPtr<MHTMLArchive> archive = parser.frameAt(i);
113        for (size_t j = 1; j < parser.frameCount(); ++j) {
114            if (i != j)
115                archive->addSubframeArchive(parser.frameAt(j));
116        }
117        for (size_t j = 0; j < parser.subResourceCount(); ++j)
118            archive->addSubresource(parser.subResourceAt(j));
119    }
120    return mainArchive.release();
121}
122
123PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(const Vector<SerializedResource>& resources, EncodingPolicy encodingPolicy, const String& title, const String& mimeType)
124{
125    String boundary = generateRandomBoundary();
126    String endOfResourceBoundary = "--" + boundary + "\r\n";
127
128    GregorianDateTime now;
129    now.setToCurrentLocalTime();
130    String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.year(), now.hour(), now.minute(), now.second(), now.utcOffset() / 60);
131
132    StringBuilder stringBuilder;
133    stringBuilder.append("From: <Saved by WebKit>\r\n");
134    stringBuilder.append("Subject: ");
135    // We replace non ASCII characters with '?' characters to match IE's behavior.
136    stringBuilder.append(replaceNonPrintableCharacters(title));
137    stringBuilder.append("\r\nDate: ");
138    stringBuilder.append(dateString);
139    stringBuilder.append("\r\nMIME-Version: 1.0\r\n");
140    stringBuilder.append("Content-Type: multipart/related;\r\n");
141    stringBuilder.append("\ttype=\"");
142    stringBuilder.append(mimeType);
143    stringBuilder.append("\";\r\n");
144    stringBuilder.append("\tboundary=\"");
145    stringBuilder.append(boundary);
146    stringBuilder.append("\"\r\n\r\n");
147
148    // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it).
149    ASSERT(stringBuilder.toString().containsOnlyASCII());
150    CString asciiString = stringBuilder.toString().utf8();
151    RefPtr<SharedBuffer> mhtmlData = SharedBuffer::create();
152    mhtmlData->append(asciiString.data(), asciiString.length());
153
154    for (size_t i = 0; i < resources.size(); ++i) {
155        const SerializedResource& resource = resources[i];
156
157        stringBuilder.clear();
158        stringBuilder.append(endOfResourceBoundary);
159        stringBuilder.append("Content-Type: ");
160        stringBuilder.append(resource.mimeType);
161
162        const char* contentEncoding = 0;
163        if (encodingPolicy == UseBinaryEncoding)
164            contentEncoding = binary;
165        else if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType))
166            contentEncoding = quotedPrintable;
167        else
168            contentEncoding = base64;
169
170        stringBuilder.append("\r\nContent-Transfer-Encoding: ");
171        stringBuilder.append(contentEncoding);
172        stringBuilder.append("\r\nContent-Location: ");
173        stringBuilder.append(resource.url);
174        stringBuilder.append("\r\n\r\n");
175
176        asciiString = stringBuilder.toString().utf8();
177        mhtmlData->append(asciiString.data(), asciiString.length());
178
179        if (!strcmp(contentEncoding, binary)) {
180            const char* data;
181            size_t position = 0;
182            while (size_t length = resource.data->getSomeData(data, position)) {
183                mhtmlData->append(data, length);
184                position += length;
185            }
186        } else {
187            // FIXME: ideally we would encode the content as a stream without having to fetch it all.
188            const char* data = resource.data->data();
189            size_t dataLength = resource.data->size();
190            Vector<char> encodedData;
191            if (!strcmp(contentEncoding, quotedPrintable)) {
192                quotedPrintableEncode(data, dataLength, encodedData);
193                mhtmlData->append(encodedData.data(), encodedData.size());
194                mhtmlData->append("\r\n", 2);
195            } else {
196                ASSERT(!strcmp(contentEncoding, base64));
197                // We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs.
198                base64Encode(data, dataLength, encodedData);
199                const size_t maximumLineLength = 76;
200                size_t index = 0;
201                size_t encodedDataLength = encodedData.size();
202                do {
203                    size_t lineLength = std::min(encodedDataLength - index, maximumLineLength);
204                    mhtmlData->append(encodedData.data() + index, lineLength);
205                    mhtmlData->append("\r\n", 2);
206                    index += maximumLineLength;
207                } while (index < encodedDataLength);
208            }
209        }
210    }
211
212    asciiString = String("--" + boundary + "--\r\n").utf8();
213    mhtmlData->append(asciiString.data(), asciiString.length());
214
215    return mhtmlData.release();
216}
217
218void MHTMLArchive::clearAllSubframeArchives()
219{
220    Vector<RefPtr<MHTMLArchive> > clearedArchives;
221    clearAllSubframeArchivesImpl(&clearedArchives);
222}
223
224void MHTMLArchive::clearAllSubframeArchivesImpl(Vector<RefPtr<MHTMLArchive> >* clearedArchives)
225{
226    for (Vector<RefPtr<MHTMLArchive> >::iterator it = m_subframeArchives.begin(); it != m_subframeArchives.end(); ++it) {
227        if (!clearedArchives->contains(*it)) {
228            clearedArchives->append(*it);
229            (*it)->clearAllSubframeArchivesImpl(clearedArchives);
230        }
231    }
232    m_subframeArchives.clear();
233}
234
235}
236