1/* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include "config.h" 32 33#include "core/loader/archive/MHTMLArchive.h" 34 35#include "core/loader/archive/MHTMLParser.h" 36#include "core/platform/MIMETypeRegistry.h" 37#include "core/platform/SerializedResource.h" 38#include "core/platform/SharedBuffer.h" 39#include "core/platform/text/QuotedPrintable.h" 40#include "weborigin/SchemeRegistry.h" 41#include "wtf/CryptographicallyRandomNumber.h" 42#include "wtf/DateMath.h" 43#include "wtf/GregorianDateTime.h" 44#include "wtf/text/Base64.h" 45#include "wtf/text/StringBuilder.h" 46 47namespace WebCore { 48 49const char* const quotedPrintable = "quoted-printable"; 50const char* const base64 = "base64"; 51const char* const binary = "binary"; 52 53static String generateRandomBoundary() 54{ 55 // Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0). 56 const size_t randomValuesLength = 10; 57 char randomValues[randomValuesLength]; 58 cryptographicallyRandomValues(&randomValues, randomValuesLength); 59 StringBuilder stringBuilder; 60 stringBuilder.append("----=_NextPart_000_"); 61 for (size_t i = 0; i < randomValuesLength; ++i) { 62 if (i == 2) 63 stringBuilder.append('_'); 64 else if (i == 6) 65 stringBuilder.append('.'); 66 stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i])); 67 stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i])); 68 } 69 return stringBuilder.toString(); 70} 71 72static String replaceNonPrintableCharacters(const String& text) 73{ 74 StringBuilder stringBuilder; 75 for (size_t i = 0; i < text.length(); ++i) { 76 if (isASCIIPrintable(text[i])) 77 stringBuilder.append(text[i]); 78 else 79 stringBuilder.append('?'); 80 } 81 return stringBuilder.toString(); 82} 83 84MHTMLArchive::MHTMLArchive() 85{ 86} 87 88MHTMLArchive::~MHTMLArchive() 89{ 90 // Because all frames know about each other we need to perform a deep clearing of the archives graph. 91 clearAllSubframeArchives(); 92} 93 94PassRefPtr<MHTMLArchive> MHTMLArchive::create() 95{ 96 return adoptRef(new MHTMLArchive); 97} 98 99PassRefPtr<MHTMLArchive> MHTMLArchive::create(const KURL& url, SharedBuffer* data) 100{ 101 // For security reasons we only load MHTML pages from local URLs. 102 if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol())) 103 return 0; 104 105 MHTMLParser parser(data); 106 RefPtr<MHTMLArchive> mainArchive = parser.parseArchive(); 107 if (!mainArchive) 108 return 0; // Invalid MHTML file. 109 110 // Since MHTML is a flat format, we need to make all frames aware of all resources. 111 for (size_t i = 0; i < parser.frameCount(); ++i) { 112 RefPtr<MHTMLArchive> archive = parser.frameAt(i); 113 for (size_t j = 1; j < parser.frameCount(); ++j) { 114 if (i != j) 115 archive->addSubframeArchive(parser.frameAt(j)); 116 } 117 for (size_t j = 0; j < parser.subResourceCount(); ++j) 118 archive->addSubresource(parser.subResourceAt(j)); 119 } 120 return mainArchive.release(); 121} 122 123PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(const Vector<SerializedResource>& resources, EncodingPolicy encodingPolicy, const String& title, const String& mimeType) 124{ 125 String boundary = generateRandomBoundary(); 126 String endOfResourceBoundary = "--" + boundary + "\r\n"; 127 128 GregorianDateTime now; 129 now.setToCurrentLocalTime(); 130 String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.year(), now.hour(), now.minute(), now.second(), now.utcOffset() / 60); 131 132 StringBuilder stringBuilder; 133 stringBuilder.append("From: <Saved by WebKit>\r\n"); 134 stringBuilder.append("Subject: "); 135 // We replace non ASCII characters with '?' characters to match IE's behavior. 136 stringBuilder.append(replaceNonPrintableCharacters(title)); 137 stringBuilder.append("\r\nDate: "); 138 stringBuilder.append(dateString); 139 stringBuilder.append("\r\nMIME-Version: 1.0\r\n"); 140 stringBuilder.append("Content-Type: multipart/related;\r\n"); 141 stringBuilder.append("\ttype=\""); 142 stringBuilder.append(mimeType); 143 stringBuilder.append("\";\r\n"); 144 stringBuilder.append("\tboundary=\""); 145 stringBuilder.append(boundary); 146 stringBuilder.append("\"\r\n\r\n"); 147 148 // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it). 149 ASSERT(stringBuilder.toString().containsOnlyASCII()); 150 CString asciiString = stringBuilder.toString().utf8(); 151 RefPtr<SharedBuffer> mhtmlData = SharedBuffer::create(); 152 mhtmlData->append(asciiString.data(), asciiString.length()); 153 154 for (size_t i = 0; i < resources.size(); ++i) { 155 const SerializedResource& resource = resources[i]; 156 157 stringBuilder.clear(); 158 stringBuilder.append(endOfResourceBoundary); 159 stringBuilder.append("Content-Type: "); 160 stringBuilder.append(resource.mimeType); 161 162 const char* contentEncoding = 0; 163 if (encodingPolicy == UseBinaryEncoding) 164 contentEncoding = binary; 165 else if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType)) 166 contentEncoding = quotedPrintable; 167 else 168 contentEncoding = base64; 169 170 stringBuilder.append("\r\nContent-Transfer-Encoding: "); 171 stringBuilder.append(contentEncoding); 172 stringBuilder.append("\r\nContent-Location: "); 173 stringBuilder.append(resource.url); 174 stringBuilder.append("\r\n\r\n"); 175 176 asciiString = stringBuilder.toString().utf8(); 177 mhtmlData->append(asciiString.data(), asciiString.length()); 178 179 if (!strcmp(contentEncoding, binary)) { 180 const char* data; 181 size_t position = 0; 182 while (size_t length = resource.data->getSomeData(data, position)) { 183 mhtmlData->append(data, length); 184 position += length; 185 } 186 } else { 187 // FIXME: ideally we would encode the content as a stream without having to fetch it all. 188 const char* data = resource.data->data(); 189 size_t dataLength = resource.data->size(); 190 Vector<char> encodedData; 191 if (!strcmp(contentEncoding, quotedPrintable)) { 192 quotedPrintableEncode(data, dataLength, encodedData); 193 mhtmlData->append(encodedData.data(), encodedData.size()); 194 mhtmlData->append("\r\n", 2); 195 } else { 196 ASSERT(!strcmp(contentEncoding, base64)); 197 // We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs. 198 base64Encode(data, dataLength, encodedData); 199 const size_t maximumLineLength = 76; 200 size_t index = 0; 201 size_t encodedDataLength = encodedData.size(); 202 do { 203 size_t lineLength = std::min(encodedDataLength - index, maximumLineLength); 204 mhtmlData->append(encodedData.data() + index, lineLength); 205 mhtmlData->append("\r\n", 2); 206 index += maximumLineLength; 207 } while (index < encodedDataLength); 208 } 209 } 210 } 211 212 asciiString = String("--" + boundary + "--\r\n").utf8(); 213 mhtmlData->append(asciiString.data(), asciiString.length()); 214 215 return mhtmlData.release(); 216} 217 218void MHTMLArchive::clearAllSubframeArchives() 219{ 220 Vector<RefPtr<MHTMLArchive> > clearedArchives; 221 clearAllSubframeArchivesImpl(&clearedArchives); 222} 223 224void MHTMLArchive::clearAllSubframeArchivesImpl(Vector<RefPtr<MHTMLArchive> >* clearedArchives) 225{ 226 for (Vector<RefPtr<MHTMLArchive> >::iterator it = m_subframeArchives.begin(); it != m_subframeArchives.end(); ++it) { 227 if (!clearedArchives->contains(*it)) { 228 clearedArchives->append(*it); 229 (*it)->clearAllSubframeArchivesImpl(clearedArchives); 230 } 231 } 232 m_subframeArchives.clear(); 233} 234 235} 236