1// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4// 5// This is a port of ManifestParser.cc from WebKit/WebCore/loader/appcache. 6 7/* 8 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 27 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32#include "webkit/browser/appcache/manifest_parser.h" 33 34#include "base/command_line.h" 35#include "base/i18n/icu_string_conversions.h" 36#include "base/logging.h" 37#include "base/strings/utf_string_conversions.h" 38#include "url/gurl.h" 39 40namespace appcache { 41 42namespace { 43 44// Helper function used to identify 'isPattern' annotations. 45bool HasPatternMatchingAnnotation(const wchar_t* line_p, 46 const wchar_t* line_end) { 47 // Skip whitespace separating the resource url from the annotation. 48 // Note: trailing whitespace has already been trimmed from the line. 49 while (line_p < line_end && (*line_p == '\t' || *line_p == ' ')) 50 ++line_p; 51 if (line_p == line_end) 52 return false; 53 std::wstring annotation(line_p, line_end - line_p); 54 return annotation == L"isPattern"; 55} 56 57} 58 59enum Mode { 60 EXPLICIT, 61 INTERCEPT, 62 FALLBACK, 63 ONLINE_WHITELIST, 64 UNKNOWN_MODE, 65}; 66 67enum InterceptVerb { 68 RETURN, 69 EXECUTE, 70 UNKNOWN_VERB, 71}; 72 73Manifest::Manifest() 74 : online_whitelist_all(false), 75 did_ignore_intercept_namespaces(false) { 76} 77 78Manifest::~Manifest() {} 79 80bool ParseManifest(const GURL& manifest_url, const char* data, int length, 81 ParseMode parse_mode, Manifest& manifest) { 82 // This is an implementation of the parsing algorithm specified in 83 // the HTML5 offline web application docs: 84 // http://www.w3.org/TR/html5/offline.html 85 // Do not modify it without consulting those docs. 86 // Though you might be tempted to convert these wstrings to UTF-8 or 87 // base::string16, this implementation seems simpler given the constraints. 88 89 const wchar_t kSignature[] = L"CACHE MANIFEST"; 90 const size_t kSignatureLength = arraysize(kSignature) - 1; 91 const wchar_t kChromiumSignature[] = L"CHROMIUM CACHE MANIFEST"; 92 const size_t kChromiumSignatureLength = arraysize(kChromiumSignature) - 1; 93 94 DCHECK(manifest.explicit_urls.empty()); 95 DCHECK(manifest.fallback_namespaces.empty()); 96 DCHECK(manifest.online_whitelist_namespaces.empty()); 97 DCHECK(!manifest.online_whitelist_all); 98 DCHECK(!manifest.did_ignore_intercept_namespaces); 99 100 Mode mode = EXPLICIT; 101 102 std::wstring data_string; 103 // TODO(jennb): cannot do UTF8ToWide(data, length, &data_string); 104 // until UTF8ToWide uses 0xFFFD Unicode replacement character. 105 base::CodepageToWide(std::string(data, length), base::kCodepageUTF8, 106 base::OnStringConversionError::SUBSTITUTE, &data_string); 107 const wchar_t* p = data_string.c_str(); 108 const wchar_t* end = p + data_string.length(); 109 110 // Look for the magic signature: "^\xFEFF?CACHE MANIFEST[ \t]?" 111 // Example: "CACHE MANIFEST #comment" is a valid signature. 112 // Example: "CACHE MANIFEST;V2" is not. 113 114 // When the input data starts with a UTF-8 Byte-Order-Mark 115 // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a 116 // Unicode BOM (U+FEFF). Skip a converted Unicode BOM if it exists. 117 int bom_offset = 0; 118 if (!data_string.empty() && data_string[0] == 0xFEFF) { 119 bom_offset = 1; 120 ++p; 121 } 122 123 if (p >= end) 124 return false; 125 126 // Check for a supported signature and skip p past it. 127 if (0 == data_string.compare(bom_offset, kSignatureLength, 128 kSignature)) { 129 p += kSignatureLength; 130 } else if (0 == data_string.compare(bom_offset, kChromiumSignatureLength, 131 kChromiumSignature)) { 132 p += kChromiumSignatureLength; 133 } else { 134 return false; 135 } 136 137 // Character after "CACHE MANIFEST" must be whitespace. 138 if (p < end && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r') 139 return false; 140 141 // Skip to the end of the line. 142 while (p < end && *p != '\r' && *p != '\n') 143 ++p; 144 145 while (1) { 146 // Skip whitespace 147 while (p < end && (*p == '\n' || *p == '\r' || *p == ' ' || *p == '\t')) 148 ++p; 149 150 if (p == end) 151 break; 152 153 const wchar_t* line_start = p; 154 155 // Find the end of the line 156 while (p < end && *p != '\r' && *p != '\n') 157 ++p; 158 159 // Check if we have a comment 160 if (*line_start == '#') 161 continue; 162 163 // Get rid of trailing whitespace 164 const wchar_t* tmp = p - 1; 165 while (tmp > line_start && (*tmp == ' ' || *tmp == '\t')) 166 --tmp; 167 168 std::wstring line(line_start, tmp - line_start + 1); 169 170 if (line == L"CACHE:") { 171 mode = EXPLICIT; 172 } else if (line == L"FALLBACK:") { 173 mode = FALLBACK; 174 } else if (line == L"NETWORK:") { 175 mode = ONLINE_WHITELIST; 176 } else if (line == L"CHROMIUM-INTERCEPT:") { 177 mode = INTERCEPT; 178 } else if (*(line.end() - 1) == ':') { 179 mode = UNKNOWN_MODE; 180 } else if (mode == UNKNOWN_MODE) { 181 continue; 182 } else if (line == L"*" && mode == ONLINE_WHITELIST) { 183 manifest.online_whitelist_all = true; 184 continue; 185 } else if (mode == EXPLICIT || mode == ONLINE_WHITELIST) { 186 const wchar_t *line_p = line.c_str(); 187 const wchar_t *line_end = line_p + line.length(); 188 189 // Look for whitespace separating the URL from subsequent ignored tokens. 190 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 191 ++line_p; 192 193 base::string16 url16; 194 base::WideToUTF16(line.c_str(), line_p - line.c_str(), &url16); 195 GURL url = manifest_url.Resolve(url16); 196 if (!url.is_valid()) 197 continue; 198 if (url.has_ref()) { 199 GURL::Replacements replacements; 200 replacements.ClearRef(); 201 url = url.ReplaceComponents(replacements); 202 } 203 204 // Scheme component must be the same as the manifest URL's. 205 if (url.scheme() != manifest_url.scheme()) { 206 continue; 207 } 208 209 // See http://code.google.com/p/chromium/issues/detail?id=69594 210 // We willfully violate the HTML5 spec at this point in order 211 // to support the appcaching of cross-origin HTTPS resources. 212 // Per the spec, EXPLICIT cross-origin HTTS resources should be 213 // ignored here. We've opted for a milder constraint and allow 214 // caching unless the resource has a "no-store" header. That 215 // condition is enforced in AppCacheUpdateJob. 216 217 if (mode == EXPLICIT) { 218 manifest.explicit_urls.insert(url.spec()); 219 } else { 220 bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end); 221 manifest.online_whitelist_namespaces.push_back( 222 Namespace(APPCACHE_NETWORK_NAMESPACE, url, GURL(), is_pattern)); 223 } 224 } else if (mode == INTERCEPT) { 225 if (parse_mode != PARSE_MANIFEST_ALLOWING_INTERCEPTS) { 226 manifest.did_ignore_intercept_namespaces = true; 227 continue; 228 } 229 230 // Lines of the form, 231 // <urlnamespace> <intercept_type> <targeturl> 232 const wchar_t* line_p = line.c_str(); 233 const wchar_t* line_end = line_p + line.length(); 234 235 // Look for first whitespace separating the url namespace from 236 // the intercept type. 237 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 238 ++line_p; 239 240 if (line_p == line_end) 241 continue; // There was no whitespace separating the URLs. 242 243 base::string16 namespace_url16; 244 base::WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16); 245 GURL namespace_url = manifest_url.Resolve(namespace_url16); 246 if (!namespace_url.is_valid()) 247 continue; 248 if (namespace_url.has_ref()) { 249 GURL::Replacements replacements; 250 replacements.ClearRef(); 251 namespace_url = namespace_url.ReplaceComponents(replacements); 252 } 253 254 // The namespace URL must have the same scheme, host and port 255 // as the manifest's URL. 256 if (manifest_url.GetOrigin() != namespace_url.GetOrigin()) 257 continue; 258 259 // Skip whitespace separating namespace from the type. 260 while (line_p < line_end && (*line_p == '\t' || *line_p == ' ')) 261 ++line_p; 262 263 // Look for whitespace separating the type from the target url. 264 const wchar_t* type_start = line_p; 265 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 266 ++line_p; 267 268 // Look for a type value we understand, otherwise skip the line. 269 InterceptVerb verb = UNKNOWN_VERB; 270 std::wstring type(type_start, line_p - type_start); 271 if (type == L"return") { 272 verb = RETURN; 273 } else if (type == L"execute" && 274 CommandLine::ForCurrentProcess()->HasSwitch( 275 kEnableExecutableHandlers)) { 276 verb = EXECUTE; 277 } 278 if (verb == UNKNOWN_VERB) 279 continue; 280 281 // Skip whitespace separating type from the target_url. 282 while (line_p < line_end && (*line_p == '\t' || *line_p == ' ')) 283 ++line_p; 284 285 // Look for whitespace separating the URL from subsequent ignored tokens. 286 const wchar_t* target_url_start = line_p; 287 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 288 ++line_p; 289 290 base::string16 target_url16; 291 base::WideToUTF16(target_url_start, line_p - target_url_start, 292 &target_url16); 293 GURL target_url = manifest_url.Resolve(target_url16); 294 if (!target_url.is_valid()) 295 continue; 296 297 if (target_url.has_ref()) { 298 GURL::Replacements replacements; 299 replacements.ClearRef(); 300 target_url = target_url.ReplaceComponents(replacements); 301 } 302 if (manifest_url.GetOrigin() != target_url.GetOrigin()) 303 continue; 304 305 bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end); 306 manifest.intercept_namespaces.push_back( 307 Namespace(APPCACHE_INTERCEPT_NAMESPACE, namespace_url, 308 target_url, is_pattern, verb == EXECUTE)); 309 } else if (mode == FALLBACK) { 310 const wchar_t* line_p = line.c_str(); 311 const wchar_t* line_end = line_p + line.length(); 312 313 // Look for whitespace separating the two URLs 314 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 315 ++line_p; 316 317 if (line_p == line_end) { 318 // There was no whitespace separating the URLs. 319 continue; 320 } 321 322 base::string16 namespace_url16; 323 base::WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16); 324 GURL namespace_url = manifest_url.Resolve(namespace_url16); 325 if (!namespace_url.is_valid()) 326 continue; 327 if (namespace_url.has_ref()) { 328 GURL::Replacements replacements; 329 replacements.ClearRef(); 330 namespace_url = namespace_url.ReplaceComponents(replacements); 331 } 332 333 // Fallback namespace URL must have the same scheme, host and port 334 // as the manifest's URL. 335 if (manifest_url.GetOrigin() != namespace_url.GetOrigin()) { 336 continue; 337 } 338 339 // Skip whitespace separating fallback namespace from URL. 340 while (line_p < line_end && (*line_p == '\t' || *line_p == ' ')) 341 ++line_p; 342 343 // Look for whitespace separating the URL from subsequent ignored tokens. 344 const wchar_t* fallback_start = line_p; 345 while (line_p < line_end && *line_p != '\t' && *line_p != ' ') 346 ++line_p; 347 348 base::string16 fallback_url16; 349 base::WideToUTF16(fallback_start, line_p - fallback_start, 350 &fallback_url16); 351 GURL fallback_url = manifest_url.Resolve(fallback_url16); 352 if (!fallback_url.is_valid()) 353 continue; 354 if (fallback_url.has_ref()) { 355 GURL::Replacements replacements; 356 replacements.ClearRef(); 357 fallback_url = fallback_url.ReplaceComponents(replacements); 358 } 359 360 // Fallback entry URL must have the same scheme, host and port 361 // as the manifest's URL. 362 if (manifest_url.GetOrigin() != fallback_url.GetOrigin()) { 363 continue; 364 } 365 366 bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end); 367 368 // Store regardless of duplicate namespace URL. Only first match 369 // will ever be used. 370 manifest.fallback_namespaces.push_back( 371 Namespace(APPCACHE_FALLBACK_NAMESPACE, namespace_url, 372 fallback_url, is_pattern)); 373 } else { 374 NOTREACHED(); 375 } 376 } 377 378 return true; 379} 380 381} // namespace appcache 382