1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// This is a port of ManifestParser.cc from WebKit/WebCore/loader/appcache.
6
7/*
8 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
27 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include "webkit/browser/appcache/manifest_parser.h"
33
34#include "base/command_line.h"
35#include "base/i18n/icu_string_conversions.h"
36#include "base/logging.h"
37#include "base/strings/utf_string_conversions.h"
38#include "url/gurl.h"
39
40namespace appcache {
41
42namespace {
43
44// Helper function used to identify 'isPattern' annotations.
45bool HasPatternMatchingAnnotation(const wchar_t* line_p,
46                                  const wchar_t* line_end) {
47  // Skip whitespace separating the resource url from the annotation.
48  // Note: trailing whitespace has already been trimmed from the line.
49  while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
50    ++line_p;
51  if (line_p == line_end)
52    return false;
53  std::wstring annotation(line_p, line_end - line_p);
54  return annotation == L"isPattern";
55}
56
57}
58
59enum Mode {
60  EXPLICIT,
61  INTERCEPT,
62  FALLBACK,
63  ONLINE_WHITELIST,
64  UNKNOWN_MODE,
65};
66
67enum InterceptVerb {
68  RETURN,
69  EXECUTE,
70  UNKNOWN_VERB,
71};
72
73Manifest::Manifest()
74    : online_whitelist_all(false),
75      did_ignore_intercept_namespaces(false) {
76}
77
78Manifest::~Manifest() {}
79
80bool ParseManifest(const GURL& manifest_url, const char* data, int length,
81                   ParseMode parse_mode, Manifest& manifest) {
82  // This is an implementation of the parsing algorithm specified in
83  // the HTML5 offline web application docs:
84  //   http://www.w3.org/TR/html5/offline.html
85  // Do not modify it without consulting those docs.
86  // Though you might be tempted to convert these wstrings to UTF-8 or
87  // base::string16, this implementation seems simpler given the constraints.
88
89  const wchar_t kSignature[] = L"CACHE MANIFEST";
90  const size_t kSignatureLength = arraysize(kSignature) - 1;
91  const wchar_t kChromiumSignature[] = L"CHROMIUM CACHE MANIFEST";
92  const size_t kChromiumSignatureLength = arraysize(kChromiumSignature) - 1;
93
94  DCHECK(manifest.explicit_urls.empty());
95  DCHECK(manifest.fallback_namespaces.empty());
96  DCHECK(manifest.online_whitelist_namespaces.empty());
97  DCHECK(!manifest.online_whitelist_all);
98  DCHECK(!manifest.did_ignore_intercept_namespaces);
99
100  Mode mode = EXPLICIT;
101
102  std::wstring data_string;
103  // TODO(jennb): cannot do UTF8ToWide(data, length, &data_string);
104  // until UTF8ToWide uses 0xFFFD Unicode replacement character.
105  base::CodepageToWide(std::string(data, length), base::kCodepageUTF8,
106                       base::OnStringConversionError::SUBSTITUTE, &data_string);
107  const wchar_t* p = data_string.c_str();
108  const wchar_t* end = p + data_string.length();
109
110  // Look for the magic signature: "^\xFEFF?CACHE MANIFEST[ \t]?"
111  // Example: "CACHE MANIFEST #comment" is a valid signature.
112  // Example: "CACHE MANIFEST;V2" is not.
113
114  // When the input data starts with a UTF-8 Byte-Order-Mark
115  // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a
116  // Unicode BOM (U+FEFF). Skip a converted Unicode BOM if it exists.
117  int bom_offset = 0;
118  if (!data_string.empty() && data_string[0] == 0xFEFF) {
119    bom_offset = 1;
120    ++p;
121  }
122
123  if (p >= end)
124    return false;
125
126  // Check for a supported signature and skip p past it.
127  if (0 == data_string.compare(bom_offset, kSignatureLength,
128                               kSignature)) {
129    p += kSignatureLength;
130  } else if (0 == data_string.compare(bom_offset, kChromiumSignatureLength,
131                                      kChromiumSignature)) {
132    p += kChromiumSignatureLength;
133  } else {
134    return false;
135  }
136
137  // Character after "CACHE MANIFEST" must be whitespace.
138  if (p < end && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r')
139    return false;
140
141  // Skip to the end of the line.
142  while (p < end && *p != '\r' && *p != '\n')
143    ++p;
144
145  while (1) {
146    // Skip whitespace
147    while (p < end && (*p == '\n' || *p == '\r' || *p == ' ' || *p == '\t'))
148      ++p;
149
150    if (p == end)
151      break;
152
153    const wchar_t* line_start = p;
154
155    // Find the end of the line
156    while (p < end && *p != '\r' && *p != '\n')
157      ++p;
158
159    // Check if we have a comment
160    if (*line_start == '#')
161      continue;
162
163    // Get rid of trailing whitespace
164    const wchar_t* tmp = p - 1;
165    while (tmp > line_start && (*tmp == ' ' || *tmp == '\t'))
166      --tmp;
167
168    std::wstring line(line_start, tmp - line_start + 1);
169
170    if (line == L"CACHE:") {
171      mode = EXPLICIT;
172    } else if (line == L"FALLBACK:") {
173      mode = FALLBACK;
174    } else if (line == L"NETWORK:") {
175      mode = ONLINE_WHITELIST;
176    } else if (line == L"CHROMIUM-INTERCEPT:") {
177      mode = INTERCEPT;
178    } else if (*(line.end() - 1) == ':') {
179      mode = UNKNOWN_MODE;
180    } else if (mode == UNKNOWN_MODE) {
181      continue;
182    } else if (line == L"*" && mode == ONLINE_WHITELIST) {
183      manifest.online_whitelist_all = true;
184      continue;
185    } else if (mode == EXPLICIT || mode == ONLINE_WHITELIST) {
186      const wchar_t *line_p = line.c_str();
187      const wchar_t *line_end = line_p + line.length();
188
189      // Look for whitespace separating the URL from subsequent ignored tokens.
190      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
191        ++line_p;
192
193      base::string16 url16;
194      base::WideToUTF16(line.c_str(), line_p - line.c_str(), &url16);
195      GURL url = manifest_url.Resolve(url16);
196      if (!url.is_valid())
197        continue;
198      if (url.has_ref()) {
199        GURL::Replacements replacements;
200        replacements.ClearRef();
201        url = url.ReplaceComponents(replacements);
202      }
203
204      // Scheme component must be the same as the manifest URL's.
205      if (url.scheme() != manifest_url.scheme()) {
206        continue;
207      }
208
209      // See http://code.google.com/p/chromium/issues/detail?id=69594
210      // We willfully violate the HTML5 spec at this point in order
211      // to support the appcaching of cross-origin HTTPS resources.
212      // Per the spec, EXPLICIT cross-origin HTTS resources should be
213      // ignored here. We've opted for a milder constraint and allow
214      // caching unless the resource has a "no-store" header. That
215      // condition is enforced in AppCacheUpdateJob.
216
217      if (mode == EXPLICIT) {
218        manifest.explicit_urls.insert(url.spec());
219      } else {
220        bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end);
221        manifest.online_whitelist_namespaces.push_back(
222            Namespace(APPCACHE_NETWORK_NAMESPACE, url, GURL(), is_pattern));
223      }
224    } else if (mode == INTERCEPT) {
225      if (parse_mode != PARSE_MANIFEST_ALLOWING_INTERCEPTS) {
226        manifest.did_ignore_intercept_namespaces = true;
227        continue;
228      }
229
230      // Lines of the form,
231      // <urlnamespace> <intercept_type> <targeturl>
232      const wchar_t* line_p = line.c_str();
233      const wchar_t* line_end = line_p + line.length();
234
235      // Look for first whitespace separating the url namespace from
236      // the intercept type.
237      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
238        ++line_p;
239
240      if (line_p == line_end)
241        continue;  // There was no whitespace separating the URLs.
242
243      base::string16 namespace_url16;
244      base::WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16);
245      GURL namespace_url = manifest_url.Resolve(namespace_url16);
246      if (!namespace_url.is_valid())
247        continue;
248      if (namespace_url.has_ref()) {
249        GURL::Replacements replacements;
250        replacements.ClearRef();
251        namespace_url = namespace_url.ReplaceComponents(replacements);
252      }
253
254      // The namespace URL must have the same scheme, host and port
255      // as the manifest's URL.
256      if (manifest_url.GetOrigin() != namespace_url.GetOrigin())
257        continue;
258
259      // Skip whitespace separating namespace from the type.
260      while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
261        ++line_p;
262
263      // Look for whitespace separating the type from the target url.
264      const wchar_t* type_start = line_p;
265      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
266        ++line_p;
267
268      // Look for a type value we understand, otherwise skip the line.
269      InterceptVerb verb = UNKNOWN_VERB;
270      std::wstring type(type_start, line_p - type_start);
271      if (type == L"return") {
272        verb = RETURN;
273      } else if (type == L"execute" &&
274                 CommandLine::ForCurrentProcess()->HasSwitch(
275                    kEnableExecutableHandlers)) {
276        verb = EXECUTE;
277      }
278      if (verb == UNKNOWN_VERB)
279        continue;
280
281      // Skip whitespace separating type from the target_url.
282      while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
283        ++line_p;
284
285      // Look for whitespace separating the URL from subsequent ignored tokens.
286      const wchar_t* target_url_start = line_p;
287      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
288        ++line_p;
289
290      base::string16 target_url16;
291      base::WideToUTF16(target_url_start, line_p - target_url_start,
292                        &target_url16);
293      GURL target_url = manifest_url.Resolve(target_url16);
294      if (!target_url.is_valid())
295        continue;
296
297      if (target_url.has_ref()) {
298        GURL::Replacements replacements;
299        replacements.ClearRef();
300        target_url = target_url.ReplaceComponents(replacements);
301      }
302      if (manifest_url.GetOrigin() != target_url.GetOrigin())
303        continue;
304
305      bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end);
306      manifest.intercept_namespaces.push_back(
307          Namespace(APPCACHE_INTERCEPT_NAMESPACE, namespace_url,
308                    target_url, is_pattern, verb == EXECUTE));
309    } else if (mode == FALLBACK) {
310      const wchar_t* line_p = line.c_str();
311      const wchar_t* line_end = line_p + line.length();
312
313      // Look for whitespace separating the two URLs
314      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
315        ++line_p;
316
317      if (line_p == line_end) {
318        // There was no whitespace separating the URLs.
319        continue;
320      }
321
322      base::string16 namespace_url16;
323      base::WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16);
324      GURL namespace_url = manifest_url.Resolve(namespace_url16);
325      if (!namespace_url.is_valid())
326        continue;
327      if (namespace_url.has_ref()) {
328        GURL::Replacements replacements;
329        replacements.ClearRef();
330        namespace_url = namespace_url.ReplaceComponents(replacements);
331      }
332
333      // Fallback namespace URL must have the same scheme, host and port
334      // as the manifest's URL.
335      if (manifest_url.GetOrigin() != namespace_url.GetOrigin()) {
336        continue;
337      }
338
339      // Skip whitespace separating fallback namespace from URL.
340      while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
341        ++line_p;
342
343      // Look for whitespace separating the URL from subsequent ignored tokens.
344      const wchar_t* fallback_start = line_p;
345      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
346        ++line_p;
347
348      base::string16 fallback_url16;
349      base::WideToUTF16(fallback_start, line_p - fallback_start,
350                        &fallback_url16);
351      GURL fallback_url = manifest_url.Resolve(fallback_url16);
352      if (!fallback_url.is_valid())
353        continue;
354      if (fallback_url.has_ref()) {
355        GURL::Replacements replacements;
356        replacements.ClearRef();
357        fallback_url = fallback_url.ReplaceComponents(replacements);
358      }
359
360      // Fallback entry URL must have the same scheme, host and port
361      // as the manifest's URL.
362      if (manifest_url.GetOrigin() != fallback_url.GetOrigin()) {
363        continue;
364      }
365
366      bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end);
367
368      // Store regardless of duplicate namespace URL. Only first match
369      // will ever be used.
370      manifest.fallback_namespaces.push_back(
371          Namespace(APPCACHE_FALLBACK_NAMESPACE, namespace_url,
372                    fallback_url, is_pattern));
373    } else {
374      NOTREACHED();
375    }
376  }
377
378  return true;
379}
380
381}  // namespace appcache
382