1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// This is a port of ManifestParser.cc from WebKit/WebCore/loader/appcache.
6
7/*
8 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
27 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include "webkit/browser/appcache/manifest_parser.h"
33
34#include "base/command_line.h"
35#include "base/i18n/icu_string_conversions.h"
36#include "base/logging.h"
37#include "base/strings/utf_string_conversions.h"
38#include "url/gurl.h"
39
40namespace appcache {
41
42namespace {
43
44// Helper function used to identify 'isPattern' annotations.
45bool HasPatternMatchingAnnotation(const wchar_t* line_p,
46                                  const wchar_t* line_end) {
47  // Skip whitespace separating the resource url from the annotation.
48  // Note: trailing whitespace has already been trimmed from the line.
49  while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
50    ++line_p;
51  if (line_p == line_end)
52    return false;
53  std::wstring annotation(line_p, line_end - line_p);
54  return annotation == L"isPattern";
55}
56
57}
58
59enum Mode {
60  EXPLICIT,
61  INTERCEPT,
62  FALLBACK,
63  ONLINE_WHITELIST,
64  UNKNOWN_MODE,
65};
66
67enum InterceptVerb {
68  RETURN,
69  EXECUTE,
70  UNKNOWN_VERB,
71};
72
73Manifest::Manifest() : online_whitelist_all(false) {}
74
75Manifest::~Manifest() {}
76
77bool ParseManifest(const GURL& manifest_url, const char* data, int length,
78                   Manifest& manifest) {
79  // This is an implementation of the parsing algorithm specified in
80  // the HTML5 offline web application docs:
81  //   http://www.w3.org/TR/html5/offline.html
82  // Do not modify it without consulting those docs.
83  // Though you might be tempted to convert these wstrings to UTF-8 or
84  // base::string16, this implementation seems simpler given the constraints.
85
86  const wchar_t kSignature[] = L"CACHE MANIFEST";
87  const size_t kSignatureLength = arraysize(kSignature) - 1;
88  const wchar_t kChromiumSignature[] = L"CHROMIUM CACHE MANIFEST";
89  const size_t kChromiumSignatureLength = arraysize(kChromiumSignature) - 1;
90
91  DCHECK(manifest.explicit_urls.empty());
92  DCHECK(manifest.fallback_namespaces.empty());
93  DCHECK(manifest.online_whitelist_namespaces.empty());
94  DCHECK(!manifest.online_whitelist_all);
95
96  Mode mode = EXPLICIT;
97
98  std::wstring data_string;
99  // TODO(jennb): cannot do UTF8ToWide(data, length, &data_string);
100  // until UTF8ToWide uses 0xFFFD Unicode replacement character.
101  base::CodepageToWide(std::string(data, length), base::kCodepageUTF8,
102                       base::OnStringConversionError::SUBSTITUTE, &data_string);
103  const wchar_t* p = data_string.c_str();
104  const wchar_t* end = p + data_string.length();
105
106  // Look for the magic signature: "^\xFEFF?CACHE MANIFEST[ \t]?"
107  // Example: "CACHE MANIFEST #comment" is a valid signature.
108  // Example: "CACHE MANIFEST;V2" is not.
109
110  // When the input data starts with a UTF-8 Byte-Order-Mark
111  // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a
112  // Unicode BOM (U+FEFF). Skip a converted Unicode BOM if it exists.
113  int bom_offset = 0;
114  if (!data_string.empty() && data_string[0] == 0xFEFF) {
115    bom_offset = 1;
116    ++p;
117  }
118
119  if (p >= end)
120    return false;
121
122  // Check for a supported signature and skip p past it.
123  if (0 == data_string.compare(bom_offset, kSignatureLength,
124                               kSignature)) {
125    p += kSignatureLength;
126  } else if (0 == data_string.compare(bom_offset, kChromiumSignatureLength,
127                                      kChromiumSignature)) {
128    p += kChromiumSignatureLength;
129  } else {
130    return false;
131  }
132
133  // Character after "CACHE MANIFEST" must be whitespace.
134  if (p < end && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r')
135    return false;
136
137  // Skip to the end of the line.
138  while (p < end && *p != '\r' && *p != '\n')
139    ++p;
140
141  while (1) {
142    // Skip whitespace
143    while (p < end && (*p == '\n' || *p == '\r' || *p == ' ' || *p == '\t'))
144      ++p;
145
146    if (p == end)
147      break;
148
149    const wchar_t* line_start = p;
150
151    // Find the end of the line
152    while (p < end && *p != '\r' && *p != '\n')
153      ++p;
154
155    // Check if we have a comment
156    if (*line_start == '#')
157      continue;
158
159    // Get rid of trailing whitespace
160    const wchar_t* tmp = p - 1;
161    while (tmp > line_start && (*tmp == ' ' || *tmp == '\t'))
162      --tmp;
163
164    std::wstring line(line_start, tmp - line_start + 1);
165
166    if (line == L"CACHE:") {
167      mode = EXPLICIT;
168    } else if (line == L"FALLBACK:") {
169      mode = FALLBACK;
170    } else if (line == L"NETWORK:") {
171      mode = ONLINE_WHITELIST;
172    } else if (line == L"CHROMIUM-INTERCEPT:") {
173      mode = INTERCEPT;
174    } else if (*(line.end() - 1) == ':') {
175      mode = UNKNOWN_MODE;
176    } else if (mode == UNKNOWN_MODE) {
177      continue;
178    } else if (line == L"*" && mode == ONLINE_WHITELIST) {
179      manifest.online_whitelist_all = true;
180      continue;
181    } else if (mode == EXPLICIT || mode == ONLINE_WHITELIST) {
182      const wchar_t *line_p = line.c_str();
183      const wchar_t *line_end = line_p + line.length();
184
185      // Look for whitespace separating the URL from subsequent ignored tokens.
186      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
187        ++line_p;
188
189      base::string16 url16;
190      WideToUTF16(line.c_str(), line_p - line.c_str(), &url16);
191      GURL url = manifest_url.Resolve(url16);
192      if (!url.is_valid())
193        continue;
194      if (url.has_ref()) {
195        GURL::Replacements replacements;
196        replacements.ClearRef();
197        url = url.ReplaceComponents(replacements);
198      }
199
200      // Scheme component must be the same as the manifest URL's.
201      if (url.scheme() != manifest_url.scheme()) {
202        continue;
203      }
204
205      // See http://code.google.com/p/chromium/issues/detail?id=69594
206      // We willfully violate the HTML5 spec at this point in order
207      // to support the appcaching of cross-origin HTTPS resources.
208      // Per the spec, EXPLICIT cross-origin HTTS resources should be
209      // ignored here. We've opted for a milder constraint and allow
210      // caching unless the resource has a "no-store" header. That
211      // condition is enforced in AppCacheUpdateJob.
212
213      if (mode == EXPLICIT) {
214        manifest.explicit_urls.insert(url.spec());
215      } else {
216        bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end);
217        manifest.online_whitelist_namespaces.push_back(
218            Namespace(NETWORK_NAMESPACE, url, GURL(), is_pattern));
219      }
220    } else if (mode == INTERCEPT) {
221      // Lines of the form,
222      // <urlnamespace> <intercept_type> <targeturl>
223      const wchar_t* line_p = line.c_str();
224      const wchar_t* line_end = line_p + line.length();
225
226      // Look for first whitespace separating the url namespace from
227      // the intercept type.
228      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
229        ++line_p;
230
231      if (line_p == line_end)
232        continue;  // There was no whitespace separating the URLs.
233
234      base::string16 namespace_url16;
235      WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16);
236      GURL namespace_url = manifest_url.Resolve(namespace_url16);
237      if (!namespace_url.is_valid())
238        continue;
239      if (namespace_url.has_ref()) {
240        GURL::Replacements replacements;
241        replacements.ClearRef();
242        namespace_url = namespace_url.ReplaceComponents(replacements);
243      }
244
245      // The namespace URL must have the same scheme, host and port
246      // as the manifest's URL.
247      if (manifest_url.GetOrigin() != namespace_url.GetOrigin())
248        continue;
249
250      // Skip whitespace separating namespace from the type.
251      while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
252        ++line_p;
253
254      // Look for whitespace separating the type from the target url.
255      const wchar_t* type_start = line_p;
256      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
257        ++line_p;
258
259      // Look for a type value we understand, otherwise skip the line.
260      InterceptVerb verb = UNKNOWN_VERB;
261      std::wstring type(type_start, line_p - type_start);
262      if (type == L"return") {
263        verb = RETURN;
264      } else if (type == L"execute" &&
265                 CommandLine::ForCurrentProcess()->HasSwitch(
266                    kEnableExecutableHandlers)) {
267        verb = EXECUTE;
268      }
269      if (verb == UNKNOWN_VERB)
270        continue;
271
272      // Skip whitespace separating type from the target_url.
273      while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
274        ++line_p;
275
276      // Look for whitespace separating the URL from subsequent ignored tokens.
277      const wchar_t* target_url_start = line_p;
278      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
279        ++line_p;
280
281      base::string16 target_url16;
282      WideToUTF16(target_url_start, line_p - target_url_start, &target_url16);
283      GURL target_url = manifest_url.Resolve(target_url16);
284      if (!target_url.is_valid())
285        continue;
286
287      if (target_url.has_ref()) {
288        GURL::Replacements replacements;
289        replacements.ClearRef();
290        target_url = target_url.ReplaceComponents(replacements);
291      }
292      if (manifest_url.GetOrigin() != target_url.GetOrigin())
293        continue;
294
295      bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end);
296      manifest.intercept_namespaces.push_back(
297          Namespace(INTERCEPT_NAMESPACE, namespace_url,
298                    target_url, is_pattern, verb == EXECUTE));
299    } else if (mode == FALLBACK) {
300      const wchar_t* line_p = line.c_str();
301      const wchar_t* line_end = line_p + line.length();
302
303      // Look for whitespace separating the two URLs
304      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
305        ++line_p;
306
307      if (line_p == line_end) {
308        // There was no whitespace separating the URLs.
309        continue;
310      }
311
312      base::string16 namespace_url16;
313      WideToUTF16(line.c_str(), line_p - line.c_str(), &namespace_url16);
314      GURL namespace_url = manifest_url.Resolve(namespace_url16);
315      if (!namespace_url.is_valid())
316        continue;
317      if (namespace_url.has_ref()) {
318        GURL::Replacements replacements;
319        replacements.ClearRef();
320        namespace_url = namespace_url.ReplaceComponents(replacements);
321      }
322
323      // Fallback namespace URL must have the same scheme, host and port
324      // as the manifest's URL.
325      if (manifest_url.GetOrigin() != namespace_url.GetOrigin()) {
326        continue;
327      }
328
329      // Skip whitespace separating fallback namespace from URL.
330      while (line_p < line_end && (*line_p == '\t' || *line_p == ' '))
331        ++line_p;
332
333      // Look for whitespace separating the URL from subsequent ignored tokens.
334      const wchar_t* fallback_start = line_p;
335      while (line_p < line_end && *line_p != '\t' && *line_p != ' ')
336        ++line_p;
337
338      base::string16 fallback_url16;
339      WideToUTF16(fallback_start, line_p - fallback_start, &fallback_url16);
340      GURL fallback_url = manifest_url.Resolve(fallback_url16);
341      if (!fallback_url.is_valid())
342        continue;
343      if (fallback_url.has_ref()) {
344        GURL::Replacements replacements;
345        replacements.ClearRef();
346        fallback_url = fallback_url.ReplaceComponents(replacements);
347      }
348
349      // Fallback entry URL must have the same scheme, host and port
350      // as the manifest's URL.
351      if (manifest_url.GetOrigin() != fallback_url.GetOrigin()) {
352        continue;
353      }
354
355      bool is_pattern = HasPatternMatchingAnnotation(line_p, line_end);
356
357      // Store regardless of duplicate namespace URL. Only first match
358      // will ever be used.
359      manifest.fallback_namespaces.push_back(
360          Namespace(FALLBACK_NAMESPACE, namespace_url,
361                    fallback_url, is_pattern));
362    } else {
363      NOTREACHED();
364    }
365  }
366
367  return true;
368}
369
370}  // namespace appcache
371