url_to_filename_encoder.h revision c407dc5cd9bdc5668497f21b26b09d988ab439de
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// URL filename encoder goals:
6//
7// 1. Allow URLs with arbitrary path-segment length, generating filenames
8//    with a maximum of 128 characters.
9// 2. Provide a somewhat human readable filenames, for easy debugging flow.
10// 3. Provide reverse-mapping from filenames back to URLs.
11// 4. Be able to distinguish http://x from http://x/ from http://x/index.html.
12//    Those can all be different URLs.
13// 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen
14//    with Facebook Connect.
15//
16// We need an escape-character for representing characters that are legal
17// in URL paths, but not in filenames, such as '?'.  Illegal characters
18// in Windows are <>:"/\|?*.  For reference, see
19//   http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
20//
21// We can pick any legal character as an escape, as long as we escape it too.
22// But as we have a goal of having filenames that humans can correlate with
23// URLs, we should pick one that doesn't show up frequently in URLs. Candidates
24// are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are
25// shell escapes, and characters that occur frequently in URLs.
26//
27// .#&%-=_+ occur frequently in URLs.
28// ~`!$^&(){}[] are special to Unix shells
29//
30// @ might seem like a reasonble option, but some build tools don't appreciate
31// filenames with @ in testdata.  Perforce does not appreciate # in a filename.
32//
33// Though a web-site http://www.vias.org/linux-knowhow/lnag_05_05_09.html
34// identifies ^ as a special shell character, it did not appear to be an
35// issue to use it unquoted as a filename in bash or tcsh.
36//
37// Here are some frequencies of some special characters in a data set from Fall
38// '09.  We find only 3 occurences of "x5E" (^ is ascii 0x53):
39//   ^   3               build tools don't like ^ in testdata filenames
40//   @   10              build tools don't like @ in testdata filenames
41//   .   1676            too frequent in URLs
42//   ,   76              THE WINNER
43//   #   0               build tools doesn't like it
44//   &   487             Prefer to avoid shell escapes
45//   %   374             g4 doesn't like it
46//   =   579             very frequent in URLs -- leave unmodified
47//   -   464             very frequent in URLs -- leave unmodified
48//   _   798             very frequent in URLs -- leave unmodified
49//
50// It is interesting that there were no slurped URLs with #, but I suspect this
51// might be due to the slurping methdology.  So let's stick with the relatively
52// rare ','.
53//
54// Here's the escaping methodology:
55//
56//     URL               File
57//     /                 /,
58//     /.                /.,
59//     //                /,/,
60//     /./               /,./,
61//     /../              /,../,
62//     /,                /,2C,
63//     /,/               /,2C/,
64//     /a/b              /a/b,     (, at the end of a name indicates a leaf).
65//     /a/b/             /a/b/,
66//
67// path segments greater than 128 characters (after escape expansion) are
68// suffixed with ,- so we can know that the next "/" is not part of the URL:
69//
70//    /verylongname/    /verylong,-/name
71
72// NOTE: we avoid using some classes here (like FilePath and GURL) because we
73//       share this code with other projects externally.
74
75#ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_
76#define NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_
77
78#include <string>
79
80#include "base/file_path.h"
81#include "base/file_util.h"
82#include "base/string_util.h"
83#include "net/tools/dump_cache/url_utilities.h"
84
85namespace net {
86
87// Helper class for converting a URL into a filename.
88class UrlToFilenameEncoder {
89 public:
90  // Given a |url| and a |base_path|, returns a string which represents this
91  // |url|.
92  // |legacy_escape| indicates that this function should use the old-style
93  // of encoding.
94  // TODO(mbelshe): delete the legacy_escape code.
95  static std::string Encode(const std::string& url, std::string base_path,
96                            bool legacy_escape) {
97    std::string clean_url(url);
98    if (clean_url.length() && clean_url[clean_url.length()-1] == '/')
99      clean_url.append("index.html");
100
101    std::string host = UrlUtilities::GetUrlHost(clean_url);
102    std::string filename(base_path);
103    filename.append("\\");
104    filename = filename.append(host);
105    filename.append("\\");
106
107    std::string url_filename = UrlUtilities::GetUrlPath(clean_url);
108    // Strip the leading '/'
109    if (url_filename[0] == '/')
110      url_filename = url_filename.substr(1);
111
112    // replace '/' with '\'
113    ConvertToSlashes(&url_filename);
114
115    // strip double slashes ("\\")
116    StripDoubleSlashes(&url_filename);
117
118    // Save path as filesystem-safe characters
119    if (legacy_escape) {
120      url_filename = LegacyEscape(url_filename);
121    } else {
122      url_filename = Escape(url_filename);
123    }
124    filename = filename.append(url_filename);
125
126#ifndef WIN32
127    // Last step - convert to native slashes!
128    const std::string slash("/");
129    const std::string backslash("\\");
130    ReplaceAll(&filename, backslash, slash);
131#endif
132
133    return filename;
134  }
135
136  // Rewrite HTML in a form that the SPDY in-memory server
137  // can read.
138  // |filename_prefix| is prepended without escaping.
139  // |filename_ending| is the URL to be encoded into a filename.
140  // |dir_separator| is "/" on Unix, "\" on Windows.
141  // |encoded_filename| is the resultant filename.
142  static void EncodeSegment(
143      const std::string& filename_prefix,
144      const std::string& filename_ending,
145      char dir_separator,
146      std::string* encoded_filename);
147
148  // Decodes a filename that was encoded with EncodeSegment,
149  // yielding back the original URL.
150  static bool Decode(const std::string& encoded_filename,
151                     char dir_separator,
152                     std::string* decoded_url);
153
154 private:
155  // Appends a segment of the path, special-casing ".", "..", and "", and
156  // ensuring that the segment does not exceed the path length.  If it does,
157  // it chops the end off the segment, writes the segment with a separator of
158  // ",-/", and then rewrites segment to contain just the truncated piece so
159  // it can be used in the next iteration.
160  // |dir_separator| is "/" on Unix, "\" on Windows.
161  // |segment| is a read/write parameter containing segment to write
162  static void AppendSegment(
163      char dir_separator,
164      std::string* segment,
165      std::string* dest);
166
167  // Escapes the given input |path| and chop any individual components
168  // of the path which are greater than kMaximumSubdirectoryLength characters
169  // into two chunks.
170  static std::string Escape(const std::string& path) {
171    std::string output;
172    EncodeSegment("", path, '\\', &output);
173    return output;
174  }
175
176  // Allow reading of old slurped files.
177  static std::string LegacyEscape(const std::string& path);
178
179  // Replace all instances of |from| within |str| as |to|.
180  static void ReplaceAll(std::string* str, const std::string& from,
181                  const std::string& to) {
182    std::string::size_type pos(0);
183    while ((pos = str->find(from, pos)) != std::string::npos) {
184      str->replace(pos, from.size(), to);
185      pos += from.size();
186    }
187  }
188
189  // Replace all instances of "/" with "\" in |path|.
190  static void ConvertToSlashes(std::string* path) {
191    const std::string slash("/");
192    const std::string backslash("\\");
193    ReplaceAll(path, slash, backslash);
194  }
195
196  // Replace all instances of "\\" with "%5C%5C" in |path|.
197  static void StripDoubleSlashes(std::string* path) {
198    const std::string doubleslash("\\\\");
199    const std::string escaped_doubleslash("%5C%5C");
200    ReplaceAll(path, doubleslash, escaped_doubleslash);
201  }
202};
203
204}  // namespace net
205
206#endif  // NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_
207
208