url_to_filename_encoder.h revision 3345a6884c488ff3a535c2c9acdd33d74b37e311
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// URL filename encoder goals:
6//
7// 1. Allow URLs with arbitrary path-segment length, generating filenames
8//    with a maximum of 128 characters.
9// 2. Provide a somewhat human readable filenames, for easy debugging flow.
10// 3. Provide reverse-mapping from filenames back to URLs.
11// 4. Be able to distinguish http://x from http://x/ from http://x/index.html.
12//    Those can all be different URLs.
13// 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen
14//    with Facebook Connect.
15//
16// We need an escape-character for representing characters that are legal
17// in URL paths, but not in filenames, such as '?'.
18//
19// We can pick any legal character as an escape, as long as we escape it too.
20// But as we have a goal of having filenames that humans can correlate with
21// URLs, we should pick one that doesn't show up frequently in URLs. Candidates
22// are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are
23// shell escapes or that various build tools use.
24//
25// .#&%-=_+ occur frequently in URLs.
26// <>:"/\|?* are illegal in Windows
27//   See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
28// ~`!$^&(){}[]'; are special to Unix shells
29// In addition, build tools do not like ^@#%
30//
31// Josh took a quick look at the frequency of some special characters in
32// Sadeesh's slurped directory from Fall 09 and found the following occurances:
33//
34//   ^   3               build tool doesn't like ^ in testdata filenames
35//   @   10              build tool doesn't like @ in testdata filenames
36//   .   1676            too frequent in URLs
37//   ,   76              THE WINNER
38//   #   0               build tool doesn't like it
39//   &   487             Prefer to avoid shell escapes
40//   %   374             g4 doesn't like it
41//   =   579             very frequent in URLs -- leave unmodified
42//   -   464             very frequent in URLs -- leave unmodified
43//   _   798             very frequent in URLs -- leave unmodified
44//
45//
46// The escaping algorithm is:
47//  1) Escape all unfriendly symbols as ,XX where XX is the hex code.
48//  2) Add a ',' at the end (We do not allow ',' at end of any directory name,
49//     so this assures that e.g. /a and /a/b can coexist in the filesystem).
50//  3) Go through the path segment by segment (where a segment is one directory
51//     or leaf in the path) and
52//     3a) If the segment is empty, escape the second slash. i.e. if it was
53//         www.foo.com//a then we escape the second / like www.foo.com/,2Fa,
54//     3a) If it is "." or ".." prepend with ',' (so that we have a non-
55//         empty and non-reserved filename).
56//     3b) If it is over 128 characters, break it up into smaller segments by
57//         inserting ,-/ (Windows limits paths to 128 chars, other OSes also
58//         have limits that would restrict us)
59//
60// For example:
61//     URL               File
62//     /                 /,
63//     /index.html       /index.html,
64//     /.                /.,
65//     /a/b              /a/b,
66//     /a/b/             /a/b/,
67//     /a/b/c            /a/b/c,   Note: no prefix problem
68//     /u?foo=bar        /u,3Ffoo=bar,
69//     //                /,2F,
70//     /./               /,./,
71//     /../              /,../,
72//     /,                /,2C,
73//     /,./              /,2C./,
74//     /very...longname/ /very...long,-/name   If very...long is about 126 long.
75
76// NOTE: we avoid using some classes here (like FilePath and GURL) because we
77//       share this code with other projects externally.
78
79#ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
80#define NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
81#pragma once
82
83#include <string>
84
85#include "base/string_util.h"
86#include "net/tools/dump_cache/url_utilities.h"
87
88namespace net {
89
90// Helper class for converting a URL into a filename.
91class UrlToFilenameEncoder {
92 public:
93  // Given a |url| and a |base_path|, returns a filename which represents this
94  // |url|. |url| may include URL escaping such as %21 for !
95  // |legacy_escape| indicates that this function should use the old-style
96  // of encoding.
97  // TODO(mbelshe): delete the legacy_escape code.
98  static std::string Encode(const std::string& url, std::string base_path,
99                            bool legacy_escape) {
100    std::string filename;
101    if (!legacy_escape) {
102      std::string url_no_scheme = UrlUtilities::GetUrlHostPath(url);
103      EncodeSegment(base_path, url_no_scheme, '/', &filename);
104#ifdef WIN32
105      ReplaceAll(&filename, "/", "\\");
106#endif
107    } else {
108      std::string clean_url(url);
109      if (clean_url.length() && clean_url[clean_url.length()-1] == '/')
110        clean_url.append("index.html");
111
112      std::string host = UrlUtilities::GetUrlHost(clean_url);
113      filename.append(base_path);
114      filename.append(host);
115#ifdef WIN32
116      filename.append("\\");
117#else
118      filename.append("/");
119#endif
120
121      std::string url_filename = UrlUtilities::GetUrlPath(clean_url);
122      // Strip the leading '/'.
123      if (url_filename[0] == '/')
124        url_filename = url_filename.substr(1);
125
126      // Replace '/' with '\'.
127      ConvertToSlashes(&url_filename);
128
129      // Strip double back-slashes ("\\\\").
130      StripDoubleSlashes(&url_filename);
131
132      // Save path as filesystem-safe characters.
133      url_filename = LegacyEscape(url_filename);
134      filename.append(url_filename);
135
136#ifndef WIN32
137      // Last step - convert to native slashes.
138      const std::string slash("/");
139      const std::string backslash("\\");
140      ReplaceAll(&filename, backslash, slash);
141#endif
142    }
143
144    return filename;
145  }
146
147  // Rewrite HTML in a form that the SPDY in-memory server
148  // can read.
149  // |filename_prefix| is prepended without escaping.
150  // |escaped_ending| is the URL to be encoded into a filename. It may have URL
151  // escaped characters (like %21 for !).
152  // |dir_separator| is "/" on Unix, "\" on Windows.
153  // |encoded_filename| is the resultant filename.
154  static void EncodeSegment(
155      const std::string& filename_prefix,
156      const std::string& escaped_ending,
157      char dir_separator,
158      std::string* encoded_filename);
159
160  // Decodes a filename that was encoded with EncodeSegment,
161  // yielding back the original URL.
162  static bool Decode(const std::string& encoded_filename,
163                     char dir_separator,
164                     std::string* decoded_url);
165
166  static const char kEscapeChar;
167  static const char kTruncationChar;
168  static const size_t kMaximumSubdirectoryLength;
169
170  friend class UrlToFilenameEncoderTest;
171
172 private:
173  // Appends a segment of the path, special-casing "." and "..", and
174  // ensuring that the segment does not exceed the path length.  If it does,
175  // it chops the end off the segment, writes the segment with a separator of
176  // ",-/", and then rewrites segment to contain just the truncated piece so
177  // it can be used in the next iteration.
178  // |segment| is a read/write parameter containing segment to write
179  // Note: this should not be called with empty segment.
180  static void AppendSegment(std::string* segment, std::string* dest);
181
182  // Allow reading of old slurped files.
183  static std::string LegacyEscape(const std::string& path);
184
185  // Replace all instances of |from| within |str| as |to|.
186  static void ReplaceAll(std::string* str, const std::string& from,
187                         const std::string& to) {
188    std::string::size_type pos(0);
189    while ((pos = str->find(from, pos)) != std::string::npos) {
190      str->replace(pos, from.size(), to);
191      pos += from.size();
192    }
193  }
194
195  // Replace all instances of "/" with "\" in |path|.
196  static void ConvertToSlashes(std::string* path) {
197    const std::string slash("/");
198    const std::string backslash("\\");
199    ReplaceAll(path, slash, backslash);
200  }
201
202  // Replace all instances of "\\" with "%5C%5C" in |path|.
203  static void StripDoubleSlashes(std::string* path) {
204    const std::string doubleslash("\\\\");
205    const std::string escaped_doubleslash("%5C%5C");
206    ReplaceAll(path, doubleslash, escaped_doubleslash);
207  }
208};
209
210}  // namespace net
211
212#endif  // NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
213