url_to_filename_encoder.h revision c407dc5cd9bdc5668497f21b26b09d988ab439de
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// URL filename encoder goals: 6// 7// 1. Allow URLs with arbitrary path-segment length, generating filenames 8// with a maximum of 128 characters. 9// 2. Provide a somewhat human readable filenames, for easy debugging flow. 10// 3. Provide reverse-mapping from filenames back to URLs. 11// 4. Be able to distinguish http://x from http://x/ from http://x/index.html. 12// Those can all be different URLs. 13// 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen 14// with Facebook Connect. 15// 16// We need an escape-character for representing characters that are legal 17// in URL paths, but not in filenames, such as '?'. Illegal characters 18// in Windows are <>:"/\|?*. For reference, see 19// http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx 20// 21// We can pick any legal character as an escape, as long as we escape it too. 22// But as we have a goal of having filenames that humans can correlate with 23// URLs, we should pick one that doesn't show up frequently in URLs. Candidates 24// are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are 25// shell escapes, and characters that occur frequently in URLs. 26// 27// .#&%-=_+ occur frequently in URLs. 28// ~`!$^&(){}[] are special to Unix shells 29// 30// @ might seem like a reasonble option, but some build tools don't appreciate 31// filenames with @ in testdata. Perforce does not appreciate # in a filename. 32// 33// Though a web-site http://www.vias.org/linux-knowhow/lnag_05_05_09.html 34// identifies ^ as a special shell character, it did not appear to be an 35// issue to use it unquoted as a filename in bash or tcsh. 36// 37// Here are some frequencies of some special characters in a data set from Fall 38// '09. We find only 3 occurences of "x5E" (^ is ascii 0x53): 39// ^ 3 build tools don't like ^ in testdata filenames 40// @ 10 build tools don't like @ in testdata filenames 41// . 1676 too frequent in URLs 42// , 76 THE WINNER 43// # 0 build tools doesn't like it 44// & 487 Prefer to avoid shell escapes 45// % 374 g4 doesn't like it 46// = 579 very frequent in URLs -- leave unmodified 47// - 464 very frequent in URLs -- leave unmodified 48// _ 798 very frequent in URLs -- leave unmodified 49// 50// It is interesting that there were no slurped URLs with #, but I suspect this 51// might be due to the slurping methdology. So let's stick with the relatively 52// rare ','. 53// 54// Here's the escaping methodology: 55// 56// URL File 57// / /, 58// /. /., 59// // /,/, 60// /./ /,./, 61// /../ /,../, 62// /, /,2C, 63// /,/ /,2C/, 64// /a/b /a/b, (, at the end of a name indicates a leaf). 65// /a/b/ /a/b/, 66// 67// path segments greater than 128 characters (after escape expansion) are 68// suffixed with ,- so we can know that the next "/" is not part of the URL: 69// 70// /verylongname/ /verylong,-/name 71 72// NOTE: we avoid using some classes here (like FilePath and GURL) because we 73// share this code with other projects externally. 74 75#ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_ 76#define NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_ 77 78#include <string> 79 80#include "base/file_path.h" 81#include "base/file_util.h" 82#include "base/string_util.h" 83#include "net/tools/dump_cache/url_utilities.h" 84 85namespace net { 86 87// Helper class for converting a URL into a filename. 88class UrlToFilenameEncoder { 89 public: 90 // Given a |url| and a |base_path|, returns a string which represents this 91 // |url|. 92 // |legacy_escape| indicates that this function should use the old-style 93 // of encoding. 94 // TODO(mbelshe): delete the legacy_escape code. 95 static std::string Encode(const std::string& url, std::string base_path, 96 bool legacy_escape) { 97 std::string clean_url(url); 98 if (clean_url.length() && clean_url[clean_url.length()-1] == '/') 99 clean_url.append("index.html"); 100 101 std::string host = UrlUtilities::GetUrlHost(clean_url); 102 std::string filename(base_path); 103 filename.append("\\"); 104 filename = filename.append(host); 105 filename.append("\\"); 106 107 std::string url_filename = UrlUtilities::GetUrlPath(clean_url); 108 // Strip the leading '/' 109 if (url_filename[0] == '/') 110 url_filename = url_filename.substr(1); 111 112 // replace '/' with '\' 113 ConvertToSlashes(&url_filename); 114 115 // strip double slashes ("\\") 116 StripDoubleSlashes(&url_filename); 117 118 // Save path as filesystem-safe characters 119 if (legacy_escape) { 120 url_filename = LegacyEscape(url_filename); 121 } else { 122 url_filename = Escape(url_filename); 123 } 124 filename = filename.append(url_filename); 125 126#ifndef WIN32 127 // Last step - convert to native slashes! 128 const std::string slash("/"); 129 const std::string backslash("\\"); 130 ReplaceAll(&filename, backslash, slash); 131#endif 132 133 return filename; 134 } 135 136 // Rewrite HTML in a form that the SPDY in-memory server 137 // can read. 138 // |filename_prefix| is prepended without escaping. 139 // |filename_ending| is the URL to be encoded into a filename. 140 // |dir_separator| is "/" on Unix, "\" on Windows. 141 // |encoded_filename| is the resultant filename. 142 static void EncodeSegment( 143 const std::string& filename_prefix, 144 const std::string& filename_ending, 145 char dir_separator, 146 std::string* encoded_filename); 147 148 // Decodes a filename that was encoded with EncodeSegment, 149 // yielding back the original URL. 150 static bool Decode(const std::string& encoded_filename, 151 char dir_separator, 152 std::string* decoded_url); 153 154 private: 155 // Appends a segment of the path, special-casing ".", "..", and "", and 156 // ensuring that the segment does not exceed the path length. If it does, 157 // it chops the end off the segment, writes the segment with a separator of 158 // ",-/", and then rewrites segment to contain just the truncated piece so 159 // it can be used in the next iteration. 160 // |dir_separator| is "/" on Unix, "\" on Windows. 161 // |segment| is a read/write parameter containing segment to write 162 static void AppendSegment( 163 char dir_separator, 164 std::string* segment, 165 std::string* dest); 166 167 // Escapes the given input |path| and chop any individual components 168 // of the path which are greater than kMaximumSubdirectoryLength characters 169 // into two chunks. 170 static std::string Escape(const std::string& path) { 171 std::string output; 172 EncodeSegment("", path, '\\', &output); 173 return output; 174 } 175 176 // Allow reading of old slurped files. 177 static std::string LegacyEscape(const std::string& path); 178 179 // Replace all instances of |from| within |str| as |to|. 180 static void ReplaceAll(std::string* str, const std::string& from, 181 const std::string& to) { 182 std::string::size_type pos(0); 183 while ((pos = str->find(from, pos)) != std::string::npos) { 184 str->replace(pos, from.size(), to); 185 pos += from.size(); 186 } 187 } 188 189 // Replace all instances of "/" with "\" in |path|. 190 static void ConvertToSlashes(std::string* path) { 191 const std::string slash("/"); 192 const std::string backslash("\\"); 193 ReplaceAll(path, slash, backslash); 194 } 195 196 // Replace all instances of "\\" with "%5C%5C" in |path|. 197 static void StripDoubleSlashes(std::string* path) { 198 const std::string doubleslash("\\\\"); 199 const std::string escaped_doubleslash("%5C%5C"); 200 ReplaceAll(path, doubleslash, escaped_doubleslash); 201 } 202}; 203 204} // namespace net 205 206#endif // NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_ 207 208