url_to_filename_encoder.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <stdlib.h>
6
7#include "base/logging.h"
8#include "base/string_util.h"
9#include "net/base/net_util.h"
10#include "net/tools/dump_cache/url_to_filename_encoder.h"
11
12using std::string;
13
14namespace {
15
16// Returns 1 if buf is prefixed by "num_digits" of hex digits
17// Teturns 0 otherwise.
18// The function checks for '\0' for string termination.
19int HexDigitsPrefix(const char* buf, int num_digits) {
20  for (int i = 0; i < num_digits; i++) {
21    if (!IsHexDigit(buf[i]))
22      return 0;  // This also detects end of string as '\0' is not xdigit.
23  }
24  return 1;
25}
26
27#ifdef WIN32
28#define strtoull _strtoui64
29#endif
30
31// A simple parser for long long values. Returns the parsed value if a
32// valid integer is found; else returns deflt
33// UInt64 and Int64 cannot handle decimal numbers with leading 0s.
34uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) {
35  char *error = NULL;
36  const uint64 value = strtoull(str, &error, 16);
37  return (error == str) ? deflt : value;
38}
39
40}
41
42namespace net {
43
44// The escape character choice is made here -- all code and tests in this
45// directory are based off of this constant.  However, our testdata
46// has tons of dependencies on this, so it cannot be changed without
47// re-running those tests and fixing them.
48const char UrlToFilenameEncoder::kEscapeChar = ',';
49const char UrlToFilenameEncoder::kTruncationChar = '-';
50const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128;
51
52void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) {
53  CHECK(!segment->empty());
54  if ((*segment == ".") || (*segment == "..")) {
55    dest->append(1, kEscapeChar);
56    dest->append(*segment);
57    segment->clear();
58  } else {
59    size_t segment_size = segment->size();
60    if (segment_size > kMaximumSubdirectoryLength) {
61      // We need to inject ",-" at the end of the segment to signify that
62      // we are inserting an artificial '/'.  This means we have to chop
63      // off at least two characters to make room.
64      segment_size = kMaximumSubdirectoryLength - 2;
65
66      // But we don't want to break up an escape sequence that happens to lie at
67      // the end.  Escape sequences are at most 2 characters.
68      if ((*segment)[segment_size - 1] == kEscapeChar) {
69        segment_size -= 1;
70      } else if ((*segment)[segment_size - 2] == kEscapeChar) {
71        segment_size -= 2;
72      }
73      dest->append(segment->data(), segment_size);
74      dest->append(1, kEscapeChar);
75      dest->append(1, kTruncationChar);
76      segment->erase(0, segment_size);
77
78      // At this point, if we had segment_size=3, and segment="abcd",
79      // then after this erase, we will have written "abc,-" and set segment="d"
80    } else {
81      dest->append(*segment);
82      segment->clear();
83    }
84  }
85}
86
87void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix,
88                                         const string& escaped_ending,
89                                         char dir_separator,
90                                         string* encoded_filename) {
91  string filename_ending = UrlUtilities::Unescape(escaped_ending);
92
93  char encoded[3];
94  int encoded_len;
95  string segment;
96
97  // TODO(jmarantz): This code would be a bit simpler if we disallowed
98  // Instaweb allowing filename_prefix to not end in "/".  We could
99  // then change the is routine to just take one input string.
100  size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
101  if (start_of_segment == string::npos) {
102    segment = filename_prefix;
103  } else {
104    segment = filename_prefix.substr(start_of_segment + 1);
105    *encoded_filename = filename_prefix.substr(0, start_of_segment + 1);
106  }
107
108  size_t index = 0;
109  // Special case the first / to avoid adding a leading kEscapeChar.
110  if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
111    encoded_filename->append(segment);
112    segment.clear();
113    encoded_filename->append(1, dir_separator);
114    ++index;
115  }
116
117  for (; index < filename_ending.length(); ++index) {
118    unsigned char ch = static_cast<unsigned char>(filename_ending[index]);
119
120    // Note: instead of outputing an empty segment, we let the second slash
121    // be escaped below.
122    if ((ch == dir_separator) && !segment.empty()) {
123      AppendSegment(&segment, encoded_filename);
124      encoded_filename->append(1, dir_separator);
125      segment.clear();
126    } else {
127      // After removing unsafe chars the only safe ones are _.=+- and alphanums.
128      if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') ||
129          (ch == '-') || (('0' <= ch) && (ch <= '9')) ||
130          (('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) {
131        encoded[0] = ch;
132        encoded_len = 1;
133      } else {
134        encoded[0] = kEscapeChar;
135        encoded[1] = ch / 16;
136        encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
137        encoded[2] = ch % 16;
138        encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
139        encoded_len = 3;
140      }
141      segment.append(encoded, encoded_len);
142
143      // If segment is too big, we must chop it into chunks.
144      if (segment.size() > kMaximumSubdirectoryLength) {
145        AppendSegment(&segment, encoded_filename);
146        encoded_filename->append(1, dir_separator);
147      }
148    }
149  }
150
151  // Append "," to the leaf filename so the leaf can also be a branch., e.g.
152  // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
153  // /a/b/c/d".  So we will rename the "d" here to "d,".  If doing that pushed
154  // us over the 128 char limit, then we will need to append "/" and the
155  // remaining chars.
156  segment += kEscapeChar;
157  AppendSegment(&segment, encoded_filename);
158  if (!segment.empty()) {
159    // The last overflow segment is special, because we appended in
160    // kEscapeChar above.  We won't need to check it again for size
161    // or further escaping.
162    encoded_filename->append(1, dir_separator);
163    encoded_filename->append(segment);
164  }
165}
166
167// Note: this decoder is not the exact inverse of the EncodeSegment above,
168// because it does not take into account a prefix.
169bool UrlToFilenameEncoder::Decode(const string& encoded_filename,
170                                  char dir_separator,
171                                  string* decoded_url) {
172  enum State {
173    kStart,
174    kEscape,
175    kFirstDigit,
176    kTruncate,
177    kEscapeDot
178  };
179  State state = kStart;
180  char hex_buffer[3];
181  hex_buffer[2] = '\0';
182  for (size_t i = 0; i < encoded_filename.size(); ++i) {
183    char ch = encoded_filename[i];
184    switch (state) {
185      case kStart:
186        if (ch == kEscapeChar) {
187          state = kEscape;
188        } else if (ch == dir_separator) {
189          decoded_url->append(1, '/');  // URLs only use '/' not '\\'
190        } else {
191          decoded_url->append(1, ch);
192        }
193        break;
194      case kEscape:
195        if (HexDigitsPrefix(&ch, 1) == 1) {
196          hex_buffer[0] = ch;
197          state = kFirstDigit;
198        } else if (ch == kTruncationChar) {
199          state = kTruncate;
200        } else if (ch == '.') {
201          decoded_url->append(1, '.');
202          state = kEscapeDot;  // Look for at most one more dot.
203        } else if (ch == dir_separator) {
204          // Consider url "//x".  This was once encoded to "/,/x,".
205          // This code is what skips the first Escape.
206          decoded_url->append(1, '/');  // URLs only use '/' not '\\'
207          state = kStart;
208        } else {
209          return false;
210        }
211        break;
212      case kFirstDigit:
213        if (HexDigitsPrefix(&ch, 1) == 1) {
214          hex_buffer[1] = ch;
215          uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0);
216          decoded_url->append(1, static_cast<char>(hex_value));
217          state = kStart;
218        } else {
219          return false;
220        }
221        break;
222      case kTruncate:
223        if (ch == dir_separator) {
224          // Skip this separator, it was only put in to break up long
225          // path segments, but is not part of the URL.
226          state = kStart;
227        } else {
228          return false;
229        }
230        break;
231      case kEscapeDot:
232        decoded_url->append(1, ch);
233        state = kStart;
234        break;
235    }
236  }
237
238  // All legal encoded filenames end in kEscapeChar.
239  return (state == kEscape);
240}
241
242// Escape the given input |path| and chop any individual components
243// of the path which are greater than kMaximumSubdirectoryLength characters
244// into two chunks.
245//
246// This legacy version has several issues with aliasing of different URLs,
247// inability to represent both /a/b/c and /a/b/c/d, and inability to decode
248// the filenames back into URLs.
249//
250// But there is a large body of slurped data which depends on this format,
251// so leave it as the default for spdy_in_mem_edsm_server.
252string UrlToFilenameEncoder::LegacyEscape(const string& path) {
253  string output;
254
255  // Note:  We also chop paths into medium sized 'chunks'.
256  //        This is due to the incompetence of the windows
257  //        filesystem, which still hasn't figured out how
258  //        to deal with long filenames.
259  int last_slash = 0;
260  for (size_t index = 0; index < path.length(); index++) {
261    char ch = path[index];
262    if (ch == 0x5C)
263      last_slash = index;
264    if ((ch == 0x2D) ||                    // hyphen
265        (ch == 0x5C) || (ch == 0x5F) ||    // backslash, underscore
266        ((0x30 <= ch) && (ch <= 0x39)) ||  // Digits [0-9]
267        ((0x41 <= ch) && (ch <= 0x5A)) ||  // Uppercase [A-Z]
268        ((0x61 <= ch) && (ch <= 0x7A))) {  // Lowercase [a-z]
269      output.append(&path[index], 1);
270    } else {
271      char encoded[3];
272      encoded[0] = 'x';
273      encoded[1] = ch / 16;
274      encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
275      encoded[2] = ch % 16;
276      encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
277      output.append(encoded, 3);
278    }
279    if (index - last_slash > kMaximumSubdirectoryLength) {
280#ifdef WIN32
281      char slash = '\\';
282#else
283      char slash = '/';
284#endif
285      output.append(&slash, 1);
286      last_slash = index;
287    }
288  }
289  return output;
290}
291
292}  // namespace net
293