url_to_filename_encoder.cc revision c407dc5cd9bdc5668497f21b26b09d988ab439de
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/logging.h"
6#include "base/string_util.h"
7#include "net/base/net_util.h"
8#include "net/tools/dump_cache/url_to_filename_encoder.h"
9
10using std::string;
11
12namespace {
13
14inline bool IsHexDigit(unsigned char c) {
15  return (('0' <= c && c <= '9') || ('A' <= c && c <= 'F') ||
16          ('a' <= c && c <= 'f'));
17}
18
19// Returns 1 if buf is prefixed by "num_digits" of hex digits
20// Teturns 0 otherwise.
21// The function checks for '\0' for string termination.
22int HexDigitsPrefix(const char* buf, int num_digits) {
23  for (int i = 0; i < num_digits; i++)
24    if (!IsHexDigit(buf[i]))
25      return 0;  // This also detects end of string as '\0' is not xdigit.
26  return 1;
27}
28
29#ifdef WIN32
30#define strtoull _strtoui64
31#endif
32
33// A simple parser for long long values. Returns the parsed value if a
34// valid integer is found; else returns deflt
35// UInt64 and Int64 cannot handle decimal numbers with leading 0s.
36uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) {
37  char *error = NULL;
38  const uint64 value = strtoull(str, &error, 16);
39  return (error == str) ? deflt : value;
40}
41
42}
43
44namespace net {
45
46// The escape character choice is made here -- all code and tests in this
47// directory are based off of this constant.  However, our test ata
48// has tons of dependencies on this, so it cannot be changed without
49// re-running those tests and fixing them.
50const char kTruncationChar = '-';
51const char kEscapeChar = ',';
52const size_t kMaximumSubdirectoryLength = 128;
53
54void UrlToFilenameEncoder::AppendSegment(
55    char dir_separator, string* segment, string* dest) {
56  if (segment->empty() || (*segment == ".") || (*segment == "..")) {
57    dest->append(1, kEscapeChar);
58    dest->append(*segment);
59    segment->clear();
60  } else {
61    size_t segment_size = segment->size();
62    if (segment_size > kMaximumSubdirectoryLength) {
63      // We need to inject ",-" at the end of the segment to signify that
64      // we are inserting an artificial '/'.  This means we have to chop
65      // off at least two characters to make room.
66      segment_size = kMaximumSubdirectoryLength - 2;
67
68      // But we don't want to break up an escape sequence that happens to lie at
69      // the end.  Escape sequences are at most 2 characters.
70      if ((*segment)[segment_size - 1] == kEscapeChar) {
71        segment_size -= 1;
72      } else if ((*segment)[segment_size - 2] == kEscapeChar) {
73        segment_size -= 2;
74      }
75      dest->append(segment->data(), segment_size);
76      dest->append(1, kEscapeChar);
77      dest->append(1, kTruncationChar);
78      segment->erase(0, segment_size);
79
80      // At this point, if we had segment_size=3, and segment="abcd",
81      // then after this erase, we will have written "abc,-" and set segment="d"
82    } else {
83      dest->append(*segment);
84      segment->clear();
85    }
86  }
87}
88
89void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix,
90                                         const string& filename_ending,
91                                         char dir_separator,
92                                         string* encoded_filename) {
93  char encoded[3];
94  int encoded_len;
95  string segment;
96
97  // TODO(jmarantz): This code would be a bit simpler if we disallowed
98  // Instaweb allowing filename_prefix to not end in "/".  We could
99  // then change the is routine to just take one input string.
100  size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
101  if (start_of_segment == string::npos) {
102    segment = filename_prefix;
103  } else {
104    segment = filename_prefix.substr(start_of_segment + 1);
105    *encoded_filename = filename_prefix.substr(0, start_of_segment + 1);
106  }
107
108  size_t index = 0;
109  // Special case the first / to avoid adding a leading kEscapeChar.
110  if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
111    encoded_filename->append(segment);
112    segment.clear();
113    encoded_filename->append(1, dir_separator);
114    ++index;
115  }
116
117  for (; index < filename_ending.length(); ++index) {
118    unsigned char ch = static_cast<unsigned char>(filename_ending[index]);
119
120    if (ch == dir_separator) {
121      AppendSegment(dir_separator, &segment, encoded_filename);
122      encoded_filename->append(1, dir_separator);
123      segment.clear();
124    } else {
125      // & is common in URLs and is legal filename syntax, but is also
126      // a special Unix shell character, so let's avoid making
127      // filenames with &, as well as ?.  It's probably better to
128      // blow up query-params than it is to make it hard to work with
129      // the files in shell-scripts.
130      if ((ch == 0x5F) || (ch == 0x2E) ||    // underscore period
131          (ch == 0x25) || (ch == 0x3D) ||    // percent equals
132          (ch == 0x2B) || (ch == 0x2D) ||    // plus dash
133          ((0x30 <= ch) && (ch <= 0x39)) ||  // Digits [0-9]
134          ((0x41 <= ch) && (ch <= 0x5A)) ||  // Uppercase [A-Z]
135          ((0x61 <= ch) && (ch <= 0x7A))) {  // Lowercase [a-z]
136        encoded[0] = ch;
137        encoded_len = 1;
138      } else {
139        encoded[0] = kEscapeChar;
140        encoded[1] = ch / 16;
141        encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
142        encoded[2] = ch % 16;
143        encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
144        encoded_len = 3;
145      }
146      segment.append(encoded, encoded_len);
147
148      // Note:  We chop paths into medium sized 'chunks'.
149      //        This is due to filename limits on Windows and Unix.
150      //        The Windows limit appears to be 128 characters, and
151      //        Unix is larger, but not as large as URLs with large
152      //        numbers of query params.
153      if (segment.size() > kMaximumSubdirectoryLength) {
154        AppendSegment(dir_separator, &segment, encoded_filename);
155        encoded_filename->append(1, dir_separator);
156      }
157    }
158  }
159
160  // Append "," to the leaf filename so the leaf can also be a branch., e.g.
161  // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
162  // /a/b/c/d".  So we will rename the "d" here to "d,".  If doing that pushed
163  // us over the 128 char limit, then we will need to append "/" and the
164  // remaining chars.
165  segment += kEscapeChar;
166  AppendSegment(dir_separator, &segment, encoded_filename);
167  if (!segment.empty()) {
168    // The last overflow segment is special, because we appended in
169    // kEscapeChar above.  We won't need to check it again for size
170    // or further escaping.
171    encoded_filename->append(1, dir_separator);
172    encoded_filename->append(segment);
173  }
174}
175
176// Note: this decoder is not the exact inverse of the EncodeSegment above,
177// because it does not take into account a prefix.
178bool UrlToFilenameEncoder::Decode(const string& encoded_filename,
179                                  char dir_separator,
180                                  string* decoded_url) {
181  enum State {
182    kStart,
183    kEscape,
184    kFirstDigit,
185    kTruncate,
186    kEscapeDot
187  };
188  State state = kStart;
189  int char_code = 0;
190  char hex_buffer[3];
191  hex_buffer[2] = '\0';
192  for (size_t i = 0; i < encoded_filename.size(); ++i) {
193    char ch = encoded_filename[i];
194    switch (state) {
195      case kStart:
196        if (ch == kEscapeChar) {
197          state = kEscape;
198        } else {
199          decoded_url->append(1, ch);
200        }
201        break;
202      case kEscape:
203        if (HexDigitsPrefix(&ch, 1) == 1) {
204          hex_buffer[0] = ch;
205          state = kFirstDigit;
206        } else if (ch == kTruncationChar) {
207          state = kTruncate;
208        } else if (ch == '.') {
209          decoded_url->append(1, '.');
210          state = kEscapeDot;  // Look for at most one more dot.
211        } else if (ch == dir_separator) {
212          // Consider url "//x".  This will get encoded to "/,/x,".
213          // This code is what skips the first Escape.
214          decoded_url->append(1, ch);
215          state = kStart;
216        } else {
217          return false;
218        }
219        break;
220      case kFirstDigit:
221        if (HexDigitsPrefix(&ch, 1) == 1) {
222          hex_buffer[1] = ch;
223          uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0);
224          decoded_url->append(1, static_cast<char>(hex_value));
225          char_code = 0;
226          state = kStart;
227        } else {
228          return false;
229        }
230        break;
231      case kTruncate:
232        if (ch == dir_separator) {
233          // Skip this separator, it was only put in to break up long
234          // path segments, but is not part of the URL.
235          state = kStart;
236        } else {
237          return false;
238        }
239        break;
240      case kEscapeDot:
241        decoded_url->append(1, ch);
242        state = kStart;
243        break;
244    }
245  }
246
247  // All legal encoded filenames end in kEscapeChar.
248  return (state == kEscape);
249}
250
251// Escapes the given input |path| and chop any individual components
252// of the path which are greater than kMaximumSubdirectoryLength characters
253// into two chunks.
254//
255// This legacy version has several issues with aliasing of different URLs,
256// inability to represent both /a/b/c and /a/b/c/d, and inability to decode
257// the filenames back into URLs.
258//
259// But there is a large body of slurped data which depends on this format,
260// so leave it as the default for spdy_in_mem_edsm_server.
261string UrlToFilenameEncoder::LegacyEscape(const string& path) {
262  string output;
263
264  // Note:  We also chop paths into medium sized 'chunks'.
265  //        This is due to the incompetence of the windows
266  //        filesystem, which still hasn't figured out how
267  //        to deal with long filenames.
268  int last_slash = 0;
269  for (size_t index = 0; index < path.length(); index++) {
270    char ch = path[index];
271    if (ch == 0x5C)
272      last_slash = index;
273    if ((ch == 0x2D) ||                    // hyphen
274        (ch == 0x5C) || (ch == 0x5F) ||    // backslash, underscore
275        ((0x30 <= ch) && (ch <= 0x39)) ||  // Digits [0-9]
276        ((0x41 <= ch) && (ch <= 0x5A)) ||  // Uppercase [A-Z]
277        ((0x61 <= ch) && (ch <= 0x7A))) {  // Lowercase [a-z]
278      output.append(&path[index], 1);
279    } else {
280      char encoded[3];
281      encoded[0] = 'x';
282      encoded[1] = ch / 16;
283      encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
284      encoded[2] = ch % 16;
285      encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
286      output.append(encoded, 3);
287    }
288    if (index - last_slash > kMaximumSubdirectoryLength) {
289#ifdef WIN32
290      char slash = '\\';
291#else
292      char slash = '/';
293#endif
294      output.append(&slash, 1);
295      last_slash = index;
296    }
297  }
298  return output;
299}
300
301}  // namespace net
302
303