1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/tools/dump_cache/url_to_filename_encoder.h"
6
7#include <string>
8#include <vector>
9
10#include "base/string_piece.h"
11#include "base/string_util.h"
12#include "base/stringprintf.h"
13#include "testing/gtest/include/gtest/gtest.h"
14
15using base::StringPiece;
16using std::string;
17
18namespace net {
19
20#ifdef WIN32
21char kDirSeparator = '\\';
22char kOtherDirSeparator = '/';
23#else
24char kDirSeparator = '/';
25char kOtherDirSeparator = '\\';
26#endif
27
28class UrlToFilenameEncoderTest : public ::testing::Test {
29 protected:
30  UrlToFilenameEncoderTest() : escape_(1, UrlToFilenameEncoder::kEscapeChar),
31                               dir_sep_(1, kDirSeparator) {
32  }
33
34  void CheckSegmentLength(const StringPiece& escaped_word) {
35    std::vector<StringPiece> components;
36    Tokenize(escaped_word, StringPiece("/"), &components);
37    for (size_t i = 0; i < components.size(); ++i) {
38      EXPECT_GE(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
39                components[i].size());
40    }
41  }
42
43  void CheckValidChars(const StringPiece& escaped_word, char invalid_slash) {
44    // These characters are invalid in Windows.  We add in ', as that's pretty
45    // inconvenient in a Unix filename.
46    //
47    // See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
48    const string kInvalidChars = "<>:\"|?*'";
49    for (size_t i = 0; i < escaped_word.size(); ++i) {
50      char c = escaped_word[i];
51      EXPECT_EQ(string::npos, kInvalidChars.find(c));
52      EXPECT_NE(invalid_slash, c);
53      EXPECT_NE('\0', c);  // only invalid character in Posix
54      EXPECT_GT(0x7E, c);  // only English printable characters
55    }
56  }
57
58  void Validate(const string& in_word, const string& gold_word) {
59    string escaped_word, url;
60    UrlToFilenameEncoder::EncodeSegment("", in_word, '/', &escaped_word);
61    EXPECT_EQ(gold_word, escaped_word);
62    CheckSegmentLength(escaped_word);
63    CheckValidChars(escaped_word, '\\');
64    UrlToFilenameEncoder::Decode(escaped_word, '/', &url);
65    EXPECT_EQ(in_word, url);
66  }
67
68  void ValidateAllSegmentsSmall(const string& in_word) {
69    string escaped_word, url;
70    UrlToFilenameEncoder::EncodeSegment("", in_word, '/', &escaped_word);
71    CheckSegmentLength(escaped_word);
72    CheckValidChars(escaped_word, '\\');
73    UrlToFilenameEncoder::Decode(escaped_word, '/', &url);
74    EXPECT_EQ(in_word, url);
75  }
76
77  void ValidateNoChange(const string& word) {
78    // We always suffix the leaf with kEscapeChar, unless the leaf is empty.
79    Validate(word, word + escape_);
80  }
81
82  void ValidateEscaped(unsigned char ch) {
83    // We always suffix the leaf with kEscapeChar, unless the leaf is empty.
84    char escaped[100];
85    const char escape = UrlToFilenameEncoder::kEscapeChar;
86    base::snprintf(escaped, sizeof(escaped), "%c%02X%c", escape, ch, escape);
87    Validate(string(1, ch), escaped);
88  }
89
90  void ValidateUrl(const string& url, const string& base_path,
91                   bool legacy_escape, const string& gold_filename) {
92    string encoded_filename = UrlToFilenameEncoder::Encode(
93        url, base_path, legacy_escape);
94    EXPECT_EQ(gold_filename, encoded_filename);
95    if (!legacy_escape) {
96      CheckSegmentLength(encoded_filename);
97      CheckValidChars(encoded_filename, kOtherDirSeparator);
98      string decoded_url;
99      UrlToFilenameEncoder::Decode(encoded_filename, kDirSeparator,
100                                   &decoded_url);
101      if (url != decoded_url) {
102        EXPECT_EQ(url, "http://" + decoded_url);
103      }
104    }
105  }
106
107  void ValidateUrlOldNew(const string& url, const string& gold_old_filename,
108                         const string& gold_new_filename) {
109    ValidateUrl(url, "", true, gold_old_filename);
110    ValidateUrl(url, "", false, gold_new_filename);
111  }
112
113  void ValidateEncodeSame(const string& url1, const string& url2) {
114    string filename1 = UrlToFilenameEncoder::Encode(url1, "", false);
115    string filename2 = UrlToFilenameEncoder::Encode(url2, "", false);
116    EXPECT_EQ(filename1, filename2);
117  }
118
119  string escape_;
120  string dir_sep_;
121};
122
123TEST_F(UrlToFilenameEncoderTest, DoesNotEscape) {
124  ValidateNoChange("");
125  ValidateNoChange("abcdefg");
126  ValidateNoChange("abcdefghijklmnopqrstuvwxyz");
127  ValidateNoChange("ZYXWVUT");
128  ValidateNoChange("ZYXWVUTSRQPONMLKJIHGFEDCBA");
129  ValidateNoChange("01234567689");
130  ValidateNoChange("_.=+-");
131  ValidateNoChange("abcdefghijklmnopqrstuvwxyzZYXWVUTSRQPONMLKJIHGFEDCBA"
132                   "01234567689_.=+-");
133  ValidateNoChange("index.html");
134  ValidateNoChange("/");
135  ValidateNoChange("/.");
136  ValidateNoChange(".");
137  ValidateNoChange("..");
138}
139
140TEST_F(UrlToFilenameEncoderTest, Escapes) {
141  const string bad_chars =
142      "<>:\"\\|?*"      // Illegal on Windows
143      "~`!$^&(){}[]';"  // Bad for Unix shells
144      "^@"              // Build tool doesn't like
145      "#%"              // Tool doesn't like
146      ",";              // The escape char has to be escaped
147
148  for (size_t i = 0; i < bad_chars.size(); ++i) {
149    ValidateEscaped(bad_chars[i]);
150  }
151
152  // Check non-printable characters.
153  ValidateEscaped('\0');
154  for (size_t i = 127; i < 256; ++i) {
155    ValidateEscaped(static_cast<char>(i));
156  }
157}
158
159TEST_F(UrlToFilenameEncoderTest, DoesEscapeCorrectly) {
160  Validate("mysite.com&x", "mysite.com" + escape_ + "26x" + escape_);
161  Validate("/./", "/" + escape_ + "./" + escape_);
162  Validate("/../", "/" + escape_ + "../" + escape_);
163  Validate("//", "/" + escape_ + "2F" + escape_);
164  Validate("/./leaf", "/" + escape_ + "./leaf" + escape_);
165  Validate("/../leaf", "/" + escape_ + "../leaf" + escape_);
166  Validate("//leaf", "/" + escape_ + "2Fleaf" + escape_);
167  Validate("mysite/u?param1=x&param2=y",
168           "mysite/u" + escape_ + "3Fparam1=x" + escape_ + "26param2=y" +
169           escape_);
170  Validate("search?q=dogs&go=&form=QBLH&qs=n",  // from Latency Labs bing test.
171           "search" + escape_ + "3Fq=dogs" + escape_ + "26go=" + escape_ +
172           "26form=QBLH" + escape_ + "26qs=n" + escape_);
173  Validate("~joebob/my_neeto-website+with_stuff.asp?id=138&content=true",
174           "" + escape_ + "7Ejoebob/my_neeto-website+with_stuff.asp" + escape_ +
175           "3Fid=138" + escape_ + "26content=true" + escape_);
176}
177
178TEST_F(UrlToFilenameEncoderTest, EncodeUrlCorrectly) {
179  ValidateUrlOldNew("http://www.google.com/index.html",
180                    "www.google.com" + dir_sep_ + "indexx2Ehtml",
181                    "www.google.com" + dir_sep_ + "index.html" + escape_);
182  ValidateUrlOldNew("http://www.google.com/x/search?hl=en&q=dogs&oq=",
183                    "www.google.com" + dir_sep_ + "x" + dir_sep_ +
184                    "searchx3Fhlx3Denx26qx3Ddogsx26oqx3D",
185
186                    "www.google.com" + dir_sep_ + "x" + dir_sep_ + "search" +
187                    escape_ + "3Fhl=en" + escape_ + "26q=dogs" + escape_ +
188                    "26oq=" + escape_);
189  ValidateUrlOldNew("http://www.foo.com/a//",
190                    "www.foo.com" + dir_sep_ + "ax255Cx255Cindexx2Ehtml",
191                    "www.foo.com" + dir_sep_ + "a" + dir_sep_ + escape_ + "2F" +
192                    escape_);
193
194  // From bug: Double slash preserved.
195  ValidateUrl("http://www.foo.com/u?site=http://www.google.com/index.html",
196              "", false,
197              "www.foo.com" + dir_sep_ + "u" + escape_ + "3Fsite=http" +
198              escape_ + "3A" + dir_sep_ + escape_ + "2Fwww.google.com" +
199              dir_sep_ + "index.html" + escape_);
200  ValidateUrlOldNew(
201      "http://blogutils.net/olct/online.php?"
202      "site=http://thelwordfanfics.blogspot.&interval=600",
203
204      "blogutils.net" + dir_sep_ + "olct" + dir_sep_ + "onlinex2Ephpx3F"
205      "sitex3Dhttpx3Ax255Cx255Cthelwordfanficsx2Eblogspotx2Ex26intervalx3D600",
206
207      "blogutils.net" + dir_sep_ + "olct" + dir_sep_ + "online.php" + escape_ +
208      "3Fsite=http" + escape_ + "3A" + dir_sep_ + escape_ +
209      "2Fthelwordfanfics.blogspot." + escape_ + "26interval=600" + escape_);
210}
211
212// From bug: Escapes treated the same as normal char.
213TEST_F(UrlToFilenameEncoderTest, UnescapeUrlsBeforeEncode) {
214  for (int i = 0; i < 128; ++i) {
215    string unescaped(1, static_cast<char>(i));
216    string escaped = base::StringPrintf("%%%02X", i);
217    ValidateEncodeSame(unescaped, escaped);
218  }
219
220  ValidateEncodeSame(
221      "http://www.blogger.com/navbar.g?bName=God!&Mode=FOO&searchRoot"
222      "=http%3A%2F%2Fsurvivorscanthrive.blogspot.com%2Fsearch",
223
224      "http://www.blogger.com/navbar.g?bName=God%21&Mode=FOO&searchRoot"
225      "=http%3A%2F%2Fsurvivorscanthrive.blogspot.com%2Fsearch");
226}
227
228// From bug: Filename encoding is not prefix-free.
229TEST_F(UrlToFilenameEncoderTest, EscapeSecondSlash) {
230  Validate("/", "/" + escape_);
231  Validate("//", "/" + escape_ + "2F" + escape_);
232  Validate("///", "/" + escape_ + "2F" + "/" + escape_);
233}
234
235TEST_F(UrlToFilenameEncoderTest, LongTail) {
236  static char long_word[] =
237      "~joebob/briggs/12345678901234567890123456789012345678901234567890"
238      "1234567890123456789012345678901234567890123456789012345678901234567890"
239      "1234567890123456789012345678901234567890123456789012345678901234567890"
240      "1234567890123456789012345678901234567890123456789012345678901234567890"
241      "1234567890123456789012345678901234567890123456789012345678901234567890"
242      "1234567890123456789012345678901234567890123456789012345678901234567890";
243
244  // the long lines in the string below are 64 characters, so we can see
245  // the slashes every 128.
246  string gold_long_word =
247      escape_ + "7Ejoebob/briggs/"
248      "1234567890123456789012345678901234567890123456789012345678901234"
249      "56789012345678901234567890123456789012345678901234567890123456" +
250      escape_ + "-/"
251      "7890123456789012345678901234567890123456789012345678901234567890"
252      "12345678901234567890123456789012345678901234567890123456789012" +
253      escape_ + "-/"
254      "3456789012345678901234567890123456789012345678901234567890123456"
255      "78901234567890123456789012345678901234567890123456789012345678" +
256      escape_ + "-/"
257      "9012345678901234567890" + escape_;
258  EXPECT_LT(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
259            sizeof(long_word));
260  Validate(long_word, gold_long_word);
261}
262
263TEST_F(UrlToFilenameEncoderTest, LongTailQuestion) {
264  // Here the '?' in the last path segment expands to @3F, making
265  // it hit 128 chars before the input segment gets that big.
266  static char long_word[] =
267      "~joebob/briggs/1234567?1234567?1234567?1234567?1234567?"
268      "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
269      "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
270      "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
271      "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
272      "1234567?1234567?1234567?1234567?1234567?1234567?1234567?";
273
274  // Notice that at the end of the third segment, we avoid splitting
275  // the (escape_ + "3F") that was generated from the "?", so that segment is
276  // only 127 characters.
277  string pattern = "1234567" + escape_ + "3F";  // 10 characters
278  string gold_long_word =
279      escape_ + "7Ejoebob/briggs/" +
280      pattern + pattern + pattern + pattern + pattern + pattern + "1234"
281      "567" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern +
282       "123456" + escape_ + "-/"
283      "7" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern +
284      pattern + pattern + pattern + pattern + pattern + pattern + pattern +
285      "12" +
286      escape_ + "-/"
287      "34567" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern
288      + "1234567" + escape_ + "3F" + pattern + pattern + pattern + pattern
289      + pattern + "1234567" +
290      escape_ + "-/" +
291      escape_ + "3F" + pattern + pattern + escape_;
292  EXPECT_LT(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
293            sizeof(long_word));
294  Validate(long_word, gold_long_word);
295}
296
297TEST_F(UrlToFilenameEncoderTest, CornerCasesNearMaxLenNoEscape) {
298  // hit corner cases, +/- 4 characters from kMaxLen
299  for (int i = -4; i <= 4; ++i) {
300    string input;
301    input.append(i + UrlToFilenameEncoder::kMaximumSubdirectoryLength, 'x');
302    ValidateAllSegmentsSmall(input);
303  }
304}
305
306TEST_F(UrlToFilenameEncoderTest, CornerCasesNearMaxLenWithEscape) {
307  // hit corner cases, +/- 4 characters from kMaxLen.  This time we
308  // leave off the last 'x' and put in a '.', which ensures that we
309  // are truncating with '/' *after* the expansion.
310  for (int i = -4; i <= 4; ++i) {
311    string input;
312    input.append(i + UrlToFilenameEncoder::kMaximumSubdirectoryLength - 1, 'x');
313    input.append(1, '.');  // this will expand to 3 characters.
314    ValidateAllSegmentsSmall(input);
315  }
316}
317
318TEST_F(UrlToFilenameEncoderTest, LeafBranchAlias) {
319  Validate("/a/b/c", "/a/b/c" + escape_);        // c is leaf file "c,"
320  Validate("/a/b/c/d", "/a/b/c/d" + escape_);    // c is directory "c"
321  Validate("/a/b/c/d/", "/a/b/c/d/" + escape_);
322}
323
324
325TEST_F(UrlToFilenameEncoderTest, BackslashSeparator) {
326  string long_word;
327  string escaped_word;
328  long_word.append(UrlToFilenameEncoder::kMaximumSubdirectoryLength + 1, 'x');
329  UrlToFilenameEncoder::EncodeSegment("", long_word, '\\', &escaped_word);
330
331  // check that one backslash, plus the escape ",-", and the ending , got added.
332  EXPECT_EQ(long_word.size() + 4, escaped_word.size());
333  ASSERT_LT(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
334            escaped_word.size());
335  // Check that the backslash got inserted at the correct spot.
336  EXPECT_EQ('\\', escaped_word[
337      UrlToFilenameEncoder::kMaximumSubdirectoryLength]);
338}
339
340}  // namespace net
341
342