ftp_directory_listing_parser_ls.cc revision 5d1f7b1de12d16ceb2c938c56701a3e8bfa558f7
1ba5b9a6411cb1792fd21f0a078d7a25cd1ceec16Ben Murdoch// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "net/ftp/ftp_directory_listing_parser_ls.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
85d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/strings/string_number_conversions.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/strings/string_split.h"
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/strings/string_util.h"
12a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)#include "base/strings/utf_string_conversions.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/time/time.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "net/ftp/ftp_directory_listing_parser.h"
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "net/ftp/ftp_util.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace {
18868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool TwoColumnDateListingToTime(const base::string16& date,
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                const base::string16& time,
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                base::Time* result) {
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::Time::Exploded time_exploded = { 0 };
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Date should be in format YYYY-MM-DD.
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<base::string16> date_parts;
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::SplitString(date, '-', &date_parts);
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (date_parts.size() != 3)
28868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    return false;
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!base::StringToInt(date_parts[0], &time_exploded.year))
30f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return false;
311320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  if (!base::StringToInt(date_parts[1], &time_exploded.month))
321320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    return false;
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!base::StringToInt(date_parts[2], &time_exploded.day_of_month))
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
35868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)
361320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  // Time should be in format HH:MM
371320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  if (time.length() != 5)
381320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    return false;
391320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
40f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  std::vector<base::string16> time_parts;
41a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  base::SplitString(time, ':', &time_parts);
42f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  if (time_parts.size() != 2)
43f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return false;
44f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  if (!base::StringToInt(time_parts[0], &time_exploded.hour))
45f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return false;
46f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  if (!base::StringToInt(time_parts[1], &time_exploded.minute))
47f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return false;
48a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  if (!time_exploded.HasValidValues())
49f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return false;
50f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
511320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  // We don't know the time zone of the server, so just use local time.
521320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  *result = base::Time::FromLocalExploded(time_exploded);
531320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  return true;
541320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci}
551320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
561320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci// Returns the column index of the end of the date listing and detected
57eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// last modification time.
58868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)bool DetectColumnOffsetSizeAndModificationTime(
59868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    const std::vector<base::string16>& columns,
602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    const base::Time& current_time,
612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    size_t* offset,
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    base::string16* size,
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    base::Time* modification_time) {
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The column offset can be arbitrarily large if some fields
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // like owner or group name contain spaces. Try offsets from left to right
662a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  // and use the first one that matches a date listing.
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  // Here is how a listing line should look like. A star ("*") indicates
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // a required field:
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //  * 1. permission listing
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //    2. number of links (optional)
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //  * 3. owner name (may contain spaces)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //    4. group name (optional, may contain spaces)
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //  * 5. size in bytes
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //  * 6. month
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //  * 7. day of month
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //  * 8. year or time <-- column_offset will be the index of this column
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //    9. file name (optional, may contain spaces)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (size_t i = 5U; i < columns.size(); i++) {
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (net::FtpUtil::LsDateListingToTime(columns[i - 2],
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                          columns[i - 1],
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                          columns[i],
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                          current_time,
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                          modification_time)) {
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      *size = columns[i - 3];
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      *offset = i;
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return true;
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Some FTP listings have swapped the "month" and "day of month" columns
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // (for example Russian listings). We try to recognize them only after making
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // sure no column offset works above (this is a more strict way).
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (size_t i = 5U; i < columns.size(); i++) {
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (net::FtpUtil::LsDateListingToTime(columns[i - 1],
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                          columns[i - 2],
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                          columns[i],
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                          current_time,
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                          modification_time)) {
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      *size = columns[i - 3];
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      *offset = i;
103a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      return true;
104a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    }
105a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  }
106a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
107a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  // Some FTP listings use a different date format.
108a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  for (size_t i = 5U; i < columns.size(); i++) {
109a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    if (TwoColumnDateListingToTime(columns[i - 1],
110a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                                   columns[i],
111a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                                   modification_time)) {
112a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      *size = columns[i - 2];
11358537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)      *offset = i;
11458537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)      return true;
115f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    }
116a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  }
11758537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return false;
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
123f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)namespace net {
124f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
1252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)bool ParseFtpDirectoryListingLs(
126a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    const std::vector<base::string16>& lines,
127a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    const base::Time& current_time,
128a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    std::vector<FtpDirectoryListingEntry>* entries) {
1291320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  // True after we have received a "total n" listing header, where n is an
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // integer. Only one such header is allowed per listing.
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool received_total_line = false;
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (size_t i = 0; i < lines.size(); i++) {
134a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    if (lines[i].empty())
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
137f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    std::vector<base::string16> columns;
138f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    base::SplitString(CollapseWhitespace(lines[i], false), ' ', &columns);
1395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
140a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    // Some FTP servers put a "total n" line at the beginning of the listing
141a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    // (n is an integer). Allow such a line, but only once, and only if it's
142a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    // the first non-empty line. Do not match the word exactly, because it may
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // be in different languages (at least English and German have been seen
1442a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    // in the field).
1452a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    if (columns.size() == 2 && !received_total_line) {
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      received_total_line = true;
1475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      int64 total_number;
1495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      if (!base::StringToInt64(columns[1], &total_number))
1505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        return false;
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (total_number < 0)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return false;
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
157    FtpDirectoryListingEntry entry;
158
159    size_t column_offset;
160    base::string16 size;
161    if (!DetectColumnOffsetSizeAndModificationTime(columns,
162                                                   current_time,
163                                                   &column_offset,
164                                                   &size,
165                                                   &entry.last_modified)) {
166      // Some servers send a message in one of the first few lines.
167      // All those messages have in common is the string ".:",
168      // where "." means the current directory, and ":" separates it
169      // from the rest of the message, which may be empty.
170      if (lines[i].find(base::ASCIIToUTF16(".:")) != base::string16::npos)
171        continue;
172
173      return false;
174    }
175
176    // Do not check "validity" of the permission listing. It's quirky,
177    // and some servers send garbage here while other parts of the line are OK.
178
179    if (!columns[0].empty() && columns[0][0] == 'l') {
180      entry.type = FtpDirectoryListingEntry::SYMLINK;
181    } else if (!columns[0].empty() && columns[0][0] == 'd') {
182      entry.type = FtpDirectoryListingEntry::DIRECTORY;
183    } else {
184      entry.type = FtpDirectoryListingEntry::FILE;
185    }
186
187    if (!base::StringToInt64(size, &entry.size)) {
188      // Some FTP servers do not separate owning group name from file size,
189      // like "group1234". We still want to display the file name for that
190      // entry, but can't really get the size (What if the group is named
191      // "group1", and the size is in fact 234? We can't distinguish between
192      // that and "group" with size 1234). Use a dummy value for the size.
193      // TODO(phajdan.jr): Use a value that means "unknown" instead of 0 bytes.
194      entry.size = 0;
195    }
196    if (entry.size < 0) {
197      // Some FTP servers have bugs that cause them to display the file size
198      // as negative. They're most likely big files like DVD ISO images.
199      // We still want to display them, so just say the real file size
200      // is unknown.
201      entry.size = -1;
202    }
203    if (entry.type != FtpDirectoryListingEntry::FILE)
204      entry.size = -1;
205
206    if (column_offset == columns.size() - 1) {
207      // If the end of the date listing is the last column, there is no file
208      // name. Some FTP servers send listing entries with empty names.
209      // It's not obvious how to display such an entry, so we ignore them.
210      // We don't want to make the parsing fail at this point though.
211      // Other entries can still be useful.
212      continue;
213    }
214
215    entry.name = FtpUtil::GetStringPartAfterColumns(lines[i],
216                                                    column_offset + 1);
217
218    if (entry.type == FtpDirectoryListingEntry::SYMLINK) {
219      base::string16::size_type pos =
220          entry.name.rfind(base::ASCIIToUTF16(" -> "));
221
222      // We don't require the " -> " to be present. Some FTP servers don't send
223      // the symlink target, possibly for security reasons.
224      if (pos != base::string16::npos)
225        entry.name = entry.name.substr(0, pos);
226    }
227
228    entries->push_back(entry);
229  }
230
231  return true;
232}
233
234}  // namespace net
235