ftp_directory_listing_parser_ls.cc revision 2a99a7e74a7f215066514fe81d2bfa6639d9eddd
1402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll// Use of this source code is governed by a BSD-style license that can be
3402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll// found in the LICENSE file.
4402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
5402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll#include "net/ftp/ftp_directory_listing_parser_ls.h"
6402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
7402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll#include <vector>
8402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
9402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll#include "base/string_number_conversions.h"
10402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll#include "base/string_util.h"
11402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll#include "base/strings/string_split.h"
12402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll#include "base/time.h"
13402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll#include "base/utf_string_conversions.h"
14402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll#include "net/ftp/ftp_directory_listing_parser.h"
15402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll#include "net/ftp/ftp_util.h"
16402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
17402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Mollnamespace {
18402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
19402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Mollbool TwoColumnDateListingToTime(const string16& date,
20402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                const string16& time,
21402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                base::Time* result) {
22402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  base::Time::Exploded time_exploded = { 0 };
23402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
24402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // Date should be in format YYYY-MM-DD.
25402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  std::vector<string16> date_parts;
26402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  base::SplitString(date, '-', &date_parts);
27402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  if (date_parts.size() != 3)
28402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    return false;
29402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  if (!base::StringToInt(date_parts[0], &time_exploded.year))
30402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    return false;
31402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  if (!base::StringToInt(date_parts[1], &time_exploded.month))
32402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    return false;
33402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  if (!base::StringToInt(date_parts[2], &time_exploded.day_of_month))
34402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    return false;
35402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
36402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // Time should be in format HH:MM
37402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  if (time.length() != 5)
38402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    return false;
39402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
40402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  std::vector<string16> time_parts;
41402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  base::SplitString(time, ':', &time_parts);
42402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  if (time_parts.size() != 2)
43402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    return false;
44402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  if (!base::StringToInt(time_parts[0], &time_exploded.hour))
45402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    return false;
46402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  if (!base::StringToInt(time_parts[1], &time_exploded.minute))
47402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    return false;
48402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  if (!time_exploded.HasValidValues())
49402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    return false;
50402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
51402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // We don't know the time zone of the server, so just use local time.
52402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  *result = base::Time::FromLocalExploded(time_exploded);
53402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  return true;
54402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll}
55402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
56402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll// Returns the column index of the end of the date listing and detected
57402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll// last modification time.
58402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Mollbool DetectColumnOffsetSizeAndModificationTime(
59402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    const std::vector<string16>& columns,
60402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    const base::Time& current_time,
61402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    size_t* offset,
62402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    string16* size,
63402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    base::Time* modification_time) {
64402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // The column offset can be arbitrarily large if some fields
65402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // like owner or group name contain spaces. Try offsets from left to right
66402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // and use the first one that matches a date listing.
67402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  //
68402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // Here is how a listing line should look like. A star ("*") indicates
69402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // a required field:
70402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  //
71402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  //  * 1. permission listing
72402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  //    2. number of links (optional)
73402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  //  * 3. owner name (may contain spaces)
74402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  //    4. group name (optional, may contain spaces)
75402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  //  * 5. size in bytes
76402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  //  * 6. month
77402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  //  * 7. day of month
78402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  //  * 8. year or time <-- column_offset will be the index of this column
79402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  //    9. file name (optional, may contain spaces)
80402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  for (size_t i = 5U; i < columns.size(); i++) {
81402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    if (net::FtpUtil::LsDateListingToTime(columns[i - 2],
82402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                          columns[i - 1],
83402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                          columns[i],
84402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                          current_time,
85402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                          modification_time)) {
86402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      *size = columns[i - 3];
87402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      *offset = i;
88402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      return true;
89402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    }
90402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  }
91402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
92402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // Some FTP listings have swapped the "month" and "day of month" columns
93402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // (for example Russian listings). We try to recognize them only after making
94402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // sure no column offset works above (this is a more strict way).
95402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  for (size_t i = 5U; i < columns.size(); i++) {
96402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    if (net::FtpUtil::LsDateListingToTime(columns[i - 1],
97402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                          columns[i - 2],
98402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                          columns[i],
99402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                          current_time,
100402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                          modification_time)) {
101402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      *size = columns[i - 3];
102402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      *offset = i;
103402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      return true;
104402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    }
105402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  }
106402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
107402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // Some FTP listings use a different date format.
108402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  for (size_t i = 5U; i < columns.size(); i++) {
109402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    if (TwoColumnDateListingToTime(columns[i - 1],
110402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                   columns[i],
111402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll                                   modification_time)) {
112402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      *size = columns[i - 2];
113402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      *offset = i;
114402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      return true;
115402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    }
116402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  }
117402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
118402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  return false;
119402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll}
120402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
121402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll}  // namespace
122402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
123402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Mollnamespace net {
124402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
125402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Mollbool ParseFtpDirectoryListingLs(
126402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    const std::vector<string16>& lines,
127402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    const base::Time& current_time,
128402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    std::vector<FtpDirectoryListingEntry>* entries) {
129402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // True after we have received a "total n" listing header, where n is an
130402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  // integer. Only one such header is allowed per listing.
131402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  bool received_total_line = false;
132402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
133402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll  for (size_t i = 0; i < lines.size(); i++) {
134402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    if (lines[i].empty())
135402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      continue;
136402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
137402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    std::vector<string16> columns;
138402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    base::SplitString(CollapseWhitespace(lines[i], false), ' ', &columns);
139402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
140402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    // Some FTP servers put a "total n" line at the beginning of the listing
141402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    // (n is an integer). Allow such a line, but only once, and only if it's
142402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    // the first non-empty line. Do not match the word exactly, because it may
143402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    // be in different languages (at least English and German have been seen
144402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    // in the field).
145402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll    if (columns.size() == 2 && !received_total_line) {
146402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      received_total_line = true;
147402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll
148402794e73aed8611d62eb4b01cd155e2d76fcb87Raphael Moll      int64 total_number;
149      if (!base::StringToInt64(columns[1], &total_number))
150        return false;
151      if (total_number < 0)
152        return false;
153
154      continue;
155    }
156
157    FtpDirectoryListingEntry entry;
158
159    size_t column_offset;
160    string16 size;
161    if (!DetectColumnOffsetSizeAndModificationTime(columns,
162                                                   current_time,
163                                                   &column_offset,
164                                                   &size,
165                                                   &entry.last_modified)) {
166      // Some servers send a message in one of the first few lines.
167      // All those messages have in common is the string ".:",
168      // where "." means the current directory, and ":" separates it
169      // from the rest of the message, which may be empty.
170      if (lines[i].find(ASCIIToUTF16(".:")) != string16::npos)
171        continue;
172
173      return false;
174    }
175
176    // Do not check "validity" of the permission listing. It's quirky,
177    // and some servers send garbage here while other parts of the line are OK.
178
179    if (!columns[0].empty() && columns[0][0] == 'l') {
180      entry.type = FtpDirectoryListingEntry::SYMLINK;
181    } else if (!columns[0].empty() && columns[0][0] == 'd') {
182      entry.type = FtpDirectoryListingEntry::DIRECTORY;
183    } else {
184      entry.type = FtpDirectoryListingEntry::FILE;
185    }
186
187    if (!base::StringToInt64(size, &entry.size)) {
188      // Some FTP servers do not separate owning group name from file size,
189      // like "group1234". We still want to display the file name for that
190      // entry, but can't really get the size (What if the group is named
191      // "group1", and the size is in fact 234? We can't distinguish between
192      // that and "group" with size 1234). Use a dummy value for the size.
193      // TODO(phajdan.jr): Use a value that means "unknown" instead of 0 bytes.
194      entry.size = 0;
195    }
196    if (entry.size < 0) {
197      // Some FTP servers have bugs that cause them to display the file size
198      // as negative. They're most likely big files like DVD ISO images.
199      // We still want to display them, so just say the real file size
200      // is unknown.
201      entry.size = -1;
202    }
203    if (entry.type != FtpDirectoryListingEntry::FILE)
204      entry.size = -1;
205
206    if (column_offset == columns.size() - 1) {
207      // If the end of the date listing is the last column, there is no file
208      // name. Some FTP servers send listing entries with empty names.
209      // It's not obvious how to display such an entry, so we ignore them.
210      // We don't want to make the parsing fail at this point though.
211      // Other entries can still be useful.
212      continue;
213    }
214
215    entry.name = FtpUtil::GetStringPartAfterColumns(lines[i],
216                                                    column_offset + 1);
217
218    if (entry.type == FtpDirectoryListingEntry::SYMLINK) {
219      string16::size_type pos = entry.name.rfind(ASCIIToUTF16(" -> "));
220
221      // We don't require the " -> " to be present. Some FTP servers don't send
222      // the symlink target, possibly for security reasons.
223      if (pos != string16::npos)
224        entry.name = entry.name.substr(0, pos);
225    }
226
227    entries->push_back(entry);
228  }
229
230  return true;
231}
232
233}  // namespace net
234