1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/http/http_content_disposition.h"
6
7#include "base/base64.h"
8#include "base/logging.h"
9#include "base/strings/string_tokenizer.h"
10#include "base/strings/string_util.h"
11#include "base/strings/sys_string_conversions.h"
12#include "base/strings/utf_string_conversions.h"
13#include "net/base/net_string_util.h"
14#include "net/base/net_util.h"
15#include "net/http/http_util.h"
16
17namespace {
18
19enum RFC2047EncodingType {
20  Q_ENCODING,
21  B_ENCODING
22};
23
24// Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
25// decoding a quoted-printable string.  Returns true if the input was valid.
26bool DecodeQEncoding(const std::string& input, std::string* output) {
27  std::string temp;
28  temp.reserve(input.size());
29  for (std::string::const_iterator it = input.begin(); it != input.end();
30       ++it) {
31    if (*it == '_') {
32      temp.push_back(' ');
33    } else if (*it == '=') {
34      if ((input.end() - it < 3) ||
35          !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
36          !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
37        return false;
38      unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
39                         HexDigitToInt(*(it + 2));
40      temp.push_back(static_cast<char>(ch));
41      ++it;
42      ++it;
43    } else if (0x20 < *it && *it < 0x7F && *it != '?') {
44      // In a Q-encoded word, only printable ASCII characters
45      // represent themselves. Besides, space, '=', '_' and '?' are
46      // not allowed, but they're already filtered out.
47      DCHECK_NE('=', *it);
48      DCHECK_NE('?', *it);
49      DCHECK_NE('_', *it);
50      temp.push_back(*it);
51    } else {
52      return false;
53    }
54  }
55  output->swap(temp);
56  return true;
57}
58
59// Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
60// type is specified in |enc_type|.
61bool DecodeBQEncoding(const std::string& part,
62                      RFC2047EncodingType enc_type,
63                      const std::string& charset,
64                      std::string* output) {
65  std::string decoded;
66  if (!((enc_type == B_ENCODING) ?
67        base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) {
68    return false;
69  }
70
71  if (decoded.empty()) {
72    output->clear();
73    return true;
74  }
75
76  return net::ConvertToUtf8(decoded, charset.c_str(), output);
77}
78
79bool DecodeWord(const std::string& encoded_word,
80                const std::string& referrer_charset,
81                bool* is_rfc2047,
82                std::string* output,
83                int* parse_result_flags) {
84  *is_rfc2047 = false;
85  output->clear();
86  if (encoded_word.empty())
87    return true;
88
89  if (!base::IsStringASCII(encoded_word)) {
90    // Try UTF-8, referrer_charset and the native OS default charset in turn.
91    if (base::IsStringUTF8(encoded_word)) {
92      *output = encoded_word;
93    } else {
94      base::string16 utf16_output;
95      if (!referrer_charset.empty() &&
96          net::ConvertToUTF16(encoded_word, referrer_charset.c_str(),
97                              &utf16_output)) {
98        *output = base::UTF16ToUTF8(utf16_output);
99      } else {
100        *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word));
101      }
102    }
103
104    *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS;
105    return true;
106  }
107
108  // RFC 2047 : one of encoding methods supported by Firefox and relatively
109  // widely used by web servers.
110  // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
111  // We don't care about the length restriction (72 bytes) because
112  // many web servers generate encoded words longer than the limit.
113  std::string decoded_word;
114  *is_rfc2047 = true;
115  int part_index = 0;
116  std::string charset;
117  base::StringTokenizer t(encoded_word, "?");
118  RFC2047EncodingType enc_type = Q_ENCODING;
119  while (*is_rfc2047 && t.GetNext()) {
120    std::string part = t.token();
121    switch (part_index) {
122      case 0:
123        if (part != "=") {
124          *is_rfc2047 = false;
125          break;
126        }
127        ++part_index;
128        break;
129      case 1:
130        // Do we need charset validity check here?
131        charset = part;
132        ++part_index;
133        break;
134      case 2:
135        if (part.size() > 1 ||
136            part.find_first_of("bBqQ") == std::string::npos) {
137          *is_rfc2047 = false;
138          break;
139        }
140        if (part[0] == 'b' || part[0] == 'B') {
141          enc_type = B_ENCODING;
142        }
143        ++part_index;
144        break;
145      case 3:
146        *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
147        if (!*is_rfc2047) {
148          // Last minute failure. Invalid B/Q encoding. Rather than
149          // passing it through, return now.
150          return false;
151        }
152        ++part_index;
153        break;
154      case 4:
155        if (part != "=") {
156          // Another last minute failure !
157          // Likely to be a case of two encoded-words in a row or
158          // an encoded word followed by a non-encoded word. We can be
159          // generous, but it does not help much in terms of compatibility,
160          // I believe. Return immediately.
161          *is_rfc2047 = false;
162          return false;
163        }
164        ++part_index;
165        break;
166      default:
167        *is_rfc2047 = false;
168        return false;
169    }
170  }
171
172  if (*is_rfc2047) {
173    if (*(encoded_word.end() - 1) == '=') {
174      output->swap(decoded_word);
175      *parse_result_flags |=
176          net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
177      return true;
178    }
179    // encoded_word ending prematurelly with '?' or extra '?'
180    *is_rfc2047 = false;
181    return false;
182  }
183
184  // We're not handling 'especial' characters quoted with '\', but
185  // it should be Ok because we're not an email client but a
186  // web browser.
187
188  // What IE6/7 does: %-escaped UTF-8.
189  decoded_word = net::UnescapeURLComponent(encoded_word,
190                                           net::UnescapeRule::SPACES);
191  if (decoded_word != encoded_word)
192    *parse_result_flags |=
193        net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
194  if (base::IsStringUTF8(decoded_word)) {
195    output->swap(decoded_word);
196    return true;
197    // We can try either the OS default charset or 'origin charset' here,
198    // As far as I can tell, IE does not support it. However, I've seen
199    // web servers emit %-escaped string in a legacy encoding (usually
200    // origin charset).
201    // TODO(jungshik) : Test IE further and consider adding a fallback here.
202  }
203  return false;
204}
205
206// Decodes the value of a 'filename' or 'name' parameter given as |input|. The
207// value is supposed to be of the form:
208//
209//   value                   = token | quoted-string
210//
211// However we currently also allow RFC 2047 encoding and non-ASCII
212// strings. Non-ASCII strings are interpreted based on |referrer_charset|.
213bool DecodeFilenameValue(const std::string& input,
214                         const std::string& referrer_charset,
215                         std::string* output,
216                         int* parse_result_flags) {
217  int current_parse_result_flags = 0;
218  std::string decoded_value;
219  bool is_previous_token_rfc2047 = true;
220
221  // Tokenize with whitespace characters.
222  base::StringTokenizer t(input, " \t\n\r");
223  t.set_options(base::StringTokenizer::RETURN_DELIMS);
224  while (t.GetNext()) {
225    if (t.token_is_delim()) {
226      // If the previous non-delimeter token is not RFC2047-encoded,
227      // put in a space in its place. Otheriwse, skip over it.
228      if (!is_previous_token_rfc2047)
229        decoded_value.push_back(' ');
230      continue;
231    }
232    // We don't support a single multibyte character split into
233    // adjacent encoded words. Some broken mail clients emit headers
234    // with that problem, but most web servers usually encode a filename
235    // in a single encoded-word. Firefox/Thunderbird do not support
236    // it, either.
237    std::string decoded;
238    if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
239                    &decoded, &current_parse_result_flags))
240      return false;
241    decoded_value.append(decoded);
242  }
243  output->swap(decoded_value);
244  if (parse_result_flags && !output->empty())
245    *parse_result_flags |= current_parse_result_flags;
246  return true;
247}
248
249// Parses the charset and value-chars out of an ext-value string.
250//
251//  ext-value     = charset  "'" [ language ] "'" value-chars
252bool ParseExtValueComponents(const std::string& input,
253                             std::string* charset,
254                             std::string* value_chars) {
255  base::StringTokenizer t(input, "'");
256  t.set_options(base::StringTokenizer::RETURN_DELIMS);
257  std::string temp_charset;
258  std::string temp_value;
259  int numDelimsSeen = 0;
260  while (t.GetNext()) {
261    if (t.token_is_delim()) {
262      ++numDelimsSeen;
263      continue;
264    } else {
265      switch (numDelimsSeen) {
266        case 0:
267          temp_charset = t.token();
268          break;
269        case 1:
270          // Language is ignored.
271          break;
272        case 2:
273          temp_value = t.token();
274          break;
275        default:
276          return false;
277      }
278    }
279  }
280  if (numDelimsSeen != 2)
281    return false;
282  if (temp_charset.empty() || temp_value.empty())
283    return false;
284  charset->swap(temp_charset);
285  value_chars->swap(temp_value);
286  return true;
287}
288
289// http://tools.ietf.org/html/rfc5987#section-3.2
290//
291//  ext-value     = charset  "'" [ language ] "'" value-chars
292//
293//  charset       = "UTF-8" / "ISO-8859-1" / mime-charset
294//
295//  mime-charset  = 1*mime-charsetc
296//  mime-charsetc = ALPHA / DIGIT
297//                 / "!" / "#" / "$" / "%" / "&"
298//                 / "+" / "-" / "^" / "_" / "`"
299//                 / "{" / "}" / "~"
300//
301//  language      = <Language-Tag, defined in [RFC5646], Section 2.1>
302//
303//  value-chars   = *( pct-encoded / attr-char )
304//
305//  pct-encoded   = "%" HEXDIG HEXDIG
306//
307//  attr-char     = ALPHA / DIGIT
308//                 / "!" / "#" / "$" / "&" / "+" / "-" / "."
309//                 / "^" / "_" / "`" / "|" / "~"
310bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
311  if (param_value.find('"') != std::string::npos)
312    return false;
313
314  std::string charset;
315  std::string value;
316  if (!ParseExtValueComponents(param_value, &charset, &value))
317    return false;
318
319  // RFC 5987 value should be ASCII-only.
320  if (!base::IsStringASCII(value)) {
321    decoded->clear();
322    return true;
323  }
324
325  std::string unescaped = net::UnescapeURLComponent(
326      value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
327
328  return net::ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded);
329}
330
331} // namespace
332
333namespace net {
334
335HttpContentDisposition::HttpContentDisposition(
336    const std::string& header, const std::string& referrer_charset)
337  : type_(INLINE),
338    parse_result_flags_(INVALID) {
339  Parse(header, referrer_charset);
340}
341
342HttpContentDisposition::~HttpContentDisposition() {
343}
344
345std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
346    std::string::const_iterator begin, std::string::const_iterator end) {
347  DCHECK(type_ == INLINE);
348  std::string::const_iterator delimiter = std::find(begin, end, ';');
349
350  std::string::const_iterator type_begin = begin;
351  std::string::const_iterator type_end = delimiter;
352  HttpUtil::TrimLWS(&type_begin, &type_end);
353
354  // If the disposition-type isn't a valid token the then the
355  // Content-Disposition header is malformed, and we treat the first bytes as
356  // a parameter rather than a disposition-type.
357  if (!HttpUtil::IsToken(type_begin, type_end))
358    return begin;
359
360  parse_result_flags_ |= HAS_DISPOSITION_TYPE;
361
362  DCHECK(std::find(type_begin, type_end, '=') == type_end);
363
364  if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {
365    type_ = INLINE;
366  } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {
367    type_ = ATTACHMENT;
368  } else {
369    parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
370    type_ = ATTACHMENT;
371  }
372  return delimiter;
373}
374
375// http://tools.ietf.org/html/rfc6266
376//
377//  content-disposition = "Content-Disposition" ":"
378//                         disposition-type *( ";" disposition-parm )
379//
380//  disposition-type    = "inline" | "attachment" | disp-ext-type
381//                      ; case-insensitive
382//  disp-ext-type       = token
383//
384//  disposition-parm    = filename-parm | disp-ext-parm
385//
386//  filename-parm       = "filename" "=" value
387//                      | "filename*" "=" ext-value
388//
389//  disp-ext-parm       = token "=" value
390//                      | ext-token "=" ext-value
391//  ext-token           = <the characters in token, followed by "*">
392//
393void HttpContentDisposition::Parse(const std::string& header,
394                                   const std::string& referrer_charset) {
395  DCHECK(type_ == INLINE);
396  DCHECK(filename_.empty());
397
398  std::string::const_iterator pos = header.begin();
399  std::string::const_iterator end = header.end();
400  pos = ConsumeDispositionType(pos, end);
401
402  std::string name;
403  std::string filename;
404  std::string ext_filename;
405
406  HttpUtil::NameValuePairsIterator iter(pos, end, ';');
407  while (iter.GetNext()) {
408    if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
409                                                 iter.name_end(),
410                                                 "filename")) {
411      DecodeFilenameValue(iter.value(), referrer_charset, &filename,
412                          &parse_result_flags_);
413      if (!filename.empty())
414        parse_result_flags_ |= HAS_FILENAME;
415    } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),
416                                                    iter.name_end(),
417                                                    "name")) {
418      DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);
419      if (!name.empty())
420        parse_result_flags_ |= HAS_NAME;
421    } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
422                                                            iter.name_end(),
423                                                            "filename*")) {
424      DecodeExtValue(iter.raw_value(), &ext_filename);
425      if (!ext_filename.empty())
426        parse_result_flags_ |= HAS_EXT_FILENAME;
427    }
428  }
429
430  if (!ext_filename.empty())
431    filename_ = ext_filename;
432  else if (!filename.empty())
433    filename_ = filename;
434  else
435    filename_ = name;
436}
437
438}  // namespace net
439