1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/http/http_content_disposition.h"
6
7#include "base/base64.h"
8#include "base/i18n/icu_string_conversions.h"
9#include "base/logging.h"
10#include "base/strings/string_tokenizer.h"
11#include "base/strings/string_util.h"
12#include "base/strings/sys_string_conversions.h"
13#include "base/strings/utf_string_conversions.h"
14#include "net/base/net_util.h"
15#include "net/http/http_util.h"
16#include "third_party/icu/source/common/unicode/ucnv.h"
17
18namespace {
19
20enum RFC2047EncodingType {
21  Q_ENCODING,
22  B_ENCODING
23};
24
25// Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
26// decoding a quoted-printable string.  Returns true if the input was valid.
27bool DecodeQEncoding(const std::string& input, std::string* output) {
28  std::string temp;
29  temp.reserve(input.size());
30  for (std::string::const_iterator it = input.begin(); it != input.end();
31       ++it) {
32    if (*it == '_') {
33      temp.push_back(' ');
34    } else if (*it == '=') {
35      if ((input.end() - it < 3) ||
36          !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
37          !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
38        return false;
39      unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
40                         HexDigitToInt(*(it + 2));
41      temp.push_back(static_cast<char>(ch));
42      ++it;
43      ++it;
44    } else if (0x20 < *it && *it < 0x7F && *it != '?') {
45      // In a Q-encoded word, only printable ASCII characters
46      // represent themselves. Besides, space, '=', '_' and '?' are
47      // not allowed, but they're already filtered out.
48      DCHECK_NE('=', *it);
49      DCHECK_NE('?', *it);
50      DCHECK_NE('_', *it);
51      temp.push_back(*it);
52    } else {
53      return false;
54    }
55  }
56  output->swap(temp);
57  return true;
58}
59
60// Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
61// type is specified in |enc_type|.
62bool DecodeBQEncoding(const std::string& part,
63                      RFC2047EncodingType enc_type,
64                      const std::string& charset,
65                      std::string* output) {
66  std::string decoded;
67  if (!((enc_type == B_ENCODING) ?
68        base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded)))
69    return false;
70
71  if (decoded.empty()) {
72    output->clear();
73    return true;
74  }
75
76  UErrorCode err = U_ZERO_ERROR;
77  UConverter* converter(ucnv_open(charset.c_str(), &err));
78  if (U_FAILURE(err))
79    return false;
80
81  // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
82  // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
83  // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a
84  // trailing '\0'.
85  size_t output_length = decoded.length() * 3 + 1;
86  char* buf = WriteInto(output, output_length);
87  output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,
88                                     decoded.data(), decoded.length(), &err);
89  ucnv_close(converter);
90  if (U_FAILURE(err))
91    return false;
92  output->resize(output_length);
93  return true;
94}
95
96bool DecodeWord(const std::string& encoded_word,
97                const std::string& referrer_charset,
98                bool* is_rfc2047,
99                std::string* output,
100                int* parse_result_flags) {
101  *is_rfc2047 = false;
102  output->clear();
103  if (encoded_word.empty())
104    return true;
105
106  if (!IsStringASCII(encoded_word)) {
107    // Try UTF-8, referrer_charset and the native OS default charset in turn.
108    if (IsStringUTF8(encoded_word)) {
109      *output = encoded_word;
110    } else {
111      base::string16 utf16_output;
112      if (!referrer_charset.empty() &&
113          base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
114                                base::OnStringConversionError::FAIL,
115                                &utf16_output)) {
116        *output = UTF16ToUTF8(utf16_output);
117      } else {
118        *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
119      }
120    }
121
122    *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS;
123    return true;
124  }
125
126  // RFC 2047 : one of encoding methods supported by Firefox and relatively
127  // widely used by web servers.
128  // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
129  // We don't care about the length restriction (72 bytes) because
130  // many web servers generate encoded words longer than the limit.
131  std::string decoded_word;
132  *is_rfc2047 = true;
133  int part_index = 0;
134  std::string charset;
135  base::StringTokenizer t(encoded_word, "?");
136  RFC2047EncodingType enc_type = Q_ENCODING;
137  while (*is_rfc2047 && t.GetNext()) {
138    std::string part = t.token();
139    switch (part_index) {
140      case 0:
141        if (part != "=") {
142          *is_rfc2047 = false;
143          break;
144        }
145        ++part_index;
146        break;
147      case 1:
148        // Do we need charset validity check here?
149        charset = part;
150        ++part_index;
151        break;
152      case 2:
153        if (part.size() > 1 ||
154            part.find_first_of("bBqQ") == std::string::npos) {
155          *is_rfc2047 = false;
156          break;
157        }
158        if (part[0] == 'b' || part[0] == 'B') {
159          enc_type = B_ENCODING;
160        }
161        ++part_index;
162        break;
163      case 3:
164        *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
165        if (!*is_rfc2047) {
166          // Last minute failure. Invalid B/Q encoding. Rather than
167          // passing it through, return now.
168          return false;
169        }
170        ++part_index;
171        break;
172      case 4:
173        if (part != "=") {
174          // Another last minute failure !
175          // Likely to be a case of two encoded-words in a row or
176          // an encoded word followed by a non-encoded word. We can be
177          // generous, but it does not help much in terms of compatibility,
178          // I believe. Return immediately.
179          *is_rfc2047 = false;
180          return false;
181        }
182        ++part_index;
183        break;
184      default:
185        *is_rfc2047 = false;
186        return false;
187    }
188  }
189
190  if (*is_rfc2047) {
191    if (*(encoded_word.end() - 1) == '=') {
192      output->swap(decoded_word);
193      *parse_result_flags |=
194          net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
195      return true;
196    }
197    // encoded_word ending prematurelly with '?' or extra '?'
198    *is_rfc2047 = false;
199    return false;
200  }
201
202  // We're not handling 'especial' characters quoted with '\', but
203  // it should be Ok because we're not an email client but a
204  // web browser.
205
206  // What IE6/7 does: %-escaped UTF-8.
207  decoded_word = net::UnescapeURLComponent(encoded_word,
208                                           net::UnescapeRule::SPACES);
209  if (decoded_word != encoded_word)
210    *parse_result_flags |=
211        net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
212  if (IsStringUTF8(decoded_word)) {
213    output->swap(decoded_word);
214    return true;
215    // We can try either the OS default charset or 'origin charset' here,
216    // As far as I can tell, IE does not support it. However, I've seen
217    // web servers emit %-escaped string in a legacy encoding (usually
218    // origin charset).
219    // TODO(jungshik) : Test IE further and consider adding a fallback here.
220  }
221  return false;
222}
223
224// Decodes the value of a 'filename' or 'name' parameter given as |input|. The
225// value is supposed to be of the form:
226//
227//   value                   = token | quoted-string
228//
229// However we currently also allow RFC 2047 encoding and non-ASCII
230// strings. Non-ASCII strings are interpreted based on |referrer_charset|.
231bool DecodeFilenameValue(const std::string& input,
232                         const std::string& referrer_charset,
233                         std::string* output,
234                         int* parse_result_flags) {
235  int current_parse_result_flags = 0;
236  std::string decoded_value;
237  bool is_previous_token_rfc2047 = true;
238
239  // Tokenize with whitespace characters.
240  base::StringTokenizer t(input, " \t\n\r");
241  t.set_options(base::StringTokenizer::RETURN_DELIMS);
242  while (t.GetNext()) {
243    if (t.token_is_delim()) {
244      // If the previous non-delimeter token is not RFC2047-encoded,
245      // put in a space in its place. Otheriwse, skip over it.
246      if (!is_previous_token_rfc2047)
247        decoded_value.push_back(' ');
248      continue;
249    }
250    // We don't support a single multibyte character split into
251    // adjacent encoded words. Some broken mail clients emit headers
252    // with that problem, but most web servers usually encode a filename
253    // in a single encoded-word. Firefox/Thunderbird do not support
254    // it, either.
255    std::string decoded;
256    if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
257                    &decoded, &current_parse_result_flags))
258      return false;
259    decoded_value.append(decoded);
260  }
261  output->swap(decoded_value);
262  if (parse_result_flags && !output->empty())
263    *parse_result_flags |= current_parse_result_flags;
264  return true;
265}
266
267// Parses the charset and value-chars out of an ext-value string.
268//
269//  ext-value     = charset  "'" [ language ] "'" value-chars
270bool ParseExtValueComponents(const std::string& input,
271                             std::string* charset,
272                             std::string* value_chars) {
273  base::StringTokenizer t(input, "'");
274  t.set_options(base::StringTokenizer::RETURN_DELIMS);
275  std::string temp_charset;
276  std::string temp_value;
277  int numDelimsSeen = 0;
278  while (t.GetNext()) {
279    if (t.token_is_delim()) {
280      ++numDelimsSeen;
281      continue;
282    } else {
283      switch (numDelimsSeen) {
284        case 0:
285          temp_charset = t.token();
286          break;
287        case 1:
288          // Language is ignored.
289          break;
290        case 2:
291          temp_value = t.token();
292          break;
293        default:
294          return false;
295      }
296    }
297  }
298  if (numDelimsSeen != 2)
299    return false;
300  if (temp_charset.empty() || temp_value.empty())
301    return false;
302  charset->swap(temp_charset);
303  value_chars->swap(temp_value);
304  return true;
305}
306
307// http://tools.ietf.org/html/rfc5987#section-3.2
308//
309//  ext-value     = charset  "'" [ language ] "'" value-chars
310//
311//  charset       = "UTF-8" / "ISO-8859-1" / mime-charset
312//
313//  mime-charset  = 1*mime-charsetc
314//  mime-charsetc = ALPHA / DIGIT
315//                 / "!" / "#" / "$" / "%" / "&"
316//                 / "+" / "-" / "^" / "_" / "`"
317//                 / "{" / "}" / "~"
318//
319//  language      = <Language-Tag, defined in [RFC5646], Section 2.1>
320//
321//  value-chars   = *( pct-encoded / attr-char )
322//
323//  pct-encoded   = "%" HEXDIG HEXDIG
324//
325//  attr-char     = ALPHA / DIGIT
326//                 / "!" / "#" / "$" / "&" / "+" / "-" / "."
327//                 / "^" / "_" / "`" / "|" / "~"
328bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
329  if (param_value.find('"') != std::string::npos)
330    return false;
331
332  std::string charset;
333  std::string value;
334  if (!ParseExtValueComponents(param_value, &charset, &value))
335    return false;
336
337  // RFC 5987 value should be ASCII-only.
338  if (!IsStringASCII(value)) {
339    decoded->clear();
340    return true;
341  }
342
343  std::string unescaped = net::UnescapeURLComponent(
344      value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
345
346  return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);
347}
348
349} // namespace
350
351namespace net {
352
353HttpContentDisposition::HttpContentDisposition(
354    const std::string& header, const std::string& referrer_charset)
355  : type_(INLINE),
356    parse_result_flags_(INVALID) {
357  Parse(header, referrer_charset);
358}
359
360HttpContentDisposition::~HttpContentDisposition() {
361}
362
363std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
364    std::string::const_iterator begin, std::string::const_iterator end) {
365  DCHECK(type_ == INLINE);
366  std::string::const_iterator delimiter = std::find(begin, end, ';');
367
368  std::string::const_iterator type_begin = begin;
369  std::string::const_iterator type_end = delimiter;
370  HttpUtil::TrimLWS(&type_begin, &type_end);
371
372  // If the disposition-type isn't a valid token the then the
373  // Content-Disposition header is malformed, and we treat the first bytes as
374  // a parameter rather than a disposition-type.
375  if (!HttpUtil::IsToken(type_begin, type_end))
376    return begin;
377
378  parse_result_flags_ |= HAS_DISPOSITION_TYPE;
379
380  DCHECK(std::find(type_begin, type_end, '=') == type_end);
381
382  if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {
383    type_ = INLINE;
384  } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {
385    type_ = ATTACHMENT;
386  } else {
387    parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
388    type_ = ATTACHMENT;
389  }
390  return delimiter;
391}
392
393// http://tools.ietf.org/html/rfc6266
394//
395//  content-disposition = "Content-Disposition" ":"
396//                         disposition-type *( ";" disposition-parm )
397//
398//  disposition-type    = "inline" | "attachment" | disp-ext-type
399//                      ; case-insensitive
400//  disp-ext-type       = token
401//
402//  disposition-parm    = filename-parm | disp-ext-parm
403//
404//  filename-parm       = "filename" "=" value
405//                      | "filename*" "=" ext-value
406//
407//  disp-ext-parm       = token "=" value
408//                      | ext-token "=" ext-value
409//  ext-token           = <the characters in token, followed by "*">
410//
411void HttpContentDisposition::Parse(const std::string& header,
412                                   const std::string& referrer_charset) {
413  DCHECK(type_ == INLINE);
414  DCHECK(filename_.empty());
415
416  std::string::const_iterator pos = header.begin();
417  std::string::const_iterator end = header.end();
418  pos = ConsumeDispositionType(pos, end);
419
420  std::string name;
421  std::string filename;
422  std::string ext_filename;
423
424  HttpUtil::NameValuePairsIterator iter(pos, end, ';');
425  while (iter.GetNext()) {
426    if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
427                                                 iter.name_end(),
428                                                 "filename")) {
429      DecodeFilenameValue(iter.value(), referrer_charset, &filename,
430                          &parse_result_flags_);
431      if (!filename.empty())
432        parse_result_flags_ |= HAS_FILENAME;
433    } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),
434                                                    iter.name_end(),
435                                                    "name")) {
436      DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);
437      if (!name.empty())
438        parse_result_flags_ |= HAS_NAME;
439    } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
440                                                            iter.name_end(),
441                                                            "filename*")) {
442      DecodeExtValue(iter.raw_value(), &ext_filename);
443      if (!ext_filename.empty())
444        parse_result_flags_ |= HAS_EXT_FILENAME;
445    }
446  }
447
448  if (!ext_filename.empty())
449    filename_ = ext_filename;
450  else if (!filename.empty())
451    filename_ = filename;
452  else
453    filename_ = name;
454}
455
456}  // namespace net
457