1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "net/http/http_content_disposition.h" 6 7#include "base/base64.h" 8#include "base/logging.h" 9#include "base/strings/string_tokenizer.h" 10#include "base/strings/string_util.h" 11#include "base/strings/sys_string_conversions.h" 12#include "base/strings/utf_string_conversions.h" 13#include "net/base/net_string_util.h" 14#include "net/base/net_util.h" 15#include "net/http/http_util.h" 16 17namespace { 18 19enum RFC2047EncodingType { 20 Q_ENCODING, 21 B_ENCODING 22}; 23 24// Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to 25// decoding a quoted-printable string. Returns true if the input was valid. 26bool DecodeQEncoding(const std::string& input, std::string* output) { 27 std::string temp; 28 temp.reserve(input.size()); 29 for (std::string::const_iterator it = input.begin(); it != input.end(); 30 ++it) { 31 if (*it == '_') { 32 temp.push_back(' '); 33 } else if (*it == '=') { 34 if ((input.end() - it < 3) || 35 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || 36 !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) 37 return false; 38 unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + 39 HexDigitToInt(*(it + 2)); 40 temp.push_back(static_cast<char>(ch)); 41 ++it; 42 ++it; 43 } else if (0x20 < *it && *it < 0x7F && *it != '?') { 44 // In a Q-encoded word, only printable ASCII characters 45 // represent themselves. Besides, space, '=', '_' and '?' are 46 // not allowed, but they're already filtered out. 47 DCHECK_NE('=', *it); 48 DCHECK_NE('?', *it); 49 DCHECK_NE('_', *it); 50 temp.push_back(*it); 51 } else { 52 return false; 53 } 54 } 55 output->swap(temp); 56 return true; 57} 58 59// Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding 60// type is specified in |enc_type|. 61bool DecodeBQEncoding(const std::string& part, 62 RFC2047EncodingType enc_type, 63 const std::string& charset, 64 std::string* output) { 65 std::string decoded; 66 if (!((enc_type == B_ENCODING) ? 67 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) { 68 return false; 69 } 70 71 if (decoded.empty()) { 72 output->clear(); 73 return true; 74 } 75 76 return net::ConvertToUtf8(decoded, charset.c_str(), output); 77} 78 79bool DecodeWord(const std::string& encoded_word, 80 const std::string& referrer_charset, 81 bool* is_rfc2047, 82 std::string* output, 83 int* parse_result_flags) { 84 *is_rfc2047 = false; 85 output->clear(); 86 if (encoded_word.empty()) 87 return true; 88 89 if (!base::IsStringASCII(encoded_word)) { 90 // Try UTF-8, referrer_charset and the native OS default charset in turn. 91 if (base::IsStringUTF8(encoded_word)) { 92 *output = encoded_word; 93 } else { 94 base::string16 utf16_output; 95 if (!referrer_charset.empty() && 96 net::ConvertToUTF16(encoded_word, referrer_charset.c_str(), 97 &utf16_output)) { 98 *output = base::UTF16ToUTF8(utf16_output); 99 } else { 100 *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word)); 101 } 102 } 103 104 *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS; 105 return true; 106 } 107 108 // RFC 2047 : one of encoding methods supported by Firefox and relatively 109 // widely used by web servers. 110 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. 111 // We don't care about the length restriction (72 bytes) because 112 // many web servers generate encoded words longer than the limit. 113 std::string decoded_word; 114 *is_rfc2047 = true; 115 int part_index = 0; 116 std::string charset; 117 base::StringTokenizer t(encoded_word, "?"); 118 RFC2047EncodingType enc_type = Q_ENCODING; 119 while (*is_rfc2047 && t.GetNext()) { 120 std::string part = t.token(); 121 switch (part_index) { 122 case 0: 123 if (part != "=") { 124 *is_rfc2047 = false; 125 break; 126 } 127 ++part_index; 128 break; 129 case 1: 130 // Do we need charset validity check here? 131 charset = part; 132 ++part_index; 133 break; 134 case 2: 135 if (part.size() > 1 || 136 part.find_first_of("bBqQ") == std::string::npos) { 137 *is_rfc2047 = false; 138 break; 139 } 140 if (part[0] == 'b' || part[0] == 'B') { 141 enc_type = B_ENCODING; 142 } 143 ++part_index; 144 break; 145 case 3: 146 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word); 147 if (!*is_rfc2047) { 148 // Last minute failure. Invalid B/Q encoding. Rather than 149 // passing it through, return now. 150 return false; 151 } 152 ++part_index; 153 break; 154 case 4: 155 if (part != "=") { 156 // Another last minute failure ! 157 // Likely to be a case of two encoded-words in a row or 158 // an encoded word followed by a non-encoded word. We can be 159 // generous, but it does not help much in terms of compatibility, 160 // I believe. Return immediately. 161 *is_rfc2047 = false; 162 return false; 163 } 164 ++part_index; 165 break; 166 default: 167 *is_rfc2047 = false; 168 return false; 169 } 170 } 171 172 if (*is_rfc2047) { 173 if (*(encoded_word.end() - 1) == '=') { 174 output->swap(decoded_word); 175 *parse_result_flags |= 176 net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS; 177 return true; 178 } 179 // encoded_word ending prematurelly with '?' or extra '?' 180 *is_rfc2047 = false; 181 return false; 182 } 183 184 // We're not handling 'especial' characters quoted with '\', but 185 // it should be Ok because we're not an email client but a 186 // web browser. 187 188 // What IE6/7 does: %-escaped UTF-8. 189 decoded_word = net::UnescapeURLComponent(encoded_word, 190 net::UnescapeRule::SPACES); 191 if (decoded_word != encoded_word) 192 *parse_result_flags |= 193 net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS; 194 if (base::IsStringUTF8(decoded_word)) { 195 output->swap(decoded_word); 196 return true; 197 // We can try either the OS default charset or 'origin charset' here, 198 // As far as I can tell, IE does not support it. However, I've seen 199 // web servers emit %-escaped string in a legacy encoding (usually 200 // origin charset). 201 // TODO(jungshik) : Test IE further and consider adding a fallback here. 202 } 203 return false; 204} 205 206// Decodes the value of a 'filename' or 'name' parameter given as |input|. The 207// value is supposed to be of the form: 208// 209// value = token | quoted-string 210// 211// However we currently also allow RFC 2047 encoding and non-ASCII 212// strings. Non-ASCII strings are interpreted based on |referrer_charset|. 213bool DecodeFilenameValue(const std::string& input, 214 const std::string& referrer_charset, 215 std::string* output, 216 int* parse_result_flags) { 217 int current_parse_result_flags = 0; 218 std::string decoded_value; 219 bool is_previous_token_rfc2047 = true; 220 221 // Tokenize with whitespace characters. 222 base::StringTokenizer t(input, " \t\n\r"); 223 t.set_options(base::StringTokenizer::RETURN_DELIMS); 224 while (t.GetNext()) { 225 if (t.token_is_delim()) { 226 // If the previous non-delimeter token is not RFC2047-encoded, 227 // put in a space in its place. Otheriwse, skip over it. 228 if (!is_previous_token_rfc2047) 229 decoded_value.push_back(' '); 230 continue; 231 } 232 // We don't support a single multibyte character split into 233 // adjacent encoded words. Some broken mail clients emit headers 234 // with that problem, but most web servers usually encode a filename 235 // in a single encoded-word. Firefox/Thunderbird do not support 236 // it, either. 237 std::string decoded; 238 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, 239 &decoded, ¤t_parse_result_flags)) 240 return false; 241 decoded_value.append(decoded); 242 } 243 output->swap(decoded_value); 244 if (parse_result_flags && !output->empty()) 245 *parse_result_flags |= current_parse_result_flags; 246 return true; 247} 248 249// Parses the charset and value-chars out of an ext-value string. 250// 251// ext-value = charset "'" [ language ] "'" value-chars 252bool ParseExtValueComponents(const std::string& input, 253 std::string* charset, 254 std::string* value_chars) { 255 base::StringTokenizer t(input, "'"); 256 t.set_options(base::StringTokenizer::RETURN_DELIMS); 257 std::string temp_charset; 258 std::string temp_value; 259 int numDelimsSeen = 0; 260 while (t.GetNext()) { 261 if (t.token_is_delim()) { 262 ++numDelimsSeen; 263 continue; 264 } else { 265 switch (numDelimsSeen) { 266 case 0: 267 temp_charset = t.token(); 268 break; 269 case 1: 270 // Language is ignored. 271 break; 272 case 2: 273 temp_value = t.token(); 274 break; 275 default: 276 return false; 277 } 278 } 279 } 280 if (numDelimsSeen != 2) 281 return false; 282 if (temp_charset.empty() || temp_value.empty()) 283 return false; 284 charset->swap(temp_charset); 285 value_chars->swap(temp_value); 286 return true; 287} 288 289// http://tools.ietf.org/html/rfc5987#section-3.2 290// 291// ext-value = charset "'" [ language ] "'" value-chars 292// 293// charset = "UTF-8" / "ISO-8859-1" / mime-charset 294// 295// mime-charset = 1*mime-charsetc 296// mime-charsetc = ALPHA / DIGIT 297// / "!" / "#" / "$" / "%" / "&" 298// / "+" / "-" / "^" / "_" / "`" 299// / "{" / "}" / "~" 300// 301// language = <Language-Tag, defined in [RFC5646], Section 2.1> 302// 303// value-chars = *( pct-encoded / attr-char ) 304// 305// pct-encoded = "%" HEXDIG HEXDIG 306// 307// attr-char = ALPHA / DIGIT 308// / "!" / "#" / "$" / "&" / "+" / "-" / "." 309// / "^" / "_" / "`" / "|" / "~" 310bool DecodeExtValue(const std::string& param_value, std::string* decoded) { 311 if (param_value.find('"') != std::string::npos) 312 return false; 313 314 std::string charset; 315 std::string value; 316 if (!ParseExtValueComponents(param_value, &charset, &value)) 317 return false; 318 319 // RFC 5987 value should be ASCII-only. 320 if (!base::IsStringASCII(value)) { 321 decoded->clear(); 322 return true; 323 } 324 325 std::string unescaped = net::UnescapeURLComponent( 326 value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); 327 328 return net::ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded); 329} 330 331} // namespace 332 333namespace net { 334 335HttpContentDisposition::HttpContentDisposition( 336 const std::string& header, const std::string& referrer_charset) 337 : type_(INLINE), 338 parse_result_flags_(INVALID) { 339 Parse(header, referrer_charset); 340} 341 342HttpContentDisposition::~HttpContentDisposition() { 343} 344 345std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( 346 std::string::const_iterator begin, std::string::const_iterator end) { 347 DCHECK(type_ == INLINE); 348 std::string::const_iterator delimiter = std::find(begin, end, ';'); 349 350 std::string::const_iterator type_begin = begin; 351 std::string::const_iterator type_end = delimiter; 352 HttpUtil::TrimLWS(&type_begin, &type_end); 353 354 // If the disposition-type isn't a valid token the then the 355 // Content-Disposition header is malformed, and we treat the first bytes as 356 // a parameter rather than a disposition-type. 357 if (!HttpUtil::IsToken(type_begin, type_end)) 358 return begin; 359 360 parse_result_flags_ |= HAS_DISPOSITION_TYPE; 361 362 DCHECK(std::find(type_begin, type_end, '=') == type_end); 363 364 if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) { 365 type_ = INLINE; 366 } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) { 367 type_ = ATTACHMENT; 368 } else { 369 parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE; 370 type_ = ATTACHMENT; 371 } 372 return delimiter; 373} 374 375// http://tools.ietf.org/html/rfc6266 376// 377// content-disposition = "Content-Disposition" ":" 378// disposition-type *( ";" disposition-parm ) 379// 380// disposition-type = "inline" | "attachment" | disp-ext-type 381// ; case-insensitive 382// disp-ext-type = token 383// 384// disposition-parm = filename-parm | disp-ext-parm 385// 386// filename-parm = "filename" "=" value 387// | "filename*" "=" ext-value 388// 389// disp-ext-parm = token "=" value 390// | ext-token "=" ext-value 391// ext-token = <the characters in token, followed by "*"> 392// 393void HttpContentDisposition::Parse(const std::string& header, 394 const std::string& referrer_charset) { 395 DCHECK(type_ == INLINE); 396 DCHECK(filename_.empty()); 397 398 std::string::const_iterator pos = header.begin(); 399 std::string::const_iterator end = header.end(); 400 pos = ConsumeDispositionType(pos, end); 401 402 std::string name; 403 std::string filename; 404 std::string ext_filename; 405 406 HttpUtil::NameValuePairsIterator iter(pos, end, ';'); 407 while (iter.GetNext()) { 408 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), 409 iter.name_end(), 410 "filename")) { 411 DecodeFilenameValue(iter.value(), referrer_charset, &filename, 412 &parse_result_flags_); 413 if (!filename.empty()) 414 parse_result_flags_ |= HAS_FILENAME; 415 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(), 416 iter.name_end(), 417 "name")) { 418 DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL); 419 if (!name.empty()) 420 parse_result_flags_ |= HAS_NAME; 421 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), 422 iter.name_end(), 423 "filename*")) { 424 DecodeExtValue(iter.raw_value(), &ext_filename); 425 if (!ext_filename.empty()) 426 parse_result_flags_ |= HAS_EXT_FILENAME; 427 } 428 } 429 430 if (!ext_filename.empty()) 431 filename_ = ext_filename; 432 else if (!filename.empty()) 433 filename_ = filename; 434 else 435 filename_ = name; 436} 437 438} // namespace net 439