From 80456899565a20e8138450c43a7e09d40d2dd2b5 Mon Sep 17 00:00:00 2001 From: "asanka@chromium.org" Date: Sat, 15 Dec 2012 20:07:31 +0000 Subject: Move DecodeFilenameValue and DecodeExt value into http_content_disposition. BUG=none Review URL: https://chromiumcodereview.appspot.com/11471041 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@173307 0039d316-1c4b-4281-b951-d872f2087c98 --- net/http/http_content_disposition.cc | 326 +++++++++++++++++++++++++++++++++++ 1 file changed, 326 insertions(+) (limited to 'net/http') diff --git a/net/http/http_content_disposition.cc b/net/http/http_content_disposition.cc index 52d9f4f..0726e93 100644 --- a/net/http/http_content_disposition.cc +++ b/net/http/http_content_disposition.cc @@ -4,10 +4,336 @@ #include "net/http/http_content_disposition.h" +#include "base/base64.h" +#include "base/i18n/icu_string_conversions.h" #include "base/logging.h" #include "base/string_util.h" +#include "base/sys_string_conversions.h" +#include "base/utf_string_conversions.h" #include "net/base/net_util.h" #include "net/http/http_util.h" +#include "unicode/ucnv.h" + +namespace { + +enum RFC2047EncodingType { + Q_ENCODING, + B_ENCODING +}; + +// Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to +// decoding a quoted-printable string. Returns true if the input was valid. +bool DecodeQEncoding(const std::string& input, std::string* output) { + std::string temp; + temp.reserve(input.size()); + for (std::string::const_iterator it = input.begin(); it != input.end(); + ++it) { + if (*it == '_') { + temp.push_back(' '); + } else if (*it == '=') { + if ((input.end() - it < 3) || + !IsHexDigit(static_cast(*(it + 1))) || + !IsHexDigit(static_cast(*(it + 2)))) + return false; + unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + + HexDigitToInt(*(it + 2)); + temp.push_back(static_cast(ch)); + ++it; + ++it; + } else if (0x20 < *it && *it < 0x7F && *it != '?') { + // In a Q-encoded word, only printable ASCII characters + // represent themselves. Besides, space, '=', '_' and '?' are + // not allowed, but they're already filtered out. + DCHECK_NE('=', *it); + DCHECK_NE('?', *it); + DCHECK_NE('_', *it); + temp.push_back(*it); + } else { + return false; + } + } + output->swap(temp); + return true; +} + +// Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding +// type is specified in |enc_type|. +bool DecodeBQEncoding(const std::string& part, + RFC2047EncodingType enc_type, + const std::string& charset, + std::string* output) { + std::string decoded; + if (!((enc_type == B_ENCODING) ? + base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) + return false; + + if (decoded.empty()) { + output->clear(); + return true; + } + + UErrorCode err = U_ZERO_ERROR; + UConverter* converter(ucnv_open(charset.c_str(), &err)); + if (U_FAILURE(err)) + return false; + + // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. + // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes + // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a + // trailing '\0'. + size_t output_length = decoded.length() * 3 + 1; + char* buf = WriteInto(output, output_length); + output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, + decoded.data(), decoded.length(), &err); + ucnv_close(converter); + if (U_FAILURE(err)) + return false; + output->resize(output_length); + return true; +} + +bool DecodeWord(const std::string& encoded_word, + const std::string& referrer_charset, + bool* is_rfc2047, + std::string* output) { + *is_rfc2047 = false; + output->clear(); + if (encoded_word.empty()) + return true; + + if (!IsStringASCII(encoded_word)) { + // Try UTF-8, referrer_charset and the native OS default charset in turn. + if (IsStringUTF8(encoded_word)) { + *output = encoded_word; + } else { + string16 utf16_output; + if (!referrer_charset.empty() && + base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), + base::OnStringConversionError::FAIL, + &utf16_output)) { + *output = UTF16ToUTF8(utf16_output); + } else { + *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); + } + } + + return true; + } + + // RFC 2047 : one of encoding methods supported by Firefox and relatively + // widely used by web servers. + // =?charset???= where '' is either 'B' or 'Q'. + // We don't care about the length restriction (72 bytes) because + // many web servers generate encoded words longer than the limit. + std::string tmp; + *is_rfc2047 = true; + int part_index = 0; + std::string charset; + StringTokenizer t(encoded_word, "?"); + RFC2047EncodingType enc_type = Q_ENCODING; + while (*is_rfc2047 && t.GetNext()) { + std::string part = t.token(); + switch (part_index) { + case 0: + if (part != "=") { + *is_rfc2047 = false; + break; + } + ++part_index; + break; + case 1: + // Do we need charset validity check here? + charset = part; + ++part_index; + break; + case 2: + if (part.size() > 1 || + part.find_first_of("bBqQ") == std::string::npos) { + *is_rfc2047 = false; + break; + } + if (part[0] == 'b' || part[0] == 'B') { + enc_type = B_ENCODING; + } + ++part_index; + break; + case 3: + *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); + if (!*is_rfc2047) { + // Last minute failure. Invalid B/Q encoding. Rather than + // passing it through, return now. + return false; + } + ++part_index; + break; + case 4: + if (part != "=") { + // Another last minute failure ! + // Likely to be a case of two encoded-words in a row or + // an encoded word followed by a non-encoded word. We can be + // generous, but it does not help much in terms of compatibility, + // I believe. Return immediately. + *is_rfc2047 = false; + return false; + } + ++part_index; + break; + default: + *is_rfc2047 = false; + return false; + } + } + + if (*is_rfc2047) { + if (*(encoded_word.end() - 1) == '=') { + output->swap(tmp); + return true; + } + // encoded_word ending prematurelly with '?' or extra '?' + *is_rfc2047 = false; + return false; + } + + // We're not handling 'especial' characters quoted with '\', but + // it should be Ok because we're not an email client but a + // web browser. + + // What IE6/7 does: %-escaped UTF-8. + tmp = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES); + if (IsStringUTF8(tmp)) { + output->swap(tmp); + return true; + // We can try either the OS default charset or 'origin charset' here, + // As far as I can tell, IE does not support it. However, I've seen + // web servers emit %-escaped string in a legacy encoding (usually + // origin charset). + // TODO(jungshik) : Test IE further and consider adding a fallback here. + } + return false; +} + +// Decodes the value of a 'filename' or 'name' parameter given as |input|. The +// value is supposed to be of the form: +// +// value = token | quoted-string +// +// However we currently also allow RFC 2047 encoding and non-ASCII +// strings. Non-ASCII strings are interpreted based on |referrer_charset|. +bool DecodeFilenameValue(const std::string& input, + const std::string& referrer_charset, + std::string* output) { + std::string tmp; + // Tokenize with whitespace characters. + StringTokenizer t(input, " \t\n\r"); + t.set_options(StringTokenizer::RETURN_DELIMS); + bool is_previous_token_rfc2047 = true; + while (t.GetNext()) { + if (t.token_is_delim()) { + // If the previous non-delimeter token is not RFC2047-encoded, + // put in a space in its place. Otheriwse, skip over it. + if (!is_previous_token_rfc2047) { + tmp.push_back(' '); + } + continue; + } + // We don't support a single multibyte character split into + // adjacent encoded words. Some broken mail clients emit headers + // with that problem, but most web servers usually encode a filename + // in a single encoded-word. Firefox/Thunderbird do not support + // it, either. + std::string decoded; + if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, + &decoded)) + return false; + tmp.append(decoded); + } + output->swap(tmp); + return true; +} + +// Parses the charset and value-chars out of an ext-value string. +// +// ext-value = charset "'" [ language ] "'" value-chars +bool ParseExtValueComponents(const std::string& input, + std::string* charset, + std::string* value_chars) { + StringTokenizer t(input, "'"); + t.set_options(StringTokenizer::RETURN_DELIMS); + std::string temp_charset; + std::string temp_value; + int numDelimsSeen = 0; + while (t.GetNext()) { + if (t.token_is_delim()) { + ++numDelimsSeen; + continue; + } else { + switch (numDelimsSeen) { + case 0: + temp_charset = t.token(); + break; + case 1: + // Language is ignored. + break; + case 2: + temp_value = t.token(); + break; + default: + return false; + } + } + } + if (numDelimsSeen != 2) + return false; + if (temp_charset.empty() || temp_value.empty()) + return false; + charset->swap(temp_charset); + value_chars->swap(temp_value); + return true; +} + +// http://tools.ietf.org/html/rfc5987#section-3.2 +// +// ext-value = charset "'" [ language ] "'" value-chars +// +// charset = "UTF-8" / "ISO-8859-1" / mime-charset +// +// mime-charset = 1*mime-charsetc +// mime-charsetc = ALPHA / DIGIT +// / "!" / "#" / "$" / "%" / "&" +// / "+" / "-" / "^" / "_" / "`" +// / "{" / "}" / "~" +// +// language = +// +// value-chars = *( pct-encoded / attr-char ) +// +// pct-encoded = "%" HEXDIG HEXDIG +// +// attr-char = ALPHA / DIGIT +// / "!" / "#" / "$" / "&" / "+" / "-" / "." +// / "^" / "_" / "`" / "|" / "~" +bool DecodeExtValue(const std::string& param_value, std::string* decoded) { + if (param_value.find('"') != std::string::npos) + return false; + + std::string charset; + std::string value; + if (!ParseExtValueComponents(param_value, &charset, &value)) + return false; + + // RFC 5987 value should be ASCII-only. + if (!IsStringASCII(value)) { + decoded->clear(); + return true; + } + + std::string unescaped = net::UnescapeURLComponent( + value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); + + return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); +} + +} // namespace namespace net { -- cgit v1.1