diff options
-rw-r--r-- | net/base/net_util.cc | 298 | ||||
-rw-r--r-- | net/base/net_util.h | 11 | ||||
-rw-r--r-- | net/base/net_util_unittest.cc | 2 | ||||
-rw-r--r-- | net/http/http_content_disposition.cc | 326 |
4 files changed, 341 insertions, 296 deletions
diff --git a/net/base/net_util.cc b/net/base/net_util.cc index 2b11c4d..5f321c6 100644 --- a/net/base/net_util.cc +++ b/net/base/net_util.cc @@ -25,7 +25,6 @@ #include <netinet/in.h> #endif -#include "base/base64.h" #include "base/basictypes.h" #include "base/file_path.h" #include "base/file_util.h" @@ -71,7 +70,6 @@ #include "net/http/http_content_disposition.h" #include "unicode/datefmt.h" #include "unicode/regex.h" -#include "unicode/ucnv.h" #include "unicode/uidna.h" #include "unicode/ulocdata.h" #include "unicode/uniset.h" @@ -175,196 +173,6 @@ std::string::size_type CountTrailingChars( } #endif -// Similar to Base64Decode. Decodes a Q-encoded string to a sequence -// of bytes. If input is invalid, return false. -bool QPDecode(const std::string& input, std::string* output) { - std::string temp; - temp.reserve(input.size()); - for (std::string::const_iterator it = input.begin(); it != input.end(); - ++it) { - if (*it == '_') { - temp.push_back(' '); - } else if (*it == '=') { - if ((input.end() - it < 3) || - !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || - !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) - return false; - unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + - HexDigitToInt(*(it + 2)); - temp.push_back(static_cast<char>(ch)); - ++it; - ++it; - } else if (0x20 < *it && *it < 0x7F) { - // In a Q-encoded word, only printable ASCII characters - // represent themselves. Besides, space, '=', '_' and '?' are - // not allowed, but they're already filtered out. - DCHECK_NE('=', *it); - DCHECK_NE('?', *it); - DCHECK_NE('_', *it); - temp.push_back(*it); - } else { - return false; - } - } - output->swap(temp); - return true; -} - -enum RFC2047EncodingType {Q_ENCODING, B_ENCODING}; -bool DecodeBQEncoding(const std::string& part, - RFC2047EncodingType enc_type, - const std::string& charset, - std::string* output) { - std::string decoded; - if (!((enc_type == B_ENCODING) ? - base::Base64Decode(part, &decoded) : QPDecode(part, &decoded))) - return false; - - if (decoded.empty()) { - output->clear(); - return true; - } - - UErrorCode err = U_ZERO_ERROR; - UConverter* converter(ucnv_open(charset.c_str(), &err)); - if (U_FAILURE(err)) - return false; - - // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. - // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes - // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a - // trailing '\0'. - size_t output_length = decoded.length() * 3 + 1; - char* buf = WriteInto(output, output_length); - output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, - decoded.data(), decoded.length(), &err); - ucnv_close(converter); - if (U_FAILURE(err)) - return false; - output->resize(output_length); - return true; -} - -bool DecodeWord(const std::string& encoded_word, - const std::string& referrer_charset, - bool* is_rfc2047, - std::string* output) { - *is_rfc2047 = false; - output->clear(); - if (encoded_word.empty()) - return true; - - if (!IsStringASCII(encoded_word)) { - // Try UTF-8, referrer_charset and the native OS default charset in turn. - if (IsStringUTF8(encoded_word)) { - *output = encoded_word; - } else { - string16 utf16_output; - if (!referrer_charset.empty() && - base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), - base::OnStringConversionError::FAIL, - &utf16_output)) { - *output = UTF16ToUTF8(utf16_output); - } else { - *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); - } - } - - return true; - } - - // RFC 2047 : one of encoding methods supported by Firefox and relatively - // widely used by web servers. - // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. - // We don't care about the length restriction (72 bytes) because - // many web servers generate encoded words longer than the limit. - std::string tmp; - *is_rfc2047 = true; - int part_index = 0; - std::string charset; - StringTokenizer t(encoded_word, "?"); - RFC2047EncodingType enc_type = Q_ENCODING; - while (*is_rfc2047 && t.GetNext()) { - std::string part = t.token(); - switch (part_index) { - case 0: - if (part != "=") { - *is_rfc2047 = false; - break; - } - ++part_index; - break; - case 1: - // Do we need charset validity check here? - charset = part; - ++part_index; - break; - case 2: - if (part.size() > 1 || - part.find_first_of("bBqQ") == std::string::npos) { - *is_rfc2047 = false; - break; - } - if (part[0] == 'b' || part[0] == 'B') { - enc_type = B_ENCODING; - } - ++part_index; - break; - case 3: - *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); - if (!*is_rfc2047) { - // Last minute failure. Invalid B/Q encoding. Rather than - // passing it through, return now. - return false; - } - ++part_index; - break; - case 4: - if (part != "=") { - // Another last minute failure ! - // Likely to be a case of two encoded-words in a row or - // an encoded word followed by a non-encoded word. We can be - // generous, but it does not help much in terms of compatibility, - // I believe. Return immediately. - *is_rfc2047 = false; - return false; - } - ++part_index; - break; - default: - *is_rfc2047 = false; - return false; - } - } - - if (*is_rfc2047) { - if (*(encoded_word.end() - 1) == '=') { - output->swap(tmp); - return true; - } - // encoded_word ending prematurelly with '?' or extra '?' - *is_rfc2047 = false; - return false; - } - - // We're not handling 'especial' characters quoted with '\', but - // it should be Ok because we're not an email client but a - // web browser. - - // What IE6/7 does: %-escaped UTF-8. - tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES); - if (IsStringUTF8(tmp)) { - output->swap(tmp); - return true; - // We can try either the OS default charset or 'origin charset' here, - // As far as I can tell, IE does not support it. However, I've seen - // web servers emit %-escaped string in a legacy encoding (usually - // origin charset). - // TODO(jungshik) : Test IE further and consider adding a fallback here. - } - return false; -} - // Does some simple normalization of scripts so we can allow certain scripts // to exist together. // TODO(brettw) bug 880223: we should allow some other languages to be @@ -939,12 +747,20 @@ std::string GetFileNameFromURL(const GURL& url, // The URL's path should be escaped UTF-8, but may not be. std::string decoded_filename = unescaped_url_filename; - if (!IsStringASCII(decoded_filename)) { - bool ignore; + if (!IsStringUTF8(decoded_filename)) { // TODO(jshin): this is probably not robust enough. To be sure, we need // encoding detection. - DecodeWord(unescaped_url_filename, referrer_charset, &ignore, - &decoded_filename); + string16 utf16_output; + if (!referrer_charset.empty() && + base::CodepageToUTF16(unescaped_url_filename, + referrer_charset.c_str(), + base::OnStringConversionError::FAIL, + &utf16_output)) { + decoded_filename = UTF16ToUTF8(utf16_output); + } else { + decoded_filename = WideToUTF8( + base::SysNativeMBToWide(unescaped_url_filename)); + } } // If the URL contains a (possibly empty) query, assume it is a generator, and // allow the determined extension to be overwritten. @@ -1158,96 +974,6 @@ std::string GetSpecificHeader(const std::string& headers, return ret; } -bool DecodeCharset(const std::string& input, - std::string* decoded_charset, - std::string* value) { - StringTokenizer t(input, "'"); - t.set_options(StringTokenizer::RETURN_DELIMS); - std::string temp_charset; - std::string temp_value; - int numDelimsSeen = 0; - while (t.GetNext()) { - if (t.token_is_delim()) { - ++numDelimsSeen; - continue; - } else { - switch (numDelimsSeen) { - case 0: - temp_charset = t.token(); - break; - case 1: - // Language is ignored. - break; - case 2: - temp_value = t.token(); - break; - default: - return false; - } - } - } - if (numDelimsSeen != 2) - return false; - if (temp_charset.empty() || temp_value.empty()) - return false; - decoded_charset->swap(temp_charset); - value->swap(temp_value); - return true; -} - -bool DecodeFilenameValue(const std::string& input, - const std::string& referrer_charset, - std::string* output) { - std::string tmp; - // Tokenize with whitespace characters. - StringTokenizer t(input, " \t\n\r"); - t.set_options(StringTokenizer::RETURN_DELIMS); - bool is_previous_token_rfc2047 = true; - while (t.GetNext()) { - if (t.token_is_delim()) { - // If the previous non-delimeter token is not RFC2047-encoded, - // put in a space in its place. Otheriwse, skip over it. - if (!is_previous_token_rfc2047) { - tmp.push_back(' '); - } - continue; - } - // We don't support a single multibyte character split into - // adjacent encoded words. Some broken mail clients emit headers - // with that problem, but most web servers usually encode a filename - // in a single encoded-word. Firefox/Thunderbird do not support - // it, either. - std::string decoded; - if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, - &decoded)) - return false; - tmp.append(decoded); - } - output->swap(tmp); - return true; -} - -bool DecodeExtValue(const std::string& param_value, std::string* decoded) { - if (param_value.find('"') != std::string::npos) - return false; - - std::string charset; - std::string value; - if (!DecodeCharset(param_value, &charset, &value)) - return false; - - // RFC 5987 value should be ASCII-only. - if (!IsStringASCII(value)) { - decoded->clear(); - return true; - } - - std::string unescaped = UnescapeURLComponent(value, - UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); - - return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); -} - string16 IDNToUnicode(const std::string& host, const std::string& languages) { return IDNToUnicodeWithOffsets(host, languages, NULL); diff --git a/net/base/net_util.h b/net/base/net_util.h index 444a547..874f3e2 100644 --- a/net/base/net_util.h +++ b/net/base/net_util.h @@ -173,12 +173,6 @@ NET_EXPORT std::string GetHostOrSpecFromURL(const GURL& url); NET_EXPORT std::string GetSpecificHeader(const std::string& headers, const std::string& name); -// TODO(abarth): Move these functions to http_content_disposition.cc. -bool DecodeFilenameValue(const std::string& input, - const std::string& referrer_charset, - std::string* output); -bool DecodeExtValue(const std::string& value, std::string* output); - // Converts the given host name to unicode characters. This can be called for // any host name, if the input is not IDN or is invalid in some way, we'll just // return the ASCII source so it is still usable. @@ -252,9 +246,8 @@ NET_EXPORT string16 StripWWWFromHost(const GURL& url); // Generates a filename using the first successful method from the following (in // order): // -// 1) The raw Content-Disposition header in |content_disposition| (as read from -// the network. |referrer_charset| is used as described in the comment for -// GetFileNameFromCD(). +// 1) The raw Content-Disposition header in |content_disposition| as read from +// the network. |referrer_charset| is used to decode non-ASCII strings. // 2) |suggested_name| if specified. |suggested_name| is assumed to be in // UTF-8. // 3) The filename extracted from the |url|. |referrer_charset| will be used to diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc index 53e32f3..e4e181b 100644 --- a/net/base/net_util_unittest.cc +++ b/net/base/net_util_unittest.cc @@ -1179,7 +1179,7 @@ TEST(NetUtilTest, GenerateFileName) { L"default", L"default" }, - // Below is a small subset of cases taken from GetFileNameFromCD test above. + // Below is a small subset of cases taken from HttpContentDisposition tests. { "http://www.google.com/", "attachment; filename=\"%EC%98%88%EC%88%A0%20" diff --git a/net/http/http_content_disposition.cc b/net/http/http_content_disposition.cc index 52d9f4f..0726e93 100644 --- a/net/http/http_content_disposition.cc +++ b/net/http/http_content_disposition.cc @@ -4,10 +4,336 @@ #include "net/http/http_content_disposition.h" +#include "base/base64.h" +#include "base/i18n/icu_string_conversions.h" #include "base/logging.h" #include "base/string_util.h" +#include "base/sys_string_conversions.h" +#include "base/utf_string_conversions.h" #include "net/base/net_util.h" #include "net/http/http_util.h" +#include "unicode/ucnv.h" + +namespace { + +enum RFC2047EncodingType { + Q_ENCODING, + B_ENCODING +}; + +// Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to +// decoding a quoted-printable string. Returns true if the input was valid. +bool DecodeQEncoding(const std::string& input, std::string* output) { + std::string temp; + temp.reserve(input.size()); + for (std::string::const_iterator it = input.begin(); it != input.end(); + ++it) { + if (*it == '_') { + temp.push_back(' '); + } else if (*it == '=') { + if ((input.end() - it < 3) || + !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || + !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) + return false; + unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + + HexDigitToInt(*(it + 2)); + temp.push_back(static_cast<char>(ch)); + ++it; + ++it; + } else if (0x20 < *it && *it < 0x7F && *it != '?') { + // In a Q-encoded word, only printable ASCII characters + // represent themselves. Besides, space, '=', '_' and '?' are + // not allowed, but they're already filtered out. + DCHECK_NE('=', *it); + DCHECK_NE('?', *it); + DCHECK_NE('_', *it); + temp.push_back(*it); + } else { + return false; + } + } + output->swap(temp); + return true; +} + +// Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding +// type is specified in |enc_type|. +bool DecodeBQEncoding(const std::string& part, + RFC2047EncodingType enc_type, + const std::string& charset, + std::string* output) { + std::string decoded; + if (!((enc_type == B_ENCODING) ? + base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) + return false; + + if (decoded.empty()) { + output->clear(); + return true; + } + + UErrorCode err = U_ZERO_ERROR; + UConverter* converter(ucnv_open(charset.c_str(), &err)); + if (U_FAILURE(err)) + return false; + + // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. + // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes + // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a + // trailing '\0'. + size_t output_length = decoded.length() * 3 + 1; + char* buf = WriteInto(output, output_length); + output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, + decoded.data(), decoded.length(), &err); + ucnv_close(converter); + if (U_FAILURE(err)) + return false; + output->resize(output_length); + return true; +} + +bool DecodeWord(const std::string& encoded_word, + const std::string& referrer_charset, + bool* is_rfc2047, + std::string* output) { + *is_rfc2047 = false; + output->clear(); + if (encoded_word.empty()) + return true; + + if (!IsStringASCII(encoded_word)) { + // Try UTF-8, referrer_charset and the native OS default charset in turn. + if (IsStringUTF8(encoded_word)) { + *output = encoded_word; + } else { + string16 utf16_output; + if (!referrer_charset.empty() && + base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), + base::OnStringConversionError::FAIL, + &utf16_output)) { + *output = UTF16ToUTF8(utf16_output); + } else { + *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); + } + } + + return true; + } + + // RFC 2047 : one of encoding methods supported by Firefox and relatively + // widely used by web servers. + // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. + // We don't care about the length restriction (72 bytes) because + // many web servers generate encoded words longer than the limit. + std::string tmp; + *is_rfc2047 = true; + int part_index = 0; + std::string charset; + StringTokenizer t(encoded_word, "?"); + RFC2047EncodingType enc_type = Q_ENCODING; + while (*is_rfc2047 && t.GetNext()) { + std::string part = t.token(); + switch (part_index) { + case 0: + if (part != "=") { + *is_rfc2047 = false; + break; + } + ++part_index; + break; + case 1: + // Do we need charset validity check here? + charset = part; + ++part_index; + break; + case 2: + if (part.size() > 1 || + part.find_first_of("bBqQ") == std::string::npos) { + *is_rfc2047 = false; + break; + } + if (part[0] == 'b' || part[0] == 'B') { + enc_type = B_ENCODING; + } + ++part_index; + break; + case 3: + *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); + if (!*is_rfc2047) { + // Last minute failure. Invalid B/Q encoding. Rather than + // passing it through, return now. + return false; + } + ++part_index; + break; + case 4: + if (part != "=") { + // Another last minute failure ! + // Likely to be a case of two encoded-words in a row or + // an encoded word followed by a non-encoded word. We can be + // generous, but it does not help much in terms of compatibility, + // I believe. Return immediately. + *is_rfc2047 = false; + return false; + } + ++part_index; + break; + default: + *is_rfc2047 = false; + return false; + } + } + + if (*is_rfc2047) { + if (*(encoded_word.end() - 1) == '=') { + output->swap(tmp); + return true; + } + // encoded_word ending prematurelly with '?' or extra '?' + *is_rfc2047 = false; + return false; + } + + // We're not handling 'especial' characters quoted with '\', but + // it should be Ok because we're not an email client but a + // web browser. + + // What IE6/7 does: %-escaped UTF-8. + tmp = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES); + if (IsStringUTF8(tmp)) { + output->swap(tmp); + return true; + // We can try either the OS default charset or 'origin charset' here, + // As far as I can tell, IE does not support it. However, I've seen + // web servers emit %-escaped string in a legacy encoding (usually + // origin charset). + // TODO(jungshik) : Test IE further and consider adding a fallback here. + } + return false; +} + +// Decodes the value of a 'filename' or 'name' parameter given as |input|. The +// value is supposed to be of the form: +// +// value = token | quoted-string +// +// However we currently also allow RFC 2047 encoding and non-ASCII +// strings. Non-ASCII strings are interpreted based on |referrer_charset|. +bool DecodeFilenameValue(const std::string& input, + const std::string& referrer_charset, + std::string* output) { + std::string tmp; + // Tokenize with whitespace characters. + StringTokenizer t(input, " \t\n\r"); + t.set_options(StringTokenizer::RETURN_DELIMS); + bool is_previous_token_rfc2047 = true; + while (t.GetNext()) { + if (t.token_is_delim()) { + // If the previous non-delimeter token is not RFC2047-encoded, + // put in a space in its place. Otheriwse, skip over it. + if (!is_previous_token_rfc2047) { + tmp.push_back(' '); + } + continue; + } + // We don't support a single multibyte character split into + // adjacent encoded words. Some broken mail clients emit headers + // with that problem, but most web servers usually encode a filename + // in a single encoded-word. Firefox/Thunderbird do not support + // it, either. + std::string decoded; + if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, + &decoded)) + return false; + tmp.append(decoded); + } + output->swap(tmp); + return true; +} + +// Parses the charset and value-chars out of an ext-value string. +// +// ext-value = charset "'" [ language ] "'" value-chars +bool ParseExtValueComponents(const std::string& input, + std::string* charset, + std::string* value_chars) { + StringTokenizer t(input, "'"); + t.set_options(StringTokenizer::RETURN_DELIMS); + std::string temp_charset; + std::string temp_value; + int numDelimsSeen = 0; + while (t.GetNext()) { + if (t.token_is_delim()) { + ++numDelimsSeen; + continue; + } else { + switch (numDelimsSeen) { + case 0: + temp_charset = t.token(); + break; + case 1: + // Language is ignored. + break; + case 2: + temp_value = t.token(); + break; + default: + return false; + } + } + } + if (numDelimsSeen != 2) + return false; + if (temp_charset.empty() || temp_value.empty()) + return false; + charset->swap(temp_charset); + value_chars->swap(temp_value); + return true; +} + +// http://tools.ietf.org/html/rfc5987#section-3.2 +// +// ext-value = charset "'" [ language ] "'" value-chars +// +// charset = "UTF-8" / "ISO-8859-1" / mime-charset +// +// mime-charset = 1*mime-charsetc +// mime-charsetc = ALPHA / DIGIT +// / "!" / "#" / "$" / "%" / "&" +// / "+" / "-" / "^" / "_" / "`" +// / "{" / "}" / "~" +// +// language = <Language-Tag, defined in [RFC5646], Section 2.1> +// +// value-chars = *( pct-encoded / attr-char ) +// +// pct-encoded = "%" HEXDIG HEXDIG +// +// attr-char = ALPHA / DIGIT +// / "!" / "#" / "$" / "&" / "+" / "-" / "." +// / "^" / "_" / "`" / "|" / "~" +bool DecodeExtValue(const std::string& param_value, std::string* decoded) { + if (param_value.find('"') != std::string::npos) + return false; + + std::string charset; + std::string value; + if (!ParseExtValueComponents(param_value, &charset, &value)) + return false; + + // RFC 5987 value should be ASCII-only. + if (!IsStringASCII(value)) { + decoded->clear(); + return true; + } + + std::string unescaped = net::UnescapeURLComponent( + value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); + + return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); +} + +} // namespace namespace net { |