summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--net/base/net_util.cc298
-rw-r--r--net/base/net_util.h11
-rw-r--r--net/base/net_util_unittest.cc2
-rw-r--r--net/http/http_content_disposition.cc326
4 files changed, 341 insertions, 296 deletions
diff --git a/net/base/net_util.cc b/net/base/net_util.cc
index 2b11c4d..5f321c6 100644
--- a/net/base/net_util.cc
+++ b/net/base/net_util.cc
@@ -25,7 +25,6 @@
#include <netinet/in.h>
#endif
-#include "base/base64.h"
#include "base/basictypes.h"
#include "base/file_path.h"
#include "base/file_util.h"
@@ -71,7 +70,6 @@
#include "net/http/http_content_disposition.h"
#include "unicode/datefmt.h"
#include "unicode/regex.h"
-#include "unicode/ucnv.h"
#include "unicode/uidna.h"
#include "unicode/ulocdata.h"
#include "unicode/uniset.h"
@@ -175,196 +173,6 @@ std::string::size_type CountTrailingChars(
}
#endif
-// Similar to Base64Decode. Decodes a Q-encoded string to a sequence
-// of bytes. If input is invalid, return false.
-bool QPDecode(const std::string& input, std::string* output) {
- std::string temp;
- temp.reserve(input.size());
- for (std::string::const_iterator it = input.begin(); it != input.end();
- ++it) {
- if (*it == '_') {
- temp.push_back(' ');
- } else if (*it == '=') {
- if ((input.end() - it < 3) ||
- !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
- !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
- return false;
- unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
- HexDigitToInt(*(it + 2));
- temp.push_back(static_cast<char>(ch));
- ++it;
- ++it;
- } else if (0x20 < *it && *it < 0x7F) {
- // In a Q-encoded word, only printable ASCII characters
- // represent themselves. Besides, space, '=', '_' and '?' are
- // not allowed, but they're already filtered out.
- DCHECK_NE('=', *it);
- DCHECK_NE('?', *it);
- DCHECK_NE('_', *it);
- temp.push_back(*it);
- } else {
- return false;
- }
- }
- output->swap(temp);
- return true;
-}
-
-enum RFC2047EncodingType {Q_ENCODING, B_ENCODING};
-bool DecodeBQEncoding(const std::string& part,
- RFC2047EncodingType enc_type,
- const std::string& charset,
- std::string* output) {
- std::string decoded;
- if (!((enc_type == B_ENCODING) ?
- base::Base64Decode(part, &decoded) : QPDecode(part, &decoded)))
- return false;
-
- if (decoded.empty()) {
- output->clear();
- return true;
- }
-
- UErrorCode err = U_ZERO_ERROR;
- UConverter* converter(ucnv_open(charset.c_str(), &err));
- if (U_FAILURE(err))
- return false;
-
- // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
- // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
- // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a
- // trailing '\0'.
- size_t output_length = decoded.length() * 3 + 1;
- char* buf = WriteInto(output, output_length);
- output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,
- decoded.data(), decoded.length(), &err);
- ucnv_close(converter);
- if (U_FAILURE(err))
- return false;
- output->resize(output_length);
- return true;
-}
-
-bool DecodeWord(const std::string& encoded_word,
- const std::string& referrer_charset,
- bool* is_rfc2047,
- std::string* output) {
- *is_rfc2047 = false;
- output->clear();
- if (encoded_word.empty())
- return true;
-
- if (!IsStringASCII(encoded_word)) {
- // Try UTF-8, referrer_charset and the native OS default charset in turn.
- if (IsStringUTF8(encoded_word)) {
- *output = encoded_word;
- } else {
- string16 utf16_output;
- if (!referrer_charset.empty() &&
- base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
- base::OnStringConversionError::FAIL,
- &utf16_output)) {
- *output = UTF16ToUTF8(utf16_output);
- } else {
- *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
- }
- }
-
- return true;
- }
-
- // RFC 2047 : one of encoding methods supported by Firefox and relatively
- // widely used by web servers.
- // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
- // We don't care about the length restriction (72 bytes) because
- // many web servers generate encoded words longer than the limit.
- std::string tmp;
- *is_rfc2047 = true;
- int part_index = 0;
- std::string charset;
- StringTokenizer t(encoded_word, "?");
- RFC2047EncodingType enc_type = Q_ENCODING;
- while (*is_rfc2047 && t.GetNext()) {
- std::string part = t.token();
- switch (part_index) {
- case 0:
- if (part != "=") {
- *is_rfc2047 = false;
- break;
- }
- ++part_index;
- break;
- case 1:
- // Do we need charset validity check here?
- charset = part;
- ++part_index;
- break;
- case 2:
- if (part.size() > 1 ||
- part.find_first_of("bBqQ") == std::string::npos) {
- *is_rfc2047 = false;
- break;
- }
- if (part[0] == 'b' || part[0] == 'B') {
- enc_type = B_ENCODING;
- }
- ++part_index;
- break;
- case 3:
- *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);
- if (!*is_rfc2047) {
- // Last minute failure. Invalid B/Q encoding. Rather than
- // passing it through, return now.
- return false;
- }
- ++part_index;
- break;
- case 4:
- if (part != "=") {
- // Another last minute failure !
- // Likely to be a case of two encoded-words in a row or
- // an encoded word followed by a non-encoded word. We can be
- // generous, but it does not help much in terms of compatibility,
- // I believe. Return immediately.
- *is_rfc2047 = false;
- return false;
- }
- ++part_index;
- break;
- default:
- *is_rfc2047 = false;
- return false;
- }
- }
-
- if (*is_rfc2047) {
- if (*(encoded_word.end() - 1) == '=') {
- output->swap(tmp);
- return true;
- }
- // encoded_word ending prematurelly with '?' or extra '?'
- *is_rfc2047 = false;
- return false;
- }
-
- // We're not handling 'especial' characters quoted with '\', but
- // it should be Ok because we're not an email client but a
- // web browser.
-
- // What IE6/7 does: %-escaped UTF-8.
- tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);
- if (IsStringUTF8(tmp)) {
- output->swap(tmp);
- return true;
- // We can try either the OS default charset or 'origin charset' here,
- // As far as I can tell, IE does not support it. However, I've seen
- // web servers emit %-escaped string in a legacy encoding (usually
- // origin charset).
- // TODO(jungshik) : Test IE further and consider adding a fallback here.
- }
- return false;
-}
-
// Does some simple normalization of scripts so we can allow certain scripts
// to exist together.
// TODO(brettw) bug 880223: we should allow some other languages to be
@@ -939,12 +747,20 @@ std::string GetFileNameFromURL(const GURL& url,
// The URL's path should be escaped UTF-8, but may not be.
std::string decoded_filename = unescaped_url_filename;
- if (!IsStringASCII(decoded_filename)) {
- bool ignore;
+ if (!IsStringUTF8(decoded_filename)) {
// TODO(jshin): this is probably not robust enough. To be sure, we need
// encoding detection.
- DecodeWord(unescaped_url_filename, referrer_charset, &ignore,
- &decoded_filename);
+ string16 utf16_output;
+ if (!referrer_charset.empty() &&
+ base::CodepageToUTF16(unescaped_url_filename,
+ referrer_charset.c_str(),
+ base::OnStringConversionError::FAIL,
+ &utf16_output)) {
+ decoded_filename = UTF16ToUTF8(utf16_output);
+ } else {
+ decoded_filename = WideToUTF8(
+ base::SysNativeMBToWide(unescaped_url_filename));
+ }
}
// If the URL contains a (possibly empty) query, assume it is a generator, and
// allow the determined extension to be overwritten.
@@ -1158,96 +974,6 @@ std::string GetSpecificHeader(const std::string& headers,
return ret;
}
-bool DecodeCharset(const std::string& input,
- std::string* decoded_charset,
- std::string* value) {
- StringTokenizer t(input, "'");
- t.set_options(StringTokenizer::RETURN_DELIMS);
- std::string temp_charset;
- std::string temp_value;
- int numDelimsSeen = 0;
- while (t.GetNext()) {
- if (t.token_is_delim()) {
- ++numDelimsSeen;
- continue;
- } else {
- switch (numDelimsSeen) {
- case 0:
- temp_charset = t.token();
- break;
- case 1:
- // Language is ignored.
- break;
- case 2:
- temp_value = t.token();
- break;
- default:
- return false;
- }
- }
- }
- if (numDelimsSeen != 2)
- return false;
- if (temp_charset.empty() || temp_value.empty())
- return false;
- decoded_charset->swap(temp_charset);
- value->swap(temp_value);
- return true;
-}
-
-bool DecodeFilenameValue(const std::string& input,
- const std::string& referrer_charset,
- std::string* output) {
- std::string tmp;
- // Tokenize with whitespace characters.
- StringTokenizer t(input, " \t\n\r");
- t.set_options(StringTokenizer::RETURN_DELIMS);
- bool is_previous_token_rfc2047 = true;
- while (t.GetNext()) {
- if (t.token_is_delim()) {
- // If the previous non-delimeter token is not RFC2047-encoded,
- // put in a space in its place. Otheriwse, skip over it.
- if (!is_previous_token_rfc2047) {
- tmp.push_back(' ');
- }
- continue;
- }
- // We don't support a single multibyte character split into
- // adjacent encoded words. Some broken mail clients emit headers
- // with that problem, but most web servers usually encode a filename
- // in a single encoded-word. Firefox/Thunderbird do not support
- // it, either.
- std::string decoded;
- if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
- &decoded))
- return false;
- tmp.append(decoded);
- }
- output->swap(tmp);
- return true;
-}
-
-bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
- if (param_value.find('"') != std::string::npos)
- return false;
-
- std::string charset;
- std::string value;
- if (!DecodeCharset(param_value, &charset, &value))
- return false;
-
- // RFC 5987 value should be ASCII-only.
- if (!IsStringASCII(value)) {
- decoded->clear();
- return true;
- }
-
- std::string unescaped = UnescapeURLComponent(value,
- UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
-
- return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);
-}
-
string16 IDNToUnicode(const std::string& host,
const std::string& languages) {
return IDNToUnicodeWithOffsets(host, languages, NULL);
diff --git a/net/base/net_util.h b/net/base/net_util.h
index 444a547..874f3e2 100644
--- a/net/base/net_util.h
+++ b/net/base/net_util.h
@@ -173,12 +173,6 @@ NET_EXPORT std::string GetHostOrSpecFromURL(const GURL& url);
NET_EXPORT std::string GetSpecificHeader(const std::string& headers,
const std::string& name);
-// TODO(abarth): Move these functions to http_content_disposition.cc.
-bool DecodeFilenameValue(const std::string& input,
- const std::string& referrer_charset,
- std::string* output);
-bool DecodeExtValue(const std::string& value, std::string* output);
-
// Converts the given host name to unicode characters. This can be called for
// any host name, if the input is not IDN or is invalid in some way, we'll just
// return the ASCII source so it is still usable.
@@ -252,9 +246,8 @@ NET_EXPORT string16 StripWWWFromHost(const GURL& url);
// Generates a filename using the first successful method from the following (in
// order):
//
-// 1) The raw Content-Disposition header in |content_disposition| (as read from
-// the network. |referrer_charset| is used as described in the comment for
-// GetFileNameFromCD().
+// 1) The raw Content-Disposition header in |content_disposition| as read from
+// the network. |referrer_charset| is used to decode non-ASCII strings.
// 2) |suggested_name| if specified. |suggested_name| is assumed to be in
// UTF-8.
// 3) The filename extracted from the |url|. |referrer_charset| will be used to
diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc
index 53e32f3..e4e181b 100644
--- a/net/base/net_util_unittest.cc
+++ b/net/base/net_util_unittest.cc
@@ -1179,7 +1179,7 @@ TEST(NetUtilTest, GenerateFileName) {
L"default",
L"default"
},
- // Below is a small subset of cases taken from GetFileNameFromCD test above.
+ // Below is a small subset of cases taken from HttpContentDisposition tests.
{
"http://www.google.com/",
"attachment; filename=\"%EC%98%88%EC%88%A0%20"
diff --git a/net/http/http_content_disposition.cc b/net/http/http_content_disposition.cc
index 52d9f4f..0726e93 100644
--- a/net/http/http_content_disposition.cc
+++ b/net/http/http_content_disposition.cc
@@ -4,10 +4,336 @@
#include "net/http/http_content_disposition.h"
+#include "base/base64.h"
+#include "base/i18n/icu_string_conversions.h"
#include "base/logging.h"
#include "base/string_util.h"
+#include "base/sys_string_conversions.h"
+#include "base/utf_string_conversions.h"
#include "net/base/net_util.h"
#include "net/http/http_util.h"
+#include "unicode/ucnv.h"
+
+namespace {
+
+enum RFC2047EncodingType {
+ Q_ENCODING,
+ B_ENCODING
+};
+
+// Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
+// decoding a quoted-printable string. Returns true if the input was valid.
+bool DecodeQEncoding(const std::string& input, std::string* output) {
+ std::string temp;
+ temp.reserve(input.size());
+ for (std::string::const_iterator it = input.begin(); it != input.end();
+ ++it) {
+ if (*it == '_') {
+ temp.push_back(' ');
+ } else if (*it == '=') {
+ if ((input.end() - it < 3) ||
+ !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
+ !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
+ return false;
+ unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
+ HexDigitToInt(*(it + 2));
+ temp.push_back(static_cast<char>(ch));
+ ++it;
+ ++it;
+ } else if (0x20 < *it && *it < 0x7F && *it != '?') {
+ // In a Q-encoded word, only printable ASCII characters
+ // represent themselves. Besides, space, '=', '_' and '?' are
+ // not allowed, but they're already filtered out.
+ DCHECK_NE('=', *it);
+ DCHECK_NE('?', *it);
+ DCHECK_NE('_', *it);
+ temp.push_back(*it);
+ } else {
+ return false;
+ }
+ }
+ output->swap(temp);
+ return true;
+}
+
+// Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
+// type is specified in |enc_type|.
+bool DecodeBQEncoding(const std::string& part,
+ RFC2047EncodingType enc_type,
+ const std::string& charset,
+ std::string* output) {
+ std::string decoded;
+ if (!((enc_type == B_ENCODING) ?
+ base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded)))
+ return false;
+
+ if (decoded.empty()) {
+ output->clear();
+ return true;
+ }
+
+ UErrorCode err = U_ZERO_ERROR;
+ UConverter* converter(ucnv_open(charset.c_str(), &err));
+ if (U_FAILURE(err))
+ return false;
+
+ // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
+ // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
+ // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a
+ // trailing '\0'.
+ size_t output_length = decoded.length() * 3 + 1;
+ char* buf = WriteInto(output, output_length);
+ output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,
+ decoded.data(), decoded.length(), &err);
+ ucnv_close(converter);
+ if (U_FAILURE(err))
+ return false;
+ output->resize(output_length);
+ return true;
+}
+
+bool DecodeWord(const std::string& encoded_word,
+ const std::string& referrer_charset,
+ bool* is_rfc2047,
+ std::string* output) {
+ *is_rfc2047 = false;
+ output->clear();
+ if (encoded_word.empty())
+ return true;
+
+ if (!IsStringASCII(encoded_word)) {
+ // Try UTF-8, referrer_charset and the native OS default charset in turn.
+ if (IsStringUTF8(encoded_word)) {
+ *output = encoded_word;
+ } else {
+ string16 utf16_output;
+ if (!referrer_charset.empty() &&
+ base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
+ base::OnStringConversionError::FAIL,
+ &utf16_output)) {
+ *output = UTF16ToUTF8(utf16_output);
+ } else {
+ *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
+ }
+ }
+
+ return true;
+ }
+
+ // RFC 2047 : one of encoding methods supported by Firefox and relatively
+ // widely used by web servers.
+ // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
+ // We don't care about the length restriction (72 bytes) because
+ // many web servers generate encoded words longer than the limit.
+ std::string tmp;
+ *is_rfc2047 = true;
+ int part_index = 0;
+ std::string charset;
+ StringTokenizer t(encoded_word, "?");
+ RFC2047EncodingType enc_type = Q_ENCODING;
+ while (*is_rfc2047 && t.GetNext()) {
+ std::string part = t.token();
+ switch (part_index) {
+ case 0:
+ if (part != "=") {
+ *is_rfc2047 = false;
+ break;
+ }
+ ++part_index;
+ break;
+ case 1:
+ // Do we need charset validity check here?
+ charset = part;
+ ++part_index;
+ break;
+ case 2:
+ if (part.size() > 1 ||
+ part.find_first_of("bBqQ") == std::string::npos) {
+ *is_rfc2047 = false;
+ break;
+ }
+ if (part[0] == 'b' || part[0] == 'B') {
+ enc_type = B_ENCODING;
+ }
+ ++part_index;
+ break;
+ case 3:
+ *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);
+ if (!*is_rfc2047) {
+ // Last minute failure. Invalid B/Q encoding. Rather than
+ // passing it through, return now.
+ return false;
+ }
+ ++part_index;
+ break;
+ case 4:
+ if (part != "=") {
+ // Another last minute failure !
+ // Likely to be a case of two encoded-words in a row or
+ // an encoded word followed by a non-encoded word. We can be
+ // generous, but it does not help much in terms of compatibility,
+ // I believe. Return immediately.
+ *is_rfc2047 = false;
+ return false;
+ }
+ ++part_index;
+ break;
+ default:
+ *is_rfc2047 = false;
+ return false;
+ }
+ }
+
+ if (*is_rfc2047) {
+ if (*(encoded_word.end() - 1) == '=') {
+ output->swap(tmp);
+ return true;
+ }
+ // encoded_word ending prematurelly with '?' or extra '?'
+ *is_rfc2047 = false;
+ return false;
+ }
+
+ // We're not handling 'especial' characters quoted with '\', but
+ // it should be Ok because we're not an email client but a
+ // web browser.
+
+ // What IE6/7 does: %-escaped UTF-8.
+ tmp = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES);
+ if (IsStringUTF8(tmp)) {
+ output->swap(tmp);
+ return true;
+ // We can try either the OS default charset or 'origin charset' here,
+ // As far as I can tell, IE does not support it. However, I've seen
+ // web servers emit %-escaped string in a legacy encoding (usually
+ // origin charset).
+ // TODO(jungshik) : Test IE further and consider adding a fallback here.
+ }
+ return false;
+}
+
+// Decodes the value of a 'filename' or 'name' parameter given as |input|. The
+// value is supposed to be of the form:
+//
+// value = token | quoted-string
+//
+// However we currently also allow RFC 2047 encoding and non-ASCII
+// strings. Non-ASCII strings are interpreted based on |referrer_charset|.
+bool DecodeFilenameValue(const std::string& input,
+ const std::string& referrer_charset,
+ std::string* output) {
+ std::string tmp;
+ // Tokenize with whitespace characters.
+ StringTokenizer t(input, " \t\n\r");
+ t.set_options(StringTokenizer::RETURN_DELIMS);
+ bool is_previous_token_rfc2047 = true;
+ while (t.GetNext()) {
+ if (t.token_is_delim()) {
+ // If the previous non-delimeter token is not RFC2047-encoded,
+ // put in a space in its place. Otheriwse, skip over it.
+ if (!is_previous_token_rfc2047) {
+ tmp.push_back(' ');
+ }
+ continue;
+ }
+ // We don't support a single multibyte character split into
+ // adjacent encoded words. Some broken mail clients emit headers
+ // with that problem, but most web servers usually encode a filename
+ // in a single encoded-word. Firefox/Thunderbird do not support
+ // it, either.
+ std::string decoded;
+ if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
+ &decoded))
+ return false;
+ tmp.append(decoded);
+ }
+ output->swap(tmp);
+ return true;
+}
+
+// Parses the charset and value-chars out of an ext-value string.
+//
+// ext-value = charset "'" [ language ] "'" value-chars
+bool ParseExtValueComponents(const std::string& input,
+ std::string* charset,
+ std::string* value_chars) {
+ StringTokenizer t(input, "'");
+ t.set_options(StringTokenizer::RETURN_DELIMS);
+ std::string temp_charset;
+ std::string temp_value;
+ int numDelimsSeen = 0;
+ while (t.GetNext()) {
+ if (t.token_is_delim()) {
+ ++numDelimsSeen;
+ continue;
+ } else {
+ switch (numDelimsSeen) {
+ case 0:
+ temp_charset = t.token();
+ break;
+ case 1:
+ // Language is ignored.
+ break;
+ case 2:
+ temp_value = t.token();
+ break;
+ default:
+ return false;
+ }
+ }
+ }
+ if (numDelimsSeen != 2)
+ return false;
+ if (temp_charset.empty() || temp_value.empty())
+ return false;
+ charset->swap(temp_charset);
+ value_chars->swap(temp_value);
+ return true;
+}
+
+// http://tools.ietf.org/html/rfc5987#section-3.2
+//
+// ext-value = charset "'" [ language ] "'" value-chars
+//
+// charset = "UTF-8" / "ISO-8859-1" / mime-charset
+//
+// mime-charset = 1*mime-charsetc
+// mime-charsetc = ALPHA / DIGIT
+// / "!" / "#" / "$" / "%" / "&"
+// / "+" / "-" / "^" / "_" / "`"
+// / "{" / "}" / "~"
+//
+// language = <Language-Tag, defined in [RFC5646], Section 2.1>
+//
+// value-chars = *( pct-encoded / attr-char )
+//
+// pct-encoded = "%" HEXDIG HEXDIG
+//
+// attr-char = ALPHA / DIGIT
+// / "!" / "#" / "$" / "&" / "+" / "-" / "."
+// / "^" / "_" / "`" / "|" / "~"
+bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
+ if (param_value.find('"') != std::string::npos)
+ return false;
+
+ std::string charset;
+ std::string value;
+ if (!ParseExtValueComponents(param_value, &charset, &value))
+ return false;
+
+ // RFC 5987 value should be ASCII-only.
+ if (!IsStringASCII(value)) {
+ decoded->clear();
+ return true;
+ }
+
+ std::string unescaped = net::UnescapeURLComponent(
+ value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
+
+ return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);
+}
+
+} // namespace
namespace net {