4 files changed, 174 insertions, 94 deletions
diff --git a/net/base/net_util.cc b/net/base/net_util.cc
index 9610e40..e5c53eb 100644
--- a/net/base/net_util.cc
+++ b/net/base/net_util.cc
@@ -247,17 +247,22 @@ bool DecodeBQEncoding(const std::string& part, RFC2047EncodingType enc_type,
 }
 
 bool DecodeWord(const std::string& encoded_word,
+                const std::string& referrer_charset,
                 bool *is_rfc2047,
                 std::string* output) {
-  // TODO(jungshik) : Revisit this later. Do we want to pass through non-ASCII
-  // strings which can be mozibake?  WinHTTP converts a raw 8bit string
-  // UTF-16 assuming it's in the OS default encoding.
   if (!IsStringASCII(encoded_word)) {
-    // Try falling back to the NativeMB encoding if the raw input is not UTF-8.
+    // Try UTF-8, referrer_charset and the native OS default charset in turn.
     if (IsStringUTF8(encoded_word)) {
       *output = encoded_word;
     } else {
-      *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
+      std::wstring wide_output;
+      if (!referrer_charset.empty() &&
+          CodepageToWide(encoded_word, referrer_charset.c_str(),
+                         OnStringUtilConversionError::FAIL, &wide_output)) {
+        *output = WideToUTF8(wide_output);
+      } else {
+        *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
+      }
     }
     *is_rfc2047 = false;
     return true;
@@ -357,7 +362,9 @@ bool DecodeWord(const std::string& encoded_word,
   return false;
 }
 
-bool DecodeParamValue(const std::string& input, std::string* output) {
+bool DecodeParamValue(const std::string& input,
+                      const std::string& referrer_charset,
+                      std::string* output) {
   std::string tmp;
   // Tokenize with whitespace characters.
   StringTokenizer t(input, " \t\n\r");
@@ -378,7 +385,8 @@ bool DecodeParamValue(const std::string& input, std::string* output) {
     // in a single encoded-word. Firefox/Thunderbird do not support
     // it, either.
     std::string decoded;
-    if (!DecodeWord(t.token(), &is_previous_token_rfc2047, &decoded))
+    if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
+                    &decoded))
       return false;
     tmp.append(decoded);
   }
@@ -683,7 +691,8 @@ std::string GetSpecificHeader(const std::string& headers,
   return GetSpecificHeaderT(headers, name);
 }
 
-std::wstring GetFileNameFromCD(const std::string& header) {
+std::wstring GetFileNameFromCD(const std::string& header,
+                               const std::string& referrer_charset) {
   std::string param_value = GetHeaderParamValue(header, "filename");
   if (param_value.empty()) {
     // Some servers use 'name' parameter.
@@ -692,7 +701,7 @@ std::wstring GetFileNameFromCD(const std::string& header) {
   if (param_value.empty())
     return std::wstring();
   std::string decoded;
-  if (DecodeParamValue(param_value, &decoded))
+  if (DecodeParamValue(param_value, referrer_charset, &decoded))
     return UTF8ToWide(decoded);
   return std::wstring();
 }
@@ -863,8 +872,10 @@ std::wstring StripWWW(const std::wstring& text) {
 
 std::wstring GetSuggestedFilename(const GURL& url,
                                   const std::string& content_disposition,
+                                  const std::string& referrer_charset,
                                   const std::wstring& default_name) {
-  std::wstring filename = GetFileNameFromCD(content_disposition);
+  std::wstring filename = GetFileNameFromCD(content_disposition,
+                                            referrer_charset);
   if (!filename.empty()) {
     // Remove any path information the server may have sent, take the name
     // only.
@@ -901,13 +912,6 @@ std::wstring GetSuggestedFilename(const GURL& url,
   return filename;
 }
 
-std::wstring GetSuggestedFilename(const GURL& url,
-                                  const std::wstring& content_disposition,
-                                  const std::wstring& default_name) {
-  return GetSuggestedFilename(
-      url, WideToUTF8(content_disposition), default_name);
-}
-
 bool IsPortAllowedByDefault(int port) {
   int array_size = arraysize(kRestrictedPorts);
   for (int i = 0; i < array_size; i++) {
diff --git a/net/base/net_util.h b/net/base/net_util.h
index 2ad81bf..9aba789 100644
--- a/net/base/net_util.h
+++ b/net/base/net_util.h
@@ -71,12 +71,20 @@ std::wstring GetHeaderParamValue(const std::wstring& field,
 std::string GetHeaderParamValue(const std::string& field,
                                 const std::string& param_name);
 
-// Return the filename extracted from Content-Disposition header.  Only two
-// formats are supported: a. %-escaped UTF-8 b. RFC 2047.
+// Return the filename extracted from Content-Disposition header. The following
+// formats are tried in order listed below:
 //
-// A non-ASCII param value is just returned as it is (assuming a NativeMB
-// encoding). When a param value is ASCII, but is not in one of two forms
-// supported, it is returned as it is unless it's pretty close to two supported
+// 1. RFC 2047
+// 2. Raw-8bit-characters :
+//    a. UTF-8, b. referrer_charset, c. default os codepage.
+// 3. %-escaped UTF-8.
+//
+// In step 2, if referrer_charset is empty(i.e. unknown), 2b is skipped.
+// In step 3, the fallback charsets tried in step 2 are not tried. We
+// can consider doing that later.
+//
+// When a param value is ASCII, but is not in format #1 or format #3 above,
+// it is returned as it is unless it's pretty close to two supported
 // formats but not well-formed. In that case, an empty string is returned.
 //
 // In any case, a caller must check for the empty return value and resort to
@@ -90,7 +98,8 @@ std::string GetHeaderParamValue(const std::string& field,
 // other caller is a unit test. Need to figure out expose this function only to
 // net_util_unittest.
 //
-std::wstring GetFileNameFromCD(const std::string& header);
+std::wstring GetFileNameFromCD(const std::string& header,
+                               const std::string& referrer_charset);
 
 // Converts the given host name to unicode characters, APPENDING them to the
 // the given output string. This can be called for any host name, if the
@@ -133,14 +142,12 @@ std::wstring StripWWW(const std::wstring& text);
 // Gets the filename from the raw Content-Disposition header (as read from the
 // network).  Otherwise uses the last path component name or hostname from
 // |url|.  Note: it's possible for the suggested filename to be empty (e.g.,
-// file:/// or view-cache:).
+// file:/// or view-cache:). referrer_charset is used as one of charsets
+// to interpret a raw 8bit string in C-D header (after interpreting
+// as UTF-8 fails). See the comment for GetFilenameFromCD for more details.
 std::wstring GetSuggestedFilename(const GURL& url,
                                   const std::string& content_disposition,
-                                  const std::wstring& default_name);
-
-// DEPRECATED: Please use the above version of this method.
-std::wstring GetSuggestedFilename(const GURL& url,
-                                  const std::wstring& content_disposition,
+                                  const std::string& referrer_charset,
                                   const std::wstring& default_name);
 
 // Checks the given port against a list of ports which are restricted by
diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc
index f9bc7f7..1a3bcdc 100644
--- a/net/base/net_util_unittest.cc
+++ b/net/base/net_util_unittest.cc
@@ -39,6 +39,7 @@ struct HeaderParamCase {
 
 struct FileNameCDCase {
   const char* header_field;
+  const char* referrer_charset;
   const wchar_t* expected;
 };
 
@@ -58,7 +59,8 @@ struct IDNTestCase {
 
 struct SuggestedFilenameCase {
   const char* url;
-  const wchar_t* content_disp_header;
+  const char* content_disp_header;
+  const char* referrer_charset;
   const wchar_t* default_filename;
   const wchar_t* expected_filename;
 };
@@ -299,75 +301,96 @@ TEST(NetUtilTest, GetHeaderParamValue) {
 TEST(NetUtilTest, GetFileNameFromCD) {
   const FileNameCDCase tests[] = {
     // Test various forms of C-D header fields emitted by web servers.
-    {"content-disposition: inline; filename=\"abcde.pdf\"", L"abcde.pdf"},
-    {"content-disposition: inline; name=\"abcde.pdf\"", L"abcde.pdf"},
-    {"content-disposition: attachment; filename=abcde.pdf", L"abcde.pdf"},
-    {"content-disposition: attachment; name=abcde.pdf", L"abcde.pdf"},
-    {"content-disposition: attachment; filename=abc,de.pdf", L"abc,de.pdf"},
-    {"content-disposition: filename=abcde.pdf", L"abcde.pdf"},
-    {"content-disposition: filename= abcde.pdf", L"abcde.pdf"},
-    {"content-disposition: filename =abcde.pdf", L"abcde.pdf"},
-    {"content-disposition: filename = abcde.pdf", L"abcde.pdf"},
-    {"content-disposition: filename\t=abcde.pdf", L"abcde.pdf"},
-    {"content-disposition: filename \t\t  =abcde.pdf", L"abcde.pdf"},
-    {"content-disposition: name=abcde.pdf", L"abcde.pdf"},
-    {"content-disposition: inline; filename=\"abc%20de.pdf\"", L"abc de.pdf"},
+    {"content-disposition: inline; filename=\"abcde.pdf\"", "", L"abcde.pdf"},
+    {"content-disposition: inline; name=\"abcde.pdf\"", "", L"abcde.pdf"},
+    {"content-disposition: attachment; filename=abcde.pdf", "", L"abcde.pdf"},
+    {"content-disposition: attachment; name=abcde.pdf", "", L"abcde.pdf"},
+    {"content-disposition: attachment; filename=abc,de.pdf", "", L"abc,de.pdf"},
+    {"content-disposition: filename=abcde.pdf", "", L"abcde.pdf"},
+    {"content-disposition: filename= abcde.pdf", "", L"abcde.pdf"},
+    {"content-disposition: filename =abcde.pdf", "", L"abcde.pdf"},
+    {"content-disposition: filename = abcde.pdf", "", L"abcde.pdf"},
+    {"content-disposition: filename\t=abcde.pdf", "", L"abcde.pdf"},
+    {"content-disposition: filename \t\t  =abcde.pdf", "", L"abcde.pdf"},
+    {"content-disposition: name=abcde.pdf", "", L"abcde.pdf"},
+    {"content-disposition: inline; filename=\"abc%20de.pdf\"", "",
+     L"abc de.pdf"},
     // Whitespaces are converted to a space.
-    {"content-disposition: inline; filename=\"abc  \t\nde.pdf\"",
+    {"content-disposition: inline; filename=\"abc  \t\nde.pdf\"", "",
      L"abc    de.pdf"},
     // %-escaped UTF-8
     {"Content-Disposition: attachment; filename=\"%EC%98%88%EC%88%A0%20"
-     "%EC%98%88%EC%88%A0.jpg\"", L"\xc608\xc220 \xc608\xc220.jpg"},
+     "%EC%98%88%EC%88%A0.jpg\"", "", L"\xc608\xc220 \xc608\xc220.jpg"},
     {"Content-Disposition: attachment; filename=\"%F0%90%8C%B0%F0%90%8C%B1"
-     "abc.jpg\"", L"\U00010330\U00010331abc.jpg"},
+     "abc.jpg\"", "", L"\U00010330\U00010331abc.jpg"},
     {"Content-Disposition: attachment; filename=\"%EC%98%88%EC%88%A0 \n"
-     "%EC%98%88%EC%88%A0.jpg\"", L"\xc608\xc220  \xc608\xc220.jpg"},
+     "%EC%98%88%EC%88%A0.jpg\"", "", L"\xc608\xc220  \xc608\xc220.jpg"},
     // RFC 2047 with various charsets and Q/B encodings
     {"Content-Disposition: attachment; filename=\"=?EUC-JP?Q?=B7=DD=BD="
-     "D13=2Epng?=\"", L"\x82b8\x8853" L"3.png"},
+     "D13=2Epng?=\"", "", L"\x82b8\x8853" L"3.png"},
     {"Content-Disposition: attachment; filename==?eUc-Kr?b?v7m8+iAzLnBuZw==?=",
-     L"\xc608\xc220 3.png"},
+     "", L"\xc608\xc220 3.png"},
     {"Content-Disposition: attachment; filename==?utf-8?Q?=E8=8A=B8=E8"
-     "=A1=93_3=2Epng?=", L"\x82b8\x8853 3.png"},
+     "=A1=93_3=2Epng?=", "", L"\x82b8\x8853 3.png"},
     {"Content-Disposition: attachment; filename==?utf-8?Q?=F0=90=8C=B0"
-     "_3=2Epng?=", L"\U00010330 3.png"},
-    {"Content-Disposition: inline; filename=\"=?iso88591?Q?caf=e3_=2epng?=\"",
-     L"caf\x00e3 .png"},
+     "_3=2Epng?=", "", L"\U00010330 3.png"},
+    {"Content-Disposition: inline; filename=\"=?iso88591?Q?caf=e9_=2epng?=\"",
+     "", L"caf\x00e9 .png"},
     // Space after an encode word should be removed.
-    {"Content-Disposition: inline; filename=\"=?iso88591?Q?caf=E3_?= .png\"",
-     L"caf\x00e3 .png"},
+    {"Content-Disposition: inline; filename=\"=?iso88591?Q?caf=E9_?= .png\"",
+     "", L"caf\x00e9 .png"},
     // Two encoded words with different charsets (not very likely to be emitted
     // by web servers in the wild). Spaces between them are removed.
     {"Content-Disposition: inline; filename=\"=?euc-kr?b?v7m8+iAz?="
-     " =?ksc5601?q?=BF=B9=BC=FA=2Epng?=\"", L"\xc608\xc220 3\xc608\xc220.png"},
-    {"Content-Disposition: attachment; filename=\"=?windows-1252?Q?caf=E3?="
-     "  =?iso-8859-7?b?4eI=?= .png\"", L"caf\x00e3\x03b1\x03b2.png"},
-    // Non-ASCII string is passed through (and treated as UTF-8).
-    {"Content-Disposition: attachment; filename=caf\xc3\xa3.png",
-     L"caf\x00e3.png"},
+     " =?ksc5601?q?=BF=B9=BC=FA=2Epng?=\"", "",
+     L"\xc608\xc220 3\xc608\xc220.png"},
+    {"Content-Disposition: attachment; filename=\"=?windows-1252?Q?caf=E9?="
+     "  =?iso-8859-7?b?4eI=?= .png\"", "", L"caf\x00e9\x03b1\x03b2.png"},
+    // Non-ASCII string is passed through and treated as UTF-8 as long as
+    // it's valid as UTF-8 and regardless of |referrer_charset|.
+    {"Content-Disposition: attachment; filename=caf\xc3\xa9.png",
+     "iso-8859-1", L"caf\x00e9.png"},
+    {"Content-Disposition: attachment; filename=caf\xc3\xa9.png",
+     "", L"caf\x00e9.png"},
+    // Non-ASCII/Non-UTF-8 string. Fall back to the referrer charset.
+    {"Content-Disposition: attachment; filename=caf\xe5.png",
+     "windows-1253", L"caf\x03b5.png"},
+#if 0
+    // Non-ASCII/Non-UTF-8 string. Fall back to the native codepage.
+    // TODO(jungshik): We need to set the OS default codepage
+    // to a specific value before testing. On Windows, we can use
+    // SetThreadLocale().
+    {"Content-Disposition: attachment; filename=\xb0\xa1\xb0\xa2.png",
+     "", L"\xac00\xac01.png"},
+#endif
     // Failure cases
     // Invalid hex-digit "G"
-    {"Content-Disposition: attachment; filename==?iiso88591?Q?caf=EG?=", L""},
+    {"Content-Disposition: attachment; filename==?iiso88591?Q?caf=EG?=", "",
+     L""},
     // Incomplete RFC 2047 encoded-word (missing '='' at the end)
-    {"Content-Disposition: attachment; filename==?iso88591?Q?caf=E3?", L""},
+    {"Content-Disposition: attachment; filename==?iso88591?Q?caf=E3?", "", L""},
     // Extra character at the end of an encoded word
-    {"Content-Disposition: attachment; filename==?iso88591?Q?caf=E3?==", L""},
+    {"Content-Disposition: attachment; filename==?iso88591?Q?caf=E3?==",
+     "", L""},
     // Extra token at the end of an encoded word
-    {"Content-Disposition: attachment; filename==?iso88591?Q?caf=E3?=?", L""},
-    {"Content-Disposition: attachment; filename==?iso88591?Q?caf=E3?=?=", L""},
+    {"Content-Disposition: attachment; filename==?iso88591?Q?caf=E3?=?",
+     "", L""},
+    {"Content-Disposition: attachment; filename==?iso88591?Q?caf=E3?=?=",
+     "",  L""},
     // Incomplete hex-escaped chars
     {"Content-Disposition: attachment; filename==?windows-1252?Q?=63=61=E?=",
-     L""},
-    {"Content-Disposition: attachment; filename=%EC%98%88%EC%88%A", L""},
+     "", L""},
+    {"Content-Disposition: attachment; filename=%EC%98%88%EC%88%A", "", L""},
     // %-escaped non-UTF-8 encoding is an "error"
-    {"Content-Disposition: attachment; filename=%B7%DD%BD%D1.png", L""},
+    {"Content-Disposition: attachment; filename=%B7%DD%BD%D1.png", "", L""},
     // Two RFC 2047 encoded words in a row without a space is an error.
     {"Content-Disposition: attachment; filename==?windows-1252?Q?caf=E3?="
-     "=?iso-8859-7?b?4eIucG5nCg==?=", L""},
+     "=?iso-8859-7?b?4eIucG5nCg==?=", "", L""},
   };
   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(tests); ++i) {
     EXPECT_EQ(tests[i].expected,
-              net::GetFileNameFromCD(tests[i].header_field));
+              net::GetFileNameFromCD(tests[i].header_field,
+                                     tests[i].referrer_charset));
   }
 }
 
@@ -669,97 +692,132 @@ TEST(NetUtilTest, StripWWW) {
 TEST(NetUtilTest, GetSuggestedFilename) {
   const SuggestedFilenameCase test_cases[] = {
     {"http://www.google.com/",
-     L"Content-disposition: attachment; filename=test.html",
+     "Content-disposition: attachment; filename=test.html",
+     "",
      L"",
      L"test.html"},
     {"http://www.google.com/",
-     L"Content-disposition: attachment; filename=\"test.html\"",
+     "Content-disposition: attachment; filename=\"test.html\"",
+     "",
      L"",
      L"test.html"},
     {"http://www.google.com/path/test.html",
-     L"Content-disposition: attachment",
+     "Content-disposition: attachment",
+     "",
      L"",
      L"test.html"},
     {"http://www.google.com/path/test.html",
-     L"Content-disposition: attachment;",
+     "Content-disposition: attachment;",
+     "",
      L"",
      L"test.html"},
     {"http://www.google.com/",
-     L"",
+     "",
+     "",
      L"",
      L"www.google.com"},
     {"http://www.google.com/test.html",
-     L"",
+     "",
+     "",
      L"",
      L"test.html"},
     // Now that we use googleurl's ExtractFileName, this case falls back
     // to the hostname. If this behavior is not desirable, we'd better
     // change ExtractFileName (in url_parse).
     {"http://www.google.com/path/",
-     L"",
+     "",
+     "",
      L"",
      L"www.google.com"},
     {"http://www.google.com/path",
-     L"",
+     "",
+     "",
      L"",
      L"path"},
     {"file:///",
-     L"",
+     "",
+     "",
      L"",
      L"download"},
     {"view-cache:",
-     L"",
+     "",
+     "",
      L"",
      L"download"},
     {"http://www.google.com/",
-     L"Content-disposition: attachment; filename =\"test.html\"",
+     "Content-disposition: attachment; filename =\"test.html\"",
+     "",
      L"download",
      L"test.html"},
     {"http://www.google.com/",
-     L"",
+     "",
+     "",
      L"download",
      L"download"},
     {"http://www.google.com/",
-     L"Content-disposition: attachment; filename=\"../test.html\"",
+     "Content-disposition: attachment; filename=\"../test.html\"",
+     "",
      L"",
      L"test.html"},
     {"http://www.google.com/",
-     L"Content-disposition: attachment; filename=\"..\"",
+     "Content-disposition: attachment; filename=\"..\"",
+     "",
      L"download",
      L"download"},
     {"http://www.google.com/test.html",
-     L"Content-disposition: attachment; filename=\"..\"",
+     "Content-disposition: attachment; filename=\"..\"",
+     "",
      L"download",
      L"test.html"},
     // Below is a small subset of cases taken from GetFileNameFromCD test above.
     {"http://www.google.com/",
-     L"Content-Disposition: attachment; filename=\"%EC%98%88%EC%88%A0%20"
-     L"%EC%98%88%EC%88%A0.jpg\"",
+     "Content-Disposition: attachment; filename=\"%EC%98%88%EC%88%A0%20"
+     "%EC%98%88%EC%88%A0.jpg\"",
+     "",
      L"",
      L"\uc608\uc220 \uc608\uc220.jpg"},
     {"http://www.google.com/%EC%98%88%EC%88%A0%20%EC%98%88%EC%88%A0.jpg",
-     L"",
+     "",
+     "",
      L"download",
      L"\uc608\uc220 \uc608\uc220.jpg"},
     {"http://www.google.com/",
-     L"Content-disposition: attachment;",
+     "Content-disposition: attachment;",
+     "",
      L"\uB2E4\uC6B4\uB85C\uB4DC",
      L"\uB2E4\uC6B4\uB85C\uB4DC"},
     {"http://www.google.com/",
-     L"Content-Disposition: attachment; filename=\"=?EUC-JP?Q?=B7=DD=BD="
-     L"D13=2Epng?=\"",
+     "Content-Disposition: attachment; filename=\"=?EUC-JP?Q?=B7=DD=BD="
+     "D13=2Epng?=\"",
+     "",
      L"download",
      L"\u82b8\u88533.png"},
+    {"http://www.example.com/images?id=3",
+     "Content-Disposition: attachment; filename=caf\xc3\xa9.png",
+     "iso-8859-1",
+     L"",
+     L"caf\u00e9.png"},
+    {"http://www.example.com/images?id=3",
+     "Content-Disposition: attachment; filename=caf\xe5.png",
+     "windows-1253",
+     L"",
+     L"caf\u03b5.png"},
+    {"http://www.example.com/file?id=3",
+     "Content-Disposition: attachment; name=\xcf\xc2\xd4\xd8.zip",
+     "GBK",
+     L"",
+     L"\u4e0b\u8f7d.zip"},
     // Invalid C-D header. Extracts filename from url.
     {"http://www.google.com/test.html",
-     L"Content-Disposition: attachment; filename==?iiso88591?Q?caf=EG?=",
+     "Content-Disposition: attachment; filename==?iiso88591?Q?caf=EG?=",
+     "",
      L"",
      L"test.html"},
   };
   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(test_cases); ++i) {
     std::wstring filename = net::GetSuggestedFilename(
         GURL(test_cases[i].url), test_cases[i].content_disp_header,
-        test_cases[i].default_filename);
+        test_cases[i].referrer_charset, test_cases[i].default_filename);
     EXPECT_EQ(std::wstring(test_cases[i].expected_filename), filename);
   }
 }
diff --git a/net/url_request/url_request_context.h b/net/url_request/url_request_context.h
index 8c09c90..29735ac 100644
--- a/net/url_request/url_request_context.h
+++ b/net/url_request/url_request_context.h
@@ -70,6 +70,13 @@ class URLRequestContext :
     return EmptyString();
   }
 
+  // In general, referrer_charset is not known when URLRequestContext is
+  // constructed. So, we need a setter.
+  const std::string& referrer_charset() const { return referrer_charset_; }
+  void set_referrer_charset(const std::string& charset) {
+    referrer_charset_ = charset;
+  }
+
  protected:
   friend class base::RefCountedThreadSafe<URLRequestContext>;
 
@@ -85,6 +92,10 @@ class URLRequestContext :
   net::FtpAuthCache ftp_auth_cache_;
   std::string accept_language_;
   std::string accept_charset_;
+  // The charset of the referrer where this request comes from. It's not
+  // used in communication with a server but is used to construct a suggested
+  // filename for file download.
+  std::string referrer_charset_;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(URLRequestContext);