Respect the charset specified in PAC file responses.

I have updated the documentation of ProxyResolver and ProxyScriptFetcher to indicate that the response must always be given as UTF8. So ProxyScriptFetcher is responsible for any charset conversions internally. This CL also adds a unit-test to make sure that content-encodings are respected (like gzip). This was not previously broken, but it is a related area (and wasn't being tested.) BUG=http://crbug.com/22310 Review URL: http://codereview.chromium.org/210028 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@26790 0039d316-1c4b-4281-b951-d872f2087c98
author: eroman@chromium.org <eroman@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-09-22 03:06:54 +0000
committer: eroman@chromium.org <eroman@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-09-22 03:06:54 +0000
commit: 8f3c963473091104513c05328fe2fe98989e8339 (patch)
tree: 6dcf38d7305ee622a13b56c5f30143eb833148ef /net/proxy
parent: 0f3dfb420337c569091ac303081c30c9e060f842 (diff)
download: chromium_src-8f3c963473091104513c05328fe2fe98989e8339.zip
chromium_src-8f3c963473091104513c05328fe2fe98989e8339.tar.gz
chromium_src-8f3c963473091104513c05328fe2fe98989e8339.tar.bz2
6 files changed, 84 insertions, 19 deletions
diff --git a/net/proxy/proxy_resolver.h b/net/proxy/proxy_resolver.h
index 3811be8..58b09e6 100644
--- a/net/proxy/proxy_resolver.h
+++ b/net/proxy/proxy_resolver.h
@@ -59,10 +59,10 @@ class ProxyResolver {
   }
 
   // Sets the PAC script backend to use for this proxy resolver (by contents).
-  int SetPacScriptByData(const std::string& bytes,
+  int SetPacScriptByData(const std::string& bytes_utf8,
                          CompletionCallback* callback) {
     DCHECK(expects_pac_bytes());
-    return SetPacScript(GURL(), bytes, callback);
+    return SetPacScript(GURL(), bytes_utf8, callback);
   }
 
   // TODO(eroman): Make this =0.
@@ -72,12 +72,12 @@ class ProxyResolver {
 
  private:
   // Called to set the PAC script backend to use. If |pac_url| is invalid,
-  // this is a request to use WPAD (auto detect). |bytes| may be empty if the
-  // fetch failed, or if the fetch returned no content.
+  // this is a request to use WPAD (auto detect). |bytes_utf8| may be empty if
+  // the fetch failed, or if the fetch returned no content.
   // Returns ERR_IO_PENDING in the case of asynchronous completion, and notifies
   // the result through |callback|.
   virtual int SetPacScript(const GURL& pac_url,
-                           const std::string& bytes,
+                           const std::string& bytes_utf8,
                            CompletionCallback* callback) = 0;
 
   const bool expects_pac_bytes_;
diff --git a/net/proxy/proxy_resolver_v8.cc b/net/proxy/proxy_resolver_v8.cc
index cb5eab1..214837d 100644
--- a/net/proxy/proxy_resolver_v8.cc
+++ b/net/proxy/proxy_resolver_v8.cc
@@ -29,7 +29,7 @@ std::string V8StringToStdString(v8::Handle<v8::String> s) {
   return result;
 }
 
-// Convert a std::string to a V8 string.
+// Convert a std::string (UTF8) to a V8 string.
 v8::Local<v8::String> StdStringToV8String(const std::string& s) {
   return v8::String::New(s.data(), s.size());
 }
@@ -104,7 +104,7 @@ class ProxyResolverV8::Context {
     return OK;
   }
 
-  int InitV8(const std::string& pac_data) {
+  int InitV8(const std::string& pac_data_utf8) {
     v8::Locker locked;
     v8::HandleScope scope;
 
@@ -133,8 +133,8 @@ class ProxyResolverV8::Context {
     v8::TryCatch try_catch;
 
     // Compile the script, including the PAC library functions.
-    std::string text_raw = pac_data + PROXY_RESOLVER_SCRIPT;
-    v8::Local<v8::String> text = StdStringToV8String(text_raw);
+    std::string text_raw_utf8 = pac_data_utf8 + PROXY_RESOLVER_SCRIPT;
+    v8::Local<v8::String> text = StdStringToV8String(text_raw_utf8);
     v8::ScriptOrigin origin = v8::ScriptOrigin(
         v8::String::New(kPacResourceName));
     v8::Local<v8::Script> code = v8::Script::Compile(text, &origin);
@@ -284,15 +284,15 @@ void ProxyResolverV8::CancelRequest(RequestHandle request) {
 }
 
 int ProxyResolverV8::SetPacScript(const GURL& /*url*/,
-                                  const std::string& bytes,
+                                  const std::string& bytes_utf8,
                                   CompletionCallback* /*callback*/) {
   context_.reset();
-  if (bytes.empty())
+  if (bytes_utf8.empty())
     return ERR_PAC_SCRIPT_FAILED;
 
   // Try parsing the PAC script.
   scoped_ptr<Context> context(new Context(js_bindings_.get()));
-  int rv = context->InitV8(bytes);
+  int rv = context->InitV8(bytes_utf8);
   if (rv == OK)
     context_.reset(context.release());
   return rv;
diff --git a/net/proxy/proxy_resolver_v8.h b/net/proxy/proxy_resolver_v8.h
index 3f024f2..ad81faa 100644
--- a/net/proxy/proxy_resolver_v8.h
+++ b/net/proxy/proxy_resolver_v8.h
@@ -62,7 +62,7 @@ class ProxyResolverV8 : public ProxyResolver {
 
   // ProxyResolver implementation:
   virtual int SetPacScript(const GURL& /*pac_url*/,
-                           const std::string& bytes,
+                           const std::string& bytes_utf8,
                            CompletionCallback* /*callback*/);
   scoped_ptr<Context> context_;
 
diff --git a/net/proxy/proxy_script_fetcher.cc b/net/proxy/proxy_script_fetcher.cc
index d93c5c2..83189ef 100644
--- a/net/proxy/proxy_script_fetcher.cc
+++ b/net/proxy/proxy_script_fetcher.cc
@@ -43,6 +43,30 @@ bool IsPacMimeType(const std::string& mime_type) {
   return false;
 }
 
+// Convert |bytes| (which is encoded by |charset|) in place to UTF8.
+// If |charset| is empty, then we don't know what it was and guess.
+void ConvertResponseToUTF8(const std::string& charset, std::string* bytes) {
+  const char* codepage;
+
+  if (charset.empty()) {
+    // Assume ISO-8859-1 if no charset was specified.
+    codepage = "ISO-8859-1";
+  } else {
+    // Otherwise trust the charset that was provided.
+    codepage = charset.c_str();
+  }
+
+  // We will be generous in the conversion -- if any characters lie
+  // outside of |charset| (i.e. invalid), then substitute them with
+  // U+FFFD rather than failing.
+  std::wstring tmp_wide;
+  CodepageToWide(*bytes, codepage,
+                  OnStringUtilConversionError::SUBSTITUTE,
+                  &tmp_wide);
+  // TODO(eroman): would be nice to have a CodepageToUTF8() function.
+  *bytes = WideToUTF8(tmp_wide);
+}
+
 }  // namespace
 
 class ProxyScriptFetcherImpl : public ProxyScriptFetcher,
@@ -273,9 +297,15 @@ void ProxyScriptFetcherImpl::ReadBody(URLRequest* request) {
 }
 
 void ProxyScriptFetcherImpl::FetchCompleted() {
-  // On error, the caller expects empty string for bytes.
-  if (result_code_ != OK)
+  if (result_code_ == OK) {
+    // The caller expects the response to be encoded as UTF8.
+    std::string charset;
+    cur_request_->GetCharset(&charset);
+    ConvertResponseToUTF8(charset, result_bytes_);
+  } else {
+    // On error, the caller expects empty string for bytes.
     result_bytes_->clear();
+  }
 
   int result_code = result_code_;
   CompletionCallback* callback = callback_;
diff --git a/net/proxy/proxy_script_fetcher.h b/net/proxy/proxy_script_fetcher.h
index e88ab6b..cbdb911 100644
--- a/net/proxy/proxy_script_fetcher.h
+++ b/net/proxy/proxy_script_fetcher.h
@@ -26,9 +26,9 @@ class ProxyScriptFetcher {
 
   // Downloads the given PAC URL, and invokes |callback| on completion.
   // On success |callback| is executed with a result code of OK, and a
-  // string of the response bytes. On failure, the result bytes is an empty
-  // string, and the result code is a network error. Some special network
-  // errors that may occur are:
+  // string of the response bytes (as UTF8). On failure, the result bytes is
+  // an empty string, and the result code is a network error. Some special
+  // network errors that may occur are:
   //
   //    ERR_TIMED_OUT         -- the fetch took too long to complete.
   //    ERR_FILE_TOO_BIG      -- the response's body was too large.
@@ -39,7 +39,7 @@ class ProxyScriptFetcher {
   // deleting |this|), then no callback is invoked.
   //
   // Only one fetch is allowed to be outstanding at a time.
-  virtual int Fetch(const GURL& url, std::string* bytes,
+  virtual int Fetch(const GURL& url, std::string* utf8_bytes,
                     CompletionCallback* callback) = 0;
 
   // Aborts the in-progress fetch (if any).
diff --git a/net/proxy/proxy_script_fetcher_unittest.cc b/net/proxy/proxy_script_fetcher_unittest.cc
index e94e20b..87f6a9a 100644
--- a/net/proxy/proxy_script_fetcher_unittest.cc
+++ b/net/proxy/proxy_script_fetcher_unittest.cc
@@ -289,4 +289,39 @@ TEST_F(ProxyScriptFetcherTest, Hang) {
   }
 }
 
+// The ProxyScriptFetcher should decode any content-codings
+// (like gzip, bzip, etc.), and apply any charset conversions to yield
+// UTF8.
+TEST_F(ProxyScriptFetcherTest, Encodings) {
+  scoped_refptr<HTTPTestServer> server =
+      HTTPTestServer::CreateServer(kDocRoot, NULL);
+  ASSERT_TRUE(NULL != server.get());
+  scoped_refptr<URLRequestContext> context = new RequestContext;
+  scoped_ptr<ProxyScriptFetcher> pac_fetcher(
+      ProxyScriptFetcher::Create(context));
+
+  // Test a response that is gzip-encoded -- should get inflated.
+  {
+    GURL url = server->TestServerPage("files/gzipped_pac");
+    std::string bytes;
+    TestCompletionCallback callback;
+    int result = pac_fetcher->Fetch(url, &bytes, &callback);
+    EXPECT_EQ(ERR_IO_PENDING, result);
+    EXPECT_EQ(OK, callback.WaitForResult());
+    EXPECT_EQ("This data was gzipped.\n", bytes);
+  }
+
+  // Test a response that was served as UTF-16 (BE). It should
+  // be converted to UTF8.
+  {
+    GURL url = server->TestServerPage("files/utf16be_pac");
+    std::string bytes;
+    TestCompletionCallback callback;
+    int result = pac_fetcher->Fetch(url, &bytes, &callback);
+    EXPECT_EQ(ERR_IO_PENDING, result);
+    EXPECT_EQ(OK, callback.WaitForResult());
+    EXPECT_EQ("This was encoded as UTF-16BE.\n", bytes);
+  }
+}
+
 }  // namespace net
author	eroman@chromium.org <eroman@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-09-22 03:06:54 +0000
committer	eroman@chromium.org <eroman@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-09-22 03:06:54 +0000
commit	8f3c963473091104513c05328fe2fe98989e8339 (patch)
tree	6dcf38d7305ee622a13b56c5f30143eb833148ef /net/proxy
parent	0f3dfb420337c569091ac303081c30c9e060f842 (diff)
download	chromium_src-8f3c963473091104513c05328fe2fe98989e8339.zip chromium_src-8f3c963473091104513c05328fe2fe98989e8339.tar.gz chromium_src-8f3c963473091104513c05328fe2fe98989e8339.tar.bz2