summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorphajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-05-11 09:19:30 +0000
committerphajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-05-11 09:19:30 +0000
commit193c35145799c33bde24a470f460587954d70aba (patch)
tree330c97462719bb266940cec9cd40ea70531d8500
parent587f4ff6420df90d4e6160aeb78918fcf55e3599 (diff)
downloadchromium_src-193c35145799c33bde24a470f460587954d70aba.zip
chromium_src-193c35145799c33bde24a470f460587954d70aba.tar.gz
chromium_src-193c35145799c33bde24a470f460587954d70aba.tar.bz2
FTP: fix navigating to files listed under non-ASCII characters
We need to convert the file name back to server encoding. BUG=38016 TEST=see bug Review URL: http://codereview.chromium.org/1857002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@46900 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--base/base.gyp2
-rw-r--r--base/i18n/icu_encoding_detection.cc38
-rw-r--r--base/i18n/icu_encoding_detection.h19
-rw-r--r--net/base/net_error_list.h3
-rw-r--r--net/base/net_util.h10
-rw-r--r--net/ftp/ftp_directory_listing_buffer.cc34
-rw-r--r--net/ftp/ftp_directory_listing_buffer.h2
-rw-r--r--webkit/glue/ftp_directory_listing_response_delegate.cc101
-rw-r--r--webkit/glue/ftp_directory_listing_response_delegate.h7
-rw-r--r--webkit/glue/webkit_glue.gypi1
10 files changed, 128 insertions, 89 deletions
diff --git a/base/base.gyp b/base/base.gyp
index 88f8702..460319b 100644
--- a/base/base.gyp
+++ b/base/base.gyp
@@ -33,6 +33,8 @@
'sources': [
'i18n/file_util_icu.cc',
'i18n/file_util_icu.h',
+ 'i18n/icu_encoding_detection.cc',
+ 'i18n/icu_encoding_detection.h',
'i18n/icu_string_conversions.cc',
'i18n/icu_string_conversions.h',
'i18n/icu_util.cc',
diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc
new file mode 100644
index 0000000..55785c5
--- /dev/null
+++ b/base/i18n/icu_encoding_detection.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/i18n/icu_encoding_detection.h"
+
+#include "base/string_util.h"
+#include "unicode/ucsdet.h"
+
+namespace base {
+
+// TODO(jungshik): We can apply more heuristics here (e.g. using various hints
+// like TLD, the UI language/default encoding of a client, etc).
+bool DetectEncoding(const std::string& text, std::string* encoding) {
+ if (IsStringASCII(text)) {
+ *encoding = std::string();
+ return true;
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ UCharsetDetector* detector = ucsdet_open(&status);
+ ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),
+ &status);
+ // TODO(jungshik): Should we check the quality of the match? A rather
+ // arbitrary number is assigned by ICU and it's hard to come up with
+ // a lower limit.
+ const UCharsetMatch* match = ucsdet_detect(detector, &status);
+ const char* detected_encoding = ucsdet_getName(match, &status);
+ ucsdet_close(detector);
+
+ if (U_FAILURE(status))
+ return false;
+
+ *encoding = detected_encoding;
+ return true;
+}
+
+} // namespace base
diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h
new file mode 100644
index 0000000..0d8e5d8
--- /dev/null
+++ b/base/i18n/icu_encoding_detection.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_I18N_ICU_ENCODING_DETECTION_H_
+#define BASE_I18N_ICU_ENCODING_DETECTION_H_
+
+#include <string>
+
+namespace base {
+
+// Detect encoding of |text| and put the name of encoding (as returned by ICU)
+// in |encoding|. For ASCII texts |encoding| will be set to an empty string.
+// Returns true on success.
+bool DetectEncoding(const std::string& text, std::string* encoding);
+
+} // namespace base
+
+#endif // BASE_I18N_ICU_ENCODING_DETECTION_H_
diff --git a/net/base/net_error_list.h b/net/base/net_error_list.h
index c42549d..3338229 100644
--- a/net/base/net_error_list.h
+++ b/net/base/net_error_list.h
@@ -335,6 +335,9 @@ NET_ERROR(INVALID_AUTH_CREDENTIALS, -338)
// machine.
NET_ERROR(UNSUPPORTED_AUTH_SCHEME, -339)
+// Detecting the encoding of the response failed.
+NET_ERROR(ENCODING_DETECTION_FAILED, -340)
+
// The cache does not have the requested entry.
NET_ERROR(CACHE_MISS, -400)
diff --git a/net/base/net_util.h b/net/base/net_util.h
index 77bd69a..5c3e37e 100644
--- a/net/base/net_util.h
+++ b/net/base/net_util.h
@@ -210,10 +210,12 @@ std::string GetDirectoryListingHeader(const string16& title);
// Currently, it's a script tag containing a call to a Javascript function
// |addRow|.
//
-// Its 1st parameter is derived from |name| and is the Javascript-string
-// escaped form of |name| (i.e \uXXXX). The 2nd parameter is the url-escaped
-// |raw_bytes| if it's not empty. If empty, the 2nd parameter is the
-// url-escaped |name| in UTF-8.
+// |name| is the file name to be displayed. |raw_bytes| will be used
+// as the actual target of the link (so for example, ftp links should use
+// server's encoding). If |raw_bytes| is an empty string, UTF-8 encoded |name|
+// will be used.
+//
+// Both |name| and |raw_bytes| are escaped internally.
std::string GetDirectoryListingEntry(const string16& name,
const std::string& raw_bytes,
bool is_dir, int64 size,
diff --git a/net/ftp/ftp_directory_listing_buffer.cc b/net/ftp/ftp_directory_listing_buffer.cc
index bc2db9c..41daeb4 100644
--- a/net/ftp/ftp_directory_listing_buffer.cc
+++ b/net/ftp/ftp_directory_listing_buffer.cc
@@ -4,6 +4,7 @@
#include "net/ftp/ftp_directory_listing_buffer.h"
+#include "base/i18n/icu_encoding_detection.h"
#include "base/i18n/icu_string_conversions.h"
#include "base/stl_util-inl.h"
#include "base/string_util.h"
@@ -13,33 +14,6 @@
#include "net/ftp/ftp_directory_listing_parser_netware.h"
#include "net/ftp/ftp_directory_listing_parser_vms.h"
#include "net/ftp/ftp_directory_listing_parser_windows.h"
-#include "unicode/ucsdet.h"
-
-namespace {
-
-// A very simple-minded character encoding detection.
-// TODO(jungshik): We can apply more heuristics here (e.g. using various hints
-// like TLD, the UI language/default encoding of a client, etc). In that case,
-// this should be pulled out of here and moved somewhere in base because there
-// can be other use cases.
-std::string DetectEncoding(const std::string& text) {
- if (IsStringASCII(text))
- return std::string();
- UErrorCode status = U_ZERO_ERROR;
- UCharsetDetector* detector = ucsdet_open(&status);
- ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),
- &status);
- const UCharsetMatch* match = ucsdet_detect(detector, &status);
- const char* encoding = ucsdet_getName(match, &status);
- ucsdet_close(detector);
- // Should we check the quality of the match? A rather arbitrary number is
- // assigned by ICU and it's hard to come up with a lower limit.
- if (U_FAILURE(status))
- return std::string();
- return encoding;
-}
-
-} // namespace
namespace net {
@@ -109,8 +83,10 @@ bool FtpDirectoryListingBuffer::ConvertToDetectedEncoding(
}
int FtpDirectoryListingBuffer::ExtractFullLinesFromBuffer() {
- if (encoding_.empty())
- encoding_ = DetectEncoding(buffer_);
+ if (encoding_.empty()) {
+ if (!base::DetectEncoding(buffer_, &encoding_))
+ return ERR_ENCODING_DETECTION_FAILED;
+ }
int cut_pos = 0;
// TODO(phajdan.jr): This code accepts all endlines matching \r*\n. Should it
diff --git a/net/ftp/ftp_directory_listing_buffer.h b/net/ftp/ftp_directory_listing_buffer.h
index 7aead49..4123cf0 100644
--- a/net/ftp/ftp_directory_listing_buffer.h
+++ b/net/ftp/ftp_directory_listing_buffer.h
@@ -45,6 +45,8 @@ class FtpDirectoryListingBuffer {
// time, although it will return SERVER_UNKNOWN if it doesn't know the answer.
FtpServerType GetServerType() const;
+ const std::string& encoding() const { return encoding_; }
+
private:
typedef std::set<FtpDirectoryListingParser*> ParserSet;
diff --git a/webkit/glue/ftp_directory_listing_response_delegate.cc b/webkit/glue/ftp_directory_listing_response_delegate.cc
index d27b7ad..80737db 100644
--- a/webkit/glue/ftp_directory_listing_response_delegate.cc
+++ b/webkit/glue/ftp_directory_listing_response_delegate.cc
@@ -6,6 +6,7 @@
#include <vector>
+#include "base/i18n/icu_encoding_detection.h"
#include "base/i18n/icu_string_conversions.h"
#include "base/logging.h"
#include "base/string_util.h"
@@ -17,7 +18,6 @@
#include "net/base/net_util.h"
#include "net/ftp/ftp_directory_listing_parser.h"
#include "net/ftp/ftp_server_type_histograms.h"
-#include "unicode/ucsdet.h"
#include "third_party/WebKit/WebKit/chromium/public/WebURL.h"
#include "third_party/WebKit/WebKit/chromium/public/WebURLLoaderClient.h"
@@ -29,42 +29,26 @@ using WebKit::WebURLResponse;
namespace {
-// A very simple-minded character encoding detection.
-// TODO(jungshik): We can apply more heuristics here (e.g. using various hints
-// like TLD, the UI language/default encoding of a client, etc). In that case,
-// this should be pulled out of here and moved somewhere in base because there
-// can be other use cases.
-std::string DetectEncoding(const std::string& text) {
- if (IsStringASCII(text))
- return std::string();
- UErrorCode status = U_ZERO_ERROR;
- UCharsetDetector* detector = ucsdet_open(&status);
- ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),
- &status);
- const UCharsetMatch* match = ucsdet_detect(detector, &status);
- const char* encoding = ucsdet_getName(match, &status);
- ucsdet_close(detector);
- // Should we check the quality of the match? A rather arbitrary number is
- // assigned by ICU and it's hard to come up with a lower limit.
- if (U_FAILURE(status))
- return std::string();
- return encoding;
-}
+string16 ConvertPathToUTF16(const std::string& path) {
+ // Per RFC 2640, FTP servers should use UTF-8 or its proper subset ASCII,
+ // but many old FTP servers use legacy encodings. Try UTF-8 first.
+ if (IsStringUTF8(path))
+ return UTF8ToUTF16(path);
+
+ // Try detecting the encoding. The sample is rather small though, so it may
+ // fail.
+ std::string encoding;
+ if (base::DetectEncoding(path, &encoding) && !encoding.empty()) {
+ string16 path_utf16;
+ if (base::CodepageToUTF16(path, encoding.c_str(),
+ base::OnStringConversionError::SUBSTITUTE,
+ &path_utf16)) {
+ return path_utf16;
+ }
+ }
-string16 RawByteSequenceToFilename(const char* raw_filename,
- const std::string& encoding) {
- if (encoding.empty())
- return ASCIIToUTF16(raw_filename);
-
- // Try the detected encoding before falling back to the native codepage.
- // Using the native codepage does not make much sense, but we don't have
- // much else to resort to.
- string16 filename;
- if (!base::CodepageToUTF16(raw_filename, encoding.c_str(),
- base::OnStringConversionError::SUBSTITUTE,
- &filename))
- filename = WideToUTF16Hack(base::SysNativeMBToWide(raw_filename));
- return filename;
+ // Use system native encoding as the last resort.
+ return WideToUTF16Hack(base::SysNativeMBToWide(path));
}
} // namespace
@@ -111,24 +95,8 @@ void FtpDirectoryListingResponseDelegate::Init() {
UnescapeRule::URL_SPECIAL_CHARS;
std::string unescaped_path = UnescapeURLComponent(response_url.path(),
unescape_rules);
- string16 path_utf16;
- // Per RFC 2640, FTP servers should use UTF-8 or its proper subset ASCII,
- // but many old FTP servers use legacy encodings. Try UTF-8 first and
- // detect the encoding.
- if (IsStringUTF8(unescaped_path)) {
- path_utf16 = UTF8ToUTF16(unescaped_path);
- } else {
- std::string encoding = DetectEncoding(unescaped_path);
- // Try the detected encoding. If it fails, resort to the
- // OS native encoding.
- if (encoding.empty() ||
- !base::CodepageToUTF16(unescaped_path, encoding.c_str(),
- base::OnStringConversionError::SUBSTITUTE,
- &path_utf16))
- path_utf16 = WideToUTF16Hack(base::SysNativeMBToWide(unescaped_path));
- }
-
- SendDataToClient(net::GetDirectoryListingHeader(path_utf16));
+ SendDataToClient(net::GetDirectoryListingHeader(
+ ConvertPathToUTF16(unescaped_path)));
// If this isn't top level directory (i.e. the path isn't "/",)
// add a link to the parent directory.
@@ -138,6 +106,18 @@ void FtpDirectoryListingResponseDelegate::Init() {
}
}
+bool FtpDirectoryListingResponseDelegate::ConvertToServerEncoding(
+ const string16& filename, std::string* raw_bytes) const {
+ if (buffer_.encoding().empty()) {
+ *raw_bytes = std::string();
+ return true;
+ }
+
+ return base::UTF16ToCodepage(filename, buffer_.encoding().c_str(),
+ base::OnStringConversionError::FAIL,
+ raw_bytes);
+}
+
void FtpDirectoryListingResponseDelegate::ProcessReceivedEntries() {
if (!updated_histograms_ && buffer_.EntryAvailable()) {
// Only log the server type if we got enough data to reliably detect it.
@@ -157,8 +137,17 @@ void FtpDirectoryListingResponseDelegate::ProcessReceivedEntries() {
int64 size = entry.size;
if (entry.type != FtpDirectoryListingEntry::FILE)
size = 0;
- SendDataToClient(net::GetDirectoryListingEntry(
- entry.name, std::string(), is_directory, size, entry.last_modified));
+ std::string raw_bytes;
+ if (ConvertToServerEncoding(entry.name, &raw_bytes)) {
+ SendDataToClient(net::GetDirectoryListingEntry(
+ entry.name, raw_bytes, is_directory, size, entry.last_modified));
+ } else {
+ // Consider an encoding problem a non-fatal error. The server's support
+ // for non-ASCII characters might be buggy. Display an error message,
+ // but keep trying to display the rest of the listing (most file names
+ // are ASCII anyway, we could be just unlucky with this one).
+ had_parsing_error_ = true;
+ }
}
}
diff --git a/webkit/glue/ftp_directory_listing_response_delegate.h b/webkit/glue/ftp_directory_listing_response_delegate.h
index 86b5c436..1218da9 100644
--- a/webkit/glue/ftp_directory_listing_response_delegate.h
+++ b/webkit/glue/ftp_directory_listing_response_delegate.h
@@ -33,6 +33,13 @@ class FtpDirectoryListingResponseDelegate {
private:
void Init();
+ // Converts |filename| to detected server encoding and puts the result
+ // in |raw_bytes| (if no conversion is necessary, an empty string is used).
+ // Returns true on success.
+ bool ConvertToServerEncoding(const string16& filename,
+ std::string* raw_bytes) const;
+
+ // Fetches the listing entries from the buffer and sends them to the client.
void ProcessReceivedEntries();
void SendDataToClient(const std::string& data);
diff --git a/webkit/glue/webkit_glue.gypi b/webkit/glue/webkit_glue.gypi
index a0e4437..6cbf8ab 100644
--- a/webkit/glue/webkit_glue.gypi
+++ b/webkit/glue/webkit_glue.gypi
@@ -110,6 +110,7 @@
'msvs_guid': 'C66B126D-0ECE-4CA2-B6DC-FA780AFBBF09',
'dependencies': [
'<(DEPTH)/app/app.gyp:app_base',
+ '<(DEPTH)/base/base.gyp:base_i18n',
'<(DEPTH)/net/net.gyp:net',
'<(DEPTH)/skia/skia.gyp:skia',
'<(DEPTH)/third_party/icu/icu.gyp:icui18n',