diff options
author | phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-05-11 09:19:30 +0000 |
---|---|---|
committer | phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-05-11 09:19:30 +0000 |
commit | 193c35145799c33bde24a470f460587954d70aba (patch) | |
tree | 330c97462719bb266940cec9cd40ea70531d8500 | |
parent | 587f4ff6420df90d4e6160aeb78918fcf55e3599 (diff) | |
download | chromium_src-193c35145799c33bde24a470f460587954d70aba.zip chromium_src-193c35145799c33bde24a470f460587954d70aba.tar.gz chromium_src-193c35145799c33bde24a470f460587954d70aba.tar.bz2 |
FTP: fix navigating to files listed under non-ASCII characters
We need to convert the file name back to server encoding.
BUG=38016
TEST=see bug
Review URL: http://codereview.chromium.org/1857002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@46900 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | base/base.gyp | 2 | ||||
-rw-r--r-- | base/i18n/icu_encoding_detection.cc | 38 | ||||
-rw-r--r-- | base/i18n/icu_encoding_detection.h | 19 | ||||
-rw-r--r-- | net/base/net_error_list.h | 3 | ||||
-rw-r--r-- | net/base/net_util.h | 10 | ||||
-rw-r--r-- | net/ftp/ftp_directory_listing_buffer.cc | 34 | ||||
-rw-r--r-- | net/ftp/ftp_directory_listing_buffer.h | 2 | ||||
-rw-r--r-- | webkit/glue/ftp_directory_listing_response_delegate.cc | 101 | ||||
-rw-r--r-- | webkit/glue/ftp_directory_listing_response_delegate.h | 7 | ||||
-rw-r--r-- | webkit/glue/webkit_glue.gypi | 1 |
10 files changed, 128 insertions, 89 deletions
diff --git a/base/base.gyp b/base/base.gyp index 88f8702..460319b 100644 --- a/base/base.gyp +++ b/base/base.gyp @@ -33,6 +33,8 @@ 'sources': [ 'i18n/file_util_icu.cc', 'i18n/file_util_icu.h', + 'i18n/icu_encoding_detection.cc', + 'i18n/icu_encoding_detection.h', 'i18n/icu_string_conversions.cc', 'i18n/icu_string_conversions.h', 'i18n/icu_util.cc', diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc new file mode 100644 index 0000000..55785c5 --- /dev/null +++ b/base/i18n/icu_encoding_detection.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/i18n/icu_encoding_detection.h" + +#include "base/string_util.h" +#include "unicode/ucsdet.h" + +namespace base { + +// TODO(jungshik): We can apply more heuristics here (e.g. using various hints +// like TLD, the UI language/default encoding of a client, etc). +bool DetectEncoding(const std::string& text, std::string* encoding) { + if (IsStringASCII(text)) { + *encoding = std::string(); + return true; + } + + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector* detector = ucsdet_open(&status); + ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), + &status); + // TODO(jungshik): Should we check the quality of the match? A rather + // arbitrary number is assigned by ICU and it's hard to come up with + // a lower limit. + const UCharsetMatch* match = ucsdet_detect(detector, &status); + const char* detected_encoding = ucsdet_getName(match, &status); + ucsdet_close(detector); + + if (U_FAILURE(status)) + return false; + + *encoding = detected_encoding; + return true; +} + +} // namespace base diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h new file mode 100644 index 0000000..0d8e5d8 --- /dev/null +++ b/base/i18n/icu_encoding_detection.h @@ -0,0 +1,19 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_I18N_ICU_ENCODING_DETECTION_H_ +#define BASE_I18N_ICU_ENCODING_DETECTION_H_ + +#include <string> + +namespace base { + +// Detect encoding of |text| and put the name of encoding (as returned by ICU) +// in |encoding|. For ASCII texts |encoding| will be set to an empty string. +// Returns true on success. +bool DetectEncoding(const std::string& text, std::string* encoding); + +} // namespace base + +#endif // BASE_I18N_ICU_ENCODING_DETECTION_H_ diff --git a/net/base/net_error_list.h b/net/base/net_error_list.h index c42549d..3338229 100644 --- a/net/base/net_error_list.h +++ b/net/base/net_error_list.h @@ -335,6 +335,9 @@ NET_ERROR(INVALID_AUTH_CREDENTIALS, -338) // machine. NET_ERROR(UNSUPPORTED_AUTH_SCHEME, -339) +// Detecting the encoding of the response failed. +NET_ERROR(ENCODING_DETECTION_FAILED, -340) + // The cache does not have the requested entry. NET_ERROR(CACHE_MISS, -400) diff --git a/net/base/net_util.h b/net/base/net_util.h index 77bd69a..5c3e37e 100644 --- a/net/base/net_util.h +++ b/net/base/net_util.h @@ -210,10 +210,12 @@ std::string GetDirectoryListingHeader(const string16& title); // Currently, it's a script tag containing a call to a Javascript function // |addRow|. // -// Its 1st parameter is derived from |name| and is the Javascript-string -// escaped form of |name| (i.e \uXXXX). The 2nd parameter is the url-escaped -// |raw_bytes| if it's not empty. If empty, the 2nd parameter is the -// url-escaped |name| in UTF-8. +// |name| is the file name to be displayed. |raw_bytes| will be used +// as the actual target of the link (so for example, ftp links should use +// server's encoding). If |raw_bytes| is an empty string, UTF-8 encoded |name| +// will be used. +// +// Both |name| and |raw_bytes| are escaped internally. std::string GetDirectoryListingEntry(const string16& name, const std::string& raw_bytes, bool is_dir, int64 size, diff --git a/net/ftp/ftp_directory_listing_buffer.cc b/net/ftp/ftp_directory_listing_buffer.cc index bc2db9c..41daeb4 100644 --- a/net/ftp/ftp_directory_listing_buffer.cc +++ b/net/ftp/ftp_directory_listing_buffer.cc @@ -4,6 +4,7 @@ #include "net/ftp/ftp_directory_listing_buffer.h" +#include "base/i18n/icu_encoding_detection.h" #include "base/i18n/icu_string_conversions.h" #include "base/stl_util-inl.h" #include "base/string_util.h" @@ -13,33 +14,6 @@ #include "net/ftp/ftp_directory_listing_parser_netware.h" #include "net/ftp/ftp_directory_listing_parser_vms.h" #include "net/ftp/ftp_directory_listing_parser_windows.h" -#include "unicode/ucsdet.h" - -namespace { - -// A very simple-minded character encoding detection. -// TODO(jungshik): We can apply more heuristics here (e.g. using various hints -// like TLD, the UI language/default encoding of a client, etc). In that case, -// this should be pulled out of here and moved somewhere in base because there -// can be other use cases. -std::string DetectEncoding(const std::string& text) { - if (IsStringASCII(text)) - return std::string(); - UErrorCode status = U_ZERO_ERROR; - UCharsetDetector* detector = ucsdet_open(&status); - ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), - &status); - const UCharsetMatch* match = ucsdet_detect(detector, &status); - const char* encoding = ucsdet_getName(match, &status); - ucsdet_close(detector); - // Should we check the quality of the match? A rather arbitrary number is - // assigned by ICU and it's hard to come up with a lower limit. - if (U_FAILURE(status)) - return std::string(); - return encoding; -} - -} // namespace namespace net { @@ -109,8 +83,10 @@ bool FtpDirectoryListingBuffer::ConvertToDetectedEncoding( } int FtpDirectoryListingBuffer::ExtractFullLinesFromBuffer() { - if (encoding_.empty()) - encoding_ = DetectEncoding(buffer_); + if (encoding_.empty()) { + if (!base::DetectEncoding(buffer_, &encoding_)) + return ERR_ENCODING_DETECTION_FAILED; + } int cut_pos = 0; // TODO(phajdan.jr): This code accepts all endlines matching \r*\n. Should it diff --git a/net/ftp/ftp_directory_listing_buffer.h b/net/ftp/ftp_directory_listing_buffer.h index 7aead49..4123cf0 100644 --- a/net/ftp/ftp_directory_listing_buffer.h +++ b/net/ftp/ftp_directory_listing_buffer.h @@ -45,6 +45,8 @@ class FtpDirectoryListingBuffer { // time, although it will return SERVER_UNKNOWN if it doesn't know the answer. FtpServerType GetServerType() const; + const std::string& encoding() const { return encoding_; } + private: typedef std::set<FtpDirectoryListingParser*> ParserSet; diff --git a/webkit/glue/ftp_directory_listing_response_delegate.cc b/webkit/glue/ftp_directory_listing_response_delegate.cc index d27b7ad..80737db 100644 --- a/webkit/glue/ftp_directory_listing_response_delegate.cc +++ b/webkit/glue/ftp_directory_listing_response_delegate.cc @@ -6,6 +6,7 @@ #include <vector> +#include "base/i18n/icu_encoding_detection.h" #include "base/i18n/icu_string_conversions.h" #include "base/logging.h" #include "base/string_util.h" @@ -17,7 +18,6 @@ #include "net/base/net_util.h" #include "net/ftp/ftp_directory_listing_parser.h" #include "net/ftp/ftp_server_type_histograms.h" -#include "unicode/ucsdet.h" #include "third_party/WebKit/WebKit/chromium/public/WebURL.h" #include "third_party/WebKit/WebKit/chromium/public/WebURLLoaderClient.h" @@ -29,42 +29,26 @@ using WebKit::WebURLResponse; namespace { -// A very simple-minded character encoding detection. -// TODO(jungshik): We can apply more heuristics here (e.g. using various hints -// like TLD, the UI language/default encoding of a client, etc). In that case, -// this should be pulled out of here and moved somewhere in base because there -// can be other use cases. -std::string DetectEncoding(const std::string& text) { - if (IsStringASCII(text)) - return std::string(); - UErrorCode status = U_ZERO_ERROR; - UCharsetDetector* detector = ucsdet_open(&status); - ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), - &status); - const UCharsetMatch* match = ucsdet_detect(detector, &status); - const char* encoding = ucsdet_getName(match, &status); - ucsdet_close(detector); - // Should we check the quality of the match? A rather arbitrary number is - // assigned by ICU and it's hard to come up with a lower limit. - if (U_FAILURE(status)) - return std::string(); - return encoding; -} +string16 ConvertPathToUTF16(const std::string& path) { + // Per RFC 2640, FTP servers should use UTF-8 or its proper subset ASCII, + // but many old FTP servers use legacy encodings. Try UTF-8 first. + if (IsStringUTF8(path)) + return UTF8ToUTF16(path); + + // Try detecting the encoding. The sample is rather small though, so it may + // fail. + std::string encoding; + if (base::DetectEncoding(path, &encoding) && !encoding.empty()) { + string16 path_utf16; + if (base::CodepageToUTF16(path, encoding.c_str(), + base::OnStringConversionError::SUBSTITUTE, + &path_utf16)) { + return path_utf16; + } + } -string16 RawByteSequenceToFilename(const char* raw_filename, - const std::string& encoding) { - if (encoding.empty()) - return ASCIIToUTF16(raw_filename); - - // Try the detected encoding before falling back to the native codepage. - // Using the native codepage does not make much sense, but we don't have - // much else to resort to. - string16 filename; - if (!base::CodepageToUTF16(raw_filename, encoding.c_str(), - base::OnStringConversionError::SUBSTITUTE, - &filename)) - filename = WideToUTF16Hack(base::SysNativeMBToWide(raw_filename)); - return filename; + // Use system native encoding as the last resort. + return WideToUTF16Hack(base::SysNativeMBToWide(path)); } } // namespace @@ -111,24 +95,8 @@ void FtpDirectoryListingResponseDelegate::Init() { UnescapeRule::URL_SPECIAL_CHARS; std::string unescaped_path = UnescapeURLComponent(response_url.path(), unescape_rules); - string16 path_utf16; - // Per RFC 2640, FTP servers should use UTF-8 or its proper subset ASCII, - // but many old FTP servers use legacy encodings. Try UTF-8 first and - // detect the encoding. - if (IsStringUTF8(unescaped_path)) { - path_utf16 = UTF8ToUTF16(unescaped_path); - } else { - std::string encoding = DetectEncoding(unescaped_path); - // Try the detected encoding. If it fails, resort to the - // OS native encoding. - if (encoding.empty() || - !base::CodepageToUTF16(unescaped_path, encoding.c_str(), - base::OnStringConversionError::SUBSTITUTE, - &path_utf16)) - path_utf16 = WideToUTF16Hack(base::SysNativeMBToWide(unescaped_path)); - } - - SendDataToClient(net::GetDirectoryListingHeader(path_utf16)); + SendDataToClient(net::GetDirectoryListingHeader( + ConvertPathToUTF16(unescaped_path))); // If this isn't top level directory (i.e. the path isn't "/",) // add a link to the parent directory. @@ -138,6 +106,18 @@ void FtpDirectoryListingResponseDelegate::Init() { } } +bool FtpDirectoryListingResponseDelegate::ConvertToServerEncoding( + const string16& filename, std::string* raw_bytes) const { + if (buffer_.encoding().empty()) { + *raw_bytes = std::string(); + return true; + } + + return base::UTF16ToCodepage(filename, buffer_.encoding().c_str(), + base::OnStringConversionError::FAIL, + raw_bytes); +} + void FtpDirectoryListingResponseDelegate::ProcessReceivedEntries() { if (!updated_histograms_ && buffer_.EntryAvailable()) { // Only log the server type if we got enough data to reliably detect it. @@ -157,8 +137,17 @@ void FtpDirectoryListingResponseDelegate::ProcessReceivedEntries() { int64 size = entry.size; if (entry.type != FtpDirectoryListingEntry::FILE) size = 0; - SendDataToClient(net::GetDirectoryListingEntry( - entry.name, std::string(), is_directory, size, entry.last_modified)); + std::string raw_bytes; + if (ConvertToServerEncoding(entry.name, &raw_bytes)) { + SendDataToClient(net::GetDirectoryListingEntry( + entry.name, raw_bytes, is_directory, size, entry.last_modified)); + } else { + // Consider an encoding problem a non-fatal error. The server's support + // for non-ASCII characters might be buggy. Display an error message, + // but keep trying to display the rest of the listing (most file names + // are ASCII anyway, we could be just unlucky with this one). + had_parsing_error_ = true; + } } } diff --git a/webkit/glue/ftp_directory_listing_response_delegate.h b/webkit/glue/ftp_directory_listing_response_delegate.h index 86b5c436..1218da9 100644 --- a/webkit/glue/ftp_directory_listing_response_delegate.h +++ b/webkit/glue/ftp_directory_listing_response_delegate.h @@ -33,6 +33,13 @@ class FtpDirectoryListingResponseDelegate { private: void Init(); + // Converts |filename| to detected server encoding and puts the result + // in |raw_bytes| (if no conversion is necessary, an empty string is used). + // Returns true on success. + bool ConvertToServerEncoding(const string16& filename, + std::string* raw_bytes) const; + + // Fetches the listing entries from the buffer and sends them to the client. void ProcessReceivedEntries(); void SendDataToClient(const std::string& data); diff --git a/webkit/glue/webkit_glue.gypi b/webkit/glue/webkit_glue.gypi index a0e4437..6cbf8ab 100644 --- a/webkit/glue/webkit_glue.gypi +++ b/webkit/glue/webkit_glue.gypi @@ -110,6 +110,7 @@ 'msvs_guid': 'C66B126D-0ECE-4CA2-B6DC-FA780AFBBF09', 'dependencies': [ '<(DEPTH)/app/app.gyp:app_base', + '<(DEPTH)/base/base.gyp:base_i18n', '<(DEPTH)/net/net.gyp:net', '<(DEPTH)/skia/skia.gyp:skia', '<(DEPTH)/third_party/icu/icu.gyp:icui18n', |