From 1e61db506aa4c0014d384c2d990525be38cdbc60 Mon Sep 17 00:00:00 2001 From: "phajdan.jr@chromium.org" Date: Thu, 18 Nov 2010 19:50:44 +0000 Subject: FTP: improve character encoding detection in cases where ICU's first guess is wrong. Instead of using ICU's first guessed encoding immediately, we ask it for all possible encodings, try them in order, and use the first one that works. For some sites this still results in a gibberish being displayed, but at least the links are clickable so the site navigation is possible. BUG=61073 TEST=see bug Review URL: http://codereview.chromium.org/4967001 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@66664 0039d316-1c4b-4281-b951-d872f2087c98 --- net/ftp/ftp_directory_listing_buffer.cc | 67 ++++++++++++++++-------- net/ftp/ftp_directory_listing_buffer.h | 26 ++++++--- net/ftp/ftp_directory_listing_buffer_unittest.cc | 3 ++ 3 files changed, 67 insertions(+), 29 deletions(-) (limited to 'net/ftp') diff --git a/net/ftp/ftp_directory_listing_buffer.cc b/net/ftp/ftp_directory_listing_buffer.cc index a173399..f6e8748 100644 --- a/net/ftp/ftp_directory_listing_buffer.cc +++ b/net/ftp/ftp_directory_listing_buffer.cc @@ -37,7 +37,7 @@ int FtpDirectoryListingBuffer::ConsumeData(const char* data, int data_length) { buffer_.append(data, data_length); if (!encoding_.empty() || buffer_.length() > 1024) { - int rv = ExtractFullLinesFromBuffer(); + int rv = ConsumeBuffer(); if (rv != OK) return rv; } @@ -46,11 +46,12 @@ int FtpDirectoryListingBuffer::ConsumeData(const char* data, int data_length) { } int FtpDirectoryListingBuffer::ProcessRemainingData() { - int rv = ExtractFullLinesFromBuffer(); + int rv = ConsumeBuffer(); if (rv != OK) return rv; - if (!buffer_.empty()) + DCHECK(buffer_.empty()); + if (!converted_buffer_.empty()) return ERR_INVALID_RESPONSE; rv = ParseLines(); @@ -77,38 +78,62 @@ FtpServerType FtpDirectoryListingBuffer::GetServerType() const { return (current_parser_ ? current_parser_->GetServerType() : SERVER_UNKNOWN); } -bool FtpDirectoryListingBuffer::ConvertToDetectedEncoding( - const std::string& from, string16* to) { - std::string encoding(encoding_.empty() ? "ascii" : encoding_); - return base::CodepageToUTF16(from, encoding.c_str(), - base::OnStringConversionError::FAIL, to); +int FtpDirectoryListingBuffer::DecodeBufferUsingEncoding( + const std::string& encoding) { + string16 converted; + if (!base::CodepageToUTF16(buffer_, + encoding.c_str(), + base::OnStringConversionError::FAIL, + &converted)) + return ERR_ENCODING_CONVERSION_FAILED; + + buffer_.clear(); + converted_buffer_ += converted; + return OK; } -int FtpDirectoryListingBuffer::ExtractFullLinesFromBuffer() { +int FtpDirectoryListingBuffer::ConvertBufferToUTF16() { if (encoding_.empty()) { - if (!base::DetectEncoding(buffer_, &encoding_)) + std::vector encodings; + if (!base::DetectAllEncodings(buffer_, &encodings)) return ERR_ENCODING_DETECTION_FAILED; + + // Use first encoding that can be used to decode the buffer. + for (size_t i = 0; i < encodings.size(); i++) { + if (DecodeBufferUsingEncoding(encodings[i]) == OK) { + encoding_ = encodings[i]; + return OK; + } + } + + return ERR_ENCODING_DETECTION_FAILED; } + return DecodeBufferUsingEncoding(encoding_); +} + +void FtpDirectoryListingBuffer::ExtractFullLinesFromBuffer() { int cut_pos = 0; // TODO(phajdan.jr): This code accepts all endlines matching \r*\n. Should it // be more strict, or enforce consistent line endings? - for (size_t i = 0; i < buffer_.length(); ++i) { - if (buffer_[i] != '\n') + for (size_t i = 0; i < converted_buffer_.length(); ++i) { + if (converted_buffer_[i] != '\n') continue; int line_length = i - cut_pos; - if (i >= 1 && buffer_[i - 1] == '\r') + if (i >= 1 && converted_buffer_[i - 1] == '\r') line_length--; - std::string line(buffer_.substr(cut_pos, line_length)); + lines_.push_back(converted_buffer_.substr(cut_pos, line_length)); cut_pos = i + 1; - string16 line_converted; - if (!ConvertToDetectedEncoding(line, &line_converted)) { - buffer_.erase(0, cut_pos); - return ERR_ENCODING_CONVERSION_FAILED; - } - lines_.push_back(line_converted); } - buffer_.erase(0, cut_pos); + converted_buffer_.erase(0, cut_pos); +} + +int FtpDirectoryListingBuffer::ConsumeBuffer() { + int rv = ConvertBufferToUTF16(); + if (rv != OK) + return rv; + + ExtractFullLinesFromBuffer(); return OK; } diff --git a/net/ftp/ftp_directory_listing_buffer.h b/net/ftp/ftp_directory_listing_buffer.h index 0a25fff..ea68932 100644 --- a/net/ftp/ftp_directory_listing_buffer.h +++ b/net/ftp/ftp_directory_listing_buffer.h @@ -51,13 +51,20 @@ class FtpDirectoryListingBuffer { private: typedef std::set ParserSet; - // Converts the string |from| to detected encoding and stores it in |to|. - // Returns true on success. - bool ConvertToDetectedEncoding(const std::string& from, string16* to); + // Decodes the raw buffer using specified |encoding|. On success + // clears the raw buffer and appends data to |converted_buffer_|. + // Returns network error code. + int DecodeBufferUsingEncoding(const std::string& encoding); - // Tries to extract full lines from the raw buffer, converting them to the - // detected encoding. Returns network error code. - int ExtractFullLinesFromBuffer(); + // Converts the raw buffer to UTF-16. Returns network error code. + int ConvertBufferToUTF16(); + + // Extracts lines from the converted buffer, and puts them in |lines_|. + void ExtractFullLinesFromBuffer(); + + // Consumes the raw buffer (i.e. does the character set conversion + // and line splitting). Returns network error code. + int ConsumeBuffer(); // Tries to parse full lines stored in |lines_|. Returns network error code. int ParseLines(); @@ -66,12 +73,15 @@ class FtpDirectoryListingBuffer { // parsers. Returns network error code. int OnEndOfInput(); - // Detected encoding of the response (empty if unknown or ASCII). + // Detected encoding of the response (empty if unknown). std::string encoding_; - // Buffer to keep not-yet-split data. + // Buffer to keep data before character set conversion. std::string buffer_; + // Buffer to keep data before line splitting. + string16 converted_buffer_; + // CRLF-delimited lines, without the CRLF, not yet consumed by parser. std::deque lines_; diff --git a/net/ftp/ftp_directory_listing_buffer_unittest.cc b/net/ftp/ftp_directory_listing_buffer_unittest.cc index 683e2f7..ceddfc4 100644 --- a/net/ftp/ftp_directory_listing_buffer_unittest.cc +++ b/net/ftp/ftp_directory_listing_buffer_unittest.cc @@ -42,6 +42,9 @@ TEST(FtpDirectoryListingBufferTest, Parse) { "dir-listing-ls-17", "dir-listing-ls-18", "dir-listing-ls-19", + "dir-listing-ls-20", // TODO(phajdan.jr): should use windows-1251 encoding. + "dir-listing-ls-21", // TODO(phajdan.jr): should use windows-1251 encoding. + "dir-listing-ls-22", // TODO(phajdan.jr): should use windows-1251 encoding. "dir-listing-mlsd-1", "dir-listing-mlsd-2", "dir-listing-netware-1", -- cgit v1.1