diff options
author | phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-11-18 19:50:44 +0000 |
---|---|---|
committer | phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-11-18 19:50:44 +0000 |
commit | 1e61db506aa4c0014d384c2d990525be38cdbc60 (patch) | |
tree | 05b6c855ad25c7c541423d85427643d9ae87fb47 /net/ftp | |
parent | 6f6b0041b543f97fab16548168010c2ae799c688 (diff) | |
download | chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.zip chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.tar.gz chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.tar.bz2 |
FTP: improve character encoding detection in cases where ICU's first guess is wrong.
Instead of using ICU's first guessed encoding immediately,
we ask it for all possible encodings, try them in order,
and use the first one that works.
For some sites this still results in a gibberish being displayed,
but at least the links are clickable so the site navigation
is possible.
BUG=61073
TEST=see bug
Review URL: http://codereview.chromium.org/4967001
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@66664 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net/ftp')
-rw-r--r-- | net/ftp/ftp_directory_listing_buffer.cc | 67 | ||||
-rw-r--r-- | net/ftp/ftp_directory_listing_buffer.h | 26 | ||||
-rw-r--r-- | net/ftp/ftp_directory_listing_buffer_unittest.cc | 3 |
3 files changed, 67 insertions, 29 deletions
diff --git a/net/ftp/ftp_directory_listing_buffer.cc b/net/ftp/ftp_directory_listing_buffer.cc index a173399..f6e8748 100644 --- a/net/ftp/ftp_directory_listing_buffer.cc +++ b/net/ftp/ftp_directory_listing_buffer.cc @@ -37,7 +37,7 @@ int FtpDirectoryListingBuffer::ConsumeData(const char* data, int data_length) { buffer_.append(data, data_length); if (!encoding_.empty() || buffer_.length() > 1024) { - int rv = ExtractFullLinesFromBuffer(); + int rv = ConsumeBuffer(); if (rv != OK) return rv; } @@ -46,11 +46,12 @@ int FtpDirectoryListingBuffer::ConsumeData(const char* data, int data_length) { } int FtpDirectoryListingBuffer::ProcessRemainingData() { - int rv = ExtractFullLinesFromBuffer(); + int rv = ConsumeBuffer(); if (rv != OK) return rv; - if (!buffer_.empty()) + DCHECK(buffer_.empty()); + if (!converted_buffer_.empty()) return ERR_INVALID_RESPONSE; rv = ParseLines(); @@ -77,38 +78,62 @@ FtpServerType FtpDirectoryListingBuffer::GetServerType() const { return (current_parser_ ? current_parser_->GetServerType() : SERVER_UNKNOWN); } -bool FtpDirectoryListingBuffer::ConvertToDetectedEncoding( - const std::string& from, string16* to) { - std::string encoding(encoding_.empty() ? "ascii" : encoding_); - return base::CodepageToUTF16(from, encoding.c_str(), - base::OnStringConversionError::FAIL, to); +int FtpDirectoryListingBuffer::DecodeBufferUsingEncoding( + const std::string& encoding) { + string16 converted; + if (!base::CodepageToUTF16(buffer_, + encoding.c_str(), + base::OnStringConversionError::FAIL, + &converted)) + return ERR_ENCODING_CONVERSION_FAILED; + + buffer_.clear(); + converted_buffer_ += converted; + return OK; } -int FtpDirectoryListingBuffer::ExtractFullLinesFromBuffer() { +int FtpDirectoryListingBuffer::ConvertBufferToUTF16() { if (encoding_.empty()) { - if (!base::DetectEncoding(buffer_, &encoding_)) + std::vector<std::string> encodings; + if (!base::DetectAllEncodings(buffer_, &encodings)) return ERR_ENCODING_DETECTION_FAILED; + + // Use first encoding that can be used to decode the buffer. + for (size_t i = 0; i < encodings.size(); i++) { + if (DecodeBufferUsingEncoding(encodings[i]) == OK) { + encoding_ = encodings[i]; + return OK; + } + } + + return ERR_ENCODING_DETECTION_FAILED; } + return DecodeBufferUsingEncoding(encoding_); +} + +void FtpDirectoryListingBuffer::ExtractFullLinesFromBuffer() { int cut_pos = 0; // TODO(phajdan.jr): This code accepts all endlines matching \r*\n. Should it // be more strict, or enforce consistent line endings? - for (size_t i = 0; i < buffer_.length(); ++i) { - if (buffer_[i] != '\n') + for (size_t i = 0; i < converted_buffer_.length(); ++i) { + if (converted_buffer_[i] != '\n') continue; int line_length = i - cut_pos; - if (i >= 1 && buffer_[i - 1] == '\r') + if (i >= 1 && converted_buffer_[i - 1] == '\r') line_length--; - std::string line(buffer_.substr(cut_pos, line_length)); + lines_.push_back(converted_buffer_.substr(cut_pos, line_length)); cut_pos = i + 1; - string16 line_converted; - if (!ConvertToDetectedEncoding(line, &line_converted)) { - buffer_.erase(0, cut_pos); - return ERR_ENCODING_CONVERSION_FAILED; - } - lines_.push_back(line_converted); } - buffer_.erase(0, cut_pos); + converted_buffer_.erase(0, cut_pos); +} + +int FtpDirectoryListingBuffer::ConsumeBuffer() { + int rv = ConvertBufferToUTF16(); + if (rv != OK) + return rv; + + ExtractFullLinesFromBuffer(); return OK; } diff --git a/net/ftp/ftp_directory_listing_buffer.h b/net/ftp/ftp_directory_listing_buffer.h index 0a25fff..ea68932 100644 --- a/net/ftp/ftp_directory_listing_buffer.h +++ b/net/ftp/ftp_directory_listing_buffer.h @@ -51,13 +51,20 @@ class FtpDirectoryListingBuffer { private: typedef std::set<FtpDirectoryListingParser*> ParserSet; - // Converts the string |from| to detected encoding and stores it in |to|. - // Returns true on success. - bool ConvertToDetectedEncoding(const std::string& from, string16* to); + // Decodes the raw buffer using specified |encoding|. On success + // clears the raw buffer and appends data to |converted_buffer_|. + // Returns network error code. + int DecodeBufferUsingEncoding(const std::string& encoding); - // Tries to extract full lines from the raw buffer, converting them to the - // detected encoding. Returns network error code. - int ExtractFullLinesFromBuffer(); + // Converts the raw buffer to UTF-16. Returns network error code. + int ConvertBufferToUTF16(); + + // Extracts lines from the converted buffer, and puts them in |lines_|. + void ExtractFullLinesFromBuffer(); + + // Consumes the raw buffer (i.e. does the character set conversion + // and line splitting). Returns network error code. + int ConsumeBuffer(); // Tries to parse full lines stored in |lines_|. Returns network error code. int ParseLines(); @@ -66,12 +73,15 @@ class FtpDirectoryListingBuffer { // parsers. Returns network error code. int OnEndOfInput(); - // Detected encoding of the response (empty if unknown or ASCII). + // Detected encoding of the response (empty if unknown). std::string encoding_; - // Buffer to keep not-yet-split data. + // Buffer to keep data before character set conversion. std::string buffer_; + // Buffer to keep data before line splitting. + string16 converted_buffer_; + // CRLF-delimited lines, without the CRLF, not yet consumed by parser. std::deque<string16> lines_; diff --git a/net/ftp/ftp_directory_listing_buffer_unittest.cc b/net/ftp/ftp_directory_listing_buffer_unittest.cc index 683e2f7..ceddfc4 100644 --- a/net/ftp/ftp_directory_listing_buffer_unittest.cc +++ b/net/ftp/ftp_directory_listing_buffer_unittest.cc @@ -42,6 +42,9 @@ TEST(FtpDirectoryListingBufferTest, Parse) { "dir-listing-ls-17", "dir-listing-ls-18", "dir-listing-ls-19", + "dir-listing-ls-20", // TODO(phajdan.jr): should use windows-1251 encoding. + "dir-listing-ls-21", // TODO(phajdan.jr): should use windows-1251 encoding. + "dir-listing-ls-22", // TODO(phajdan.jr): should use windows-1251 encoding. "dir-listing-mlsd-1", "dir-listing-mlsd-2", "dir-listing-netware-1", |