diff options
author | phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-04-06 07:26:52 +0000 |
---|---|---|
committer | phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-04-06 07:26:52 +0000 |
commit | da322acaebe30d84d9d8b6931752ab27333c3d36 (patch) | |
tree | c47effc5bec77fd099af537193fd0bff13a9629a /base | |
parent | 8ceea64ea9e45f97f2bac1df84e770ee23cca0e1 (diff) | |
download | chromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.zip chromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.tar.gz chromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.tar.bz2 |
FTP: Multiple fixes for localized directory listings:
- fix detection of KOI8-R and possibly other encodings
- fix parsing Russian month names
When detecting the listing encoding, we need to not only
check whether the data can be converted using given encoding,
but also whether the result can be parsed as a valid directory listing.
Also, we only need to compare the first three characters of the
abbreviated month name, because that's how they're abbreviated
in FTP directory listings.
Finally, the Russian directory listings have swapped the "month" and "day of month" columns.
BUG=65917
Review URL: http://codereview.chromium.org/6718043
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@80587 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/i18n/icu_encoding_detection.cc | 40 | ||||
-rw-r--r-- | base/i18n/icu_encoding_detection.h | 2 |
2 files changed, 41 insertions, 1 deletions
diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc index d579af2..3583fa9 100644 --- a/base/i18n/icu_encoding_detection.cc +++ b/base/i18n/icu_encoding_detection.cc @@ -1,9 +1,11 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "base/i18n/icu_encoding_detection.h" +#include <set> + #include "base/string_util.h" #include "unicode/ucsdet.h" @@ -45,6 +47,13 @@ bool DetectAllEncodings(const std::string& text, return false; } + // ICU has some heuristics for encoding detection, such that the more likely + // encodings should be returned first. However, it doesn't always return + // all encodings that properly decode |text|, so we'll append more encodings + // later. To make that efficient, keep track of encodings sniffed in this + // first phase. + std::set<std::string> sniffed_encodings; + encodings->clear(); for (int i = 0; i < matches_count; i++) { UErrorCode get_name_status = U_ZERO_ERROR; @@ -54,8 +63,37 @@ bool DetectAllEncodings(const std::string& text, if (U_FAILURE(get_name_status)) continue; + int32_t confidence = ucsdet_getConfidence(matches[i], &get_name_status); + + // We also treat this error as non-fatal. + if (U_FAILURE(get_name_status)) + continue; + + // A confidence level >= 10 means that the encoding is expected to properly + // decode the text. Drop all encodings with lower confidence level. + if (confidence < 10) + continue; + encodings->push_back(encoding_name); + sniffed_encodings.insert(encoding_name); + } + + // Append all encodings not included earlier, in arbitrary order. + // TODO(jshin): This shouldn't be necessary, possible ICU bug. + // See also http://crbug.com/65917. + UEnumeration* detectable_encodings = ucsdet_getAllDetectableCharsets(detector, + &status); + int detectable_count = uenum_count(detectable_encodings, &status); + for (int i = 0; i < detectable_count; i++) { + int name_length; + const char* name_raw = uenum_next(detectable_encodings, + &name_length, + &status); + std::string name(name_raw, name_length); + if (sniffed_encodings.find(name) == sniffed_encodings.end()) + encodings->push_back(name); } + uenum_close(detectable_encodings); ucsdet_close(detector); return !encodings->empty(); diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h index cdc4cb7..552eb3d 100644 --- a/base/i18n/icu_encoding_detection.h +++ b/base/i18n/icu_encoding_detection.h @@ -18,6 +18,8 @@ bool DetectEncoding(const std::string& text, std::string* encoding); // Detect all possible encodings of |text| and put their names // (as returned by ICU) in |encodings|. Returns true on success. +// Note: this function may return encodings that may fail to decode |text|, +// the caller is responsible for handling that. bool DetectAllEncodings(const std::string& text, std::vector<std::string>* encodings); |