diff options
author | phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-11-18 19:50:44 +0000 |
---|---|---|
committer | phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-11-18 19:50:44 +0000 |
commit | 1e61db506aa4c0014d384c2d990525be38cdbc60 (patch) | |
tree | 05b6c855ad25c7c541423d85427643d9ae87fb47 /base | |
parent | 6f6b0041b543f97fab16548168010c2ae799c688 (diff) | |
download | chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.zip chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.tar.gz chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.tar.bz2 |
FTP: improve character encoding detection in cases where ICU's first guess is wrong.
Instead of using ICU's first guessed encoding immediately,
we ask it for all possible encodings, try them in order,
and use the first one that works.
For some sites this still results in a gibberish being displayed,
but at least the links are clickable so the site navigation
is possible.
BUG=61073
TEST=see bug
Review URL: http://codereview.chromium.org/4967001
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@66664 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/i18n/icu_encoding_detection.cc | 36 | ||||
-rw-r--r-- | base/i18n/icu_encoding_detection.h | 6 |
2 files changed, 37 insertions, 5 deletions
diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc index 55785c5..d579af2 100644 --- a/base/i18n/icu_encoding_detection.cc +++ b/base/i18n/icu_encoding_detection.cc @@ -9,8 +9,6 @@ namespace base { -// TODO(jungshik): We can apply more heuristics here (e.g. using various hints -// like TLD, the UI language/default encoding of a client, etc). bool DetectEncoding(const std::string& text, std::string* encoding) { if (IsStringASCII(text)) { *encoding = std::string(); @@ -21,9 +19,6 @@ bool DetectEncoding(const std::string& text, std::string* encoding) { UCharsetDetector* detector = ucsdet_open(&status); ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), &status); - // TODO(jungshik): Should we check the quality of the match? A rather - // arbitrary number is assigned by ICU and it's hard to come up with - // a lower limit. const UCharsetMatch* match = ucsdet_detect(detector, &status); const char* detected_encoding = ucsdet_getName(match, &status); ucsdet_close(detector); @@ -35,4 +30,35 @@ bool DetectEncoding(const std::string& text, std::string* encoding) { return true; } +bool DetectAllEncodings(const std::string& text, + std::vector<std::string>* encodings) { + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector* detector = ucsdet_open(&status); + ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), + &status); + int matches_count = 0; + const UCharsetMatch** matches = ucsdet_detectAll(detector, + &matches_count, + &status); + if (U_FAILURE(status)) { + ucsdet_close(detector); + return false; + } + + encodings->clear(); + for (int i = 0; i < matches_count; i++) { + UErrorCode get_name_status = U_ZERO_ERROR; + const char* encoding_name = ucsdet_getName(matches[i], &get_name_status); + + // If we failed to get the encoding's name, ignore the error. + if (U_FAILURE(get_name_status)) + continue; + + encodings->push_back(encoding_name); + } + + ucsdet_close(detector); + return !encodings->empty(); +} + } // namespace base diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h index e7e6253..cdc4cb7 100644 --- a/base/i18n/icu_encoding_detection.h +++ b/base/i18n/icu_encoding_detection.h @@ -7,6 +7,7 @@ #pragma once #include <string> +#include <vector> namespace base { @@ -15,6 +16,11 @@ namespace base { // Returns true on success. bool DetectEncoding(const std::string& text, std::string* encoding); +// Detect all possible encodings of |text| and put their names +// (as returned by ICU) in |encodings|. Returns true on success. +bool DetectAllEncodings(const std::string& text, + std::vector<std::string>* encodings); + } // namespace base #endif // BASE_I18N_ICU_ENCODING_DETECTION_H_ |