summaryrefslogtreecommitdiffstats
path: root/base
diff options
context:
space:
mode:
authorphajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-11-18 19:50:44 +0000
committerphajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-11-18 19:50:44 +0000
commit1e61db506aa4c0014d384c2d990525be38cdbc60 (patch)
tree05b6c855ad25c7c541423d85427643d9ae87fb47 /base
parent6f6b0041b543f97fab16548168010c2ae799c688 (diff)
downloadchromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.zip
chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.tar.gz
chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.tar.bz2
FTP: improve character encoding detection in cases where ICU's first guess is wrong.
Instead of using ICU's first guessed encoding immediately, we ask it for all possible encodings, try them in order, and use the first one that works. For some sites this still results in a gibberish being displayed, but at least the links are clickable so the site navigation is possible. BUG=61073 TEST=see bug Review URL: http://codereview.chromium.org/4967001 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@66664 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r--base/i18n/icu_encoding_detection.cc36
-rw-r--r--base/i18n/icu_encoding_detection.h6
2 files changed, 37 insertions, 5 deletions
diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc
index 55785c5..d579af2 100644
--- a/base/i18n/icu_encoding_detection.cc
+++ b/base/i18n/icu_encoding_detection.cc
@@ -9,8 +9,6 @@
namespace base {
-// TODO(jungshik): We can apply more heuristics here (e.g. using various hints
-// like TLD, the UI language/default encoding of a client, etc).
bool DetectEncoding(const std::string& text, std::string* encoding) {
if (IsStringASCII(text)) {
*encoding = std::string();
@@ -21,9 +19,6 @@ bool DetectEncoding(const std::string& text, std::string* encoding) {
UCharsetDetector* detector = ucsdet_open(&status);
ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),
&status);
- // TODO(jungshik): Should we check the quality of the match? A rather
- // arbitrary number is assigned by ICU and it's hard to come up with
- // a lower limit.
const UCharsetMatch* match = ucsdet_detect(detector, &status);
const char* detected_encoding = ucsdet_getName(match, &status);
ucsdet_close(detector);
@@ -35,4 +30,35 @@ bool DetectEncoding(const std::string& text, std::string* encoding) {
return true;
}
+bool DetectAllEncodings(const std::string& text,
+ std::vector<std::string>* encodings) {
+ UErrorCode status = U_ZERO_ERROR;
+ UCharsetDetector* detector = ucsdet_open(&status);
+ ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),
+ &status);
+ int matches_count = 0;
+ const UCharsetMatch** matches = ucsdet_detectAll(detector,
+ &matches_count,
+ &status);
+ if (U_FAILURE(status)) {
+ ucsdet_close(detector);
+ return false;
+ }
+
+ encodings->clear();
+ for (int i = 0; i < matches_count; i++) {
+ UErrorCode get_name_status = U_ZERO_ERROR;
+ const char* encoding_name = ucsdet_getName(matches[i], &get_name_status);
+
+ // If we failed to get the encoding's name, ignore the error.
+ if (U_FAILURE(get_name_status))
+ continue;
+
+ encodings->push_back(encoding_name);
+ }
+
+ ucsdet_close(detector);
+ return !encodings->empty();
+}
+
} // namespace base
diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h
index e7e6253..cdc4cb7 100644
--- a/base/i18n/icu_encoding_detection.h
+++ b/base/i18n/icu_encoding_detection.h
@@ -7,6 +7,7 @@
#pragma once
#include <string>
+#include <vector>
namespace base {
@@ -15,6 +16,11 @@ namespace base {
// Returns true on success.
bool DetectEncoding(const std::string& text, std::string* encoding);
+// Detect all possible encodings of |text| and put their names
+// (as returned by ICU) in |encodings|. Returns true on success.
+bool DetectAllEncodings(const std::string& text,
+ std::vector<std::string>* encodings);
+
} // namespace base
#endif // BASE_I18N_ICU_ENCODING_DETECTION_H_