FTP: improve character encoding detection in cases where ICU's first guess is wrong.

Instead of using ICU's first guessed encoding immediately, we ask it for all possible encodings, try them in order, and use the first one that works. For some sites this still results in a gibberish being displayed, but at least the links are clickable so the site navigation is possible. BUG=61073 TEST=see bug Review URL: http://codereview.chromium.org/4967001 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@66664 0039d316-1c4b-4281-b951-d872f2087c98
author: phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-11-18 19:50:44 +0000
committer: phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-11-18 19:50:44 +0000
commit: 1e61db506aa4c0014d384c2d990525be38cdbc60 (patch)
tree: 05b6c855ad25c7c541423d85427643d9ae87fb47 /base
parent: 6f6b0041b543f97fab16548168010c2ae799c688 (diff)
download: chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.zip
chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.tar.gz
chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.tar.bz2
2 files changed, 37 insertions, 5 deletions
diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc
index 55785c5..d579af2 100644
--- a/base/i18n/icu_encoding_detection.cc
+++ b/base/i18n/icu_encoding_detection.cc
@@ -9,8 +9,6 @@
 
 namespace base {
 
-// TODO(jungshik): We can apply more heuristics here (e.g. using various hints
-// like TLD, the UI language/default encoding of a client, etc).
 bool DetectEncoding(const std::string& text, std::string* encoding) {
   if (IsStringASCII(text)) {
     *encoding = std::string();
@@ -21,9 +19,6 @@ bool DetectEncoding(const std::string& text, std::string* encoding) {
   UCharsetDetector* detector = ucsdet_open(&status);
   ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),
                  &status);
-  // TODO(jungshik): Should we check the quality of the match? A rather
-  // arbitrary number is assigned by ICU and it's hard to come up with
-  // a lower limit.
   const UCharsetMatch* match = ucsdet_detect(detector, &status);
   const char* detected_encoding = ucsdet_getName(match, &status);
   ucsdet_close(detector);
@@ -35,4 +30,35 @@ bool DetectEncoding(const std::string& text, std::string* encoding) {
   return true;
 }
 
+bool DetectAllEncodings(const std::string& text,
+                        std::vector<std::string>* encodings) {
+  UErrorCode status = U_ZERO_ERROR;
+  UCharsetDetector* detector = ucsdet_open(&status);
+  ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),
+                 &status);
+  int matches_count = 0;
+  const UCharsetMatch** matches = ucsdet_detectAll(detector,
+                                                   &matches_count,
+                                                   &status);
+  if (U_FAILURE(status)) {
+    ucsdet_close(detector);
+    return false;
+  }
+
+  encodings->clear();
+  for (int i = 0; i < matches_count; i++) {
+    UErrorCode get_name_status = U_ZERO_ERROR;
+    const char* encoding_name = ucsdet_getName(matches[i], &get_name_status);
+
+    // If we failed to get the encoding's name, ignore the error.
+    if (U_FAILURE(get_name_status))
+      continue;
+
+    encodings->push_back(encoding_name);
+  }
+
+  ucsdet_close(detector);
+  return !encodings->empty();
+}
+
 }  // namespace base
diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h
index e7e6253..cdc4cb7 100644
--- a/base/i18n/icu_encoding_detection.h
+++ b/base/i18n/icu_encoding_detection.h
@@ -7,6 +7,7 @@
 #pragma once
 
 #include <string>
+#include <vector>
 
 namespace base {
 
@@ -15,6 +16,11 @@ namespace base {
 // Returns true on success.
 bool DetectEncoding(const std::string& text, std::string* encoding);
 
+// Detect all possible encodings of |text| and put their names
+// (as returned by ICU) in |encodings|. Returns true on success.
+bool DetectAllEncodings(const std::string& text,
+                        std::vector<std::string>* encodings);
+
 }  // namespace base
 
 #endif  // BASE_I18N_ICU_ENCODING_DETECTION_H_
author	phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-11-18 19:50:44 +0000
committer	phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-11-18 19:50:44 +0000
commit	1e61db506aa4c0014d384c2d990525be38cdbc60 (patch)
tree	05b6c855ad25c7c541423d85427643d9ae87fb47 /base
parent	6f6b0041b543f97fab16548168010c2ae799c688 (diff)
download	chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.zip chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.tar.gz chromium_src-1e61db506aa4c0014d384c2d990525be38cdbc60.tar.bz2