FTP: Multiple fixes for localized directory listings:

- fix detection of KOI8-R and possibly other encodings - fix parsing Russian month names When detecting the listing encoding, we need to not only check whether the data can be converted using given encoding, but also whether the result can be parsed as a valid directory listing. Also, we only need to compare the first three characters of the abbreviated month name, because that's how they're abbreviated in FTP directory listings. Finally, the Russian directory listings have swapped the "month" and "day of month" columns. BUG=65917 Review URL: http://codereview.chromium.org/6718043 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@80587 0039d316-1c4b-4281-b951-d872f2087c98
author: phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-04-06 07:26:52 +0000
committer: phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-04-06 07:26:52 +0000
commit: da322acaebe30d84d9d8b6931752ab27333c3d36 (patch)
tree: c47effc5bec77fd099af537193fd0bff13a9629a /base
parent: 8ceea64ea9e45f97f2bac1df84e770ee23cca0e1 (diff)
download: chromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.zip
chromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.tar.gz
chromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.tar.bz2
2 files changed, 41 insertions, 1 deletions
diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc
index d579af2..3583fa9 100644
--- a/base/i18n/icu_encoding_detection.cc
+++ b/base/i18n/icu_encoding_detection.cc
@@ -1,9 +1,11 @@
-// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
 #include "base/i18n/icu_encoding_detection.h"
 
+#include <set>
+
 #include "base/string_util.h"
 #include "unicode/ucsdet.h"
 
@@ -45,6 +47,13 @@ bool DetectAllEncodings(const std::string& text,
     return false;
   }
 
+  // ICU has some heuristics for encoding detection, such that the more likely
+  // encodings should be returned first. However, it doesn't always return
+  // all encodings that properly decode |text|, so we'll append more encodings
+  // later. To make that efficient, keep track of encodings sniffed in this
+  // first phase.
+  std::set<std::string> sniffed_encodings;
+
   encodings->clear();
   for (int i = 0; i < matches_count; i++) {
     UErrorCode get_name_status = U_ZERO_ERROR;
@@ -54,8 +63,37 @@ bool DetectAllEncodings(const std::string& text,
     if (U_FAILURE(get_name_status))
       continue;
 
+    int32_t confidence = ucsdet_getConfidence(matches[i], &get_name_status);
+
+    // We also treat this error as non-fatal.
+    if (U_FAILURE(get_name_status))
+      continue;
+
+    // A confidence level >= 10 means that the encoding is expected to properly
+    // decode the text. Drop all encodings with lower confidence level.
+    if (confidence < 10)
+      continue;
+
     encodings->push_back(encoding_name);
+    sniffed_encodings.insert(encoding_name);
+  }
+
+  // Append all encodings not included earlier, in arbitrary order.
+  // TODO(jshin): This shouldn't be necessary, possible ICU bug.
+  // See also http://crbug.com/65917.
+  UEnumeration* detectable_encodings = ucsdet_getAllDetectableCharsets(detector,
+                                                                       &status);
+  int detectable_count = uenum_count(detectable_encodings, &status);
+  for (int i = 0; i < detectable_count; i++) {
+    int name_length;
+    const char* name_raw = uenum_next(detectable_encodings,
+                                      &name_length,
+                                      &status);
+    std::string name(name_raw, name_length);
+    if (sniffed_encodings.find(name) == sniffed_encodings.end())
+      encodings->push_back(name);
   }
+  uenum_close(detectable_encodings);
 
   ucsdet_close(detector);
   return !encodings->empty();
diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h
index cdc4cb7..552eb3d 100644
--- a/base/i18n/icu_encoding_detection.h
+++ b/base/i18n/icu_encoding_detection.h
@@ -18,6 +18,8 @@ bool DetectEncoding(const std::string& text, std::string* encoding);
 
 // Detect all possible encodings of |text| and put their names
 // (as returned by ICU) in |encodings|. Returns true on success.
+// Note: this function may return encodings that may fail to decode |text|,
+// the caller is responsible for handling that.
 bool DetectAllEncodings(const std::string& text,
                         std::vector<std::string>* encodings);
author	phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-04-06 07:26:52 +0000
committer	phajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-04-06 07:26:52 +0000
commit	da322acaebe30d84d9d8b6931752ab27333c3d36 (patch)
tree	c47effc5bec77fd099af537193fd0bff13a9629a /base
parent	8ceea64ea9e45f97f2bac1df84e770ee23cca0e1 (diff)
download	chromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.zip chromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.tar.gz chromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.tar.bz2