summaryrefslogtreecommitdiffstats
path: root/base
diff options
context:
space:
mode:
authorphajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-04-06 07:26:52 +0000
committerphajdan.jr@chromium.org <phajdan.jr@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-04-06 07:26:52 +0000
commitda322acaebe30d84d9d8b6931752ab27333c3d36 (patch)
treec47effc5bec77fd099af537193fd0bff13a9629a /base
parent8ceea64ea9e45f97f2bac1df84e770ee23cca0e1 (diff)
downloadchromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.zip
chromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.tar.gz
chromium_src-da322acaebe30d84d9d8b6931752ab27333c3d36.tar.bz2
FTP: Multiple fixes for localized directory listings:
- fix detection of KOI8-R and possibly other encodings - fix parsing Russian month names When detecting the listing encoding, we need to not only check whether the data can be converted using given encoding, but also whether the result can be parsed as a valid directory listing. Also, we only need to compare the first three characters of the abbreviated month name, because that's how they're abbreviated in FTP directory listings. Finally, the Russian directory listings have swapped the "month" and "day of month" columns. BUG=65917 Review URL: http://codereview.chromium.org/6718043 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@80587 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r--base/i18n/icu_encoding_detection.cc40
-rw-r--r--base/i18n/icu_encoding_detection.h2
2 files changed, 41 insertions, 1 deletions
diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc
index d579af2..3583fa9 100644
--- a/base/i18n/icu_encoding_detection.cc
+++ b/base/i18n/icu_encoding_detection.cc
@@ -1,9 +1,11 @@
-// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/icu_encoding_detection.h"
+#include <set>
+
#include "base/string_util.h"
#include "unicode/ucsdet.h"
@@ -45,6 +47,13 @@ bool DetectAllEncodings(const std::string& text,
return false;
}
+ // ICU has some heuristics for encoding detection, such that the more likely
+ // encodings should be returned first. However, it doesn't always return
+ // all encodings that properly decode |text|, so we'll append more encodings
+ // later. To make that efficient, keep track of encodings sniffed in this
+ // first phase.
+ std::set<std::string> sniffed_encodings;
+
encodings->clear();
for (int i = 0; i < matches_count; i++) {
UErrorCode get_name_status = U_ZERO_ERROR;
@@ -54,8 +63,37 @@ bool DetectAllEncodings(const std::string& text,
if (U_FAILURE(get_name_status))
continue;
+ int32_t confidence = ucsdet_getConfidence(matches[i], &get_name_status);
+
+ // We also treat this error as non-fatal.
+ if (U_FAILURE(get_name_status))
+ continue;
+
+ // A confidence level >= 10 means that the encoding is expected to properly
+ // decode the text. Drop all encodings with lower confidence level.
+ if (confidence < 10)
+ continue;
+
encodings->push_back(encoding_name);
+ sniffed_encodings.insert(encoding_name);
+ }
+
+ // Append all encodings not included earlier, in arbitrary order.
+ // TODO(jshin): This shouldn't be necessary, possible ICU bug.
+ // See also http://crbug.com/65917.
+ UEnumeration* detectable_encodings = ucsdet_getAllDetectableCharsets(detector,
+ &status);
+ int detectable_count = uenum_count(detectable_encodings, &status);
+ for (int i = 0; i < detectable_count; i++) {
+ int name_length;
+ const char* name_raw = uenum_next(detectable_encodings,
+ &name_length,
+ &status);
+ std::string name(name_raw, name_length);
+ if (sniffed_encodings.find(name) == sniffed_encodings.end())
+ encodings->push_back(name);
}
+ uenum_close(detectable_encodings);
ucsdet_close(detector);
return !encodings->empty();
diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h
index cdc4cb7..552eb3d 100644
--- a/base/i18n/icu_encoding_detection.h
+++ b/base/i18n/icu_encoding_detection.h
@@ -18,6 +18,8 @@ bool DetectEncoding(const std::string& text, std::string* encoding);
// Detect all possible encodings of |text| and put their names
// (as returned by ICU) in |encodings|. Returns true on success.
+// Note: this function may return encodings that may fail to decode |text|,
+// the caller is responsible for handling that.
bool DetectAllEncodings(const std::string& text,
std::vector<std::string>* encodings);