diff options
-rw-r--r-- | base/i18n/icu_encoding_detection.cc | 40 | ||||
-rw-r--r-- | base/i18n/icu_encoding_detection.h | 2 | ||||
-rw-r--r-- | net/data/ftp/dir-listing-ls-25 | 6 | ||||
-rw-r--r-- | net/data/ftp/dir-listing-ls-25.expected | 53 | ||||
-rw-r--r-- | net/data/ftp/dir-listing-ls-26 | 6 | ||||
-rw-r--r-- | net/data/ftp/dir-listing-ls-26.expected | 53 | ||||
-rw-r--r-- | net/data/ftp/dir-listing-ls-27 | 6 | ||||
-rw-r--r-- | net/data/ftp/dir-listing-ls-27.expected | 53 | ||||
-rw-r--r-- | net/ftp/ftp_directory_listing_parser.cc | 114 | ||||
-rw-r--r-- | net/ftp/ftp_directory_listing_parser_ls.cc | 14 | ||||
-rw-r--r-- | net/ftp/ftp_directory_listing_parser_ls_unittest.cc | 13 | ||||
-rw-r--r-- | net/ftp/ftp_directory_listing_parser_unittest.cc | 7 | ||||
-rw-r--r-- | net/ftp/ftp_util.cc | 9 | ||||
-rw-r--r-- | net/ftp/ftp_util_unittest.cc | 19 |
14 files changed, 339 insertions, 56 deletions
diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc index d579af2..3583fa9 100644 --- a/base/i18n/icu_encoding_detection.cc +++ b/base/i18n/icu_encoding_detection.cc @@ -1,9 +1,11 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "base/i18n/icu_encoding_detection.h" +#include <set> + #include "base/string_util.h" #include "unicode/ucsdet.h" @@ -45,6 +47,13 @@ bool DetectAllEncodings(const std::string& text, return false; } + // ICU has some heuristics for encoding detection, such that the more likely + // encodings should be returned first. However, it doesn't always return + // all encodings that properly decode |text|, so we'll append more encodings + // later. To make that efficient, keep track of encodings sniffed in this + // first phase. + std::set<std::string> sniffed_encodings; + encodings->clear(); for (int i = 0; i < matches_count; i++) { UErrorCode get_name_status = U_ZERO_ERROR; @@ -54,8 +63,37 @@ bool DetectAllEncodings(const std::string& text, if (U_FAILURE(get_name_status)) continue; + int32_t confidence = ucsdet_getConfidence(matches[i], &get_name_status); + + // We also treat this error as non-fatal. + if (U_FAILURE(get_name_status)) + continue; + + // A confidence level >= 10 means that the encoding is expected to properly + // decode the text. Drop all encodings with lower confidence level. + if (confidence < 10) + continue; + encodings->push_back(encoding_name); + sniffed_encodings.insert(encoding_name); + } + + // Append all encodings not included earlier, in arbitrary order. + // TODO(jshin): This shouldn't be necessary, possible ICU bug. + // See also http://crbug.com/65917. + UEnumeration* detectable_encodings = ucsdet_getAllDetectableCharsets(detector, + &status); + int detectable_count = uenum_count(detectable_encodings, &status); + for (int i = 0; i < detectable_count; i++) { + int name_length; + const char* name_raw = uenum_next(detectable_encodings, + &name_length, + &status); + std::string name(name_raw, name_length); + if (sniffed_encodings.find(name) == sniffed_encodings.end()) + encodings->push_back(name); } + uenum_close(detectable_encodings); ucsdet_close(detector); return !encodings->empty(); diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h index cdc4cb7..552eb3d 100644 --- a/base/i18n/icu_encoding_detection.h +++ b/base/i18n/icu_encoding_detection.h @@ -18,6 +18,8 @@ bool DetectEncoding(const std::string& text, std::string* encoding); // Detect all possible encodings of |text| and put their names // (as returned by ICU) in |encodings|. Returns true on success. +// Note: this function may return encodings that may fail to decode |text|, +// the caller is responsible for handling that. bool DetectAllEncodings(const std::string& text, std::vector<std::string>* encodings); diff --git a/net/data/ftp/dir-listing-ls-25 b/net/data/ftp/dir-listing-ls-25 new file mode 100644 index 0000000..7f36b14 --- /dev/null +++ b/net/data/ftp/dir-listing-ls-25 @@ -0,0 +1,6 @@ +drwxr-xr-x 3 ftp ftp 4096 15 апр 18:11 .
+drwxr-xr-x 3 ftp ftp 4096 15 июл 18:11 ..
+-rw-r--r-- 1 ftp ftp 528 01 май 2007 .message
+-rw-r--r-- 1 ftp ftp 528 01 ноя 2007 README
+-rw-r--r-- 1 ftp ftp 560 28 сен 2007 index.html
+drwxr-xr-x 33 ftp ftp 4096 12 фев 2008 pub
diff --git a/net/data/ftp/dir-listing-ls-25.expected b/net/data/ftp/dir-listing-ls-25.expected new file mode 100644 index 0000000..3405f86 --- /dev/null +++ b/net/data/ftp/dir-listing-ls-25.expected @@ -0,0 +1,53 @@ +d +. +-1 +1994 +4 +15 +18 +11 + +d +.. +-1 +1994 +7 +15 +18 +11 + +- +.message +528 +2007 +5 +1 +0 +0 + +- +README +528 +2007 +11 +1 +0 +0 + +- +index.html +560 +2007 +9 +28 +0 +0 + +d +pub +-1 +2008 +2 +12 +0 +0 diff --git a/net/data/ftp/dir-listing-ls-26 b/net/data/ftp/dir-listing-ls-26 new file mode 100644 index 0000000..73161af --- /dev/null +++ b/net/data/ftp/dir-listing-ls-26 @@ -0,0 +1,6 @@ +drwxr-xr-x 3 ftp ftp 4096 15 18:11 .
+drwxr-xr-x 3 ftp ftp 4096 15 18:11 ..
+-rw-r--r-- 1 ftp ftp 528 01 2007 .message
+-rw-r--r-- 1 ftp ftp 528 01 2007 README
+-rw-r--r-- 1 ftp ftp 560 28 2007 index.html
+drwxr-xr-x 33 ftp ftp 4096 12 2008 pub
diff --git a/net/data/ftp/dir-listing-ls-26.expected b/net/data/ftp/dir-listing-ls-26.expected new file mode 100644 index 0000000..3405f86 --- /dev/null +++ b/net/data/ftp/dir-listing-ls-26.expected @@ -0,0 +1,53 @@ +d +. +-1 +1994 +4 +15 +18 +11 + +d +.. +-1 +1994 +7 +15 +18 +11 + +- +.message +528 +2007 +5 +1 +0 +0 + +- +README +528 +2007 +11 +1 +0 +0 + +- +index.html +560 +2007 +9 +28 +0 +0 + +d +pub +-1 +2008 +2 +12 +0 +0 diff --git a/net/data/ftp/dir-listing-ls-27 b/net/data/ftp/dir-listing-ls-27 new file mode 100644 index 0000000..eec958e --- /dev/null +++ b/net/data/ftp/dir-listing-ls-27 @@ -0,0 +1,6 @@ +drwxr-xr-x 3 ftp ftp 4096 15 18:11 .
+drwxr-xr-x 3 ftp ftp 4096 15 18:11 ..
+-rw-r--r-- 1 ftp ftp 528 01 2007 .message
+-rw-r--r-- 1 ftp ftp 528 01 2007 README
+-rw-r--r-- 1 ftp ftp 560 28 2007 index.html
+drwxr-xr-x 33 ftp ftp 4096 12 2008 pub
diff --git a/net/data/ftp/dir-listing-ls-27.expected b/net/data/ftp/dir-listing-ls-27.expected new file mode 100644 index 0000000..3405f86 --- /dev/null +++ b/net/data/ftp/dir-listing-ls-27.expected @@ -0,0 +1,53 @@ +d +. +-1 +1994 +4 +15 +18 +11 + +d +.. +-1 +1994 +7 +15 +18 +11 + +- +.message +528 +2007 +5 +1 +0 +0 + +- +README +528 +2007 +11 +1 +0 +0 + +- +index.html +560 +2007 +9 +28 +0 +0 + +d +pub +-1 +2008 +2 +12 +0 +0 diff --git a/net/ftp/ftp_directory_listing_parser.cc b/net/ftp/ftp_directory_listing_parser.cc index 8c36bb6..7d47725 100644 --- a/net/ftp/ftp_directory_listing_parser.cc +++ b/net/ftp/ftp_directory_listing_parser.cc @@ -16,93 +16,109 @@ #include "net/ftp/ftp_directory_listing_parser_windows.h" #include "net/ftp/ftp_server_type_histograms.h" -namespace { - -// Converts a string with unknown character encoding to UTF-16. On success -// fills in |converted_text| and |encoding|. Returns network error code. -int ConvertStringToUTF16(const std::string& text, - string16* converted_text, - std::string* encoding) { - std::vector<std::string> encodings; - if (!base::DetectAllEncodings(text, &encodings)) - return net::ERR_ENCODING_DETECTION_FAILED; - - // Use first encoding that can be used to decode the text. - for (size_t i = 0; i < encodings.size(); i++) { - if (base::CodepageToUTF16(text, - encodings[i].c_str(), - base::OnStringConversionError::FAIL, - converted_text)) { - *encoding = encodings[i]; - return net::OK; - } - } +namespace net { - return net::ERR_ENCODING_DETECTION_FAILED; -} +namespace { +// Fills in |raw_name| for all |entries| using |encoding|. Returns network +// error code. int FillInRawName(const std::string& encoding, - std::vector<net::FtpDirectoryListingEntry>* entries) { + std::vector<FtpDirectoryListingEntry>* entries) { for (size_t i = 0; i < entries->size(); i++) { if (!base::UTF16ToCodepage(entries->at(i).name, encoding.c_str(), base::OnStringConversionError::FAIL, &entries->at(i).raw_name)) { - return net::ERR_ENCODING_CONVERSION_FAILED; + return ERR_ENCODING_CONVERSION_FAILED; } } - return net::OK; + return OK; } -} // namespace - -namespace net { - -FtpDirectoryListingEntry::FtpDirectoryListingEntry() { -} - -int ParseFtpDirectoryListing(const std::string& text, - const base::Time& current_time, - std::vector<FtpDirectoryListingEntry>* entries) { - std::string encoding; - - string16 converted_text; - int rv = ConvertStringToUTF16(text, &converted_text, &encoding); - if (rv != OK) - return rv; - +// Parses |text| as an FTP directory listing. Fills in |entries| +// and |server_type| and returns network error code. +int ParseListing(const string16& text, + const std::string& encoding, + const base::Time& current_time, + std::vector<FtpDirectoryListingEntry>* entries, + FtpServerType* server_type) { std::vector<string16> lines; - base::SplitString(converted_text, '\n', &lines); + base::SplitString(text, '\n', &lines); // TODO(phajdan.jr): Use a table of callbacks instead of repeating code. entries->clear(); if (ParseFtpDirectoryListingLs(lines, current_time, entries)) { - UpdateFtpServerTypeHistograms(SERVER_LS); + *server_type = SERVER_LS; return FillInRawName(encoding, entries); } entries->clear(); if (ParseFtpDirectoryListingWindows(lines, entries)) { - UpdateFtpServerTypeHistograms(SERVER_WINDOWS); + *server_type = SERVER_WINDOWS; return FillInRawName(encoding, entries); } entries->clear(); if (ParseFtpDirectoryListingVms(lines, entries)) { - UpdateFtpServerTypeHistograms(SERVER_VMS); + *server_type = SERVER_VMS; return FillInRawName(encoding, entries); } entries->clear(); if (ParseFtpDirectoryListingNetware(lines, current_time, entries)) { - UpdateFtpServerTypeHistograms(SERVER_NETWARE); + *server_type = SERVER_NETWARE; return FillInRawName(encoding, entries); } entries->clear(); - UpdateFtpServerTypeHistograms(SERVER_UNKNOWN); + return ERR_UNRECOGNIZED_FTP_DIRECTORY_LISTING_FORMAT; +} + +// Detects encoding of |text| and parses it as an FTP directory listing. +// Fills in |entries| and |server_type| and returns network error code. +int DecodeAndParse(const std::string& text, + const base::Time& current_time, + std::vector<FtpDirectoryListingEntry>* entries, + FtpServerType* server_type) { + std::vector<std::string> encodings; + if (!base::DetectAllEncodings(text, &encodings)) + return ERR_ENCODING_DETECTION_FAILED; + + // Use first encoding that can be used to decode the text. + for (size_t i = 0; i < encodings.size(); i++) { + string16 converted_text; + if (base::CodepageToUTF16(text, + encodings[i].c_str(), + base::OnStringConversionError::FAIL, + &converted_text)) { + int rv = ParseListing(converted_text, + encodings[i], + current_time, + entries, + server_type); + if (rv == OK) + return rv; + } + } + + entries->clear(); + *server_type = SERVER_UNKNOWN; return ERR_UNRECOGNIZED_FTP_DIRECTORY_LISTING_FORMAT; } } // namespace + +FtpDirectoryListingEntry::FtpDirectoryListingEntry() { +} + +int ParseFtpDirectoryListing(const std::string& text, + const base::Time& current_time, + std::vector<FtpDirectoryListingEntry>* entries) { + FtpServerType server_type = SERVER_UNKNOWN; + int rv = DecodeAndParse(text, current_time, entries, &server_type); + UpdateFtpServerTypeHistograms(server_type); + return rv; +} + +} // namespace net diff --git a/net/ftp/ftp_directory_listing_parser_ls.cc b/net/ftp/ftp_directory_listing_parser_ls.cc index 9d637a8..f7ad6ac 100644 --- a/net/ftp/ftp_directory_listing_parser_ls.cc +++ b/net/ftp/ftp_directory_listing_parser_ls.cc @@ -98,6 +98,20 @@ bool DetectColumnOffsetAndModificationTime(const std::vector<string16>& columns, } } + // Some FTP listings have swapped the "month" and "day of month" columns + // (for example Russian listings). We try to recognize them only after making + // sure no column offset works above (this is a more strict way). + for (size_t i = 5U; i < columns.size(); i++) { + if (net::FtpUtil::LsDateListingToTime(columns[i - 1], + columns[i - 2], + columns[i], + current_time, + modification_time)) { + *offset = i; + return true; + } + } + return false; } diff --git a/net/ftp/ftp_directory_listing_parser_ls_unittest.cc b/net/ftp/ftp_directory_listing_parser_ls_unittest.cc index 0414eb9..c05e691 100644 --- a/net/ftp/ftp_directory_listing_parser_ls_unittest.cc +++ b/net/ftp/ftp_directory_listing_parser_ls_unittest.cc @@ -44,7 +44,7 @@ TEST_F(FtpDirectoryListingParserLsTest, Good) { { "d-wx-wx-wt+ 4 ftp 989 512 Dec 8 15:54 incoming", FtpDirectoryListingEntry::DIRECTORY, "incoming", -1, 1993, 12, 8, 15, 54 }, - { "drwxrwxrwx 1 owner group 0 Sep 13 0:30 audio", + { "drwxrwxrwx 1 owner group 1024 Sep 13 0:30 audio", FtpDirectoryListingEntry::DIRECTORY, "audio", -1, 1994, 9, 13, 0, 30 }, { "lrwxrwxrwx 1 0 0 26 Sep 18 2008 pub", @@ -94,6 +94,15 @@ TEST_F(FtpDirectoryListingParserLsTest, Good) { { "drwxrwxr-x 3 %%%% Domain Users 4096 Dec 9 2009 %%%%%", net::FtpDirectoryListingEntry::DIRECTORY, "%%%%%", -1, 2009, 12, 9, 0, 0 }, + + // Tests for "ls -l" style listing in Russian locale (note the swapped + // parts order: the day of month is the first, before month). + { "-rwxrwxr-x 1 ftp ftp 123 23 \xd0\xbc\xd0\xb0\xd0\xb9 2011 test", + net::FtpDirectoryListingEntry::FILE, "test", 123, + 2011, 5, 23, 0, 0 }, + { "drwxrwxr-x 1 ftp ftp 4096 19 \xd0\xbe\xd0\xba\xd1\x82 2011 dir", + net::FtpDirectoryListingEntry::DIRECTORY, "dir", -1, + 2011, 10, 19, 0, 0 }, }; for (size_t i = 0; i < arraysize(good_cases); i++) { SCOPED_TRACE(base::StringPrintf("Test[%" PRIuS "]: %s", i, @@ -150,7 +159,7 @@ TEST_F(FtpDirectoryListingParserLsTest, Bad) { "qrwwr--r-- 1 ftp ftp 528 Nov 01 2007 README", "-rw-r--r-- 1 ftp ftp -528 Nov 01 2007 README", "-rw-r--r-- 1 ftp ftp 528 Foo 01 2007 README", - "drwxrwxrwx 1 owner group 0 Sep 13 0:3 audio", + "drwxrwxrwx 1 owner group 1024 Sep 13 0:3 audio", "-qqqqqqqqq+ 2 sys 512 Mar 27 2009 pub", }; diff --git a/net/ftp/ftp_directory_listing_parser_unittest.cc b/net/ftp/ftp_directory_listing_parser_unittest.cc index b8f0851..6664a89 100644 --- a/net/ftp/ftp_directory_listing_parser_unittest.cc +++ b/net/ftp/ftp_directory_listing_parser_unittest.cc @@ -46,6 +46,13 @@ TEST(FtpDirectoryListingBufferTest, Parse) { "dir-listing-ls-22", // TODO(phajdan.jr): should use windows-1251 encoding. "dir-listing-ls-23", "dir-listing-ls-24", + + // Tests for Russian listings. The only difference between those + // files is character encoding: + "dir-listing-ls-25", // UTF-8 + "dir-listing-ls-26", // KOI8-R + "dir-listing-ls-27", // windows-1251 + "dir-listing-netware-1", "dir-listing-netware-2", "dir-listing-vms-1", diff --git a/net/ftp/ftp_util.cc b/net/ftp/ftp_util.cc index 6c7959f..f96fab5 100644 --- a/net/ftp/ftp_util.cc +++ b/net/ftp/ftp_util.cc @@ -137,7 +137,12 @@ bool FtpUtil::AbbreviatedMonthToNumber(const string16& text, int* number) { // An alternative solution (to parse |text| in given locale) is more // lenient, and may accept more than we want even with setLenient(false). for (int32_t month = 0; month < months_count; month++) { - if (months[month].caseCompare(unicode_text, 0) == 0) { + // Compare (case-insensitive), but just first three characters. Sometimes + // ICU returns longer strings (for example for Russian locale), and in FTP + // listings they are abbreviated to just three characters. + // Note: ICU may also return strings shorter than three characters, + // and those also should be accepted. + if (months[month].caseCompare(0, 3, unicode_text, 0) == 0) { *number = month + 1; return true; } @@ -159,6 +164,8 @@ bool FtpUtil::LsDateListingToTime(const string16& month, const string16& day, if (!base::StringToInt(day, &time_exploded.day_of_month)) return false; + if (time_exploded.day_of_month > 31) + return false; if (!base::StringToInt(rest, &time_exploded.year)) { // Maybe it's time. Does it look like time (HH:MM)? diff --git a/net/ftp/ftp_util_unittest.cc b/net/ftp/ftp_util_unittest.cc index 98ae975..4f26817 100644 --- a/net/ftp/ftp_util_unittest.cc +++ b/net/ftp/ftp_util_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -125,13 +125,26 @@ TEST(FtpUtilTest, LsDateListingToTime) { { "Nov", "01", "2007", 2007, 11, 1, 0, 0 }, { "Jul", "25", "13:37", 1994, 7, 25, 13, 37 }, - // Test date listings in German, we should support them for FTP servers - // giving localized listings. + // Test date listings in German. { "M\xc3\xa4r", "13", "2009", 2009, 3, 13, 0, 0 }, { "Mai", "1", "10:10", 1994, 5, 1, 10, 10 }, { "Okt", "14", "21:18", 1994, 10, 14, 21, 18 }, { "Dez", "25", "2008", 2008, 12, 25, 0, 0 }, + // Test date listings in Russian. + { "\xd1\x8f\xd0\xbd\xd0\xb2", "1", "2011", 2011, 1, 1, 0, 0 }, + { "\xd1\x84\xd0\xb5\xd0\xb2", "1", "2011", 2011, 2, 1, 0, 0 }, + { "\xd0\xbc\xd0\xb0\xd1\x80", "1", "2011", 2011, 3, 1, 0, 0 }, + { "\xd0\xb0\xd0\xbf\xd1\x80", "1", "2011", 2011, 4, 1, 0, 0 }, + { "\xd0\xbc\xd0\xb0\xd0\xb9", "1", "2011", 2011, 5, 1, 0, 0 }, + { "\xd0\xb8\xd1\x8e\xd0\xbd", "1", "2011", 2011, 6, 1, 0, 0 }, + { "\xd0\xb8\xd1\x8e\xd0\xbb", "1", "2011", 2011, 7, 1, 0, 0 }, + { "\xd0\xb0\xd0\xb2\xd0\xb3", "1", "2011", 2011, 8, 1, 0, 0 }, + { "\xd1\x81\xd0\xb5\xd0\xbd", "1", "2011", 2011, 9, 1, 0, 0 }, + { "\xd0\xbe\xd0\xba\xd1\x82", "1", "2011", 2011, 10, 1, 0, 0 }, + { "\xd0\xbd\xd0\xbe\xd1\x8f", "1", "2011", 2011, 11, 1, 0, 0 }, + { "\xd0\xb4\xd0\xb5\xd0\xba", "1", "2011", 2011, 12, 1, 0, 0 }, + // Test current year detection. { "Nov", "01", "12:00", 1994, 11, 1, 12, 0 }, { "Nov", "15", "12:00", 1994, 11, 15, 12, 0 }, |