summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--base/i18n/icu_encoding_detection.cc40
-rw-r--r--base/i18n/icu_encoding_detection.h2
-rw-r--r--net/data/ftp/dir-listing-ls-256
-rw-r--r--net/data/ftp/dir-listing-ls-25.expected53
-rw-r--r--net/data/ftp/dir-listing-ls-266
-rw-r--r--net/data/ftp/dir-listing-ls-26.expected53
-rw-r--r--net/data/ftp/dir-listing-ls-276
-rw-r--r--net/data/ftp/dir-listing-ls-27.expected53
-rw-r--r--net/ftp/ftp_directory_listing_parser.cc114
-rw-r--r--net/ftp/ftp_directory_listing_parser_ls.cc14
-rw-r--r--net/ftp/ftp_directory_listing_parser_ls_unittest.cc13
-rw-r--r--net/ftp/ftp_directory_listing_parser_unittest.cc7
-rw-r--r--net/ftp/ftp_util.cc9
-rw-r--r--net/ftp/ftp_util_unittest.cc19
14 files changed, 339 insertions, 56 deletions
diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc
index d579af2..3583fa9 100644
--- a/base/i18n/icu_encoding_detection.cc
+++ b/base/i18n/icu_encoding_detection.cc
@@ -1,9 +1,11 @@
-// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/icu_encoding_detection.h"
+#include <set>
+
#include "base/string_util.h"
#include "unicode/ucsdet.h"
@@ -45,6 +47,13 @@ bool DetectAllEncodings(const std::string& text,
return false;
}
+ // ICU has some heuristics for encoding detection, such that the more likely
+ // encodings should be returned first. However, it doesn't always return
+ // all encodings that properly decode |text|, so we'll append more encodings
+ // later. To make that efficient, keep track of encodings sniffed in this
+ // first phase.
+ std::set<std::string> sniffed_encodings;
+
encodings->clear();
for (int i = 0; i < matches_count; i++) {
UErrorCode get_name_status = U_ZERO_ERROR;
@@ -54,8 +63,37 @@ bool DetectAllEncodings(const std::string& text,
if (U_FAILURE(get_name_status))
continue;
+ int32_t confidence = ucsdet_getConfidence(matches[i], &get_name_status);
+
+ // We also treat this error as non-fatal.
+ if (U_FAILURE(get_name_status))
+ continue;
+
+ // A confidence level >= 10 means that the encoding is expected to properly
+ // decode the text. Drop all encodings with lower confidence level.
+ if (confidence < 10)
+ continue;
+
encodings->push_back(encoding_name);
+ sniffed_encodings.insert(encoding_name);
+ }
+
+ // Append all encodings not included earlier, in arbitrary order.
+ // TODO(jshin): This shouldn't be necessary, possible ICU bug.
+ // See also http://crbug.com/65917.
+ UEnumeration* detectable_encodings = ucsdet_getAllDetectableCharsets(detector,
+ &status);
+ int detectable_count = uenum_count(detectable_encodings, &status);
+ for (int i = 0; i < detectable_count; i++) {
+ int name_length;
+ const char* name_raw = uenum_next(detectable_encodings,
+ &name_length,
+ &status);
+ std::string name(name_raw, name_length);
+ if (sniffed_encodings.find(name) == sniffed_encodings.end())
+ encodings->push_back(name);
}
+ uenum_close(detectable_encodings);
ucsdet_close(detector);
return !encodings->empty();
diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h
index cdc4cb7..552eb3d 100644
--- a/base/i18n/icu_encoding_detection.h
+++ b/base/i18n/icu_encoding_detection.h
@@ -18,6 +18,8 @@ bool DetectEncoding(const std::string& text, std::string* encoding);
// Detect all possible encodings of |text| and put their names
// (as returned by ICU) in |encodings|. Returns true on success.
+// Note: this function may return encodings that may fail to decode |text|,
+// the caller is responsible for handling that.
bool DetectAllEncodings(const std::string& text,
std::vector<std::string>* encodings);
diff --git a/net/data/ftp/dir-listing-ls-25 b/net/data/ftp/dir-listing-ls-25
new file mode 100644
index 0000000..7f36b14
--- /dev/null
+++ b/net/data/ftp/dir-listing-ls-25
@@ -0,0 +1,6 @@
+drwxr-xr-x 3 ftp ftp 4096 15 апр 18:11 .
+drwxr-xr-x 3 ftp ftp 4096 15 июл 18:11 ..
+-rw-r--r-- 1 ftp ftp 528 01 май 2007 .message
+-rw-r--r-- 1 ftp ftp 528 01 ноя 2007 README
+-rw-r--r-- 1 ftp ftp 560 28 сен 2007 index.html
+drwxr-xr-x 33 ftp ftp 4096 12 фев 2008 pub
diff --git a/net/data/ftp/dir-listing-ls-25.expected b/net/data/ftp/dir-listing-ls-25.expected
new file mode 100644
index 0000000..3405f86
--- /dev/null
+++ b/net/data/ftp/dir-listing-ls-25.expected
@@ -0,0 +1,53 @@
+d
+.
+-1
+1994
+4
+15
+18
+11
+
+d
+..
+-1
+1994
+7
+15
+18
+11
+
+-
+.message
+528
+2007
+5
+1
+0
+0
+
+-
+README
+528
+2007
+11
+1
+0
+0
+
+-
+index.html
+560
+2007
+9
+28
+0
+0
+
+d
+pub
+-1
+2008
+2
+12
+0
+0
diff --git a/net/data/ftp/dir-listing-ls-26 b/net/data/ftp/dir-listing-ls-26
new file mode 100644
index 0000000..73161af
--- /dev/null
+++ b/net/data/ftp/dir-listing-ls-26
@@ -0,0 +1,6 @@
+drwxr-xr-x 3 ftp ftp 4096 15 18:11 .
+drwxr-xr-x 3 ftp ftp 4096 15 18:11 ..
+-rw-r--r-- 1 ftp ftp 528 01 2007 .message
+-rw-r--r-- 1 ftp ftp 528 01 2007 README
+-rw-r--r-- 1 ftp ftp 560 28 2007 index.html
+drwxr-xr-x 33 ftp ftp 4096 12 2008 pub
diff --git a/net/data/ftp/dir-listing-ls-26.expected b/net/data/ftp/dir-listing-ls-26.expected
new file mode 100644
index 0000000..3405f86
--- /dev/null
+++ b/net/data/ftp/dir-listing-ls-26.expected
@@ -0,0 +1,53 @@
+d
+.
+-1
+1994
+4
+15
+18
+11
+
+d
+..
+-1
+1994
+7
+15
+18
+11
+
+-
+.message
+528
+2007
+5
+1
+0
+0
+
+-
+README
+528
+2007
+11
+1
+0
+0
+
+-
+index.html
+560
+2007
+9
+28
+0
+0
+
+d
+pub
+-1
+2008
+2
+12
+0
+0
diff --git a/net/data/ftp/dir-listing-ls-27 b/net/data/ftp/dir-listing-ls-27
new file mode 100644
index 0000000..eec958e
--- /dev/null
+++ b/net/data/ftp/dir-listing-ls-27
@@ -0,0 +1,6 @@
+drwxr-xr-x 3 ftp ftp 4096 15 18:11 .
+drwxr-xr-x 3 ftp ftp 4096 15 18:11 ..
+-rw-r--r-- 1 ftp ftp 528 01 2007 .message
+-rw-r--r-- 1 ftp ftp 528 01 2007 README
+-rw-r--r-- 1 ftp ftp 560 28 2007 index.html
+drwxr-xr-x 33 ftp ftp 4096 12 2008 pub
diff --git a/net/data/ftp/dir-listing-ls-27.expected b/net/data/ftp/dir-listing-ls-27.expected
new file mode 100644
index 0000000..3405f86
--- /dev/null
+++ b/net/data/ftp/dir-listing-ls-27.expected
@@ -0,0 +1,53 @@
+d
+.
+-1
+1994
+4
+15
+18
+11
+
+d
+..
+-1
+1994
+7
+15
+18
+11
+
+-
+.message
+528
+2007
+5
+1
+0
+0
+
+-
+README
+528
+2007
+11
+1
+0
+0
+
+-
+index.html
+560
+2007
+9
+28
+0
+0
+
+d
+pub
+-1
+2008
+2
+12
+0
+0
diff --git a/net/ftp/ftp_directory_listing_parser.cc b/net/ftp/ftp_directory_listing_parser.cc
index 8c36bb6..7d47725 100644
--- a/net/ftp/ftp_directory_listing_parser.cc
+++ b/net/ftp/ftp_directory_listing_parser.cc
@@ -16,93 +16,109 @@
#include "net/ftp/ftp_directory_listing_parser_windows.h"
#include "net/ftp/ftp_server_type_histograms.h"
-namespace {
-
-// Converts a string with unknown character encoding to UTF-16. On success
-// fills in |converted_text| and |encoding|. Returns network error code.
-int ConvertStringToUTF16(const std::string& text,
- string16* converted_text,
- std::string* encoding) {
- std::vector<std::string> encodings;
- if (!base::DetectAllEncodings(text, &encodings))
- return net::ERR_ENCODING_DETECTION_FAILED;
-
- // Use first encoding that can be used to decode the text.
- for (size_t i = 0; i < encodings.size(); i++) {
- if (base::CodepageToUTF16(text,
- encodings[i].c_str(),
- base::OnStringConversionError::FAIL,
- converted_text)) {
- *encoding = encodings[i];
- return net::OK;
- }
- }
+namespace net {
- return net::ERR_ENCODING_DETECTION_FAILED;
-}
+namespace {
+// Fills in |raw_name| for all |entries| using |encoding|. Returns network
+// error code.
int FillInRawName(const std::string& encoding,
- std::vector<net::FtpDirectoryListingEntry>* entries) {
+ std::vector<FtpDirectoryListingEntry>* entries) {
for (size_t i = 0; i < entries->size(); i++) {
if (!base::UTF16ToCodepage(entries->at(i).name, encoding.c_str(),
base::OnStringConversionError::FAIL,
&entries->at(i).raw_name)) {
- return net::ERR_ENCODING_CONVERSION_FAILED;
+ return ERR_ENCODING_CONVERSION_FAILED;
}
}
- return net::OK;
+ return OK;
}
-} // namespace
-
-namespace net {
-
-FtpDirectoryListingEntry::FtpDirectoryListingEntry() {
-}
-
-int ParseFtpDirectoryListing(const std::string& text,
- const base::Time& current_time,
- std::vector<FtpDirectoryListingEntry>* entries) {
- std::string encoding;
-
- string16 converted_text;
- int rv = ConvertStringToUTF16(text, &converted_text, &encoding);
- if (rv != OK)
- return rv;
-
+// Parses |text| as an FTP directory listing. Fills in |entries|
+// and |server_type| and returns network error code.
+int ParseListing(const string16& text,
+ const std::string& encoding,
+ const base::Time& current_time,
+ std::vector<FtpDirectoryListingEntry>* entries,
+ FtpServerType* server_type) {
std::vector<string16> lines;
- base::SplitString(converted_text, '\n', &lines);
+ base::SplitString(text, '\n', &lines);
// TODO(phajdan.jr): Use a table of callbacks instead of repeating code.
entries->clear();
if (ParseFtpDirectoryListingLs(lines, current_time, entries)) {
- UpdateFtpServerTypeHistograms(SERVER_LS);
+ *server_type = SERVER_LS;
return FillInRawName(encoding, entries);
}
entries->clear();
if (ParseFtpDirectoryListingWindows(lines, entries)) {
- UpdateFtpServerTypeHistograms(SERVER_WINDOWS);
+ *server_type = SERVER_WINDOWS;
return FillInRawName(encoding, entries);
}
entries->clear();
if (ParseFtpDirectoryListingVms(lines, entries)) {
- UpdateFtpServerTypeHistograms(SERVER_VMS);
+ *server_type = SERVER_VMS;
return FillInRawName(encoding, entries);
}
entries->clear();
if (ParseFtpDirectoryListingNetware(lines, current_time, entries)) {
- UpdateFtpServerTypeHistograms(SERVER_NETWARE);
+ *server_type = SERVER_NETWARE;
return FillInRawName(encoding, entries);
}
entries->clear();
- UpdateFtpServerTypeHistograms(SERVER_UNKNOWN);
+ return ERR_UNRECOGNIZED_FTP_DIRECTORY_LISTING_FORMAT;
+}
+
+// Detects encoding of |text| and parses it as an FTP directory listing.
+// Fills in |entries| and |server_type| and returns network error code.
+int DecodeAndParse(const std::string& text,
+ const base::Time& current_time,
+ std::vector<FtpDirectoryListingEntry>* entries,
+ FtpServerType* server_type) {
+ std::vector<std::string> encodings;
+ if (!base::DetectAllEncodings(text, &encodings))
+ return ERR_ENCODING_DETECTION_FAILED;
+
+ // Use first encoding that can be used to decode the text.
+ for (size_t i = 0; i < encodings.size(); i++) {
+ string16 converted_text;
+ if (base::CodepageToUTF16(text,
+ encodings[i].c_str(),
+ base::OnStringConversionError::FAIL,
+ &converted_text)) {
+ int rv = ParseListing(converted_text,
+ encodings[i],
+ current_time,
+ entries,
+ server_type);
+ if (rv == OK)
+ return rv;
+ }
+ }
+
+ entries->clear();
+ *server_type = SERVER_UNKNOWN;
return ERR_UNRECOGNIZED_FTP_DIRECTORY_LISTING_FORMAT;
}
} // namespace
+
+FtpDirectoryListingEntry::FtpDirectoryListingEntry() {
+}
+
+int ParseFtpDirectoryListing(const std::string& text,
+ const base::Time& current_time,
+ std::vector<FtpDirectoryListingEntry>* entries) {
+ FtpServerType server_type = SERVER_UNKNOWN;
+ int rv = DecodeAndParse(text, current_time, entries, &server_type);
+ UpdateFtpServerTypeHistograms(server_type);
+ return rv;
+}
+
+} // namespace net
diff --git a/net/ftp/ftp_directory_listing_parser_ls.cc b/net/ftp/ftp_directory_listing_parser_ls.cc
index 9d637a8..f7ad6ac 100644
--- a/net/ftp/ftp_directory_listing_parser_ls.cc
+++ b/net/ftp/ftp_directory_listing_parser_ls.cc
@@ -98,6 +98,20 @@ bool DetectColumnOffsetAndModificationTime(const std::vector<string16>& columns,
}
}
+ // Some FTP listings have swapped the "month" and "day of month" columns
+ // (for example Russian listings). We try to recognize them only after making
+ // sure no column offset works above (this is a more strict way).
+ for (size_t i = 5U; i < columns.size(); i++) {
+ if (net::FtpUtil::LsDateListingToTime(columns[i - 1],
+ columns[i - 2],
+ columns[i],
+ current_time,
+ modification_time)) {
+ *offset = i;
+ return true;
+ }
+ }
+
return false;
}
diff --git a/net/ftp/ftp_directory_listing_parser_ls_unittest.cc b/net/ftp/ftp_directory_listing_parser_ls_unittest.cc
index 0414eb9..c05e691 100644
--- a/net/ftp/ftp_directory_listing_parser_ls_unittest.cc
+++ b/net/ftp/ftp_directory_listing_parser_ls_unittest.cc
@@ -44,7 +44,7 @@ TEST_F(FtpDirectoryListingParserLsTest, Good) {
{ "d-wx-wx-wt+ 4 ftp 989 512 Dec 8 15:54 incoming",
FtpDirectoryListingEntry::DIRECTORY, "incoming", -1,
1993, 12, 8, 15, 54 },
- { "drwxrwxrwx 1 owner group 0 Sep 13 0:30 audio",
+ { "drwxrwxrwx 1 owner group 1024 Sep 13 0:30 audio",
FtpDirectoryListingEntry::DIRECTORY, "audio", -1,
1994, 9, 13, 0, 30 },
{ "lrwxrwxrwx 1 0 0 26 Sep 18 2008 pub",
@@ -94,6 +94,15 @@ TEST_F(FtpDirectoryListingParserLsTest, Good) {
{ "drwxrwxr-x 3 %%%% Domain Users 4096 Dec 9 2009 %%%%%",
net::FtpDirectoryListingEntry::DIRECTORY, "%%%%%", -1,
2009, 12, 9, 0, 0 },
+
+ // Tests for "ls -l" style listing in Russian locale (note the swapped
+ // parts order: the day of month is the first, before month).
+ { "-rwxrwxr-x 1 ftp ftp 123 23 \xd0\xbc\xd0\xb0\xd0\xb9 2011 test",
+ net::FtpDirectoryListingEntry::FILE, "test", 123,
+ 2011, 5, 23, 0, 0 },
+ { "drwxrwxr-x 1 ftp ftp 4096 19 \xd0\xbe\xd0\xba\xd1\x82 2011 dir",
+ net::FtpDirectoryListingEntry::DIRECTORY, "dir", -1,
+ 2011, 10, 19, 0, 0 },
};
for (size_t i = 0; i < arraysize(good_cases); i++) {
SCOPED_TRACE(base::StringPrintf("Test[%" PRIuS "]: %s", i,
@@ -150,7 +159,7 @@ TEST_F(FtpDirectoryListingParserLsTest, Bad) {
"qrwwr--r-- 1 ftp ftp 528 Nov 01 2007 README",
"-rw-r--r-- 1 ftp ftp -528 Nov 01 2007 README",
"-rw-r--r-- 1 ftp ftp 528 Foo 01 2007 README",
- "drwxrwxrwx 1 owner group 0 Sep 13 0:3 audio",
+ "drwxrwxrwx 1 owner group 1024 Sep 13 0:3 audio",
"-qqqqqqqqq+ 2 sys 512 Mar 27 2009 pub",
};
diff --git a/net/ftp/ftp_directory_listing_parser_unittest.cc b/net/ftp/ftp_directory_listing_parser_unittest.cc
index b8f0851..6664a89 100644
--- a/net/ftp/ftp_directory_listing_parser_unittest.cc
+++ b/net/ftp/ftp_directory_listing_parser_unittest.cc
@@ -46,6 +46,13 @@ TEST(FtpDirectoryListingBufferTest, Parse) {
"dir-listing-ls-22", // TODO(phajdan.jr): should use windows-1251 encoding.
"dir-listing-ls-23",
"dir-listing-ls-24",
+
+ // Tests for Russian listings. The only difference between those
+ // files is character encoding:
+ "dir-listing-ls-25", // UTF-8
+ "dir-listing-ls-26", // KOI8-R
+ "dir-listing-ls-27", // windows-1251
+
"dir-listing-netware-1",
"dir-listing-netware-2",
"dir-listing-vms-1",
diff --git a/net/ftp/ftp_util.cc b/net/ftp/ftp_util.cc
index 6c7959f..f96fab5 100644
--- a/net/ftp/ftp_util.cc
+++ b/net/ftp/ftp_util.cc
@@ -137,7 +137,12 @@ bool FtpUtil::AbbreviatedMonthToNumber(const string16& text, int* number) {
// An alternative solution (to parse |text| in given locale) is more
// lenient, and may accept more than we want even with setLenient(false).
for (int32_t month = 0; month < months_count; month++) {
- if (months[month].caseCompare(unicode_text, 0) == 0) {
+ // Compare (case-insensitive), but just first three characters. Sometimes
+ // ICU returns longer strings (for example for Russian locale), and in FTP
+ // listings they are abbreviated to just three characters.
+ // Note: ICU may also return strings shorter than three characters,
+ // and those also should be accepted.
+ if (months[month].caseCompare(0, 3, unicode_text, 0) == 0) {
*number = month + 1;
return true;
}
@@ -159,6 +164,8 @@ bool FtpUtil::LsDateListingToTime(const string16& month, const string16& day,
if (!base::StringToInt(day, &time_exploded.day_of_month))
return false;
+ if (time_exploded.day_of_month > 31)
+ return false;
if (!base::StringToInt(rest, &time_exploded.year)) {
// Maybe it's time. Does it look like time (HH:MM)?
diff --git a/net/ftp/ftp_util_unittest.cc b/net/ftp/ftp_util_unittest.cc
index 98ae975..4f26817 100644
--- a/net/ftp/ftp_util_unittest.cc
+++ b/net/ftp/ftp_util_unittest.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -125,13 +125,26 @@ TEST(FtpUtilTest, LsDateListingToTime) {
{ "Nov", "01", "2007", 2007, 11, 1, 0, 0 },
{ "Jul", "25", "13:37", 1994, 7, 25, 13, 37 },
- // Test date listings in German, we should support them for FTP servers
- // giving localized listings.
+ // Test date listings in German.
{ "M\xc3\xa4r", "13", "2009", 2009, 3, 13, 0, 0 },
{ "Mai", "1", "10:10", 1994, 5, 1, 10, 10 },
{ "Okt", "14", "21:18", 1994, 10, 14, 21, 18 },
{ "Dez", "25", "2008", 2008, 12, 25, 0, 0 },
+ // Test date listings in Russian.
+ { "\xd1\x8f\xd0\xbd\xd0\xb2", "1", "2011", 2011, 1, 1, 0, 0 },
+ { "\xd1\x84\xd0\xb5\xd0\xb2", "1", "2011", 2011, 2, 1, 0, 0 },
+ { "\xd0\xbc\xd0\xb0\xd1\x80", "1", "2011", 2011, 3, 1, 0, 0 },
+ { "\xd0\xb0\xd0\xbf\xd1\x80", "1", "2011", 2011, 4, 1, 0, 0 },
+ { "\xd0\xbc\xd0\xb0\xd0\xb9", "1", "2011", 2011, 5, 1, 0, 0 },
+ { "\xd0\xb8\xd1\x8e\xd0\xbd", "1", "2011", 2011, 6, 1, 0, 0 },
+ { "\xd0\xb8\xd1\x8e\xd0\xbb", "1", "2011", 2011, 7, 1, 0, 0 },
+ { "\xd0\xb0\xd0\xb2\xd0\xb3", "1", "2011", 2011, 8, 1, 0, 0 },
+ { "\xd1\x81\xd0\xb5\xd0\xbd", "1", "2011", 2011, 9, 1, 0, 0 },
+ { "\xd0\xbe\xd0\xba\xd1\x82", "1", "2011", 2011, 10, 1, 0, 0 },
+ { "\xd0\xbd\xd0\xbe\xd1\x8f", "1", "2011", 2011, 11, 1, 0, 0 },
+ { "\xd0\xb4\xd0\xb5\xd0\xba", "1", "2011", 2011, 12, 1, 0, 0 },
+
// Test current year detection.
{ "Nov", "01", "12:00", 1994, 11, 1, 12, 0 },
{ "Nov", "15", "12:00", 1994, 11, 15, 12, 0 },