diff options
-rw-r--r-- | chrome/browser/autocomplete/autocomplete.cc | 38 | ||||
-rw-r--r-- | chrome/browser/autocomplete/autocomplete_unittest.cc | 4 | ||||
-rw-r--r-- | net/base/net_util.cc | 48 | ||||
-rw-r--r-- | net/base/net_util.h | 11 | ||||
-rw-r--r-- | net/base/net_util_unittest.cc | 31 |
5 files changed, 117 insertions, 15 deletions
diff --git a/chrome/browser/autocomplete/autocomplete.cc b/chrome/browser/autocomplete/autocomplete.cc index 45b9285..492ce3c 100644 --- a/chrome/browser/autocomplete/autocomplete.cc +++ b/chrome/browser/autocomplete/autocomplete.cc @@ -168,22 +168,33 @@ AutocompleteInput::Type AutocompleteInput::Parse( if (registry_length == std::wstring::npos) return QUERY; // Could be a broken IP address, etc. - // A space in the "host" means this is a query. (Technically, IE and GURL - // allow hostnames with spaces for wierd intranet machines, but it's supposed - // to be illegal and I'm not worried about users trying to type these in.) - if (host.find(' ') != std::wstring::npos) + // See if the hostname is valid per RFC 1738. While IE and GURL allow + // hostnames to contain many other characters (perhaps for weird intranet + // machines), it's extremely unlikely that a user would be trying to type + // those in for anything other than a search query. + url_canon::CanonHostInfo host_info; + const std::string canonicalized_host(net::CanonicalizeHost(host, &host_info)); + if ((host_info.family == url_canon::CanonHostInfo::NEUTRAL) && + !net::IsCanonicalizedHostRFC1738Compliant(canonicalized_host)) return QUERY; - // Presence of a password/port mean this is almost certainly a URL. We don't - // treat usernames (without passwords) as indicating a URL, because this could - // be an email address like "user@mail.com" which is more likely a search than - // an HTTP auth login attempt. - if (parts->password.is_nonempty() || parts->port.is_nonempty()) + // Presence of a port means this is likely a URL, if the port is really a port + // number. If it's just garbage after a colon, this is a query. + if (parts->port.is_nonempty()) { + int port; + return (StringToInt(WideToUTF16( + text.substr(parts->port.begin, parts->port.len)), &port) && + (port >= 0) && (port <= 65535)) ? URL : QUERY; + } + + // Presence of a password means this is likely a URL. We don't treat + // usernames (without passwords) as indicating a URL, because this could be an + // email address like "user@mail.com" which is more likely a search than an + // HTTP auth login attempt. + if (parts->password.is_nonempty()) return URL; // See if the host is an IP address. - url_canon::CanonHostInfo host_info; - net::CanonicalizeHost(host, &host_info); if (host_info.family == url_canon::CanonHostInfo::IPV4) { // If the user originally typed a host that looks like an IP address (a // dotted quad), they probably want to open it. If the original input was @@ -194,11 +205,8 @@ AutocompleteInput::Type AutocompleteInput::Parse( return URL; return desired_tld.empty() ? UNKNOWN : REQUESTED_URL; } - - if (host_info.family == url_canon::CanonHostInfo::IPV6) { - // If the user typed a valid bracketed IPv6 address, treat it as a URL. + if (host_info.family == url_canon::CanonHostInfo::IPV6) return URL; - } // The host doesn't look like a number, so see if the user's given us a path. if (parts->path.is_nonempty()) { diff --git a/chrome/browser/autocomplete/autocomplete_unittest.cc b/chrome/browser/autocomplete/autocomplete_unittest.cc index 40e6ecf..4ab6ea4 100644 --- a/chrome/browser/autocomplete/autocomplete_unittest.cc +++ b/chrome/browser/autocomplete/autocomplete_unittest.cc @@ -211,13 +211,17 @@ TEST(AutocompleteTest, InputType) { { L"?http://foo.com/bar", AutocompleteInput::FORCED_QUERY }, { L"foo", AutocompleteInput::UNKNOWN }, { L"foo.com", AutocompleteInput::URL }, + { L"-.com", AutocompleteInput::QUERY }, { L"foo/bar", AutocompleteInput::URL }, { L"foo/bar baz", AutocompleteInput::UNKNOWN }, { L"http://foo/bar baz", AutocompleteInput::URL }, { L"foo bar", AutocompleteInput::QUERY }, + { L"\"foo:bar\"", AutocompleteInput::QUERY }, { L"link:foo.com", AutocompleteInput::UNKNOWN }, { L"www.foo.com:81", AutocompleteInput::URL }, { L"localhost:8080", AutocompleteInput::URL }, + { L"foo.com:123456", AutocompleteInput::QUERY }, + { L"foo.com:abc", AutocompleteInput::QUERY }, { L"en.wikipedia.org/wiki/James Bond", AutocompleteInput::URL }, // In Chrome itself, mailto: will get handled by ShellExecute, but in // unittest mode, we don't have the data loaded in the external protocol diff --git a/net/base/net_util.cc b/net/base/net_util.cc index 1d7d558..05f5841 100644 --- a/net/base/net_util.cc +++ b/net/base/net_util.cc @@ -939,6 +939,54 @@ std::string GetDirectoryListingHeader(const string16& title) { return result; } +inline bool IsHostCharAlpha(char c) { + // We can just check lowercase because uppercase characters have already been + // normalized. + return (c >= 'a') && (c <= 'z'); +} + +inline bool IsHostCharDigit(char c) { + return (c >= '0') && (c <= '9'); +} + +bool IsCanonicalizedHostRFC1738Compliant(const std::string& host) { + if (host.empty()) + return false; + + enum State { + NOT_IN_COMPONENT, + IN_COMPONENT_STARTED_DIGIT, + IN_COMPONENT_STARTED_ALPHA + } state = NOT_IN_COMPONENT; + bool last_char_was_hyphen = false; + + for (std::string::const_iterator i(host.begin()); i != host.end(); ++i) { + const char c = *i; + if (state == NOT_IN_COMPONENT) { + if (IsHostCharDigit(c)) + state = IN_COMPONENT_STARTED_DIGIT; + else if (IsHostCharAlpha(c)) + state = IN_COMPONENT_STARTED_ALPHA; + else + return false; + } else { + if (c == '.') { + if (last_char_was_hyphen) + return false; + state = NOT_IN_COMPONENT; + } else if (IsHostCharAlpha(c) || IsHostCharDigit(c)) { + last_char_was_hyphen = false; + } else if (c == '-') { + last_char_was_hyphen = true; + } else { + return false; + } + } + } + + return state == IN_COMPONENT_STARTED_ALPHA; +} + std::string GetDirectoryListingEntry(const string16& name, const std::string& raw_bytes, bool is_dir, diff --git a/net/base/net_util.h b/net/base/net_util.h index 4d7e0aa..302a55f 100644 --- a/net/base/net_util.h +++ b/net/base/net_util.h @@ -158,6 +158,17 @@ std::string CanonicalizeHost(const std::string& host, std::string CanonicalizeHost(const std::wstring& host, url_canon::CanonHostInfo* host_info); +// Returns true if |host| is RFC 1738-compliant (and not an IP address). The +// rules are: +// * One or more components separated by '.' +// * Each component begins and ends with an alphanumeric character +// * Each component contains only alphanumeric characters and '-' +// * The last component does not begin with a digit +// +// NOTE: You should only pass in hosts that have been returned from +// CanonicalizeHost(), or you may not get accurate results. +bool IsCanonicalizedHostRFC1738Compliant(const std::string& host); + // Call these functions to get the html snippet for a directory listing. // The return values of both functions are in UTF-8. std::string GetDirectoryListingHeader(const string16& title); diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc index 6a01ec9..22d467c 100644 --- a/net/base/net_util_unittest.cc +++ b/net/base/net_util_unittest.cc @@ -344,6 +344,11 @@ const IDNTestCase idn_cases[] = { #endif }; +struct RFC1738Case { + const char* host; + bool expected_output; +}; + struct SuggestedFilenameCase { const char* url; const char* content_disp_header; @@ -810,6 +815,32 @@ TEST(NetUtilTest, IDNToUnicodeSlow) { } } +TEST(NetUtilTest, RFC1738) { + const RFC1738Case rfc1738_cases[] = { + {"", false}, + {"a", true}, + {"-", false}, + {".", false}, + {"a.", false}, + {"a.a", true}, + {"9.a", true}, + {"a.9", false}, + {"a.a9", true}, + {"a.9a", false}, + {"a+9a", false}, + {"1-.a-b", false}, + {"1-2.a-b", true}, + {"a.b.c.d.e", true}, + {"1.2.3.4.e", true}, + {"a.b.c.d.5", false}, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(rfc1738_cases); ++i) { + EXPECT_EQ(rfc1738_cases[i].expected_output, + net::IsCanonicalizedHostRFC1738Compliant(rfc1738_cases[i].host)); + } +} + TEST(NetUtilTest, StripWWW) { EXPECT_EQ(L"", net::StripWWW(L"")); EXPECT_EQ(L"", net::StripWWW(L"www.")); |