5 files changed, 117 insertions, 15 deletions
diff --git a/chrome/browser/autocomplete/autocomplete.cc b/chrome/browser/autocomplete/autocomplete.cc
index 45b9285..492ce3c 100644
--- a/chrome/browser/autocomplete/autocomplete.cc
+++ b/chrome/browser/autocomplete/autocomplete.cc
@@ -168,22 +168,33 @@ AutocompleteInput::Type AutocompleteInput::Parse(
   if (registry_length == std::wstring::npos)
     return QUERY;  // Could be a broken IP address, etc.
 
-  // A space in the "host" means this is a query.  (Technically, IE and GURL
-  // allow hostnames with spaces for wierd intranet machines, but it's supposed
-  // to be illegal and I'm not worried about users trying to type these in.)
-  if (host.find(' ') != std::wstring::npos)
+  // See if the hostname is valid per RFC 1738.  While IE and GURL allow
+  // hostnames to contain many other characters (perhaps for weird intranet
+  // machines), it's extremely unlikely that a user would be trying to type
+  // those in for anything other than a search query.
+  url_canon::CanonHostInfo host_info;
+  const std::string canonicalized_host(net::CanonicalizeHost(host, &host_info));
+  if ((host_info.family == url_canon::CanonHostInfo::NEUTRAL) &&
+      !net::IsCanonicalizedHostRFC1738Compliant(canonicalized_host))
     return QUERY;
 
-  // Presence of a password/port mean this is almost certainly a URL.  We don't
-  // treat usernames (without passwords) as indicating a URL, because this could
-  // be an email address like "user@mail.com" which is more likely a search than
-  // an HTTP auth login attempt.
-  if (parts->password.is_nonempty() || parts->port.is_nonempty())
+  // Presence of a port means this is likely a URL, if the port is really a port
+  // number.  If it's just garbage after a colon, this is a query.
+  if (parts->port.is_nonempty()) {
+    int port;
+    return (StringToInt(WideToUTF16(
+                text.substr(parts->port.begin, parts->port.len)), &port) &&
+            (port >= 0) && (port <= 65535)) ? URL : QUERY;
+  }
+
+  // Presence of a password means this is likely a URL.  We don't treat
+  // usernames (without passwords) as indicating a URL, because this could be an
+  // email address like "user@mail.com" which is more likely a search than an
+  // HTTP auth login attempt.
+  if (parts->password.is_nonempty())
     return URL;
 
   // See if the host is an IP address.
-  url_canon::CanonHostInfo host_info;
-  net::CanonicalizeHost(host, &host_info);
   if (host_info.family == url_canon::CanonHostInfo::IPV4) {
     // If the user originally typed a host that looks like an IP address (a
     // dotted quad), they probably want to open it.  If the original input was
@@ -194,11 +205,8 @@ AutocompleteInput::Type AutocompleteInput::Parse(
       return URL;
     return desired_tld.empty() ? UNKNOWN : REQUESTED_URL;
   }
-
-  if (host_info.family == url_canon::CanonHostInfo::IPV6) {
-    // If the user typed a valid bracketed IPv6 address, treat it as a URL.
+  if (host_info.family == url_canon::CanonHostInfo::IPV6)
     return URL;
-  }
 
   // The host doesn't look like a number, so see if the user's given us a path.
   if (parts->path.is_nonempty()) {
diff --git a/chrome/browser/autocomplete/autocomplete_unittest.cc b/chrome/browser/autocomplete/autocomplete_unittest.cc
index 40e6ecf..4ab6ea4 100644
--- a/chrome/browser/autocomplete/autocomplete_unittest.cc
+++ b/chrome/browser/autocomplete/autocomplete_unittest.cc
@@ -211,13 +211,17 @@ TEST(AutocompleteTest, InputType) {
     { L"?http://foo.com/bar", AutocompleteInput::FORCED_QUERY },
     { L"foo", AutocompleteInput::UNKNOWN },
     { L"foo.com", AutocompleteInput::URL },
+    { L"-.com", AutocompleteInput::QUERY },
     { L"foo/bar", AutocompleteInput::URL },
     { L"foo/bar baz", AutocompleteInput::UNKNOWN },
     { L"http://foo/bar baz", AutocompleteInput::URL },
     { L"foo bar", AutocompleteInput::QUERY },
+    { L"\"foo:bar\"", AutocompleteInput::QUERY },
     { L"link:foo.com", AutocompleteInput::UNKNOWN },
     { L"www.foo.com:81", AutocompleteInput::URL },
     { L"localhost:8080", AutocompleteInput::URL },
+    { L"foo.com:123456", AutocompleteInput::QUERY },
+    { L"foo.com:abc", AutocompleteInput::QUERY },
     { L"en.wikipedia.org/wiki/James Bond", AutocompleteInput::URL },
     // In Chrome itself, mailto: will get handled by ShellExecute, but in
     // unittest mode, we don't have the data loaded in the external protocol
diff --git a/net/base/net_util.cc b/net/base/net_util.cc
index 1d7d558..05f5841 100644
--- a/net/base/net_util.cc
+++ b/net/base/net_util.cc
@@ -939,6 +939,54 @@ std::string GetDirectoryListingHeader(const string16& title) {
   return result;
 }
 
+inline bool IsHostCharAlpha(char c) {
+  // We can just check lowercase because uppercase characters have already been
+  // normalized.
+  return (c >= 'a') && (c <= 'z');
+}
+
+inline bool IsHostCharDigit(char c) {
+  return (c >= '0') && (c <= '9');
+}
+
+bool IsCanonicalizedHostRFC1738Compliant(const std::string& host) {
+  if (host.empty())
+    return false;
+
+  enum State {
+    NOT_IN_COMPONENT,
+    IN_COMPONENT_STARTED_DIGIT,
+    IN_COMPONENT_STARTED_ALPHA
+  } state = NOT_IN_COMPONENT;
+  bool last_char_was_hyphen = false;
+
+  for (std::string::const_iterator i(host.begin()); i != host.end(); ++i) {
+    const char c = *i;
+    if (state == NOT_IN_COMPONENT) {
+      if (IsHostCharDigit(c))
+        state = IN_COMPONENT_STARTED_DIGIT;
+      else if (IsHostCharAlpha(c))
+        state = IN_COMPONENT_STARTED_ALPHA;
+      else
+        return false;
+    } else {
+      if (c == '.') {
+        if (last_char_was_hyphen)
+          return false;
+        state = NOT_IN_COMPONENT;
+      } else if (IsHostCharAlpha(c) || IsHostCharDigit(c)) {
+        last_char_was_hyphen = false;
+      } else if (c == '-') {
+        last_char_was_hyphen = true;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  return state == IN_COMPONENT_STARTED_ALPHA;
+}
+
 std::string GetDirectoryListingEntry(const string16& name,
                                      const std::string& raw_bytes,
                                      bool is_dir,
diff --git a/net/base/net_util.h b/net/base/net_util.h
index 4d7e0aa..302a55f 100644
--- a/net/base/net_util.h
+++ b/net/base/net_util.h
@@ -158,6 +158,17 @@ std::string CanonicalizeHost(const std::string& host,
 std::string CanonicalizeHost(const std::wstring& host,
                              url_canon::CanonHostInfo* host_info);
 
+// Returns true if |host| is RFC 1738-compliant (and not an IP address).  The
+// rules are:
+//   * One or more components separated by '.'
+//   * Each component begins and ends with an alphanumeric character
+//   * Each component contains only alphanumeric characters and '-'
+//   * The last component does not begin with a digit
+//
+// NOTE: You should only pass in hosts that have been returned from
+// CanonicalizeHost(), or you may not get accurate results.
+bool IsCanonicalizedHostRFC1738Compliant(const std::string& host);
+
 // Call these functions to get the html snippet for a directory listing.
 // The return values of both functions are in UTF-8.
 std::string GetDirectoryListingHeader(const string16& title);
diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc
index 6a01ec9..22d467c 100644
--- a/net/base/net_util_unittest.cc
+++ b/net/base/net_util_unittest.cc
@@ -344,6 +344,11 @@ const IDNTestCase idn_cases[] = {
 #endif
 };
 
+struct RFC1738Case {
+  const char* host;
+  bool expected_output;
+};
+
 struct SuggestedFilenameCase {
   const char* url;
   const char* content_disp_header;
@@ -810,6 +815,32 @@ TEST(NetUtilTest, IDNToUnicodeSlow) {
   }
 }
 
+TEST(NetUtilTest, RFC1738) {
+  const RFC1738Case rfc1738_cases[] = {
+    {"", false},
+    {"a", true},
+    {"-", false},
+    {".", false},
+    {"a.", false},
+    {"a.a", true},
+    {"9.a", true},
+    {"a.9", false},
+    {"a.a9", true},
+    {"a.9a", false},
+    {"a+9a", false},
+    {"1-.a-b", false},
+    {"1-2.a-b", true},
+    {"a.b.c.d.e", true},
+    {"1.2.3.4.e", true},
+    {"a.b.c.d.5", false},
+  };
+
+  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(rfc1738_cases); ++i) {
+    EXPECT_EQ(rfc1738_cases[i].expected_output,
+              net::IsCanonicalizedHostRFC1738Compliant(rfc1738_cases[i].host));
+  }
+}
+
 TEST(NetUtilTest, StripWWW) {
   EXPECT_EQ(L"", net::StripWWW(L""));
   EXPECT_EQ(L"", net::StripWWW(L"www."));