Introduce UrlPattern. This is basically me resuming work on

issue 14106, but as it is a complete rewrite, I have started a new issue. I also added supporting JoinString() and ReplaceAll() utility functions. Review URL: http://codereview.chromium.org/19704 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@9031 0039d316-1c4b-4281-b951-d872f2087c98
author: aa@chromium.org <aa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-02-02 04:09:58 +0000
committer: aa@chromium.org <aa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-02-02 04:09:58 +0000
commit: 0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6 (patch)
tree: 53272fae82210fbf553bb3a1a0256fba5b851368 /chrome/common/extensions
parent: b112a4cc460212188d353b995a055f6e14029ba3 (diff)
download: chromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.zip
chromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.tar.gz
chromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.tar.bz2
3 files changed, 382 insertions, 0 deletions
diff --git a/chrome/common/extensions/url_pattern.cc b/chrome/common/extensions/url_pattern.cc
new file mode 100644
index 0000000..bb32cb5
--- /dev/null
+++ b/chrome/common/extensions/url_pattern.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/string_piece.h"
+#include "base/string_util.h"
+#include "chrome/common/extensions/url_pattern.h"
+
+// TODO(aa): Consider adding chrome-extension? What about more obscure ones
+// like data: and javascript: ?
+static const char* kValidSchemes[] = {
+  "http",
+  "https",
+  "file",
+  "ftp",
+  "chrome-ui"
+};
+
+static const char kSchemeSeparator[] = "://";
+static const char kPathSeparator[] = "/";
+
+static bool IsValidScheme(const std::string& scheme) {
+  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
+    if (scheme == kValidSchemes[i])
+      return true;
+  }
+
+  return false;
+}
+
+bool URLPattern::Parse(const std::string& pattern) {
+  size_t scheme_end_pos = pattern.find(kSchemeSeparator);
+  if (scheme_end_pos == std::string::npos)
+    return false;
+
+  scheme_ = pattern.substr(0, scheme_end_pos);
+  if (!IsValidScheme(scheme_))
+    return false;
+
+  size_t host_start_pos = scheme_end_pos + strlen(kSchemeSeparator);
+  if (host_start_pos >= pattern.length())
+    return false;
+
+  // Parse out the host and path.
+  size_t path_start_pos = 0;
+
+  // File URLs are special because they have no host. There are other schemes
+  // with the same structure, but we don't support them (yet).
+  if (scheme_ == "file") {
+    path_start_pos = host_start_pos;
+  } else {
+    size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
+    if (host_end_pos == std::string::npos)
+      return false;
+
+    host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
+
+    // The first component can optionally be '*' to match all subdomains.
+    std::vector<std::string> host_components;
+    SplitString(host_, '.', &host_components);
+    if (host_components[0] == "*") {
+      match_subdomains_ = true;
+      host_components.erase(host_components.begin(),
+                            host_components.begin() + 1);
+    }
+    host_ = JoinString(host_components, '.');
+
+    // No other '*' can occur in the host, though. This isn't necessary, but is
+    // done as a convenience to developers who might otherwise be confused and
+    // think '*' works as a glob in the host.
+    if (host_.find('*') != std::string::npos)
+      return false;
+
+    path_start_pos = host_end_pos;
+  }
+  
+  path_ = pattern.substr(path_start_pos);
+  return true;
+}
+
+bool URLPattern::MatchesUrl(const GURL &test) {
+  if (test.scheme() != scheme_)
+    return false;
+
+  if (!MatchesHost(test))
+    return false;
+
+  if (!MatchesPath(test))
+    return false;
+
+  return true;
+}
+
+bool URLPattern::MatchesHost(const GURL& test) {
+  if (test.host() == host_)
+    return true;
+
+  if (!match_subdomains_ || test.HostIsIPAddress())
+    return false;
+
+  // If we're matching subdomains, and we have no host, that means the pattern
+  // was <scheme>://*/<whatever>, so we match anything.
+  if (host_.empty())
+    return true;
+
+  // Check if the test host is a subdomain of our host.
+  if (test.host().length() <= (host_.length() + 1))
+    return false;
+
+  if (test.host().compare(test.host().length() - host_.length(),
+                          host_.length(), host_) != 0)
+    return false;
+
+  return test.host()[test.host().length() - host_.length() - 1] == '.';
+}
+
+bool URLPattern::MatchesPath(const GURL& test) {
+  if (path_escaped_.empty()) {
+    path_escaped_ = path_;
+    ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
+    ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
+  }
+
+  if (!MatchPattern(test.PathForRequest(), path_escaped_))
+    return false;
+
+  return true;
+}
diff --git a/chrome/common/extensions/url_pattern.h b/chrome/common/extensions/url_pattern.h
new file mode 100644
index 0000000..0be9660
--- /dev/null
+++ b/chrome/common/extensions/url_pattern.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_
+#define CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_
+
+#include "googleurl/src/gurl.h"
+
+// A pattern that can be used to match URLs. A URLPattern is a very restricted
+// subset of URL syntax:
+//
+// <url-pattern> := <scheme>://<host><path>
+// <scheme> := 'http' | 'https' | 'file' | 'ftp' | 'chrome-ui'
+// <host> := '*' | '*.' <anychar except '/' and '*'>+
+// <path> := '/' <any chars>
+//
+// * Host is not used when the scheme is 'file'.
+// * The path can have embedded '*' characters which act as glob wildcards.
+//
+// Examples of valid patterns:
+// - http://*/*
+// - http://*/foo*
+// - https://*.google.com/foo*bar
+// - chrome-ui://foo/bar
+// - file://monkey*
+// - http://127.0.0.1/*
+//
+// Examples of invalid patterns:
+// - http://* -- path not specified
+// - http://*foo/bar -- * not allowed as substring of host component
+// - http://foo.*.bar/baz -- * must be first component
+// - http:/bar -- scheme separator not found
+// - foo://* -- invalid scheme
+//
+// Design rationale:
+// * We need to be able to tell users what 'sites' a given URLPattern will
+//   affect. For example "This extension will interact with the site
+//   'www.google.com'.
+// * We'd like to be able to convert as many existing Greasemonkey @include
+//   patterns to URLPatterns as possible. Greasemonkey @include patterns are
+//   simple globs, so this won't be perfect.
+// * Although we would like to support any scheme, it isn't clear what to tell
+//   users about URLPatterns that affect data or javascript URLs, and saying
+//   something useful about chrome-extension URLs is more work, so those are
+//   left out for now.
+//
+// From a 2008-ish crawl of userscripts.org, the following patterns were found
+// in @include lines:
+// - total lines                    : 24271
+// - @include *                     :   919
+// - @include http://[^\*]+?/       : 11128 (no star in host)
+// - @include http://\*\.[^\*]+?/   :  2325 (host prefixed by *.)
+// - @include http://\*[^\.][^\*]+?/:  1524 (host prefixed by *, no dot -- many
+//                                           appear to only need subdomain
+//                                           matching, not real prefix matching)
+// - @include http://[^\*/]+\*/     :   320 (host suffixed by *)
+// - @include contains .tld         :   297 (host suffixed by .tld -- a special
+//                                           Greasemonkey domain component that
+//                                           tries to match all valid registry-
+//                                           controlled suffixes)
+// - @include http://\*/            :   228 (host is * exactly, but there is
+//                                           more to the pattern)
+//
+// So, we can support at least half of current @include lines without supporting
+// subdomain matching. We can pick up at least another 10% by supporting
+// subdomain matching. It is probably possible to coerce more of the existing
+// patterns to URLPattern, but the resulting pattern will be more restrictive
+// than the original glob, which is probably better than nothing.
+class URLPattern {
+ public:
+  URLPattern() : match_subdomains_(false) {}
+
+  // Initializes this instance by parsing the provided string. On failure, the
+  // instance will have some intermediate values and is in an invalid state.
+  bool Parse(const std::string& pattern_str);
+
+  // Returns true if this instance matches the specified URL.
+  bool MatchesUrl(const GURL& url);
+
+  // Get the scheme the pattern matches. This will always return a valid scheme
+  // if is_valid() returns true.
+  std::string scheme() const { return scheme_; }
+
+  // Gets the host the pattern matches. This can be an empty string if the
+  // pattern matches all hosts (the input was <scheme>://*/<whatever>).
+  std::string host() const { return host_; }
+
+  // Gets whether to match subdomains of host().
+  bool match_subdomains() const { return match_subdomains_; }
+
+  // Gets the path the pattern matches with the leading slash. This can have
+  // embedded asterisks which are interpreted using glob rules.
+  std::string path() const { return path_; }
+
+ private:
+  // Returns true if |test| matches our host.
+  bool MatchesHost(const GURL& test);
+
+  // Returns true if |test| matches our path.
+  bool MatchesPath(const GURL& test);
+
+  // The scheme for the pattern.
+  std::string scheme_;
+
+  // The host without any leading "*" components.
+  std::string host_;
+
+  // Whether we should match subdomains of the host. This is true if the first
+  // component of the pattern's host was "*".
+  bool match_subdomains_;
+
+  // The path to match. This is everything after the host of the URL, or
+  // everything after the scheme in the case of file:// URLs.
+  std::string path_;
+
+  // The path with "?" and "\" characters escaped for use with the
+  // MatchPattern() function. This is populated lazily, the first time it is
+  // needed.
+  std::string path_escaped_;
+};
+
+#endif  // CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_
diff --git a/chrome/common/extensions/url_pattern_unittest.cc b/chrome/common/extensions/url_pattern_unittest.cc
new file mode 100644
index 0000000..b53fc07
--- /dev/null
+++ b/chrome/common/extensions/url_pattern_unittest.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/common/extensions/url_pattern.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+// See url_pattern.h for examples of valid and invalid patterns.
+
+TEST(URLPatternTest, ParseInvalid) {
+  const char* kInvalidPatterns[] = {
+    "http",  // no scheme
+    "http://",  // no path separator
+    "http://foo",  // no path separator
+    "http://*foo/bar",  // not allowed as substring of host component
+    "http://foo.*.bar/baz",  // must be first component
+    "http:/bar",  // scheme separator not found
+    "foo://*",  // invalid scheme
+  };
+
+  for (size_t i = 0; i < arraysize(kInvalidPatterns); ++i) {
+    URLPattern pattern;
+    EXPECT_FALSE(pattern.Parse(kInvalidPatterns[i]));
+  }
+};
+
+// all pages for a given scheme
+TEST(URLPatternTest, Match1) {
+  URLPattern pattern;
+  EXPECT_TRUE(pattern.Parse("http://*/*"));
+  EXPECT_EQ("http", pattern.scheme());
+  EXPECT_EQ("", pattern.host());
+  EXPECT_TRUE(pattern.match_subdomains());
+  EXPECT_EQ("/*", pattern.path());
+  EXPECT_TRUE(pattern.MatchesUrl(GURL("http://google.com")));
+  EXPECT_TRUE(pattern.MatchesUrl(GURL("http://yahoo.com")));
+  EXPECT_TRUE(pattern.MatchesUrl(GURL("http://google.com/foo")));
+  EXPECT_FALSE(pattern.MatchesUrl(GURL("https://google.com")));
+}
+
+// all domains
+TEST(URLPatternTest, Match2) {
+  URLPattern pattern;
+  EXPECT_TRUE(pattern.Parse("https://*/foo*"));
+  EXPECT_EQ("https", pattern.scheme());
+  EXPECT_EQ("", pattern.host());
+  EXPECT_TRUE(pattern.match_subdomains());
+  EXPECT_EQ("/foo*", pattern.path());
+  EXPECT_TRUE(pattern.MatchesUrl(GURL("https://www.google.com/foo")));
+  EXPECT_TRUE(pattern.MatchesUrl(GURL("https://www.google.com/foobar")));
+  EXPECT_FALSE(pattern.MatchesUrl(GURL("http://www.google.com/foo")));
+  EXPECT_FALSE(pattern.MatchesUrl(GURL("https://www.google.com/")));
+}
+
+// subdomains
+TEST(URLPatternTest, Match3) {
+  URLPattern pattern;
+  EXPECT_TRUE(pattern.Parse("http://*.google.com/foo*bar"));
+  EXPECT_EQ("http", pattern.scheme());
+  EXPECT_EQ("google.com", pattern.host());
+  EXPECT_TRUE(pattern.match_subdomains());
+  EXPECT_EQ("/foo*bar", pattern.path());
+  EXPECT_TRUE(pattern.MatchesUrl(GURL("http://google.com/foobar")));
+  EXPECT_TRUE(pattern.MatchesUrl(GURL("http://www.google.com/foo?bar")));
+  EXPECT_TRUE(pattern.MatchesUrl(
+      GURL("http://monkey.images.google.com/foooobar")));
+  EXPECT_FALSE(pattern.MatchesUrl(GURL("http://yahoo.com/foobar")));
+}
+
+// odd schemes and normalization
+TEST(URLPatternTest, Match4) {
+  URLPattern pattern;
+  EXPECT_TRUE(pattern.Parse("chrome-ui://thinger/*"));
+  EXPECT_EQ("chrome-ui", pattern.scheme());
+  EXPECT_EQ("thinger", pattern.host());
+  EXPECT_FALSE(pattern.match_subdomains());
+  EXPECT_EQ("/*", pattern.path());
+  EXPECT_TRUE(pattern.MatchesUrl(GURL("chrome-ui://thinger/foobar")));
+  EXPECT_TRUE(pattern.MatchesUrl(GURL("CHROME-UI://thinger/")));
+  EXPECT_FALSE(pattern.MatchesUrl(GURL("http://thinger/")));
+}
+
+// glob escaping
+TEST(URLPatternTest, Match5) {
+  URLPattern pattern;
+  EXPECT_TRUE(pattern.Parse("file:///foo?bar\\*baz"));
+  EXPECT_EQ("file", pattern.scheme());
+  EXPECT_EQ("", pattern.host());
+  EXPECT_FALSE(pattern.match_subdomains());
+  EXPECT_EQ("/foo?bar\\*baz", pattern.path());
+  EXPECT_TRUE(pattern.MatchesUrl(GURL("file:///foo?bar\\hellobaz")));
+  EXPECT_FALSE(pattern.MatchesUrl(GURL("file:///fooXbar\\hellobaz")));
+}
+
+// ip addresses
+TEST(URLPatternTest, Match6) {
+  URLPattern pattern;
+  EXPECT_TRUE(pattern.Parse("http://127.0.0.1/*"));
+  EXPECT_EQ("http", pattern.scheme());
+  EXPECT_EQ("127.0.0.1", pattern.host());
+  EXPECT_FALSE(pattern.match_subdomains());
+  EXPECT_EQ("/*", pattern.path());
+  EXPECT_TRUE(pattern.MatchesUrl(GURL("http://127.0.0.1")));
+}
+
+// subdomain matching with ip addresses
+TEST(URLPatternTest, Match7) {
+  URLPattern pattern;
+  EXPECT_TRUE(pattern.Parse("http://*.0.0.1/*")); // allowed, but useless
+  EXPECT_EQ("http", pattern.scheme());
+  EXPECT_EQ("0.0.1", pattern.host());
+  EXPECT_TRUE(pattern.match_subdomains());
+  EXPECT_EQ("/*", pattern.path());
+  // Subdomain matching is never done if the argument has an IP address host.
+  EXPECT_FALSE(pattern.MatchesUrl(GURL("http://127.0.0.1")));
+};
+
+// unicode
+TEST(URLPatternTest, Match8) {
+  URLPattern pattern;
+  // The below is the ASCII encoding of the following URL:
+  // http://*.\xe1\x80\xbf/a\xc2\x81\xe1*
+  EXPECT_TRUE(pattern.Parse("http://*.xn--gkd/a%C2%81%E1*"));
+  EXPECT_EQ("http", pattern.scheme());
+  EXPECT_EQ("xn--gkd", pattern.host());
+  EXPECT_TRUE(pattern.match_subdomains());
+  EXPECT_EQ("/a%C2%81%E1*", pattern.path());
+  EXPECT_TRUE(pattern.MatchesUrl(
+      GURL("http://abc.\xe1\x80\xbf/a\xc2\x81\xe1xyz")));
+  EXPECT_TRUE(pattern.MatchesUrl(
+      GURL("http://\xe1\x80\xbf/a\xc2\x81\xe1\xe1")));
+};
author	aa@chromium.org <aa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-02-02 04:09:58 +0000
committer	aa@chromium.org <aa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-02-02 04:09:58 +0000
commit	0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6 (patch)
tree	53272fae82210fbf553bb3a1a0256fba5b851368 /chrome/common/extensions
parent	b112a4cc460212188d353b995a055f6e14029ba3 (diff)
download	chromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.zip chromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.tar.gz chromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.tar.bz2