summaryrefslogtreecommitdiffstats
path: root/chrome/common/extensions
diff options
context:
space:
mode:
authoraa@chromium.org <aa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-02-02 04:09:58 +0000
committeraa@chromium.org <aa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-02-02 04:09:58 +0000
commit0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6 (patch)
tree53272fae82210fbf553bb3a1a0256fba5b851368 /chrome/common/extensions
parentb112a4cc460212188d353b995a055f6e14029ba3 (diff)
downloadchromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.zip
chromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.tar.gz
chromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.tar.bz2
Introduce UrlPattern. This is basically me resuming work on
issue 14106, but as it is a complete rewrite, I have started a new issue. I also added supporting JoinString() and ReplaceAll() utility functions. Review URL: http://codereview.chromium.org/19704 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@9031 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/common/extensions')
-rw-r--r--chrome/common/extensions/url_pattern.cc128
-rw-r--r--chrome/common/extensions/url_pattern.h122
-rw-r--r--chrome/common/extensions/url_pattern_unittest.cc132
3 files changed, 382 insertions, 0 deletions
diff --git a/chrome/common/extensions/url_pattern.cc b/chrome/common/extensions/url_pattern.cc
new file mode 100644
index 0000000..bb32cb5
--- /dev/null
+++ b/chrome/common/extensions/url_pattern.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/string_piece.h"
+#include "base/string_util.h"
+#include "chrome/common/extensions/url_pattern.h"
+
+// TODO(aa): Consider adding chrome-extension? What about more obscure ones
+// like data: and javascript: ?
+static const char* kValidSchemes[] = {
+ "http",
+ "https",
+ "file",
+ "ftp",
+ "chrome-ui"
+};
+
+static const char kSchemeSeparator[] = "://";
+static const char kPathSeparator[] = "/";
+
+static bool IsValidScheme(const std::string& scheme) {
+ for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
+ if (scheme == kValidSchemes[i])
+ return true;
+ }
+
+ return false;
+}
+
+bool URLPattern::Parse(const std::string& pattern) {
+ size_t scheme_end_pos = pattern.find(kSchemeSeparator);
+ if (scheme_end_pos == std::string::npos)
+ return false;
+
+ scheme_ = pattern.substr(0, scheme_end_pos);
+ if (!IsValidScheme(scheme_))
+ return false;
+
+ size_t host_start_pos = scheme_end_pos + strlen(kSchemeSeparator);
+ if (host_start_pos >= pattern.length())
+ return false;
+
+ // Parse out the host and path.
+ size_t path_start_pos = 0;
+
+ // File URLs are special because they have no host. There are other schemes
+ // with the same structure, but we don't support them (yet).
+ if (scheme_ == "file") {
+ path_start_pos = host_start_pos;
+ } else {
+ size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
+ if (host_end_pos == std::string::npos)
+ return false;
+
+ host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
+
+ // The first component can optionally be '*' to match all subdomains.
+ std::vector<std::string> host_components;
+ SplitString(host_, '.', &host_components);
+ if (host_components[0] == "*") {
+ match_subdomains_ = true;
+ host_components.erase(host_components.begin(),
+ host_components.begin() + 1);
+ }
+ host_ = JoinString(host_components, '.');
+
+ // No other '*' can occur in the host, though. This isn't necessary, but is
+ // done as a convenience to developers who might otherwise be confused and
+ // think '*' works as a glob in the host.
+ if (host_.find('*') != std::string::npos)
+ return false;
+
+ path_start_pos = host_end_pos;
+ }
+
+ path_ = pattern.substr(path_start_pos);
+ return true;
+}
+
+bool URLPattern::MatchesUrl(const GURL &test) {
+ if (test.scheme() != scheme_)
+ return false;
+
+ if (!MatchesHost(test))
+ return false;
+
+ if (!MatchesPath(test))
+ return false;
+
+ return true;
+}
+
+bool URLPattern::MatchesHost(const GURL& test) {
+ if (test.host() == host_)
+ return true;
+
+ if (!match_subdomains_ || test.HostIsIPAddress())
+ return false;
+
+ // If we're matching subdomains, and we have no host, that means the pattern
+ // was <scheme>://*/<whatever>, so we match anything.
+ if (host_.empty())
+ return true;
+
+ // Check if the test host is a subdomain of our host.
+ if (test.host().length() <= (host_.length() + 1))
+ return false;
+
+ if (test.host().compare(test.host().length() - host_.length(),
+ host_.length(), host_) != 0)
+ return false;
+
+ return test.host()[test.host().length() - host_.length() - 1] == '.';
+}
+
+bool URLPattern::MatchesPath(const GURL& test) {
+ if (path_escaped_.empty()) {
+ path_escaped_ = path_;
+ ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
+ ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
+ }
+
+ if (!MatchPattern(test.PathForRequest(), path_escaped_))
+ return false;
+
+ return true;
+}
diff --git a/chrome/common/extensions/url_pattern.h b/chrome/common/extensions/url_pattern.h
new file mode 100644
index 0000000..0be9660
--- /dev/null
+++ b/chrome/common/extensions/url_pattern.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_
+#define CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_
+
+#include "googleurl/src/gurl.h"
+
+// A pattern that can be used to match URLs. A URLPattern is a very restricted
+// subset of URL syntax:
+//
+// <url-pattern> := <scheme>://<host><path>
+// <scheme> := 'http' | 'https' | 'file' | 'ftp' | 'chrome-ui'
+// <host> := '*' | '*.' <anychar except '/' and '*'>+
+// <path> := '/' <any chars>
+//
+// * Host is not used when the scheme is 'file'.
+// * The path can have embedded '*' characters which act as glob wildcards.
+//
+// Examples of valid patterns:
+// - http://*/*
+// - http://*/foo*
+// - https://*.google.com/foo*bar
+// - chrome-ui://foo/bar
+// - file://monkey*
+// - http://127.0.0.1/*
+//
+// Examples of invalid patterns:
+// - http://* -- path not specified
+// - http://*foo/bar -- * not allowed as substring of host component
+// - http://foo.*.bar/baz -- * must be first component
+// - http:/bar -- scheme separator not found
+// - foo://* -- invalid scheme
+//
+// Design rationale:
+// * We need to be able to tell users what 'sites' a given URLPattern will
+// affect. For example "This extension will interact with the site
+// 'www.google.com'.
+// * We'd like to be able to convert as many existing Greasemonkey @include
+// patterns to URLPatterns as possible. Greasemonkey @include patterns are
+// simple globs, so this won't be perfect.
+// * Although we would like to support any scheme, it isn't clear what to tell
+// users about URLPatterns that affect data or javascript URLs, and saying
+// something useful about chrome-extension URLs is more work, so those are
+// left out for now.
+//
+// From a 2008-ish crawl of userscripts.org, the following patterns were found
+// in @include lines:
+// - total lines : 24271
+// - @include * : 919
+// - @include http://[^\*]+?/ : 11128 (no star in host)
+// - @include http://\*\.[^\*]+?/ : 2325 (host prefixed by *.)
+// - @include http://\*[^\.][^\*]+?/: 1524 (host prefixed by *, no dot -- many
+// appear to only need subdomain
+// matching, not real prefix matching)
+// - @include http://[^\*/]+\*/ : 320 (host suffixed by *)
+// - @include contains .tld : 297 (host suffixed by .tld -- a special
+// Greasemonkey domain component that
+// tries to match all valid registry-
+// controlled suffixes)
+// - @include http://\*/ : 228 (host is * exactly, but there is
+// more to the pattern)
+//
+// So, we can support at least half of current @include lines without supporting
+// subdomain matching. We can pick up at least another 10% by supporting
+// subdomain matching. It is probably possible to coerce more of the existing
+// patterns to URLPattern, but the resulting pattern will be more restrictive
+// than the original glob, which is probably better than nothing.
+class URLPattern {
+ public:
+ URLPattern() : match_subdomains_(false) {}
+
+ // Initializes this instance by parsing the provided string. On failure, the
+ // instance will have some intermediate values and is in an invalid state.
+ bool Parse(const std::string& pattern_str);
+
+ // Returns true if this instance matches the specified URL.
+ bool MatchesUrl(const GURL& url);
+
+ // Get the scheme the pattern matches. This will always return a valid scheme
+ // if is_valid() returns true.
+ std::string scheme() const { return scheme_; }
+
+ // Gets the host the pattern matches. This can be an empty string if the
+ // pattern matches all hosts (the input was <scheme>://*/<whatever>).
+ std::string host() const { return host_; }
+
+ // Gets whether to match subdomains of host().
+ bool match_subdomains() const { return match_subdomains_; }
+
+ // Gets the path the pattern matches with the leading slash. This can have
+ // embedded asterisks which are interpreted using glob rules.
+ std::string path() const { return path_; }
+
+ private:
+ // Returns true if |test| matches our host.
+ bool MatchesHost(const GURL& test);
+
+ // Returns true if |test| matches our path.
+ bool MatchesPath(const GURL& test);
+
+ // The scheme for the pattern.
+ std::string scheme_;
+
+ // The host without any leading "*" components.
+ std::string host_;
+
+ // Whether we should match subdomains of the host. This is true if the first
+ // component of the pattern's host was "*".
+ bool match_subdomains_;
+
+ // The path to match. This is everything after the host of the URL, or
+ // everything after the scheme in the case of file:// URLs.
+ std::string path_;
+
+ // The path with "?" and "\" characters escaped for use with the
+ // MatchPattern() function. This is populated lazily, the first time it is
+ // needed.
+ std::string path_escaped_;
+};
+
+#endif // CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_
diff --git a/chrome/common/extensions/url_pattern_unittest.cc b/chrome/common/extensions/url_pattern_unittest.cc
new file mode 100644
index 0000000..b53fc07
--- /dev/null
+++ b/chrome/common/extensions/url_pattern_unittest.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/common/extensions/url_pattern.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+// See url_pattern.h for examples of valid and invalid patterns.
+
+TEST(URLPatternTest, ParseInvalid) {
+ const char* kInvalidPatterns[] = {
+ "http", // no scheme
+ "http://", // no path separator
+ "http://foo", // no path separator
+ "http://*foo/bar", // not allowed as substring of host component
+ "http://foo.*.bar/baz", // must be first component
+ "http:/bar", // scheme separator not found
+ "foo://*", // invalid scheme
+ };
+
+ for (size_t i = 0; i < arraysize(kInvalidPatterns); ++i) {
+ URLPattern pattern;
+ EXPECT_FALSE(pattern.Parse(kInvalidPatterns[i]));
+ }
+};
+
+// all pages for a given scheme
+TEST(URLPatternTest, Match1) {
+ URLPattern pattern;
+ EXPECT_TRUE(pattern.Parse("http://*/*"));
+ EXPECT_EQ("http", pattern.scheme());
+ EXPECT_EQ("", pattern.host());
+ EXPECT_TRUE(pattern.match_subdomains());
+ EXPECT_EQ("/*", pattern.path());
+ EXPECT_TRUE(pattern.MatchesUrl(GURL("http://google.com")));
+ EXPECT_TRUE(pattern.MatchesUrl(GURL("http://yahoo.com")));
+ EXPECT_TRUE(pattern.MatchesUrl(GURL("http://google.com/foo")));
+ EXPECT_FALSE(pattern.MatchesUrl(GURL("https://google.com")));
+}
+
+// all domains
+TEST(URLPatternTest, Match2) {
+ URLPattern pattern;
+ EXPECT_TRUE(pattern.Parse("https://*/foo*"));
+ EXPECT_EQ("https", pattern.scheme());
+ EXPECT_EQ("", pattern.host());
+ EXPECT_TRUE(pattern.match_subdomains());
+ EXPECT_EQ("/foo*", pattern.path());
+ EXPECT_TRUE(pattern.MatchesUrl(GURL("https://www.google.com/foo")));
+ EXPECT_TRUE(pattern.MatchesUrl(GURL("https://www.google.com/foobar")));
+ EXPECT_FALSE(pattern.MatchesUrl(GURL("http://www.google.com/foo")));
+ EXPECT_FALSE(pattern.MatchesUrl(GURL("https://www.google.com/")));
+}
+
+// subdomains
+TEST(URLPatternTest, Match3) {
+ URLPattern pattern;
+ EXPECT_TRUE(pattern.Parse("http://*.google.com/foo*bar"));
+ EXPECT_EQ("http", pattern.scheme());
+ EXPECT_EQ("google.com", pattern.host());
+ EXPECT_TRUE(pattern.match_subdomains());
+ EXPECT_EQ("/foo*bar", pattern.path());
+ EXPECT_TRUE(pattern.MatchesUrl(GURL("http://google.com/foobar")));
+ EXPECT_TRUE(pattern.MatchesUrl(GURL("http://www.google.com/foo?bar")));
+ EXPECT_TRUE(pattern.MatchesUrl(
+ GURL("http://monkey.images.google.com/foooobar")));
+ EXPECT_FALSE(pattern.MatchesUrl(GURL("http://yahoo.com/foobar")));
+}
+
+// odd schemes and normalization
+TEST(URLPatternTest, Match4) {
+ URLPattern pattern;
+ EXPECT_TRUE(pattern.Parse("chrome-ui://thinger/*"));
+ EXPECT_EQ("chrome-ui", pattern.scheme());
+ EXPECT_EQ("thinger", pattern.host());
+ EXPECT_FALSE(pattern.match_subdomains());
+ EXPECT_EQ("/*", pattern.path());
+ EXPECT_TRUE(pattern.MatchesUrl(GURL("chrome-ui://thinger/foobar")));
+ EXPECT_TRUE(pattern.MatchesUrl(GURL("CHROME-UI://thinger/")));
+ EXPECT_FALSE(pattern.MatchesUrl(GURL("http://thinger/")));
+}
+
+// glob escaping
+TEST(URLPatternTest, Match5) {
+ URLPattern pattern;
+ EXPECT_TRUE(pattern.Parse("file:///foo?bar\\*baz"));
+ EXPECT_EQ("file", pattern.scheme());
+ EXPECT_EQ("", pattern.host());
+ EXPECT_FALSE(pattern.match_subdomains());
+ EXPECT_EQ("/foo?bar\\*baz", pattern.path());
+ EXPECT_TRUE(pattern.MatchesUrl(GURL("file:///foo?bar\\hellobaz")));
+ EXPECT_FALSE(pattern.MatchesUrl(GURL("file:///fooXbar\\hellobaz")));
+}
+
+// ip addresses
+TEST(URLPatternTest, Match6) {
+ URLPattern pattern;
+ EXPECT_TRUE(pattern.Parse("http://127.0.0.1/*"));
+ EXPECT_EQ("http", pattern.scheme());
+ EXPECT_EQ("127.0.0.1", pattern.host());
+ EXPECT_FALSE(pattern.match_subdomains());
+ EXPECT_EQ("/*", pattern.path());
+ EXPECT_TRUE(pattern.MatchesUrl(GURL("http://127.0.0.1")));
+}
+
+// subdomain matching with ip addresses
+TEST(URLPatternTest, Match7) {
+ URLPattern pattern;
+ EXPECT_TRUE(pattern.Parse("http://*.0.0.1/*")); // allowed, but useless
+ EXPECT_EQ("http", pattern.scheme());
+ EXPECT_EQ("0.0.1", pattern.host());
+ EXPECT_TRUE(pattern.match_subdomains());
+ EXPECT_EQ("/*", pattern.path());
+ // Subdomain matching is never done if the argument has an IP address host.
+ EXPECT_FALSE(pattern.MatchesUrl(GURL("http://127.0.0.1")));
+};
+
+// unicode
+TEST(URLPatternTest, Match8) {
+ URLPattern pattern;
+ // The below is the ASCII encoding of the following URL:
+ // http://*.\xe1\x80\xbf/a\xc2\x81\xe1*
+ EXPECT_TRUE(pattern.Parse("http://*.xn--gkd/a%C2%81%E1*"));
+ EXPECT_EQ("http", pattern.scheme());
+ EXPECT_EQ("xn--gkd", pattern.host());
+ EXPECT_TRUE(pattern.match_subdomains());
+ EXPECT_EQ("/a%C2%81%E1*", pattern.path());
+ EXPECT_TRUE(pattern.MatchesUrl(
+ GURL("http://abc.\xe1\x80\xbf/a\xc2\x81\xe1xyz")));
+ EXPECT_TRUE(pattern.MatchesUrl(
+ GURL("http://\xe1\x80\xbf/a\xc2\x81\xe1\xe1")));
+};