diff options
author | aa@chromium.org <aa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-02-02 04:09:58 +0000 |
---|---|---|
committer | aa@chromium.org <aa@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-02-02 04:09:58 +0000 |
commit | 0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6 (patch) | |
tree | 53272fae82210fbf553bb3a1a0256fba5b851368 /chrome/common/extensions | |
parent | b112a4cc460212188d353b995a055f6e14029ba3 (diff) | |
download | chromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.zip chromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.tar.gz chromium_src-0b7c092f7f5b196fd9085f1ab796a0c9ac9473a6.tar.bz2 |
Introduce UrlPattern. This is basically me resuming work on
issue 14106, but as it is a complete rewrite, I have started
a new issue.
I also added supporting JoinString() and ReplaceAll()
utility functions.
Review URL: http://codereview.chromium.org/19704
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@9031 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/common/extensions')
-rw-r--r-- | chrome/common/extensions/url_pattern.cc | 128 | ||||
-rw-r--r-- | chrome/common/extensions/url_pattern.h | 122 | ||||
-rw-r--r-- | chrome/common/extensions/url_pattern_unittest.cc | 132 |
3 files changed, 382 insertions, 0 deletions
diff --git a/chrome/common/extensions/url_pattern.cc b/chrome/common/extensions/url_pattern.cc new file mode 100644 index 0000000..bb32cb5 --- /dev/null +++ b/chrome/common/extensions/url_pattern.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/string_piece.h" +#include "base/string_util.h" +#include "chrome/common/extensions/url_pattern.h" + +// TODO(aa): Consider adding chrome-extension? What about more obscure ones +// like data: and javascript: ? +static const char* kValidSchemes[] = { + "http", + "https", + "file", + "ftp", + "chrome-ui" +}; + +static const char kSchemeSeparator[] = "://"; +static const char kPathSeparator[] = "/"; + +static bool IsValidScheme(const std::string& scheme) { + for (size_t i = 0; i < arraysize(kValidSchemes); ++i) { + if (scheme == kValidSchemes[i]) + return true; + } + + return false; +} + +bool URLPattern::Parse(const std::string& pattern) { + size_t scheme_end_pos = pattern.find(kSchemeSeparator); + if (scheme_end_pos == std::string::npos) + return false; + + scheme_ = pattern.substr(0, scheme_end_pos); + if (!IsValidScheme(scheme_)) + return false; + + size_t host_start_pos = scheme_end_pos + strlen(kSchemeSeparator); + if (host_start_pos >= pattern.length()) + return false; + + // Parse out the host and path. + size_t path_start_pos = 0; + + // File URLs are special because they have no host. There are other schemes + // with the same structure, but we don't support them (yet). + if (scheme_ == "file") { + path_start_pos = host_start_pos; + } else { + size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos); + if (host_end_pos == std::string::npos) + return false; + + host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos); + + // The first component can optionally be '*' to match all subdomains. + std::vector<std::string> host_components; + SplitString(host_, '.', &host_components); + if (host_components[0] == "*") { + match_subdomains_ = true; + host_components.erase(host_components.begin(), + host_components.begin() + 1); + } + host_ = JoinString(host_components, '.'); + + // No other '*' can occur in the host, though. This isn't necessary, but is + // done as a convenience to developers who might otherwise be confused and + // think '*' works as a glob in the host. + if (host_.find('*') != std::string::npos) + return false; + + path_start_pos = host_end_pos; + } + + path_ = pattern.substr(path_start_pos); + return true; +} + +bool URLPattern::MatchesUrl(const GURL &test) { + if (test.scheme() != scheme_) + return false; + + if (!MatchesHost(test)) + return false; + + if (!MatchesPath(test)) + return false; + + return true; +} + +bool URLPattern::MatchesHost(const GURL& test) { + if (test.host() == host_) + return true; + + if (!match_subdomains_ || test.HostIsIPAddress()) + return false; + + // If we're matching subdomains, and we have no host, that means the pattern + // was <scheme>://*/<whatever>, so we match anything. + if (host_.empty()) + return true; + + // Check if the test host is a subdomain of our host. + if (test.host().length() <= (host_.length() + 1)) + return false; + + if (test.host().compare(test.host().length() - host_.length(), + host_.length(), host_) != 0) + return false; + + return test.host()[test.host().length() - host_.length() - 1] == '.'; +} + +bool URLPattern::MatchesPath(const GURL& test) { + if (path_escaped_.empty()) { + path_escaped_ = path_; + ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\"); + ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?"); + } + + if (!MatchPattern(test.PathForRequest(), path_escaped_)) + return false; + + return true; +} diff --git a/chrome/common/extensions/url_pattern.h b/chrome/common/extensions/url_pattern.h new file mode 100644 index 0000000..0be9660 --- /dev/null +++ b/chrome/common/extensions/url_pattern.h @@ -0,0 +1,122 @@ +// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#ifndef CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_ +#define CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_ + +#include "googleurl/src/gurl.h" + +// A pattern that can be used to match URLs. A URLPattern is a very restricted +// subset of URL syntax: +// +// <url-pattern> := <scheme>://<host><path> +// <scheme> := 'http' | 'https' | 'file' | 'ftp' | 'chrome-ui' +// <host> := '*' | '*.' <anychar except '/' and '*'>+ +// <path> := '/' <any chars> +// +// * Host is not used when the scheme is 'file'. +// * The path can have embedded '*' characters which act as glob wildcards. +// +// Examples of valid patterns: +// - http://*/* +// - http://*/foo* +// - https://*.google.com/foo*bar +// - chrome-ui://foo/bar +// - file://monkey* +// - http://127.0.0.1/* +// +// Examples of invalid patterns: +// - http://* -- path not specified +// - http://*foo/bar -- * not allowed as substring of host component +// - http://foo.*.bar/baz -- * must be first component +// - http:/bar -- scheme separator not found +// - foo://* -- invalid scheme +// +// Design rationale: +// * We need to be able to tell users what 'sites' a given URLPattern will +// affect. For example "This extension will interact with the site +// 'www.google.com'. +// * We'd like to be able to convert as many existing Greasemonkey @include +// patterns to URLPatterns as possible. Greasemonkey @include patterns are +// simple globs, so this won't be perfect. +// * Although we would like to support any scheme, it isn't clear what to tell +// users about URLPatterns that affect data or javascript URLs, and saying +// something useful about chrome-extension URLs is more work, so those are +// left out for now. +// +// From a 2008-ish crawl of userscripts.org, the following patterns were found +// in @include lines: +// - total lines : 24271 +// - @include * : 919 +// - @include http://[^\*]+?/ : 11128 (no star in host) +// - @include http://\*\.[^\*]+?/ : 2325 (host prefixed by *.) +// - @include http://\*[^\.][^\*]+?/: 1524 (host prefixed by *, no dot -- many +// appear to only need subdomain +// matching, not real prefix matching) +// - @include http://[^\*/]+\*/ : 320 (host suffixed by *) +// - @include contains .tld : 297 (host suffixed by .tld -- a special +// Greasemonkey domain component that +// tries to match all valid registry- +// controlled suffixes) +// - @include http://\*/ : 228 (host is * exactly, but there is +// more to the pattern) +// +// So, we can support at least half of current @include lines without supporting +// subdomain matching. We can pick up at least another 10% by supporting +// subdomain matching. It is probably possible to coerce more of the existing +// patterns to URLPattern, but the resulting pattern will be more restrictive +// than the original glob, which is probably better than nothing. +class URLPattern { + public: + URLPattern() : match_subdomains_(false) {} + + // Initializes this instance by parsing the provided string. On failure, the + // instance will have some intermediate values and is in an invalid state. + bool Parse(const std::string& pattern_str); + + // Returns true if this instance matches the specified URL. + bool MatchesUrl(const GURL& url); + + // Get the scheme the pattern matches. This will always return a valid scheme + // if is_valid() returns true. + std::string scheme() const { return scheme_; } + + // Gets the host the pattern matches. This can be an empty string if the + // pattern matches all hosts (the input was <scheme>://*/<whatever>). + std::string host() const { return host_; } + + // Gets whether to match subdomains of host(). + bool match_subdomains() const { return match_subdomains_; } + + // Gets the path the pattern matches with the leading slash. This can have + // embedded asterisks which are interpreted using glob rules. + std::string path() const { return path_; } + + private: + // Returns true if |test| matches our host. + bool MatchesHost(const GURL& test); + + // Returns true if |test| matches our path. + bool MatchesPath(const GURL& test); + + // The scheme for the pattern. + std::string scheme_; + + // The host without any leading "*" components. + std::string host_; + + // Whether we should match subdomains of the host. This is true if the first + // component of the pattern's host was "*". + bool match_subdomains_; + + // The path to match. This is everything after the host of the URL, or + // everything after the scheme in the case of file:// URLs. + std::string path_; + + // The path with "?" and "\" characters escaped for use with the + // MatchPattern() function. This is populated lazily, the first time it is + // needed. + std::string path_escaped_; +}; + +#endif // CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_ diff --git a/chrome/common/extensions/url_pattern_unittest.cc b/chrome/common/extensions/url_pattern_unittest.cc new file mode 100644 index 0000000..b53fc07 --- /dev/null +++ b/chrome/common/extensions/url_pattern_unittest.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/common/extensions/url_pattern.h" +#include "testing/gtest/include/gtest/gtest.h" + +// See url_pattern.h for examples of valid and invalid patterns. + +TEST(URLPatternTest, ParseInvalid) { + const char* kInvalidPatterns[] = { + "http", // no scheme + "http://", // no path separator + "http://foo", // no path separator + "http://*foo/bar", // not allowed as substring of host component + "http://foo.*.bar/baz", // must be first component + "http:/bar", // scheme separator not found + "foo://*", // invalid scheme + }; + + for (size_t i = 0; i < arraysize(kInvalidPatterns); ++i) { + URLPattern pattern; + EXPECT_FALSE(pattern.Parse(kInvalidPatterns[i])); + } +}; + +// all pages for a given scheme +TEST(URLPatternTest, Match1) { + URLPattern pattern; + EXPECT_TRUE(pattern.Parse("http://*/*")); + EXPECT_EQ("http", pattern.scheme()); + EXPECT_EQ("", pattern.host()); + EXPECT_TRUE(pattern.match_subdomains()); + EXPECT_EQ("/*", pattern.path()); + EXPECT_TRUE(pattern.MatchesUrl(GURL("http://google.com"))); + EXPECT_TRUE(pattern.MatchesUrl(GURL("http://yahoo.com"))); + EXPECT_TRUE(pattern.MatchesUrl(GURL("http://google.com/foo"))); + EXPECT_FALSE(pattern.MatchesUrl(GURL("https://google.com"))); +} + +// all domains +TEST(URLPatternTest, Match2) { + URLPattern pattern; + EXPECT_TRUE(pattern.Parse("https://*/foo*")); + EXPECT_EQ("https", pattern.scheme()); + EXPECT_EQ("", pattern.host()); + EXPECT_TRUE(pattern.match_subdomains()); + EXPECT_EQ("/foo*", pattern.path()); + EXPECT_TRUE(pattern.MatchesUrl(GURL("https://www.google.com/foo"))); + EXPECT_TRUE(pattern.MatchesUrl(GURL("https://www.google.com/foobar"))); + EXPECT_FALSE(pattern.MatchesUrl(GURL("http://www.google.com/foo"))); + EXPECT_FALSE(pattern.MatchesUrl(GURL("https://www.google.com/"))); +} + +// subdomains +TEST(URLPatternTest, Match3) { + URLPattern pattern; + EXPECT_TRUE(pattern.Parse("http://*.google.com/foo*bar")); + EXPECT_EQ("http", pattern.scheme()); + EXPECT_EQ("google.com", pattern.host()); + EXPECT_TRUE(pattern.match_subdomains()); + EXPECT_EQ("/foo*bar", pattern.path()); + EXPECT_TRUE(pattern.MatchesUrl(GURL("http://google.com/foobar"))); + EXPECT_TRUE(pattern.MatchesUrl(GURL("http://www.google.com/foo?bar"))); + EXPECT_TRUE(pattern.MatchesUrl( + GURL("http://monkey.images.google.com/foooobar"))); + EXPECT_FALSE(pattern.MatchesUrl(GURL("http://yahoo.com/foobar"))); +} + +// odd schemes and normalization +TEST(URLPatternTest, Match4) { + URLPattern pattern; + EXPECT_TRUE(pattern.Parse("chrome-ui://thinger/*")); + EXPECT_EQ("chrome-ui", pattern.scheme()); + EXPECT_EQ("thinger", pattern.host()); + EXPECT_FALSE(pattern.match_subdomains()); + EXPECT_EQ("/*", pattern.path()); + EXPECT_TRUE(pattern.MatchesUrl(GURL("chrome-ui://thinger/foobar"))); + EXPECT_TRUE(pattern.MatchesUrl(GURL("CHROME-UI://thinger/"))); + EXPECT_FALSE(pattern.MatchesUrl(GURL("http://thinger/"))); +} + +// glob escaping +TEST(URLPatternTest, Match5) { + URLPattern pattern; + EXPECT_TRUE(pattern.Parse("file:///foo?bar\\*baz")); + EXPECT_EQ("file", pattern.scheme()); + EXPECT_EQ("", pattern.host()); + EXPECT_FALSE(pattern.match_subdomains()); + EXPECT_EQ("/foo?bar\\*baz", pattern.path()); + EXPECT_TRUE(pattern.MatchesUrl(GURL("file:///foo?bar\\hellobaz"))); + EXPECT_FALSE(pattern.MatchesUrl(GURL("file:///fooXbar\\hellobaz"))); +} + +// ip addresses +TEST(URLPatternTest, Match6) { + URLPattern pattern; + EXPECT_TRUE(pattern.Parse("http://127.0.0.1/*")); + EXPECT_EQ("http", pattern.scheme()); + EXPECT_EQ("127.0.0.1", pattern.host()); + EXPECT_FALSE(pattern.match_subdomains()); + EXPECT_EQ("/*", pattern.path()); + EXPECT_TRUE(pattern.MatchesUrl(GURL("http://127.0.0.1"))); +} + +// subdomain matching with ip addresses +TEST(URLPatternTest, Match7) { + URLPattern pattern; + EXPECT_TRUE(pattern.Parse("http://*.0.0.1/*")); // allowed, but useless + EXPECT_EQ("http", pattern.scheme()); + EXPECT_EQ("0.0.1", pattern.host()); + EXPECT_TRUE(pattern.match_subdomains()); + EXPECT_EQ("/*", pattern.path()); + // Subdomain matching is never done if the argument has an IP address host. + EXPECT_FALSE(pattern.MatchesUrl(GURL("http://127.0.0.1"))); +}; + +// unicode +TEST(URLPatternTest, Match8) { + URLPattern pattern; + // The below is the ASCII encoding of the following URL: + // http://*.\xe1\x80\xbf/a\xc2\x81\xe1* + EXPECT_TRUE(pattern.Parse("http://*.xn--gkd/a%C2%81%E1*")); + EXPECT_EQ("http", pattern.scheme()); + EXPECT_EQ("xn--gkd", pattern.host()); + EXPECT_TRUE(pattern.match_subdomains()); + EXPECT_EQ("/a%C2%81%E1*", pattern.path()); + EXPECT_TRUE(pattern.MatchesUrl( + GURL("http://abc.\xe1\x80\xbf/a\xc2\x81\xe1xyz"))); + EXPECT_TRUE(pattern.MatchesUrl( + GURL("http://\xe1\x80\xbf/a\xc2\x81\xe1\xe1"))); +}; |