summaryrefslogtreecommitdiffstats
path: root/chrome/common/extensions/url_pattern.h
diff options
context:
space:
mode:
Diffstat (limited to 'chrome/common/extensions/url_pattern.h')
-rw-r--r--chrome/common/extensions/url_pattern.h217
1 files changed, 217 insertions, 0 deletions
diff --git a/chrome/common/extensions/url_pattern.h b/chrome/common/extensions/url_pattern.h
new file mode 100644
index 0000000..3fe6eb6
--- /dev/null
+++ b/chrome/common/extensions/url_pattern.h
@@ -0,0 +1,217 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
+#define CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+
+class GURL;
+
+// A pattern that can be used to match URLs. A URLPattern is a very restricted
+// subset of URL syntax:
+//
+// <url-pattern> := <scheme>://<host><path> | '<all_urls>'
+// <scheme> := '*' | 'http' | 'https' | 'file' | 'ftp' | 'chrome'
+// <host> := '*' | '*.' <anychar except '/' and '*'>+
+// <path> := '/' <any chars>
+//
+// * Host is not used when the scheme is 'file'.
+// * The path can have embedded '*' characters which act as glob wildcards.
+// * '<all_urls>' is a special pattern that matches any URL that contains a
+// valid scheme (as specified by valid_schemes_).
+// * The '*' scheme pattern excludes file URLs.
+//
+// Examples of valid patterns:
+// - http://*/*
+// - http://*/foo*
+// - https://*.google.com/foo*bar
+// - file://monkey*
+// - http://127.0.0.1/*
+//
+// Examples of invalid patterns:
+// - http://* -- path not specified
+// - http://*foo/bar -- * not allowed as substring of host component
+// - http://foo.*.bar/baz -- * must be first component
+// - http:/bar -- scheme separator not found
+// - foo://* -- invalid scheme
+// - chrome:// -- we don't support chrome internal URLs
+//
+// Design rationale:
+// * We need to be able to tell users what 'sites' a given URLPattern will
+// affect. For example "This extension will interact with the site
+// 'www.google.com'.
+// * We'd like to be able to convert as many existing Greasemonkey @include
+// patterns to URLPatterns as possible. Greasemonkey @include patterns are
+// simple globs, so this won't be perfect.
+// * Although we would like to support any scheme, it isn't clear what to tell
+// users about URLPatterns that affect data or javascript URLs, so those are
+// left out for now.
+//
+// From a 2008-ish crawl of userscripts.org, the following patterns were found
+// in @include lines:
+// - total lines : 24471
+// - @include * : 919
+// - @include http://[^\*]+?/ : 11128 (no star in host)
+// - @include http://\*\.[^\*]+?/ : 2325 (host prefixed by *.)
+// - @include http://\*[^\.][^\*]+?/: 1524 (host prefixed by *, no dot -- many
+// appear to only need subdomain
+// matching, not real prefix matching)
+// - @include http://[^\*/]+\*/ : 320 (host suffixed by *)
+// - @include contains .tld : 297 (host suffixed by .tld -- a special
+// Greasemonkey domain component that
+// tries to match all valid registry-
+// controlled suffixes)
+// - @include http://\*/ : 228 (host is * exactly, but there is
+// more to the pattern)
+//
+// So, we can support at least half of current @include lines without supporting
+// subdomain matching. We can pick up at least another 10% by supporting
+// subdomain matching. It is probably possible to coerce more of the existing
+// patterns to URLPattern, but the resulting pattern will be more restrictive
+// than the original glob, which is probably better than nothing.
+class URLPattern {
+ public:
+ // A collection of scheme bitmasks for use with valid_schemes.
+ enum SchemeMasks {
+ SCHEME_HTTP = 1<<0,
+ SCHEME_HTTPS = 1<<1,
+ SCHEME_FILE = 1<<2,
+ SCHEME_FTP = 1<<3,
+ SCHEME_CHROMEUI = 1<<4,
+ };
+
+ // Note: don't use this directly. This exists so URLPattern can be used
+ // with STL containers.
+ URLPattern();
+
+ // Construct an URLPattern with the given set of allowable schemes. See
+ // valid_schemes_ for more info.
+ explicit URLPattern(int valid_schemes);
+
+ // Convenience to construct a URLPattern from a string. The string is expected
+ // to be a valid pattern. If the string is not known ahead of time, use
+ // Parse() instead, which returns success or failure.
+ URLPattern(int valid_schemes, const std::string& pattern);
+
+ ~URLPattern();
+
+ // Gets the bitmask of valid schemes.
+ int valid_schemes() const { return valid_schemes_; }
+ void set_valid_schemes(int valid_schemes) { valid_schemes_ = valid_schemes; }
+
+ // Gets the host the pattern matches. This can be an empty string if the
+ // pattern matches all hosts (the input was <scheme>://*/<whatever>).
+ const std::string& host() const { return host_; }
+ void set_host(const std::string& host) { host_ = host; }
+
+ // Gets whether to match subdomains of host().
+ bool match_subdomains() const { return match_subdomains_; }
+ void set_match_subdomains(bool val) { match_subdomains_ = val; }
+
+ // Gets the path the pattern matches with the leading slash. This can have
+ // embedded asterisks which are interpreted using glob rules.
+ const std::string& path() const { return path_; }
+ void set_path(const std::string& path) {
+ path_ = path;
+ path_escaped_ = "";
+ }
+
+ // Returns true if this pattern matches all urls.
+ bool match_all_urls() const { return match_all_urls_; }
+ void set_match_all_urls(bool val) { match_all_urls_ = val; }
+
+ // Initializes this instance by parsing the provided string. On failure, the
+ // instance will have some intermediate values and is in an invalid state.
+ bool Parse(const std::string& pattern_str);
+
+ // Sets the scheme for pattern matches. This can be a single '*' if the
+ // pattern matches all valid schemes (as defined by the valid_schemes_
+ // property). Returns false on failure (if the scheme is not valid).
+ bool SetScheme(const std::string& scheme);
+ // Note: You should use MatchesScheme() instead of this getter unless you
+ // absolutely need the exact scheme. This is exposed for testing.
+ const std::string& scheme() const { return scheme_; }
+
+ // Returns true if the specified scheme can be used in this URL pattern, and
+ // false otherwise. Uses valid_schemes_ to determine validity.
+ bool IsValidScheme(const std::string& scheme) const;
+
+ // Returns true if this instance matches the specified URL.
+ bool MatchesUrl(const GURL& url) const;
+
+ // Returns true if |test| matches our scheme.
+ bool MatchesScheme(const std::string& test) const;
+
+ // Returns true if |test| matches our host.
+ bool MatchesHost(const std::string& test) const;
+ bool MatchesHost(const GURL& test) const;
+
+ // Returns true if |test| matches our path.
+ bool MatchesPath(const std::string& test) const;
+
+ // Returns a string representing this instance.
+ std::string GetAsString() const;
+
+ // Determine whether there is a URL that would match this instance and another
+ // instance. This method is symmetrical: Calling other.OverlapsWith(this)
+ // would result in the same answer.
+ bool OverlapsWith(const URLPattern& other) const;
+
+ // Conver this URLPattern into an equivalent set of URLPatterns that don't use
+ // a wildcard in the scheme component. If this URLPattern doesn't use a
+ // wildcard scheme, then the returned set will contain one element that is
+ // equivalent to this instance.
+ std::vector<URLPattern> ConvertToExplicitSchemes() const;
+
+ static bool EffectiveHostCompare(const URLPattern& a, const URLPattern& b) {
+ if (a.match_all_urls_ && b.match_all_urls_)
+ return false;
+ return a.host_.compare(b.host_) < 0;
+ };
+
+ // Used for origin comparisons in a std::set.
+ class EffectiveHostCompareFunctor :
+ public std::binary_function<URLPattern, URLPattern, bool> {
+ public:
+ bool operator()(const URLPattern& a, const URLPattern& b) const {
+ return EffectiveHostCompare(a, b);
+ };
+ };
+
+ private:
+ // A bitmask containing the schemes which are considered valid for this
+ // pattern. Parse() uses this to decide whether a pattern contains a valid
+ // scheme. MatchesScheme uses this to decide whether a wildcard scheme_
+ // matches a given test scheme.
+ int valid_schemes_;
+
+ // True if this is a special-case "<all_urls>" pattern.
+ bool match_all_urls_;
+
+ // The scheme for the pattern.
+ std::string scheme_;
+
+ // The host without any leading "*" components.
+ std::string host_;
+
+ // Whether we should match subdomains of the host. This is true if the first
+ // component of the pattern's host was "*".
+ bool match_subdomains_;
+
+ // The path to match. This is everything after the host of the URL, or
+ // everything after the scheme in the case of file:// URLs.
+ std::string path_;
+
+ // The path with "?" and "\" characters escaped for use with the
+ // MatchPattern() function. This is populated lazily, the first time it is
+ // needed.
+ mutable std::string path_escaped_;
+};
+
+typedef std::vector<URLPattern> URLPatternList;
+
+#endif // CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_