// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_ #define CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_ #pragma once #include #include #include "googleurl/src/gurl.h" // A pattern that can be used to match URLs. A URLPattern is a very restricted // subset of URL syntax: // // := :// | '' // := '*' | 'http' | 'https' | 'file' | 'ftp' | 'chrome' // := '*' | '*.' + // := '/' // // * Host is not used when the scheme is 'file'. // * The path can have embedded '*' characters which act as glob wildcards. // * '' is a special pattern that matches any URL that contains a // valid scheme (as specified by valid_schemes_). // * The '*' scheme pattern excludes file URLs. // // Examples of valid patterns: // - http://*/* // - http://*/foo* // - https://*.google.com/foo*bar // - file://monkey* // - http://127.0.0.1/* // // Examples of invalid patterns: // - http://* -- path not specified // - http://*foo/bar -- * not allowed as substring of host component // - http://foo.*.bar/baz -- * must be first component // - http:/bar -- scheme separator not found // - foo://* -- invalid scheme // - chrome:// -- we don't support chrome internal URLs // // Design rationale: // * We need to be able to tell users what 'sites' a given URLPattern will // affect. For example "This extension will interact with the site // 'www.google.com'. // * We'd like to be able to convert as many existing Greasemonkey @include // patterns to URLPatterns as possible. Greasemonkey @include patterns are // simple globs, so this won't be perfect. // * Although we would like to support any scheme, it isn't clear what to tell // users about URLPatterns that affect data or javascript URLs, so those are // left out for now. // // From a 2008-ish crawl of userscripts.org, the following patterns were found // in @include lines: // - total lines : 24471 // - @include * : 919 // - @include http://[^\*]+?/ : 11128 (no star in host) // - @include http://\*\.[^\*]+?/ : 2325 (host prefixed by *.) // - @include http://\*[^\.][^\*]+?/: 1524 (host prefixed by *, no dot -- many // appear to only need subdomain // matching, not real prefix matching) // - @include http://[^\*/]+\*/ : 320 (host suffixed by *) // - @include contains .tld : 297 (host suffixed by .tld -- a special // Greasemonkey domain component that // tries to match all valid registry- // controlled suffixes) // - @include http://\*/ : 228 (host is * exactly, but there is // more to the pattern) // // So, we can support at least half of current @include lines without supporting // subdomain matching. We can pick up at least another 10% by supporting // subdomain matching. It is probably possible to coerce more of the existing // patterns to URLPattern, but the resulting pattern will be more restrictive // than the original glob, which is probably better than nothing. class URLPattern { public: // A collection of scheme bitmasks for use with valid_schemes. enum SchemeMasks { SCHEME_HTTP = 1<<0, SCHEME_HTTPS = 1<<1, SCHEME_FILE = 1<<2, SCHEME_FTP = 1<<3, SCHEME_CHROMEUI = 1<<4, }; // Note: don't use this directly. This exists so URLPattern can be used // with STL containers. URLPattern(); // Construct an URLPattern with the given set of allowable schemes. See // valid_schemes_ for more info. explicit URLPattern(int valid_schemes); // Convenience to construct a URLPattern from a string. The string is expected // to be a valid pattern. If the string is not known ahead of time, use // Parse() instead, which returns success or failure. URLPattern(int valid_schemes, const std::string& pattern); // Gets the bitmask of valid schemes. int valid_schemes() const { return valid_schemes_; } void set_valid_schemes(int valid_schemes) { valid_schemes_ = valid_schemes; } // Gets the host the pattern matches. This can be an empty string if the // pattern matches all hosts (the input was ://*/). const std::string& host() const { return host_; } void set_host(const std::string& host) { host_ = host; } // Gets whether to match subdomains of host(). bool match_subdomains() const { return match_subdomains_; } void set_match_subdomains(bool val) { match_subdomains_ = val; } // Gets the path the pattern matches with the leading slash. This can have // embedded asterisks which are interpreted using glob rules. const std::string& path() const { return path_; } void set_path(const std::string& path) { path_ = path; path_escaped_ = ""; } // Returns true if this pattern matches all urls. bool match_all_urls() const { return match_all_urls_; } void set_match_all_urls(bool val) { match_all_urls_ = val; } // Initializes this instance by parsing the provided string. On failure, the // instance will have some intermediate values and is in an invalid state. bool Parse(const std::string& pattern_str); // Sets the scheme for pattern matches. This can be a single '*' if the // pattern matches all valid schemes (as defined by the valid_schemes_ // property). Returns false on failure (if the scheme is not valid). bool SetScheme(const std::string& scheme); // Note: You should use MatchesScheme() instead of this getter unless you // absolutely need the exact scheme. This is exposed for testing. const std::string& scheme() const { return scheme_; } // Returns true if the specified scheme can be used in this URL pattern, and // false otherwise. Uses valid_schemes_ to determine validity. bool IsValidScheme(const std::string& scheme) const; // Returns true if this instance matches the specified URL. bool MatchesUrl(const GURL& url) const; // Returns true if |test| matches our scheme. bool MatchesScheme(const std::string& test) const; // Returns true if |test| matches our host. bool MatchesHost(const std::string& test) const; bool MatchesHost(const GURL& test) const; // Returns true if |test| matches our path. bool MatchesPath(const std::string& test) const; // Returns a string representing this instance. std::string GetAsString() const; // Determine whether there is a URL that would match this instance and another // instance. This method is symmetrical: Calling other.OverlapsWith(this) // would result in the same answer. bool OverlapsWith(const URLPattern& other) const; // Conver this URLPattern into an equivalent set of URLPatterns that don't use // a wildcard in the scheme component. If this URLPattern doesn't use a // wildcard scheme, then the returned set will contain one element that is // equivalent to this instance. std::vector ConvertToExplicitSchemes() const; private: // A bitmask containing the schemes which are considered valid for this // pattern. Parse() uses this to decide whether a pattern contains a valid // scheme. MatchesScheme uses this to decide whether a wildcard scheme_ // matches a given test scheme. int valid_schemes_; // True if this is a special-case "" pattern. bool match_all_urls_; // The scheme for the pattern. std::string scheme_; // The host without any leading "*" components. std::string host_; // Whether we should match subdomains of the host. This is true if the first // component of the pattern's host was "*". bool match_subdomains_; // The path to match. This is everything after the host of the URL, or // everything after the scheme in the case of file:// URLs. std::string path_; // The path with "?" and "\" characters escaped for use with the // MatchPatternASCII() function. This is populated lazily, the first time it // is needed. mutable std::string path_escaped_; }; #endif // CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_