summaryrefslogtreecommitdiffstats
path: root/chrome/common/extensions/url_pattern.h
blob: f6f51ec92a1fe9d2beec37bb2d744e5209cc8311 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
#define CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_

#include <string>
#include <vector>

#include "googleurl/src/gurl.h"

// A pattern that can be used to match URLs. A URLPattern is a very restricted
// subset of URL syntax:
//
// <url-pattern> := <scheme>://<host><path> | '<all_urls>'
// <scheme> := '*' | 'http' | 'https' | 'file' | 'ftp' | 'chrome'
// <host> := '*' | '*.' <anychar except '/' and '*'>+
// <path> := '/' <any chars>
//
// * Host is not used when the scheme is 'file'.
// * The path can have embedded '*' characters which act as glob wildcards.
// * '<all_urls>' is a special pattern that matches any URL that contains a
//   valid scheme (as specified by valid_schemes_).
// * The '*' scheme pattern excludes file URLs.
//
// Examples of valid patterns:
// - http://*/*
// - http://*/foo*
// - https://*.google.com/foo*bar
// - file://monkey*
// - http://127.0.0.1/*
//
// Examples of invalid patterns:
// - http://* -- path not specified
// - http://*foo/bar -- * not allowed as substring of host component
// - http://foo.*.bar/baz -- * must be first component
// - http:/bar -- scheme separator not found
// - foo://* -- invalid scheme
// - chrome:// -- we don't support chrome internal URLs
//
// Design rationale:
// * We need to be able to tell users what 'sites' a given URLPattern will
//   affect. For example "This extension will interact with the site
//   'www.google.com'.
// * We'd like to be able to convert as many existing Greasemonkey @include
//   patterns to URLPatterns as possible. Greasemonkey @include patterns are
//   simple globs, so this won't be perfect.
// * Although we would like to support any scheme, it isn't clear what to tell
//   users about URLPatterns that affect data or javascript URLs, so those are
//   left out for now.
//
// From a 2008-ish crawl of userscripts.org, the following patterns were found
// in @include lines:
// - total lines                    : 24471
// - @include *                     :   919
// - @include http://[^\*]+?/       : 11128 (no star in host)
// - @include http://\*\.[^\*]+?/   :  2325 (host prefixed by *.)
// - @include http://\*[^\.][^\*]+?/:  1524 (host prefixed by *, no dot -- many
//                                           appear to only need subdomain
//                                           matching, not real prefix matching)
// - @include http://[^\*/]+\*/     :   320 (host suffixed by *)
// - @include contains .tld         :   297 (host suffixed by .tld -- a special
//                                           Greasemonkey domain component that
//                                           tries to match all valid registry-
//                                           controlled suffixes)
// - @include http://\*/            :   228 (host is * exactly, but there is
//                                           more to the pattern)
//
// So, we can support at least half of current @include lines without supporting
// subdomain matching. We can pick up at least another 10% by supporting
// subdomain matching. It is probably possible to coerce more of the existing
// patterns to URLPattern, but the resulting pattern will be more restrictive
// than the original glob, which is probably better than nothing.
class URLPattern {
 public:
  // A collection of scheme bitmasks for use with valid_schemes.
  enum SchemeMasks {
    SCHEME_HTTP = 1<<0,
    SCHEME_HTTPS = 1<<1,
    SCHEME_FILE = 1<<2,
    SCHEME_FTP = 1<<3,
    SCHEME_CHROMEUI = 1<<4,
  };

  // Note: don't use this directly. This exists so URLPattern can be used
  // with STL containers.
  URLPattern();

  // Construct an URLPattern with the given set of allowable schemes. See
  // valid_schemes_ for more info.
  explicit URLPattern(int valid_schemes);

  // Convenience to construct a URLPattern from a string. The string is expected
  // to be a valid pattern. If the string is not known ahead of time, use
  // Parse() instead, which returns success or failure.
  URLPattern(int valid_schemes, const std::string& pattern);

  // Gets the bitmask of valid schemes.
  int valid_schemes() const { return valid_schemes_; }
  void set_valid_schemes(int valid_schemes) { valid_schemes_ = valid_schemes; }

  // Gets the host the pattern matches. This can be an empty string if the
  // pattern matches all hosts (the input was <scheme>://*/<whatever>).
  const std::string& host() const { return host_; }
  void set_host(const std::string& host) { host_ = host; }

  // Gets whether to match subdomains of host().
  bool match_subdomains() const { return match_subdomains_; }
  void set_match_subdomains(bool val) { match_subdomains_ = val; }

  // Gets the path the pattern matches with the leading slash. This can have
  // embedded asterisks which are interpreted using glob rules.
  const std::string& path() const { return path_; }
  void set_path(const std::string& path) {
    path_ = path;
    path_escaped_ = "";
  }

  // Returns true if this pattern matches all urls.
  bool match_all_urls() const { return match_all_urls_; }
  void set_match_all_urls(bool val) { match_all_urls_ = val; }

  // Initializes this instance by parsing the provided string. On failure, the
  // instance will have some intermediate values and is in an invalid state.
  bool Parse(const std::string& pattern_str);

  // Sets the scheme for pattern matches. This can be a single '*' if the
  // pattern matches all valid schemes (as defined by the valid_schemes_
  // property). Returns false on failure (if the scheme is not valid).
  bool SetScheme(const std::string& scheme);
  // Note: You should use MatchesScheme() instead of this getter unless you
  // absolutely need the exact scheme. This is exposed for testing.
  const std::string& scheme() const { return scheme_; }

  // Returns true if the specified scheme can be used in this URL pattern, and
  // false otherwise. Uses valid_schemes_ to determine validity.
  bool IsValidScheme(const std::string& scheme) const;

  // Returns true if this instance matches the specified URL.
  bool MatchesUrl(const GURL& url) const;

  // Returns true if |test| matches our scheme.
  bool MatchesScheme(const std::string& test) const;

  // Returns true if |test| matches our host.
  bool MatchesHost(const std::string& test) const;
  bool MatchesHost(const GURL& test) const;

  // Returns true if |test| matches our path.
  bool MatchesPath(const std::string& test) const;

  // Returns a string representing this instance.
  std::string GetAsString() const;

  // Determine whether there is a URL that would match this instance and another
  // instance. This method is symmetrical: Calling other.OverlapsWith(this)
  // would result in the same answer.
  bool OverlapsWith(const URLPattern& other) const;

  // Conver this URLPattern into an equivalent set of URLPatterns that don't use
  // a wildcard in the scheme component. If this URLPattern doesn't use a
  // wildcard scheme, then the returned set will contain one element that is
  // equivalent to this instance.
  std::vector<URLPattern> ConvertToExplicitSchemes() const;

 private:
  // A bitmask containing the schemes which are considered valid for this
  // pattern. Parse() uses this to decide whether a pattern contains a valid
  // scheme. MatchesScheme uses this to decide whether a wildcard scheme_
  // matches a given test scheme.
  int valid_schemes_;

  // True if this is a special-case "<all_urls>" pattern.
  bool match_all_urls_;

  // The scheme for the pattern.
  std::string scheme_;

  // The host without any leading "*" components.
  std::string host_;

  // Whether we should match subdomains of the host. This is true if the first
  // component of the pattern's host was "*".
  bool match_subdomains_;

  // The path to match. This is everything after the host of the URL, or
  // everything after the scheme in the case of file:// URLs.
  std::string path_;

  // The path with "?" and "\" characters escaped for use with the
  // MatchPatternASCII() function. This is populated lazily, the first time it
  // is needed.
  mutable std::string path_escaped_;
};

#endif  // CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_