summaryrefslogtreecommitdiffstats
path: root/chrome/common/extensions/url_pattern.h
blob: f69cd7ab22ee64d236a5814bd3f895ac4300098a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
#define CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_

#include <string>

#include "googleurl/src/gurl.h"

// A pattern that can be used to match URLs. A URLPattern is a very restricted
// subset of URL syntax:
//
// <url-pattern> := <scheme>://<host><path>
// <scheme> := 'http' | 'https' | 'file' | 'ftp' | 'chrome'
// <host> := '*' | '*.' <anychar except '/' and '*'>+
// <path> := '/' <any chars>
//
// * Host is not used when the scheme is 'file'.
// * The path can have embedded '*' characters which act as glob wildcards.
//
// Examples of valid patterns:
// - http://*/*
// - http://*/foo*
// - https://*.google.com/foo*bar
// - file://monkey*
// - http://127.0.0.1/*
//
// Examples of invalid patterns:
// - http://* -- path not specified
// - http://*foo/bar -- * not allowed as substring of host component
// - http://foo.*.bar/baz -- * must be first component
// - http:/bar -- scheme separator not found
// - foo://* -- invalid scheme
// - chrome:// -- we don't support chrome internal URLs
//
// Design rationale:
// * We need to be able to tell users what 'sites' a given URLPattern will
//   affect. For example "This extension will interact with the site
//   'www.google.com'.
// * We'd like to be able to convert as many existing Greasemonkey @include
//   patterns to URLPatterns as possible. Greasemonkey @include patterns are
//   simple globs, so this won't be perfect.
// * Although we would like to support any scheme, it isn't clear what to tell
//   users about URLPatterns that affect data or javascript URLs, so those are
//   left out for now.
//
// From a 2008-ish crawl of userscripts.org, the following patterns were found
// in @include lines:
// - total lines                    : 24471
// - @include *                     :   919
// - @include http://[^\*]+?/       : 11128 (no star in host)
// - @include http://\*\.[^\*]+?/   :  2325 (host prefixed by *.)
// - @include http://\*[^\.][^\*]+?/:  1524 (host prefixed by *, no dot -- many
//                                           appear to only need subdomain
//                                           matching, not real prefix matching)
// - @include http://[^\*/]+\*/     :   320 (host suffixed by *)
// - @include contains .tld         :   297 (host suffixed by .tld -- a special
//                                           Greasemonkey domain component that
//                                           tries to match all valid registry-
//                                           controlled suffixes)
// - @include http://\*/            :   228 (host is * exactly, but there is
//                                           more to the pattern)
//
// So, we can support at least half of current @include lines without supporting
// subdomain matching. We can pick up at least another 10% by supporting
// subdomain matching. It is probably possible to coerce more of the existing
// patterns to URLPattern, but the resulting pattern will be more restrictive
// than the original glob, which is probably better than nothing.
class URLPattern {
 public:
  // Returns true if the specified scheme can be used in URL patterns, and false
  // otherwise.
  static bool IsValidScheme(const std::string& scheme);

  URLPattern() : match_subdomains_(false) {}

  // Initializes this instance by parsing the provided string. On failure, the
  // instance will have some intermediate values and is in an invalid state.
  bool Parse(const std::string& pattern_str);

  // Returns true if this instance matches the specified URL.
  bool MatchesUrl(const GURL& url) const;

  std::string GetAsString() const;

  // Get the scheme the pattern matches. This will always return a valid scheme
  // if is_valid() returns true.
  std::string scheme() const { return scheme_; }

  // Gets the host the pattern matches. This can be an empty string if the
  // pattern matches all hosts (the input was <scheme>://*/<whatever>).
  std::string host() const { return host_; }

  // Gets whether to match subdomains of host().
  bool match_subdomains() const { return match_subdomains_; }

  // Gets the path the pattern matches with the leading slash. This can have
  // embedded asterisks which are interpreted using glob rules.
  std::string path() const { return path_; }

 private:
  // Returns true if |test| matches our host.
  bool MatchesHost(const GURL& test) const;

  // Returns true if |test| matches our path.
  bool MatchesPath(const GURL& test) const;

  // The scheme for the pattern.
  std::string scheme_;

  // The host without any leading "*" components.
  std::string host_;

  // Whether we should match subdomains of the host. This is true if the first
  // component of the pattern's host was "*".
  bool match_subdomains_;

  // The path to match. This is everything after the host of the URL, or
  // everything after the scheme in the case of file:// URLs.
  std::string path_;

  // The path with "?" and "\" characters escaped for use with the
  // MatchPattern() function. This is populated lazily, the first time it is
  // needed.
  mutable std::string path_escaped_;
};

#endif  // CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_