summaryrefslogtreecommitdiffstats
path: root/chrome/common/extensions/url_pattern.h
blob: d952216429f123742a63527b78f337a496b2f946 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
#define CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_

#include <string>

#include "googleurl/src/gurl.h"

// A pattern that can be used to match URLs. A URLPattern is a very restricted
// subset of URL syntax:
//
// <url-pattern> := <scheme>://<host><path>
// <scheme> := 'http' | 'https' | 'file' | 'ftp' | 'chrome'
// <host> := '*' | '*.' <anychar except '/' and '*'>+
// <path> := '/' <any chars>
//
// * Host is not used when the scheme is 'file'.
// * The path can have embedded '*' characters which act as glob wildcards.
//
// Examples of valid patterns:
// - http://*/*
// - http://*/foo*
// - https://*.google.com/foo*bar
// - file://monkey*
// - http://127.0.0.1/*
//
// Examples of invalid patterns:
// - http://* -- path not specified
// - http://*foo/bar -- * not allowed as substring of host component
// - http://foo.*.bar/baz -- * must be first component
// - http:/bar -- scheme separator not found
// - foo://* -- invalid scheme
// - chrome:// -- we don't support chrome internal URLs
//
// Design rationale:
// * We need to be able to tell users what 'sites' a given URLPattern will
//   affect. For example "This extension will interact with the site
//   'www.google.com'.
// * We'd like to be able to convert as many existing Greasemonkey @include
//   patterns to URLPatterns as possible. Greasemonkey @include patterns are
//   simple globs, so this won't be perfect.
// * Although we would like to support any scheme, it isn't clear what to tell
//   users about URLPatterns that affect data or javascript URLs, so those are
//   left out for now.
//
// From a 2008-ish crawl of userscripts.org, the following patterns were found
// in @include lines:
// - total lines                    : 24471
// - @include *                     :   919
// - @include http://[^\*]+?/       : 11128 (no star in host)
// - @include http://\*\.[^\*]+?/   :  2325 (host prefixed by *.)
// - @include http://\*[^\.][^\*]+?/:  1524 (host prefixed by *, no dot -- many
//                                           appear to only need subdomain
//                                           matching, not real prefix matching)
// - @include http://[^\*/]+\*/     :   320 (host suffixed by *)
// - @include contains .tld         :   297 (host suffixed by .tld -- a special
//                                           Greasemonkey domain component that
//                                           tries to match all valid registry-
//                                           controlled suffixes)
// - @include http://\*/            :   228 (host is * exactly, but there is
//                                           more to the pattern)
//
// So, we can support at least half of current @include lines without supporting
// subdomain matching. We can pick up at least another 10% by supporting
// subdomain matching. It is probably possible to coerce more of the existing
// patterns to URLPattern, but the resulting pattern will be more restrictive
// than the original glob, which is probably better than nothing.
class URLPattern {
 public:
  // Returns true if the specified scheme can be used in URL patterns, and false
  // otherwise.
  static bool IsValidScheme(const std::string& scheme);

  URLPattern() : match_subdomains_(false) {}

  // Initializes this instance by parsing the provided string. On failure, the
  // instance will have some intermediate values and is in an invalid state.
  bool Parse(const std::string& pattern_str);

  // Returns true if this instance matches the specified URL.
  bool MatchesUrl(const GURL& url) const;

  std::string GetAsString() const;

  // Get the scheme the pattern matches. This will always return a valid scheme
  // if is_valid() returns true.
  std::string scheme() const { return scheme_; }

  // Gets the host the pattern matches. This can be an empty string if the
  // pattern matches all hosts (the input was <scheme>://*/<whatever>).
  std::string host() const { return host_; }

  // Gets whether to match subdomains of host().
  bool match_subdomains() const { return match_subdomains_; }

  // Gets the path the pattern matches with the leading slash. This can have
  // embedded asterisks which are interpreted using glob rules.
  std::string path() const { return path_; }
  void set_path(const std::string& path) {
    path_ = path;
    path_escaped_ = "";
  }

 private:
  // Returns true if |test| matches our host.
  bool MatchesHost(const GURL& test) const;

  // Returns true if |test| matches our path.
  bool MatchesPath(const GURL& test) const;

  // The scheme for the pattern.
  std::string scheme_;

  // The host without any leading "*" components.
  std::string host_;

  // Whether we should match subdomains of the host. This is true if the first
  // component of the pattern's host was "*".
  bool match_subdomains_;

  // The path to match. This is everything after the host of the URL, or
  // everything after the scheme in the case of file:// URLs.
  std::string path_;

  // The path with "?" and "\" characters escaped for use with the
  // MatchPattern() function. This is populated lazily, the first time it is
  // needed.
  mutable std::string path_escaped_;
};

#endif  // CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_