1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
#define CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
#include <string>
#include "googleurl/src/gurl.h"
// A pattern that can be used to match URLs. A URLPattern is a very restricted
// subset of URL syntax:
//
// <url-pattern> := <scheme>://<host><path>
// <scheme> := 'http' | 'https' | 'file' | 'ftp' | 'chrome'
// <host> := '*' | '*.' <anychar except '/' and '*'>+
// <path> := '/' <any chars>
//
// * Host is not used when the scheme is 'file'.
// * The path can have embedded '*' characters which act as glob wildcards.
//
// Examples of valid patterns:
// - http://*/*
// - http://*/foo*
// - https://*.google.com/foo*bar
// - chrome://foo/bar
// - file://monkey*
// - http://127.0.0.1/*
//
// Examples of invalid patterns:
// - http://* -- path not specified
// - http://*foo/bar -- * not allowed as substring of host component
// - http://foo.*.bar/baz -- * must be first component
// - http:/bar -- scheme separator not found
// - foo://* -- invalid scheme
//
// Design rationale:
// * We need to be able to tell users what 'sites' a given URLPattern will
// affect. For example "This extension will interact with the site
// 'www.google.com'.
// * We'd like to be able to convert as many existing Greasemonkey @include
// patterns to URLPatterns as possible. Greasemonkey @include patterns are
// simple globs, so this won't be perfect.
// * Although we would like to support any scheme, it isn't clear what to tell
// users about URLPatterns that affect data or javascript URLs, and saying
// something useful about chrome-extension URLs is more work, so those are
// left out for now.
//
// From a 2008-ish crawl of userscripts.org, the following patterns were found
// in @include lines:
// - total lines : 24471
// - @include * : 919
// - @include http://[^\*]+?/ : 11128 (no star in host)
// - @include http://\*\.[^\*]+?/ : 2325 (host prefixed by *.)
// - @include http://\*[^\.][^\*]+?/: 1524 (host prefixed by *, no dot -- many
// appear to only need subdomain
// matching, not real prefix matching)
// - @include http://[^\*/]+\*/ : 320 (host suffixed by *)
// - @include contains .tld : 297 (host suffixed by .tld -- a special
// Greasemonkey domain component that
// tries to match all valid registry-
// controlled suffixes)
// - @include http://\*/ : 228 (host is * exactly, but there is
// more to the pattern)
//
// So, we can support at least half of current @include lines without supporting
// subdomain matching. We can pick up at least another 10% by supporting
// subdomain matching. It is probably possible to coerce more of the existing
// patterns to URLPattern, but the resulting pattern will be more restrictive
// than the original glob, which is probably better than nothing.
class URLPattern {
public:
URLPattern() : match_subdomains_(false) {}
// Initializes this instance by parsing the provided string. On failure, the
// instance will have some intermediate values and is in an invalid state.
bool Parse(const std::string& pattern_str);
// Returns true if this instance matches the specified URL.
bool MatchesUrl(const GURL& url) const;
std::string GetAsString() const;
// Get the scheme the pattern matches. This will always return a valid scheme
// if is_valid() returns true.
std::string scheme() const { return scheme_; }
// Gets the host the pattern matches. This can be an empty string if the
// pattern matches all hosts (the input was <scheme>://*/<whatever>).
std::string host() const { return host_; }
// Gets whether to match subdomains of host().
bool match_subdomains() const { return match_subdomains_; }
// Gets the path the pattern matches with the leading slash. This can have
// embedded asterisks which are interpreted using glob rules.
std::string path() const { return path_; }
private:
// Returns true if |test| matches our host.
bool MatchesHost(const GURL& test) const;
// Returns true if |test| matches our path.
bool MatchesPath(const GURL& test) const;
// The scheme for the pattern.
std::string scheme_;
// The host without any leading "*" components.
std::string host_;
// Whether we should match subdomains of the host. This is true if the first
// component of the pattern's host was "*".
bool match_subdomains_;
// The path to match. This is everything after the host of the URL, or
// everything after the scheme in the case of file:// URLs.
std::string path_;
// The path with "?" and "\" characters escaped for use with the
// MatchPattern() function. This is populated lazily, the first time it is
// needed.
mutable std::string path_escaped_;
};
#endif // CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
|