summaryrefslogtreecommitdiffstats
path: root/url/gurl.h
blob: 8c274eaedbd9ed3247f1ea0365630067e8e47071 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef URL_GURL_H_
#define URL_GURL_H_

#include <iosfwd>
#include <string>

#include "base/memory/scoped_ptr.h"
#include "base/strings/string16.h"
#include "base/strings/string_piece.h"
#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_canon_stdstring.h"
#include "url/url_constants.h"
#include "url/url_export.h"

// Represents a URL.
//
// A parsed canonicalized URL will be guaranteed UTF-8. Only the ref (if
// specified) can be non-ASCII, the host, path, etc. will be guaranteed ASCII
// and any non-ASCII characters will be encoded and % escaped.
//
// The string representation of a URL is called the spec(). Getting the
// spec will assert if the URL is invalid to help protect against malicious
// URLs. If you want the "best effort" canonicalization of an invalid URL, you
// can use possibly_invalid_spec(). Test validity with is_valid(). Data and
// javascript URLs use GetContent() to extract the data.
//
// This class has existence checkers and getters for the various components of
// a URL. Existence is different than being nonempty. "http://www.google.com/?"
// has a query that just happens to be empty, and has_query() will return true
// while the query getters will return the empty string.
//
// Prefer not to modify a URL using string operations (though sometimes this is
// unavoidable). Instead, use ReplaceComponents which can replace or delete
// multiple parts of a URL in one step, doesn't re-canonicalize unchanged
// sections, and avoids some screw-ups. An example is creating a URL with a
// path that contains a literal '#'. Using string concatenation will generate a
// URL with a truncated path and a reference fragment, while ReplaceComponents
// will know to escape this and produce the desired result.
class URL_EXPORT GURL {
 public:
  typedef url::StringPieceReplacements<std::string> Replacements;
  typedef url::StringPieceReplacements<base::string16> ReplacementsW;

  // Creates an empty, invalid URL.
  GURL();

  // Copy construction is relatively inexpensive, with most of the time going
  // to reallocating the string. It does not re-parse.
  GURL(const GURL& other);

  // The strings to this contructor should be UTF-8 / UTF-16.
  explicit GURL(const std::string& url_string);
  explicit GURL(const base::string16& url_string);

  // Constructor for URLs that have already been parsed and canonicalized. This
  // is used for conversions from KURL, for example. The caller must supply all
  // information associated with the URL, which must be correct and consistent.
  GURL(const char* canonical_spec,
       size_t canonical_spec_len,
       const url::Parsed& parsed,
       bool is_valid);
  // Notice that we take the canonical_spec by value so that we can convert
  // from WebURL without copying the string. When we call this constructor
  // we pass in a temporary std::string, which lets the compiler skip the
  // copy and just move the std::string into the function argument. In the
  // implementation, we use swap to move the data into the GURL itself,
  // which means we end up with zero copies.
  GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid);

  ~GURL();

  GURL& operator=(GURL other);

  // Returns true when this object represents a valid parsed URL. When not
  // valid, other functions will still succeed, but you will not get canonical
  // data out in the format you may be expecting. Instead, we keep something
  // "reasonable looking" so that the user can see how it's busted if
  // displayed to them.
  bool is_valid() const {
    return is_valid_;
  }

  // Returns true if the URL is zero-length. Note that empty URLs are also
  // invalid, and is_valid() will return false for them. This is provided
  // because some users may want to treat the empty case differently.
  bool is_empty() const {
    return spec_.empty();
  }

  // Returns the raw spec, i.e., the full text of the URL, in canonical UTF-8,
  // if the URL is valid. If the URL is not valid, this will assert and return
  // the empty string (for safety in release builds, to keep them from being
  // misused which might be a security problem).
  //
  // The URL will be ASCII except the reference fragment, which may be UTF-8.
  // It is guaranteed to be valid UTF-8.
  //
  // The exception is for empty() URLs (which are !is_valid()) but this will
  // return the empty string without asserting.
  //
  // Used invalid_spec() below to get the unusable spec of an invalid URL. This
  // separation is designed to prevent errors that may cause security problems
  // that could result from the mistaken use of an invalid URL.
  const std::string& spec() const;

  // Returns the potentially invalid spec for a the URL. This spec MUST NOT be
  // modified or sent over the network. It is designed to be displayed in error
  // messages to the user, as the appearance of the spec may explain the error.
  // If the spec is valid, the valid spec will be returned.
  //
  // The returned string is guaranteed to be valid UTF-8.
  const std::string& possibly_invalid_spec() const {
    return spec_;
  }

  // Getter for the raw parsed structure. This allows callers to locate parts
  // of the URL within the spec themselves. Most callers should consider using
  // the individual component getters below.
  //
  // The returned parsed structure will reference into the raw spec, which may
  // or may not be valid. If you are using this to index into the spec, BE
  // SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you
  // don't do anything "important" with invalid specs.
  const url::Parsed& parsed_for_possibly_invalid_spec() const {
    return parsed_;
  }

  // Defiant equality operator!
  bool operator==(const GURL& other) const;
  bool operator!=(const GURL& other) const;

  // Allows GURL to used as a key in STL (for example, a std::set or std::map).
  bool operator<(const GURL& other) const;
  bool operator>(const GURL& other) const;

  // Resolves a URL that's possibly relative to this object's URL, and returns
  // it. Absolute URLs are also handled according to the rules of URLs on web
  // pages.
  //
  // It may be impossible to resolve the URLs properly. If the input is not
  // "standard" (IsStandard() == false) and the input looks relative, we can't
  // resolve it. In these cases, the result will be an empty, invalid GURL.
  //
  // The result may also be a nonempty, invalid URL if the input has some kind
  // of encoding error. In these cases, we will try to construct a "good" URL
  // that may have meaning to the user, but it will be marked invalid.
  //
  // It is an error to resolve a URL relative to an invalid URL. The result
  // will be the empty URL.
  GURL Resolve(const std::string& relative) const;
  GURL Resolve(const base::string16& relative) const;

  // Creates a new GURL by replacing the current URL's components with the
  // supplied versions. See the Replacements class in url_canon.h for more.
  //
  // These are not particularly quick, so avoid doing mutations when possible.
  // Prefer the 8-bit version when possible.
  //
  // It is an error to replace components of an invalid URL. The result will
  // be the empty URL.
  //
  // Note that we use the more general url::Replacements type to give
  // callers extra flexibility rather than our override.
  GURL ReplaceComponents(const url::Replacements<char>& replacements) const;
  GURL ReplaceComponents(
      const url::Replacements<base::char16>& replacements) const;

  // A helper function that is equivalent to replacing the path with a slash
  // and clearing out everything after that. We sometimes need to know just the
  // scheme and the authority. If this URL is not a standard URL (it doesn't
  // have the regular authority and path sections), then the result will be
  // an empty, invalid GURL. Note that this *does* work for file: URLs, which
  // some callers may want to filter out before calling this.
  //
  // It is an error to get an empty path on an invalid URL. The result
  // will be the empty URL.
  GURL GetWithEmptyPath() const;

  // A helper function to return a GURL containing just the scheme, host,
  // and port from a URL. Equivalent to clearing any username and password,
  // replacing the path with a slash, and clearing everything after that. If
  // this URL is not a standard URL, then the result will be an empty,
  // invalid GURL. If the URL has neither username nor password, this
  // degenerates to GetWithEmptyPath().
  //
  // It is an error to get the origin of an invalid URL. The result
  // will be the empty URL.
  GURL GetOrigin() const;

  // A helper function to return a GURL stripped from the elements that are not
  // supposed to be sent as HTTP referrer: username, password and ref fragment.
  // For invalid URLs or URLs that no valid referrers, an empty URL will be
  // returned.
  GURL GetAsReferrer() const;

  // Returns true if the scheme for the current URL is a known "standard-format"
  // scheme. A standard-format scheme adheres to what RFC 3986 calls "generic
  // URI syntax" (https://tools.ietf.org/html/rfc3986#section-3). This includes
  // file: and filesystem:, which some callers may want to filter out explicitly
  // by calling SchemeIsFile[System].
  bool IsStandard() const;

  // Returns true if the given parameter (should be lower-case ASCII to match
  // the canonicalized scheme) is the scheme for this URL. Do not include a
  // colon.
  bool SchemeIs(base::StringPiece lower_ascii_scheme) const;

  // Returns true if the scheme is "http" or "https".
  bool SchemeIsHTTPOrHTTPS() const;

  // Returns true is the scheme is "ws" or "wss".
  bool SchemeIsWSOrWSS() const;

  // We often need to know if this is a file URL. File URLs are "standard", but
  // are often treated separately by some programs.
  bool SchemeIsFile() const {
    return SchemeIs(url::kFileScheme);
  }

  // FileSystem URLs need to be treated differently in some cases.
  bool SchemeIsFileSystem() const {
    return SchemeIs(url::kFileSystemScheme);
  }

  // Returns true if the scheme indicates a network connection that uses TLS or
  // some other cryptographic protocol (e.g. QUIC) for security.
  //
  // This function is a not a complete test of whether or not an origin's code
  // is minimally trustworthy. For that, see Chromium's |IsOriginSecure| for a
  // higher-level and more complete semantics. See that function's documentation
  // for more detail.
  bool SchemeIsCryptographic() const {
    return SchemeIs(url::kHttpsScheme) || SchemeIs(url::kWssScheme);
  }

  // Returns true if the scheme is "blob".
  bool SchemeIsBlob() const {
    return SchemeIs(url::kBlobScheme);
  }

  // The "content" of the URL is everything after the scheme (skipping the
  // scheme delimiting colon). It is an error to get the content of an invalid
  // URL: the result will be an empty string.
  std::string GetContent() const;

  // Returns true if the hostname is an IP address. Note: this function isn't
  // as cheap as a simple getter because it re-parses the hostname to verify.
  bool HostIsIPAddress() const;

  // Not including the colon. If you are comparing schemes, prefer SchemeIs.
  bool has_scheme() const {
    return parsed_.scheme.len >= 0;
  }
  std::string scheme() const {
    return ComponentString(parsed_.scheme);
  }
  base::StringPiece scheme_piece() const {
    return ComponentStringPiece(parsed_.scheme);
  }

  bool has_username() const {
    return parsed_.username.len >= 0;
  }
  std::string username() const {
    return ComponentString(parsed_.username);
  }
  base::StringPiece username_piece() const {
    return ComponentStringPiece(parsed_.username);
  }

  bool has_password() const {
    return parsed_.password.len >= 0;
  }
  std::string password() const {
    return ComponentString(parsed_.password);
  }
  base::StringPiece password_piece() const {
    return ComponentStringPiece(parsed_.password);
  }

  // The host may be a hostname, an IPv4 address, or an IPv6 literal surrounded
  // by square brackets, like "[2001:db8::1]". To exclude these brackets, use
  // HostNoBrackets() below.
  bool has_host() const {
    // Note that hosts are special, absence of host means length 0.
    return parsed_.host.len > 0;
  }
  std::string host() const {
    return ComponentString(parsed_.host);
  }
  base::StringPiece host_piece() const {
    return ComponentStringPiece(parsed_.host);
  }

  // The port if one is explicitly specified. Most callers will want IntPort()
  // or EffectiveIntPort() instead of these. The getters will not include the
  // ':'.
  bool has_port() const {
    return parsed_.port.len >= 0;
  }
  std::string port() const {
    return ComponentString(parsed_.port);
  }
  base::StringPiece port_piece() const {
    return ComponentStringPiece(parsed_.port);
  }

  // Including first slash following host, up to the query. The URL
  // "http://www.google.com/" has a path of "/".
  bool has_path() const {
    return parsed_.path.len >= 0;
  }
  std::string path() const {
    return ComponentString(parsed_.path);
  }
  base::StringPiece path_piece() const {
    return ComponentStringPiece(parsed_.path);
  }

  // Stuff following '?' up to the ref. The getters will not include the '?'.
  bool has_query() const {
    return parsed_.query.len >= 0;
  }
  std::string query() const {
    return ComponentString(parsed_.query);
  }
  base::StringPiece query_piece() const {
    return ComponentStringPiece(parsed_.query);
  }

  // Stuff following '#' to the end of the string. This will be UTF-8 encoded
  // (not necessarily ASCII). The getters will not include the '#'.
  bool has_ref() const {
    return parsed_.ref.len >= 0;
  }
  std::string ref() const {
    return ComponentString(parsed_.ref);
  }
  base::StringPiece ref_piece() const {
    return ComponentStringPiece(parsed_.ref);
  }

  // Returns a parsed version of the port. Can also be any of the special
  // values defined in Parsed for ExtractPort.
  int IntPort() const;

  // Returns the port number of the URL, or the default port number.
  // If the scheme has no concept of port (or unknown default) returns
  // PORT_UNSPECIFIED.
  int EffectiveIntPort() const;

  // Extracts the filename portion of the path and returns it. The filename
  // is everything after the last slash in the path. This may be empty.
  std::string ExtractFileName() const;

  // Returns the path that should be sent to the server. This is the path,
  // parameter, and query portions of the URL. It is guaranteed to be ASCII.
  std::string PathForRequest() const;

  // Returns the host, excluding the square brackets surrounding IPv6 address
  // literals. This can be useful for passing to getaddrinfo().
  std::string HostNoBrackets() const;

  // Returns true if this URL's host matches or is in the same domain as
  // the given input string. For example, if the hostname of the URL is
  // "www.google.com", this will return true for "com", "google.com", and
  // "www.google.com".
  //
  // The input domain should be lower-case ASCII to match the canonicalized
  // scheme. This call is more efficient than getting the host and check
  // whether host has the specific domain or not because no copies or
  // object constructions are done.
  bool DomainIs(base::StringPiece lower_ascii_domain) const;

  // Swaps the contents of this GURL object with |other|, without doing
  // any memory allocations.
  void Swap(GURL* other);

  // Returns a reference to a singleton empty GURL. This object is for callers
  // who return references but don't have anything to return in some cases.
  // If you just want an empty URL for normal use, prefer GURL(). This function
  // may be called from any thread.
  static const GURL& EmptyGURL();

  // Returns the inner URL of a nested URL (currently only non-null for
  // filesystem URLs).
  const GURL* inner_url() const {
    return inner_url_.get();
  }

 private:
  // Variant of the string parsing constructor that allows the caller to elect
  // retain trailing whitespace, if any, on the passed URL spec, but only if
  // the scheme is one that allows trailing whitespace. The primary use-case is
  // for data: URLs. In most cases, you want to use the single parameter
  // constructor above.
  enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE };
  GURL(const std::string& url_string, RetainWhiteSpaceSelector);

  template<typename STR>
  void InitCanonical(const STR& input_spec, bool trim_path_end);

  void InitializeFromCanonicalSpec();

  // Returns the substring of the input identified by the given component.
  std::string ComponentString(const url::Component& comp) const {
    if (comp.len <= 0)
      return std::string();
    return std::string(spec_, comp.begin, comp.len);
  }
  base::StringPiece ComponentStringPiece(const url::Component& comp) const {
    if (comp.len <= 0)
      return base::StringPiece();
    return base::StringPiece(&spec_[comp.begin], comp.len);
  }

  // The actual text of the URL, in canonical ASCII form.
  std::string spec_;

  // Set when the given URL is valid. Otherwise, we may still have a spec and
  // components, but they may not identify valid resources (for example, an
  // invalid port number, invalid characters in the scheme, etc.).
  bool is_valid_;

  // Identified components of the canonical spec.
  url::Parsed parsed_;

  // Used for nested schemes [currently only filesystem:].
  scoped_ptr<GURL> inner_url_;
};

// Stream operator so GURL can be used in assertion statements.
URL_EXPORT std::ostream& operator<<(std::ostream& out, const GURL& url);

#endif  // URL_GURL_H_