summaryrefslogtreecommitdiffstats
path: root/googleurl
diff options
context:
space:
mode:
Diffstat (limited to 'googleurl')
-rw-r--r--googleurl/src/gurl.cc3
-rw-r--r--googleurl/src/gurl.h61
-rw-r--r--googleurl/src/gurl_unittest.cc59
-rw-r--r--googleurl/src/url_canon.h435
-rw-r--r--googleurl/src/url_canon_etc.cc5
-rw-r--r--googleurl/src/url_canon_icu.h10
-rw-r--r--googleurl/src/url_canon_ip.cc13
-rw-r--r--googleurl/src/url_canon_ip.h43
-rw-r--r--googleurl/src/url_canon_path.cc2
-rw-r--r--googleurl/src/url_canon_relative.cc7
-rw-r--r--googleurl/src/url_canon_stdstring.h11
-rw-r--r--googleurl/src/url_canon_stdurl.cc9
-rw-r--r--googleurl/src/url_canon_unittest.cc25
-rw-r--r--googleurl/src/url_common.h48
-rw-r--r--googleurl/src/url_parse.cc98
-rw-r--r--googleurl/src/url_parse.h84
-rw-r--r--googleurl/src/url_util.cc232
-rw-r--r--googleurl/src/url_util.h186
-rw-r--r--googleurl/src/url_util_unittest.cc120
19 files changed, 908 insertions, 543 deletions
diff --git a/googleurl/src/gurl.cc b/googleurl/src/gurl.cc
index 2dab0b2..a0bfd26 100644
--- a/googleurl/src/gurl.cc
+++ b/googleurl/src/gurl.cc
@@ -304,8 +304,7 @@ GURL GURL::GetWithEmptyPath() const {
}
bool GURL::IsStandard() const {
- return url_util::IsStandard(spec_.data(), static_cast<int>(spec_.length()),
- parsed_.scheme);
+ return url_util::IsStandard(spec_.data(), parsed_.scheme);
}
bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
diff --git a/googleurl/src/gurl.h b/googleurl/src/gurl.h
index 36cd14c..29fea81 100644
--- a/googleurl/src/gurl.h
+++ b/googleurl/src/gurl.h
@@ -36,6 +36,7 @@
#include "base/string16.h"
#include "googleurl/src/url_canon.h"
#include "googleurl/src/url_canon_stdstring.h"
+#include "googleurl/src/url_common.h"
#include "googleurl/src/url_parse.h"
class GURL {
@@ -44,11 +45,11 @@ class GURL {
typedef url_canon::StdStringReplacements<string16> ReplacementsW;
// Creates an empty, invalid URL.
- GURL();
+ GURL_API GURL();
// Copy construction is relatively inexpensive, with most of the time going
// to reallocating the string. It does not re-parse.
- GURL(const GURL& other);
+ GURL_API GURL(const GURL& other);
// The narrow version requires the input be UTF-8. Invalid UTF-8 input will
// result in an invalid URL.
@@ -57,14 +58,16 @@ class GURL {
// encode the query parameters. It is probably sufficient for the narrow
// version to assume the query parameter encoding should be the same as the
// input encoding.
- explicit GURL(const std::string& url_string /*, output_param_encoding*/);
- explicit GURL(const string16& url_string /*, output_param_encoding*/);
+ GURL_API explicit GURL(const std::string& url_string
+ /*, output_param_encoding*/);
+ GURL_API explicit GURL(const string16& url_string
+ /*, output_param_encoding*/);
// Constructor for URLs that have already been parsed and canonicalized. This
// is used for conversions from KURL, for example. The caller must supply all
// information associated with the URL, which must be correct and consistent.
- GURL(const char* canonical_spec, size_t canonical_spec_len,
- const url_parse::Parsed& parsed, bool is_valid);
+ GURL_API GURL(const char* canonical_spec, size_t canonical_spec_len,
+ const url_parse::Parsed& parsed, bool is_valid);
// Returns true when this object represents a valid parsed URL. When not
// valid, other functions will still succeed, but you will not get canonical
@@ -96,7 +99,7 @@ class GURL {
// Used invalid_spec() below to get the unusable spec of an invalid URL. This
// separation is designed to prevent errors that may cause security problems
// that could result from the mistaken use of an invalid URL.
- const std::string& spec() const;
+ GURL_API const std::string& spec() const;
// Returns the potentially invalid spec for a the URL. This spec MUST NOT be
// modified or sent over the network. It is designed to be displayed in error
@@ -148,8 +151,8 @@ class GURL {
//
// It is an error to resolve a URL relative to an invalid URL. The result
// will be the empty URL.
- GURL Resolve(const std::string& relative) const;
- GURL Resolve(const string16& relative) const;
+ GURL_API GURL Resolve(const std::string& relative) const;
+ GURL_API GURL Resolve(const string16& relative) const;
// Like Resolve() above but takes a character set encoder which will be used
// for any query text specified in the input. The charset converter parameter
@@ -158,10 +161,10 @@ class GURL {
// TODO(brettw): These should be replaced with versions that take something
// more friendly than a raw CharsetConverter (maybe like an ICU character set
// name).
- GURL ResolveWithCharsetConverter(
+ GURL_API GURL ResolveWithCharsetConverter(
const std::string& relative,
url_canon::CharsetConverter* charset_converter) const;
- GURL ResolveWithCharsetConverter(
+ GURL_API GURL ResolveWithCharsetConverter(
const string16& relative,
url_canon::CharsetConverter* charset_converter) const;
@@ -176,9 +179,9 @@ class GURL {
//
// Note that we use the more general url_canon::Replacements type to give
// callers extra flexibility rather than our override.
- GURL ReplaceComponents(
+ GURL_API GURL ReplaceComponents(
const url_canon::Replacements<char>& replacements) const;
- GURL ReplaceComponents(
+ GURL_API GURL ReplaceComponents(
const url_canon::Replacements<char16>& replacements) const;
// A helper function that is equivalent to replacing the path with a slash
@@ -190,7 +193,7 @@ class GURL {
//
// It is an error to get an empty path on an invalid URL. The result
// will be the empty URL.
- GURL GetWithEmptyPath() const;
+ GURL_API GURL GetWithEmptyPath() const;
// A helper function to return a GURL containing just the scheme, host,
// and port from a URL. Equivalent to clearing any username and password,
@@ -201,19 +204,19 @@ class GURL {
//
// It is an error to get the origin of an invalid URL. The result
// will be the empty URL.
- GURL GetOrigin() const;
+ GURL_API GURL GetOrigin() const;
// Returns true if the scheme for the current URL is a known "standard"
- // scheme or there is a "://" after it. Standard schemes have an authority
- // and a path section. This includes file:, which some callers may want to
- // filter out explicitly by calling SchemeIsFile.
- bool IsStandard() const;
+ // scheme. Standard schemes have an authority and a path section. This
+ // includes file:, which some callers may want to filter out explicitly by
+ // calling SchemeIsFile.
+ GURL_API bool IsStandard() const;
// Returns true if the given parameter (should be lower-case ASCII to match
// the canonicalized scheme) is the scheme for this URL. This call is more
// efficient than getting the scheme and comparing it because no copies or
// object constructions are done.
- bool SchemeIs(const char* lower_ascii_scheme) const;
+ GURL_API bool SchemeIs(const char* lower_ascii_scheme) const;
// We often need to know if this is a file URL. File URLs are "standard", but
// are often treated separately by some programs.
@@ -229,7 +232,7 @@ class GURL {
// Returns true if the hostname is an IP address. Note: this function isn't
// as cheap as a simple getter because it re-parses the hostname to verify.
// This currently identifies only IPv4 addresses (bug 822685).
- bool HostIsIPAddress() const;
+ GURL_API bool HostIsIPAddress() const;
// Getters for various components of the URL. The returned string will be
// empty if the component is empty or is not present.
@@ -295,24 +298,24 @@ class GURL {
// Returns a parsed version of the port. Can also be any of the special
// values defined in Parsed for ExtractPort.
- int IntPort() const;
+ GURL_API int IntPort() const;
// Returns the port number of the url, or the default port number.
// If the scheme has no concept of port (or unknown default) returns
// PORT_UNSPECIFIED.
- int EffectiveIntPort() const;
+ GURL_API int EffectiveIntPort() const;
// Extracts the filename portion of the path and returns it. The filename
// is everything after the last slash in the path. This may be empty.
- std::string ExtractFileName() const;
+ GURL_API std::string ExtractFileName() const;
// Returns the path that should be sent to the server. This is the path,
// parameter, and query portions of the URL. It is guaranteed to be ASCII.
- std::string PathForRequest() const;
+ GURL_API std::string PathForRequest() const;
// Returns the host, excluding the square brackets surrounding IPv6 address
// literals. This can be useful for passing to getaddrinfo().
- std::string HostNoBrackets() const;
+ GURL_API std::string HostNoBrackets() const;
// Returns true if this URL's host matches or is in the same domain as
// the given input string. For example if this URL was "www.google.com",
@@ -324,7 +327,7 @@ class GURL {
//
// If function DomainIs has parameter domain_len, which means the parameter
// lower_ascii_domain does not gurantee to terminate with NULL character.
- bool DomainIs(const char* lower_ascii_domain, int domain_len) const;
+ GURL_API bool DomainIs(const char* lower_ascii_domain, int domain_len) const;
// If function DomainIs only has parameter lower_ascii_domain, which means
// domain string should be terminate with NULL character.
@@ -335,12 +338,12 @@ class GURL {
// Swaps the contents of this GURL object with the argument without doing
// any memory allocations.
- void Swap(GURL* other);
+ GURL_API void Swap(GURL* other);
// Returns a reference to a singleton empty GURL. This object is for callers
// who return references but don't have anything to return in some cases.
// This function may be called from any thread.
- static const GURL& EmptyGURL();
+ GURL_API static const GURL& EmptyGURL();
private:
// Returns the substring of the input identified by the given component.
diff --git a/googleurl/src/gurl_unittest.cc b/googleurl/src/gurl_unittest.cc
index 4e81de6..079e1ea 100644
--- a/googleurl/src/gurl_unittest.cc
+++ b/googleurl/src/gurl_unittest.cc
@@ -31,35 +31,36 @@ void SetupReplacement(void (url_canon::Replacements<CHAR>::*func)(const CHAR*,
}
}
+// Returns the canonicalized string for the given URL string for the
+// GURLTest.Types test.
+std::string TypesTestCase(const char* src) {
+ GURL gurl(src);
+ return gurl.possibly_invalid_spec();
+}
+
} // namespace
// Different types of URLs should be handled differently by url_util, and
// handed off to different canonicalizers.
TEST(GURLTest, Types) {
- struct TypeTest {
- const char* src;
- const char* expected;
- } type_cases[] = {
- // URLs with "://" should be treated as standard and have a hostname, even
- // when the scheme is unknown.
- {"something:///HOSTNAME.com/", "something://hostname.com/"},
- // In the reverse, lacking a "://" means a path URL so no canonicalization
- // should happen.
- {"something:HOSTNAME.com/", "something:HOSTNAME.com/"},
- {"something:/HOSTNAME.com/", "something:/HOSTNAME.com/"},
+ // URLs with unknown schemes should be treated as path URLs, even when they
+ // have things like "://".
+ EXPECT_EQ("something:///HOSTNAME.com/",
+ TypesTestCase("something:///HOSTNAME.com/"));
+
+ // In the reverse, known schemes should always trigger standard URL handling.
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com"));
+
#ifdef WIN32
- // URLs that look like absolute Windows drive specs.
- {"c:\\foo.txt", "file:///C:/foo.txt"},
- {"Z|foo.txt", "file:///Z:/foo.txt"},
- {"\\\\server\\foo.txt", "file://server/foo.txt"},
- {"//server/foo.txt", "file://server/foo.txt"},
+ // URLs that look like absolute Windows drive specs.
+ EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt"));
+ EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt"));
+ EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt"));
+ EXPECT_EQ("file://server/foo.txt", TypesTestCase("//server/foo.txt"));
#endif
- };
-
- for (size_t i = 0; i < ARRAYSIZE(type_cases); i++) {
- GURL gurl(type_cases[i].src);
- EXPECT_STREQ(type_cases[i].expected, gurl.spec().c_str());
- }
}
// Test the basic creation and querying of components in a GURL. We assume
@@ -166,9 +167,7 @@ TEST(GURLTest, Resolve) {
{"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", true, "http://www.google.com/hello/world.html?a#b"},
{"http://www.google.com/foo#bar", "#com", true, "http://www.google.com/foo#com"},
{"http://www.google.com/", "Https:images.google.com", true, "https://images.google.com/"},
- // Unknown schemes with a "://" should be treated as standard.
- {"somescheme://foo/", "bar", true, "somescheme://foo/bar"},
- // Unknown schemes with no "://" are not standard.
+ // Unknown schemes are not standard.
{"data:blahblah", "http://google.com/", true, "http://google.com/"},
{"data:blahblah", "http:google.com", true, "http://google.com/"},
{"data:/blahblah", "file.html", false, ""},
@@ -178,15 +177,15 @@ TEST(GURLTest, Resolve) {
// 8-bit code path.
GURL input(resolve_cases[i].base);
GURL output = input.Resolve(resolve_cases[i].relative);
- EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid());
- EXPECT_EQ(resolve_cases[i].expected, output.spec());
+ EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()) << i;
+ EXPECT_EQ(resolve_cases[i].expected, output.spec()) << i;
// Wide code path.
GURL inputw(ConvertUTF8ToUTF16(resolve_cases[i].base));
GURL outputw =
input.Resolve(ConvertUTF8ToUTF16(resolve_cases[i].relative));
- EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid());
- EXPECT_EQ(resolve_cases[i].expected, outputw.spec());
+ EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()) << i;
+ EXPECT_EQ(resolve_cases[i].expected, outputw.spec()) << i;
}
}
@@ -429,5 +428,5 @@ TEST(GURLTest, IsStandard) {
EXPECT_FALSE(b.IsStandard());
GURL c("foo://bar/baz");
- EXPECT_TRUE(c.IsStandard());
+ EXPECT_FALSE(c.IsStandard());
}
diff --git a/googleurl/src/url_canon.h b/googleurl/src/url_canon.h
index 143574d..e2cfb55 100644
--- a/googleurl/src/url_canon.h
+++ b/googleurl/src/url_canon.h
@@ -33,6 +33,7 @@
#include <stdlib.h>
#include "base/string16.h"
+#include "googleurl/src/url_common.h"
#include "googleurl/src/url_parse.h"
namespace url_canon {
@@ -248,12 +249,12 @@ class CharsetConverter {
//
// Therefore, callers should not use the buffer, since it may actuall be empty,
// use the computed pointer and |*output_len| instead.
-const char* RemoveURLWhitespace(const char* input, int input_len,
- CanonOutputT<char>* buffer,
- int* output_len);
-const char16* RemoveURLWhitespace(const char16* input, int input_len,
- CanonOutputT<char16>* buffer,
- int* output_len);
+GURL_API const char* RemoveURLWhitespace(const char* input, int input_len,
+ CanonOutputT<char>* buffer,
+ int* output_len);
+GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len,
+ CanonOutputT<char16>* buffer,
+ int* output_len);
// IDN ------------------------------------------------------------------------
@@ -266,7 +267,7 @@ const char16* RemoveURLWhitespace(const char16* input, int input_len,
// the length of the output will be set to the length of the new host name.
//
// On error, returns false. The output in this case is undefined.
-bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output);
+GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output);
// Piece-by-piece canonicalizers ----------------------------------------------
//
@@ -292,14 +293,14 @@ bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output);
// URLs.
//
// The 8-bit version requires UTF-8 encoding.
-bool CanonicalizeScheme(const char* spec,
- const url_parse::Component& scheme,
- CanonOutput* output,
- url_parse::Component* out_scheme);
-bool CanonicalizeScheme(const char16* spec,
- const url_parse::Component& scheme,
- CanonOutput* output,
- url_parse::Component* out_scheme);
+GURL_API bool CanonicalizeScheme(const char* spec,
+ const url_parse::Component& scheme,
+ CanonOutput* output,
+ url_parse::Component* out_scheme);
+GURL_API bool CanonicalizeScheme(const char16* spec,
+ const url_parse::Component& scheme,
+ CanonOutput* output,
+ url_parse::Component* out_scheme);
// User info: username/password. If present, this will add the delimiters so
// the output will be "<username>:<password>@" or "<username>@". Empty
@@ -311,20 +312,20 @@ bool CanonicalizeScheme(const char16* spec,
// is legal as long as the two components don't overlap.
//
// The 8-bit version requires UTF-8 encoding.
-bool CanonicalizeUserInfo(const char* username_source,
- const url_parse::Component& username,
- const char* password_source,
- const url_parse::Component& password,
- CanonOutput* output,
- url_parse::Component* out_username,
- url_parse::Component* out_password);
-bool CanonicalizeUserInfo(const char16* username_source,
- const url_parse::Component& username,
- const char16* password_source,
- const url_parse::Component& password,
- CanonOutput* output,
- url_parse::Component* out_username,
- url_parse::Component* out_password);
+GURL_API bool CanonicalizeUserInfo(const char* username_source,
+ const url_parse::Component& username,
+ const char* password_source,
+ const url_parse::Component& password,
+ CanonOutput* output,
+ url_parse::Component* out_username,
+ url_parse::Component* out_password);
+GURL_API bool CanonicalizeUserInfo(const char16* username_source,
+ const url_parse::Component& username,
+ const char16* password_source,
+ const url_parse::Component& password,
+ CanonOutput* output,
+ url_parse::Component* out_username,
+ url_parse::Component* out_password);
// This structure holds detailed state exported from the IP/Host canonicalizers.
@@ -366,27 +367,27 @@ struct CanonHostInfo {
//
// The 8-bit version requires UTF-8 encoding. Use this version when you only
// need to know whether canonicalization succeeded.
-bool CanonicalizeHost(const char* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- url_parse::Component* out_host);
-bool CanonicalizeHost(const char16* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- url_parse::Component* out_host);
+GURL_API bool CanonicalizeHost(const char* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ url_parse::Component* out_host);
+GURL_API bool CanonicalizeHost(const char16* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ url_parse::Component* out_host);
// Extended version of CanonicalizeHost, which returns additional information.
// Use this when you need to know whether the hostname was an IP address.
// A successful return is indicated by host_info->family != BROKEN. See the
// definition of CanonHostInfo above for details.
-void CanonicalizeHostVerbose(const char* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- CanonHostInfo* host_info);
-void CanonicalizeHostVerbose(const char16* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- CanonHostInfo* host_info);
+GURL_API void CanonicalizeHostVerbose(const char* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+GURL_API void CanonicalizeHostVerbose(const char16* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
// IP addresses.
@@ -399,34 +400,34 @@ void CanonicalizeHostVerbose(const char16* spec,
// This is called AUTOMATICALLY from the host canonicalizer, which ensures that
// the input is unescaped and name-prepped, etc. It should not normally be
// necessary or wise to call this directly.
-void CanonicalizeIPAddress(const char* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- CanonHostInfo* host_info);
-void CanonicalizeIPAddress(const char16* spec,
- const url_parse::Component& host,
- CanonOutput* output,
- CanonHostInfo* host_info);
+GURL_API void CanonicalizeIPAddress(const char* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+GURL_API void CanonicalizeIPAddress(const char16* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
// Port: this function will add the colon for the port if a port is present.
// The caller can pass url_parse::PORT_UNSPECIFIED as the
// default_port_for_scheme argument if there is no default port.
//
// The 8-bit version requires UTF-8 encoding.
-bool CanonicalizePort(const char* spec,
- const url_parse::Component& port,
- int default_port_for_scheme,
- CanonOutput* output,
- url_parse::Component* out_port);
-bool CanonicalizePort(const char16* spec,
- const url_parse::Component& port,
- int default_port_for_scheme,
- CanonOutput* output,
- url_parse::Component* out_port);
+GURL_API bool CanonicalizePort(const char* spec,
+ const url_parse::Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ url_parse::Component* out_port);
+GURL_API bool CanonicalizePort(const char16* spec,
+ const url_parse::Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ url_parse::Component* out_port);
// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
// if the scheme is unknown.
-int DefaultPortForScheme(const char* scheme, int scheme_len);
+GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len);
// Path. If the input does not begin in a slash (including if the input is
// empty), we'll prepend a slash to the path to make it canonical.
@@ -437,14 +438,14 @@ int DefaultPortForScheme(const char* scheme, int scheme_len);
// an issue. Somebody giving us an 8-bit path is responsible for generating
// the path that the server expects (we'll escape high-bit characters), so
// if something is invalid, it's their problem.
-bool CanonicalizePath(const char* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
-bool CanonicalizePath(const char16* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
+GURL_API bool CanonicalizePath(const char* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
+GURL_API bool CanonicalizePath(const char16* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
// Canonicalizes the input as a file path. This is like CanonicalizePath except
// that it also handles Windows drive specs. For example, the path can begin
@@ -452,14 +453,14 @@ bool CanonicalizePath(const char16* spec,
// The string will be appended to |*output| and |*out_path| will be updated.
//
// The 8-bit version requires UTF-8 encoding.
-bool FileCanonicalizePath(const char* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
-bool FileCanonicalizePath(const char16* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
+GURL_API bool FileCanonicalizePath(const char* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
+GURL_API bool FileCanonicalizePath(const char16* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
// Query: Prepends the ? if needed.
//
@@ -473,16 +474,16 @@ bool FileCanonicalizePath(const char16* spec,
// if necessary, for ASCII input, no conversions are necessary.
//
// The converter can be NULL. In this case, the output encoding will be UTF-8.
-void CanonicalizeQuery(const char* spec,
- const url_parse::Component& query,
- CharsetConverter* converter,
- CanonOutput* output,
- url_parse::Component* out_query);
-void CanonicalizeQuery(const char16* spec,
- const url_parse::Component& query,
- CharsetConverter* converter,
- CanonOutput* output,
- url_parse::Component* out_query);
+GURL_API void CanonicalizeQuery(const char* spec,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ url_parse::Component* out_query);
+GURL_API void CanonicalizeQuery(const char16* spec,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ url_parse::Component* out_query);
// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only
// canonicalizer that does not produce ASCII output). The output is
@@ -490,14 +491,14 @@ void CanonicalizeQuery(const char16* spec,
//
// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use
// the "Unicode replacement character" for the confusing bits and copy the rest.
-void CanonicalizeRef(const char* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
-void CanonicalizeRef(const char16* spec,
- const url_parse::Component& path,
- CanonOutput* output,
- url_parse::Component* out_path);
+GURL_API void CanonicalizeRef(const char* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
+GURL_API void CanonicalizeRef(const char16* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
// Full canonicalizer ---------------------------------------------------------
//
@@ -510,61 +511,61 @@ void CanonicalizeRef(const char16* spec,
// The 8-bit versions require UTF-8 encoding.
// Use for standard URLs with authorities and paths.
-bool CanonicalizeStandardURL(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-bool CanonicalizeStandardURL(const char16* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeStandardURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeStandardURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
// Use for file URLs.
-bool CanonicalizeFileURL(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-bool CanonicalizeFileURL(const char16* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeFileURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeFileURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
// Use for path URLs such as javascript. This does not modify the path in any
// way, for example, by escaping it.
-bool CanonicalizePathURL(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-bool CanonicalizePathURL(const char16* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizePathURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizePathURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
// Use for mailto URLs. This "canonicalizes" the url into a path and query
// component. It does not attempt to merge "to" fields. It uses UTF-8 for
// the query encoding if there is a query. This is because a mailto URL is
// really intended for an external mail program, and the encoding of a page,
// etc. which would influence a query encoding normally are irrelevant.
-bool CanonicalizeMailtoURL(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-bool CanonicalizeMailtoURL(const char16* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeMailtoURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeMailtoURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
// Part replacer --------------------------------------------------------------
@@ -585,7 +586,7 @@ template<typename CHAR>
struct URLComponentSource {
// Constructor normally used by callers wishing to replace components. This
// will make them all NULL, which is no replacement. The caller would then
- // override the compoents they want to replace.
+ // override the components they want to replace.
URLComponentSource()
: scheme(NULL),
username(NULL),
@@ -749,59 +750,59 @@ class Replacements {
};
// The base must be an 8-bit canonical URL.
-bool ReplaceStandardURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char>& replacements,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-bool ReplaceStandardURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char16>& replacements,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceStandardURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceStandardURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
// Replacing some parts of a file URL is not permitted. Everything except
// the host, path, query, and ref will be ignored.
-bool ReplaceFileURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char>& replacements,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-bool ReplaceFileURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char16>& replacements,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceFileURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceFileURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
// Path URLs can only have the scheme and path replaced. All other components
// will be ignored.
-bool ReplacePathURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-bool ReplacePathURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char16>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+GURL_API bool ReplacePathURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool ReplacePathURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
// Mailto URLs can only have the scheme, path, and query replaced.
// All other components will be ignored.
-bool ReplaceMailtoURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
-bool ReplaceMailtoURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const Replacements<char16>& replacements,
- CanonOutput* output,
- url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceMailtoURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceMailtoURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
// Relative URL ---------------------------------------------------------------
@@ -816,20 +817,20 @@ bool ReplaceMailtoURL(const char* base,
// not). Failure means that the combination of URLs doesn't make any sense.
//
// The base URL should always be canonical, therefore is ASCII.
-bool IsRelativeURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const char* fragment,
- int fragment_len,
- bool is_base_hierarchical,
- bool* is_relative,
- url_parse::Component* relative_component);
-bool IsRelativeURL(const char* base,
- const url_parse::Parsed& base_parsed,
- const char16* fragment,
- int fragment_len,
- bool is_base_hierarchical,
- bool* is_relative,
- url_parse::Component* relative_component);
+GURL_API bool IsRelativeURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const char* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ url_parse::Component* relative_component);
+GURL_API bool IsRelativeURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const char16* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ url_parse::Component* relative_component);
// Given a canonical parsed source URL, a URL fragment known to be relative,
// and the identified relevant portion of the relative URL (computed by
@@ -849,22 +850,22 @@ bool IsRelativeURL(const char* base,
// Returns true on success. On failure, the output will be "something
// reasonable" that will be consistent and valid, just probably not what
// was intended by the web page author or caller.
-bool ResolveRelativeURL(const char* base_url,
- const url_parse::Parsed& base_parsed,
- bool base_is_file,
- const char* relative_url,
- const url_parse::Component& relative_component,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* out_parsed);
-bool ResolveRelativeURL(const char* base_url,
- const url_parse::Parsed& base_parsed,
- bool base_is_file,
- const char16* relative_url,
- const url_parse::Component& relative_component,
- CharsetConverter* query_converter,
- CanonOutput* output,
- url_parse::Parsed* out_parsed);
+GURL_API bool ResolveRelativeURL(const char* base_url,
+ const url_parse::Parsed& base_parsed,
+ bool base_is_file,
+ const char* relative_url,
+ const url_parse::Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* out_parsed);
+GURL_API bool ResolveRelativeURL(const char* base_url,
+ const url_parse::Parsed& base_parsed,
+ bool base_is_file,
+ const char16* relative_url,
+ const url_parse::Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* out_parsed);
} // namespace url_canon
diff --git a/googleurl/src/url_canon_etc.cc b/googleurl/src/url_canon_etc.cc
index 672b187..aea181a 100644
--- a/googleurl/src/url_canon_etc.cc
+++ b/googleurl/src/url_canon_etc.cc
@@ -120,6 +120,11 @@ bool DoScheme(const CHAR* spec,
// The output scheme starts from the current position.
out_scheme->begin = output->length();
+ // Danger: it's important that this code does not strip any characters: it
+ // only emits the canonical version (be it valid or escaped) of each of
+ // the input characters. Stripping would put it out of sync with
+ // url_util::FindAndCompareScheme, which could cause some security checks on
+ // schemes to be incorrect.
bool success = true;
int end = scheme.end();
for (int i = scheme.begin; i < end; i++) {
diff --git a/googleurl/src/url_canon_icu.h b/googleurl/src/url_canon_icu.h
index 3980663..6bc52c3 100644
--- a/googleurl/src/url_canon_icu.h
+++ b/googleurl/src/url_canon_icu.h
@@ -45,13 +45,13 @@ class ICUCharsetConverter : public CharsetConverter {
// Constructs a converter using an already-existing ICU character set
// converter. This converter is NOT owned by this object; the lifetime must
// be managed by the creator such that it is alive as long as this is.
- ICUCharsetConverter(UConverter* converter);
+ GURL_API ICUCharsetConverter(UConverter* converter);
- virtual ~ICUCharsetConverter() {}
+ GURL_API virtual ~ICUCharsetConverter() {}
- virtual void ConvertFromUTF16(const char16* input,
- int input_len,
- CanonOutput* output);
+ GURL_API virtual void ConvertFromUTF16(const char16* input,
+ int input_len,
+ CanonOutput* output);
private:
// The ICU converter, not owned by this class.
diff --git a/googleurl/src/url_canon_ip.cc b/googleurl/src/url_canon_ip.cc
index d84ff7d..86f7c9c 100644
--- a/googleurl/src/url_canon_ip.cc
+++ b/googleurl/src/url_canon_ip.cc
@@ -58,11 +58,14 @@ template<typename CHAR, typename UCHAR>
bool DoFindIPv4Components(const CHAR* spec,
const url_parse::Component& host,
url_parse::Component components[4]) {
+ if (!host.is_nonempty())
+ return false;
+
int cur_component = 0; // Index of the component we're working on.
int cur_component_begin = host.begin; // Start of the current component.
int end = host.end();
for (int i = host.begin; /* nothing */; i++) {
- if (i == end || spec[i] == '.') {
+ if (i >= end || spec[i] == '.') {
// Found the end of the current component.
int component_len = i - cur_component_begin;
components[cur_component] =
@@ -76,10 +79,10 @@ bool DoFindIPv4Components(const CHAR* spec,
// allow an empty component at the end (this would indicate that the
// input ends in a dot). We also want to error if the component is
// empty and it's the only component (cur_component == 1).
- if (component_len == 0 && (i != end || cur_component == 1))
+ if (component_len == 0 && (i < end || cur_component == 1))
return false;
- if (i == end)
+ if (i >= end)
break; // End of the input.
if (cur_component == 4) {
@@ -537,8 +540,8 @@ bool DoIPv6AddressToNumber(const CHAR* spec,
if (ipv6_parsed.ipv4_component.is_valid()) {
// We only allow the embedded IPv4 syntax to be used for "compat" and
// "mapped" formats:
- // "compat" ==> 0:0:0:0:0:ffff:<IPv4-literal>
- // "mapped" ==> 0:0:0:0:0:0000:<IPv4-literal>
+ // "mapped" ==> 0:0:0:0:0:ffff:<IPv4-literal>
+ // "compat" ==> 0:0:0:0:0:0000:<IPv4-literal>
for (int j = 0; j < 10; ++j) {
if (address[j] != 0)
return false;
diff --git a/googleurl/src/url_canon_ip.h b/googleurl/src/url_canon_ip.h
index 6ce069d..0a01c9f 100644
--- a/googleurl/src/url_canon_ip.h
+++ b/googleurl/src/url_canon_ip.h
@@ -32,6 +32,7 @@
#include "base/string16.h"
#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_common.h"
#include "googleurl/src/url_parse.h"
namespace url_canon {
@@ -54,12 +55,12 @@ namespace url_canon {
// Mozilla), so this code path never gets hit. Our host canonicalization will
// notice these spaces and escape them, which will make IP address finding
// fail. This seems like better behavior than stripping after a space.
-bool FindIPv4Components(const char* spec,
- const url_parse::Component& host,
- url_parse::Component components[4]);
-bool FindIPv4Components(const char16* spec,
- const url_parse::Component& host,
- url_parse::Component components[4]);
+GURL_API bool FindIPv4Components(const char* spec,
+ const url_parse::Component& host,
+ url_parse::Component components[4]);
+GURL_API bool FindIPv4Components(const char16* spec,
+ const url_parse::Component& host,
+ url_parse::Component components[4]);
// Converts an IPv4 address to a 32-bit number (network byte order).
//
@@ -72,26 +73,28 @@ bool FindIPv4Components(const char16* spec,
//
// On success, |num_ipv4_components| will be populated with the number of
// components in the IPv4 address.
-CanonHostInfo::Family IPv4AddressToNumber(const char* spec,
- const url_parse::Component& host,
- unsigned char address[4],
- int* num_ipv4_components);
-CanonHostInfo::Family IPv4AddressToNumber(const char16* spec,
- const url_parse::Component& host,
- unsigned char address[4],
- int* num_ipv4_components);
+GURL_API CanonHostInfo::Family IPv4AddressToNumber(
+ const char* spec,
+ const url_parse::Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components);
+GURL_API CanonHostInfo::Family IPv4AddressToNumber(
+ const char16* spec,
+ const url_parse::Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components);
// Converts an IPv6 address to a 128-bit number (network byte order), returning
// true on success. False means that the input was not a valid IPv6 address.
//
// NOTE that |host| is expected to be surrounded by square brackets.
// i.e. "[::1]" rather than "::1".
-bool IPv6AddressToNumber(const char* spec,
- const url_parse::Component& host,
- unsigned char address[16]);
-bool IPv6AddressToNumber(const char16* spec,
- const url_parse::Component& host,
- unsigned char address[16]);
+GURL_API bool IPv6AddressToNumber(const char* spec,
+ const url_parse::Component& host,
+ unsigned char address[16]);
+GURL_API bool IPv6AddressToNumber(const char16* spec,
+ const url_parse::Component& host,
+ unsigned char address[16]);
} // namespace url_canon
diff --git a/googleurl/src/url_canon_path.cc b/googleurl/src/url_canon_path.cc
index 98ca40b..df97aad 100644
--- a/googleurl/src/url_canon_path.cc
+++ b/googleurl/src/url_canon_path.cc
@@ -84,7 +84,7 @@ const unsigned char kPathCharLookup[0x100] = {
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, PASS, ESCAPE, PASS, ESCAPE, ESCAPE,
// @ A B C D E F G H I J K L M N O
- UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
+ PASS, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
// P Q R S T U V W X Y Z [ \ ] ^ _
UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, ESCAPE, PASS, ESCAPE, UNESCAPE,
// ` a b c d e f g h i j k l m n o
diff --git a/googleurl/src/url_canon_relative.cc b/googleurl/src/url_canon_relative.cc
index 446b951..6bcc72f 100644
--- a/googleurl/src/url_canon_relative.cc
+++ b/googleurl/src/url_canon_relative.cc
@@ -457,10 +457,11 @@ bool DoResolveRelativeURL(const char* base_url,
}
if (relative_component.len <= 0) {
- // Empty relative URL, make no changes.
+ // Empty relative URL, leave unchanged, only removing the ref component.
int base_len = base_parsed.Length();
- for (int i = 0; i < base_len; i++)
- output->push_back(base_url[i]);
+ base_len -= base_parsed.ref.len + 1;
+ out_parsed->ref.reset();
+ output->Append(base_url, base_len);
return true;
}
diff --git a/googleurl/src/url_canon_stdstring.h b/googleurl/src/url_canon_stdstring.h
index 2241eb1..c43b777 100644
--- a/googleurl/src/url_canon_stdstring.h
+++ b/googleurl/src/url_canon_stdstring.h
@@ -31,15 +31,15 @@
// strings. Because the canonicalizer tries not to be dependent on the STL,
// we have segregated it here.
-#ifndef GOOGLEURL_SRC_URL_CANON_STRING_H__
-#define GOOGLEURL_SRC_URL_CANON_STRING_H__
+#ifndef GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+#define GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
#include <string>
#include "googleurl/src/url_canon.h"
namespace url_canon {
-// Write into a std::string given in the constructor. This object odes not own
+// Write into a std::string given in the constructor. This object does not own
// the string itself, and the user must ensure that the string stays alive
// throughout the lifetime of this object.
//
@@ -82,7 +82,7 @@ class StdStringCanonOutput : public CanonOutput {
}
protected:
- std::string* str_;
+ std::string* str_;
};
// An extension of the Replacements class that allows the setters to use
@@ -130,4 +130,5 @@ class StdStringReplacements :
} // namespace url_canon
-#endif // GOOGLEURL_SRC_URL_CANON_STRING_H__
+#endif // GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+
diff --git a/googleurl/src/url_canon_stdurl.cc b/googleurl/src/url_canon_stdurl.cc
index 41a8fa9..1e21a14 100644
--- a/googleurl/src/url_canon_stdurl.cc
+++ b/googleurl/src/url_canon_stdurl.cc
@@ -170,6 +170,15 @@ bool CanonicalizeStandardURL(const char16* spec,
output, new_parsed);
}
+// It might be nice in the future to optimize this so unchanged components don't
+// need to be recanonicalized. This is especially true since the common case for
+// ReplaceComponents is removing things we don't want, like reference fragments
+// and usernames. These cases can become more efficient if we can assume the
+// rest of the URL is OK with these removed (or only the modified parts
+// recanonicalized). This would be much more complex to implement, however.
+//
+// You would also need to update DoReplaceComponents in url_util.cc which
+// relies on this re-checking everything (see the comment there for why).
bool ReplaceStandardURL(const char* base,
const url_parse::Parsed& base_parsed,
const Replacements<char>& replacements,
diff --git a/googleurl/src/url_canon_unittest.cc b/googleurl/src/url_canon_unittest.cc
index c5be423..a3e43e2 100644
--- a/googleurl/src/url_canon_unittest.cc
+++ b/googleurl/src/url_canon_unittest.cc
@@ -766,6 +766,22 @@ TEST(URLCanonTest, IPv6) {
}
}
+TEST(URLCanonTest, IPEmpty) {
+ std::string out_str1;
+ url_canon::StdStringCanonOutput output1(&out_str1);
+ url_canon::CanonHostInfo host_info;
+
+ // This tests tests.
+ const char spec[] = "192.168.0.1";
+ url_canon::CanonicalizeIPAddress(spec, url_parse::Component(),
+ &output1, &host_info);
+ EXPECT_FALSE(host_info.IsIPAddress());
+
+ url_canon::CanonicalizeIPAddress(spec, url_parse::Component(0, 0),
+ &output1, &host_info);
+ EXPECT_FALSE(host_info.IsIPAddress());
+}
+
TEST(URLCanonTest, UserInfo) {
// Note that the canonicalizer should escape and treat empty components as
// not being there.
@@ -950,8 +966,8 @@ TEST(URLCanonTest, Path) {
// %7f should be allowed and %3D should not be unescaped (these were wrong
// in a previous version).
{"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", url_parse::Component(0, 24), true},
- // @ should be unescaped.
- {"/@asdf%40", L"/@asdf%40", "/@asdf@", url_parse::Component(0, 7), true},
+ // @ should be passed through unchanged (escaped or unescaped).
+ {"/@asdf%40", L"/@asdf%40", "/@asdf%40", url_parse::Component(0, 9), true},
// ----- encoding tests -----
// Basic conversions
@@ -1736,8 +1752,11 @@ TEST(URLCanonTest, ResolveRelativeURL) {
// Basic absolute input.
{"http://host/a", true, false, "http://another/", true, false, false, NULL},
{"http://host/a", true, false, "http:////another/", true, false, false, NULL},
- // Empty relative URLs shouldn't change the input.
+ // Empty relative URLs should only remove the ref part of the URL,
+ // leaving the rest unchanged.
{"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"},
+ {"http://foo/bar#ref", true, false, "", true, true, true, "http://foo/bar"},
+ {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"},
// Spaces at the ends of the relative path should be ignored.
{"http://foo/bar", true, false, " another ", true, true, true, "http://foo/another"},
{"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"},
diff --git a/googleurl/src/url_common.h b/googleurl/src/url_common.h
new file mode 100644
index 0000000..7e7e27a
--- /dev/null
+++ b/googleurl/src/url_common.h
@@ -0,0 +1,48 @@
+// Copyright 2010, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_COMMON_H__
+#define GOOGLEURL_SRC_URL_COMMON_H__
+
+#if !defined(GURL_IMPLEMENTATION)
+#define GURL_IMPLEMENTATION 0
+#endif
+
+#if defined(WIN32) && defined(GURL_DLL)
+#if GURL_IMPLEMENTATION
+#define GURL_API __declspec(dllexport)
+#else
+#define GURL_API __declspec(dllimport)
+#endif
+#else
+#define GURL_API
+#endif
+
+#endif // GOOGLEURL_SRC_URL_COMMON_H__
+
diff --git a/googleurl/src/url_parse.cc b/googleurl/src/url_parse.cc
index 7c37f13..a08c4da 100644
--- a/googleurl/src/url_parse.cc
+++ b/googleurl/src/url_parse.cc
@@ -64,54 +64,6 @@ int FindNextAuthorityTerminator(const CHAR* spec,
return spec_len; // Not found.
}
-// Fills in all members of the Parsed structure except for the scheme.
-//
-// |spec| is the full spec being parsed, of length |spec_len|.
-// |after_scheme| is the character immediately following the scheme (after the
-// colon) where we'll begin parsing.
-//
-// Compatability data points. I list "host", "path" extracted:
-// Input IE6 Firefox Us
-// ----- -------------- -------------- --------------
-// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
-// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
-// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/"
-// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/"
-// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
-//
-// (*) Interestingly, although IE fails to load these URLs, its history
-// canonicalizer handles them, meaning if you've been to the corresponding
-// "http://foo.com/" link, it will be colored.
-template <typename CHAR>
-void DoParseAfterScheme(const CHAR* spec,
- int spec_len,
- int after_scheme,
- Parsed* parsed) {
- int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
- int after_slashes = after_scheme + num_slashes;
-
- // First split into two main parts, the authority (username, password, host,
- // and port) and the full path (path, query, and reference).
- Component authority;
- Component full_path;
-
- // Found "//<some data>", looks like an authority section. Treat everything
- // from there to the next slash (or end of spec) to be the authority. Note
- // that we ignore the number of slashes and treat it as the authority.
- int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len);
- authority = Component(after_slashes, end_auth - after_slashes);
-
- if (end_auth == spec_len) // No beginning of path found.
- full_path = Component();
- else // Everything starting from the slash to the end is the path.
- full_path = Component(end_auth, spec_len - end_auth);
-
- // Now parse those two sub-parts.
- DoParseAuthority(spec, authority, &parsed->username, &parsed->password,
- &parsed->host, &parsed->port);
- ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
-}
-
template<typename CHAR>
void ParseUserInfo(const CHAR* spec,
const Component& user,
@@ -310,6 +262,54 @@ bool DoExtractScheme(const CHAR* url,
return false; // No colon found: no scheme
}
+// Fills in all members of the Parsed structure except for the scheme.
+//
+// |spec| is the full spec being parsed, of length |spec_len|.
+// |after_scheme| is the character immediately following the scheme (after the
+// colon) where we'll begin parsing.
+//
+// Compatability data points. I list "host", "path" extracted:
+// Input IE6 Firefox Us
+// ----- -------------- -------------- --------------
+// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
+// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
+// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/"
+// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/"
+// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
+//
+// (*) Interestingly, although IE fails to load these URLs, its history
+// canonicalizer handles them, meaning if you've been to the corresponding
+// "http://foo.com/" link, it will be colored.
+template <typename CHAR>
+void DoParseAfterScheme(const CHAR* spec,
+ int spec_len,
+ int after_scheme,
+ Parsed* parsed) {
+ int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
+ int after_slashes = after_scheme + num_slashes;
+
+ // First split into two main parts, the authority (username, password, host,
+ // and port) and the full path (path, query, and reference).
+ Component authority;
+ Component full_path;
+
+ // Found "//<some data>", looks like an authority section. Treat everything
+ // from there to the next slash (or end of spec) to be the authority. Note
+ // that we ignore the number of slashes and treat it as the authority.
+ int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len);
+ authority = Component(after_slashes, end_auth - after_slashes);
+
+ if (end_auth == spec_len) // No beginning of path found.
+ full_path = Component();
+ else // Everything starting from the slash to the end is the path.
+ full_path = Component(end_auth, spec_len - end_auth);
+
+ // Now parse those two sub-parts.
+ DoParseAuthority(spec, authority, &parsed->username, &parsed->password,
+ &parsed->host, &parsed->port);
+ ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
+}
+
// The main parsing function for standard URLs. Standard URLs have a scheme,
// host, path, etc.
template<typename CHAR>
@@ -683,7 +683,7 @@ void ParseAuthority(const char* spec,
DoParseAuthority(spec, auth, username, password, hostname, port_num);
}
-void ParseAuthority(char16* spec,
+void ParseAuthority(const char16* spec,
const Component& auth,
Component* username,
Component* password,
diff --git a/googleurl/src/url_parse.h b/googleurl/src/url_parse.h
index bea2766..134b445 100644
--- a/googleurl/src/url_parse.h
+++ b/googleurl/src/url_parse.h
@@ -34,6 +34,7 @@
#include "base/basictypes.h"
#include "base/string16.h"
+#include "googleurl/src/url_common.h"
namespace url_parse {
@@ -127,7 +128,7 @@ struct Parsed {
// of the string. For example "http://": the parsed structure will only
// contain an entry for the four-character scheme, and it doesn't know about
// the "://". For all other last-components, it will return the real length.
- int Length() const;
+ GURL_API int Length() const;
// Returns the number of characters before the given component if it exists,
// or where the component would be if it did exist. This will return the
@@ -155,7 +156,8 @@ struct Parsed {
// *QUERY: 14 15 <-
// *REF: 20 20
//
- int CountCharactersBefore(ComponentType type, bool include_delimiter) const;
+ GURL_API int CountCharactersBefore(ComponentType type,
+ bool include_delimiter) const;
// Scheme without the colon: "http://foo"/ would have a scheme of "http".
// The length will be -1 if no scheme is specified ("foo.com"), or 0 if there
@@ -215,24 +217,24 @@ struct Parsed {
// StandardURL is for when the scheme is known to be one that has an
// authority (host) like "http". This function will not handle weird ones
// like "about:" and "javascript:", or do the right thing for "file:" URLs.
-void ParseStandardURL(const char* url, int url_len, Parsed* parsed);
-void ParseStandardURL(const char16* url, int url_len, Parsed* parsed);
+GURL_API void ParseStandardURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseStandardURL(const char16* url, int url_len, Parsed* parsed);
// PathURL is for when the scheme is known not to have an authority (host)
// section but that aren't file URLs either. The scheme is parsed, and
// everything after the scheme is considered as the path. This is used for
// things like "about:" and "javascript:"
-void ParsePathURL(const char* url, int url_len, Parsed* parsed);
-void ParsePathURL(const char16* url, int url_len, Parsed* parsed);
+GURL_API void ParsePathURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParsePathURL(const char16* url, int url_len, Parsed* parsed);
// FileURL is for file URLs. There are some special rules for interpreting
// these.
-void ParseFileURL(const char* url, int url_len, Parsed* parsed);
-void ParseFileURL(const char16* url, int url_len, Parsed* parsed);
+GURL_API void ParseFileURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseFileURL(const char16* url, int url_len, Parsed* parsed);
// MailtoURL is for mailto: urls. They are made up scheme,path,query
-void ParseMailtoURL(const char* url, int url_len, Parsed* parsed);
-void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed);
+GURL_API void ParseMailtoURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed);
// Helper functions -----------------------------------------------------------
@@ -256,27 +258,27 @@ void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed);
// end of the string).
//
// The 8-bit version requires UTF-8 encoding.
-bool ExtractScheme(const char* url, int url_len, Component* scheme);
-bool ExtractScheme(const char16* url, int url_len, Component* scheme);
+GURL_API bool ExtractScheme(const char* url, int url_len, Component* scheme);
+GURL_API bool ExtractScheme(const char16* url, int url_len, Component* scheme);
// Returns true if ch is a character that terminates the authority segment
// of a URL.
-bool IsAuthorityTerminator(char16 ch);
+GURL_API bool IsAuthorityTerminator(char16 ch);
// Does a best effort parse of input |spec|, in range |auth|. If a particular
// component is not found, it will be set to invalid.
-void ParseAuthority(const char* spec,
- const Component& auth,
- Component* username,
- Component* password,
- Component* hostname,
- Component* port_num);
-void ParseAuthority(char16* spec,
- const Component& auth,
- Component* username,
- Component* password,
- Component* hostname,
- Component* port_num);
+GURL_API void ParseAuthority(const char* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num);
+GURL_API void ParseAuthority(const char16* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num);
// Computes the integer port value from the given port component. The port
// component should have been identified by one of the init functions on
@@ -285,8 +287,8 @@ void ParseAuthority(char16* spec,
// The return value will be a positive integer between 0 and 64K, or one of
// the two special values below.
enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 };
-int ParsePort(const char* url, const Component& port);
-int ParsePort(const char16* url, const Component& port);
+GURL_API int ParsePort(const char* url, const Component& port);
+GURL_API int ParsePort(const char16* url, const Component& port);
// Extracts the range of the file name in the given url. The path must
// already have been computed by the parse function, and the matching URL
@@ -298,12 +300,12 @@ int ParsePort(const char16* url, const Component& port);
// following the last slash.
//
// The 8-bit version requires UTF-8 encoding.
-void ExtractFileName(const char* url,
- const Component& path,
- Component* file_name);
-void ExtractFileName(const char16* url,
- const Component& path,
- Component* file_name);
+GURL_API void ExtractFileName(const char* url,
+ const Component& path,
+ Component* file_name);
+GURL_API void ExtractFileName(const char16* url,
+ const Component& path,
+ Component* file_name);
// Extract the first key/value from the range defined by |*query|. Updates
// |*query| to start at the end of the extracted key/value pair. This is
@@ -320,14 +322,14 @@ void ExtractFileName(const char16* url,
//
// If no key/value are found |*key| and |*value| will be unchanged and it will
// return false.
-bool ExtractQueryKeyValue(const char* url,
- Component* query,
- Component* key,
- Component* value);
-bool ExtractQueryKeyValue(const char16* url,
- Component* query,
- Component* key,
- Component* value);
+GURL_API bool ExtractQueryKeyValue(const char* url,
+ Component* query,
+ Component* key,
+ Component* value);
+GURL_API bool ExtractQueryKeyValue(const char16* url,
+ Component* query,
+ Component* key,
+ Component* value);
} // namespace url_parse
diff --git a/googleurl/src/url_util.cc b/googleurl/src/url_util.cc
index d623b45..7e100aa 100644
--- a/googleurl/src/url_util.cc
+++ b/googleurl/src/url_util.cc
@@ -33,6 +33,7 @@
#include "googleurl/src/url_util.h"
#include "base/logging.h"
+#include "googleurl/src/url_canon_internal.h"
#include "googleurl/src/url_file.h"
namespace url_util {
@@ -58,13 +59,15 @@ inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
const char kFileScheme[] = "file"; // Used in a number of places.
const char kMailtoScheme[] = "mailto";
-const int kNumStandardURLSchemes = 5;
+const int kNumStandardURLSchemes = 7;
const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
"http",
"https",
kFileScheme, // Yes, file urls can have a hostname!
"ftp",
"gopher",
+ "ws", // WebSocket.
+ "wss", // WebSocket secure.
};
// List of the currently installed standard schemes. This list is lazily
@@ -72,6 +75,9 @@ const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
// any destructors from being called that will slow us down or cause problems.
std::vector<const char*>* standard_schemes = NULL;
+// See the LockStandardSchemes declaration in the header.
+bool standard_schemes_locked = false;
+
// Ensures that the standard_schemes list is initialized, does nothing if it
// already has values.
void InitStandardSchemes() {
@@ -96,10 +102,9 @@ inline bool CompareSchemeComponent(const CHAR* spec,
}
// Returns true if the given scheme identified by |scheme| within |spec| is one
-// of the registered "standard" schemes. Note that this does not check for
-// "://", use IsStandard for that.
+// of the registered "standard" schemes.
template<typename CHAR>
-bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) {
+bool DoIsStandard(const CHAR* spec, const url_parse::Component& scheme) {
if (!scheme.is_nonempty())
return false; // Empty or invalid schemes are non-standard.
@@ -112,34 +117,20 @@ bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) {
return false;
}
-// Returns true if the stuff following the scheme in the given spec indicates
-// a "standard" URL. The presence of "://" after the scheme indicates that
-// there is a hostname, etc. which we call a standard URL.
-template<typename CHAR>
-bool HasStandardSchemeSeparator(const CHAR* spec, int spec_len,
- const url_parse::Component& scheme) {
- int after_scheme = scheme.end();
- if (spec_len < after_scheme + 3)
- return false;
- return spec[after_scheme] == ':' &&
- spec[after_scheme + 1] == '/' &&
- spec[after_scheme + 2] == '/';
-}
-
-template<typename CHAR>
-bool DoIsStandard(const CHAR* spec, int spec_len,
- const url_parse::Component& scheme) {
- return HasStandardSchemeSeparator(spec, spec_len, scheme) ||
- IsStandardScheme(spec, scheme);
-}
-
template<typename CHAR>
bool DoFindAndCompareScheme(const CHAR* str,
int str_len,
const char* compare,
url_parse::Component* found_scheme) {
+ // Before extracting scheme, canonicalize the URL to remove any whitespace.
+ // This matches the canonicalization done in DoCanonicalize function.
+ url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+ int spec_len;
+ const CHAR* spec = RemoveURLWhitespace(str, str_len,
+ &whitespace_buffer, &spec_len);
+
url_parse::Component our_scheme;
- if (!url_parse::ExtractScheme(str, str_len, &our_scheme)) {
+ if (!url_parse::ExtractScheme(spec, spec_len, &our_scheme)) {
// No scheme.
if (found_scheme)
*found_scheme = url_parse::Component();
@@ -147,7 +138,7 @@ bool DoFindAndCompareScheme(const CHAR* str,
}
if (found_scheme)
*found_scheme = our_scheme;
- return CompareSchemeComponent(str, our_scheme, compare);
+ return CompareSchemeComponent(spec, our_scheme, compare);
}
template<typename CHAR>
@@ -184,7 +175,7 @@ bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
#endif
url_parse::Component scheme;
- if(!url_parse::ExtractScheme(spec, spec_len, &scheme))
+ if (!url_parse::ExtractScheme(spec, spec_len, &scheme))
return false;
// This is the parsed version of the input URL, we have to canonicalize it
@@ -197,7 +188,7 @@ bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
charset_converter,
output, output_parsed);
- } else if (IsStandard(spec, spec_len, scheme)) {
+ } else if (DoIsStandard(spec, scheme)) {
// All "normal" URLs.
url_parse::ParseStandardURL(spec, spec_len, &parsed_input);
success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,
@@ -239,7 +230,7 @@ bool DoResolveRelative(const char* base_spec,
// See if our base URL should be treated as "standard".
bool standard_base_scheme =
base_parsed.scheme.is_nonempty() &&
- IsStandard(base_spec, base_spec_len, base_parsed.scheme);
+ DoIsStandard(base_spec, base_parsed.scheme);
bool is_relative;
url_parse::Component relative_component;
@@ -275,53 +266,111 @@ bool DoReplaceComponents(const char* spec,
url_canon::CharsetConverter* charset_converter,
url_canon::CanonOutput* output,
url_parse::Parsed* out_parsed) {
- // Note that we dispatch to the parser according the the scheme type of
- // the OUTPUT URL. Normally, this is the same as our scheme, but if the
- // scheme is being overridden, we need to test that.
-
- if (// Either the scheme is not replaced and the old one is a file,
- (!replacements.IsSchemeOverridden() &&
- CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) ||
- // ...or it is being replaced and the new one is a file.
- (replacements.IsSchemeOverridden() &&
- CompareSchemeComponent(replacements.sources().scheme,
- replacements.components().scheme,
- kFileScheme))) {
+ // If the scheme is overridden, just do a simple string substitution and
+ // reparse the whole thing. There are lots of edge cases that we really don't
+ // want to deal with. Like what happens if I replace "http://e:8080/foo"
+ // with a file. Does it become "file:///E:/8080/foo" where the port number
+ // becomes part of the path? Parsing that string as a file URL says "yes"
+ // but almost no sane rule for dealing with the components individually would
+ // come up with that.
+ //
+ // Why allow these crazy cases at all? Programatically, there is almost no
+ // case for replacing the scheme. The most common case for hitting this is
+ // in JS when building up a URL using the location object. In this case, the
+ // JS code expects the string substitution behavior:
+ // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
+ if (replacements.IsSchemeOverridden()) {
+ // Canonicalize the new scheme so it is 8-bit and can be concatenated with
+ // the existing spec.
+ url_canon::RawCanonOutput<128> scheme_replaced;
+ url_parse::Component scheme_replaced_parsed;
+ url_canon::CanonicalizeScheme(
+ replacements.sources().scheme,
+ replacements.components().scheme,
+ &scheme_replaced, &scheme_replaced_parsed);
+
+ // We can assume that the input is canonicalized, which means it always has
+ // a colon after the scheme (or where the scheme would be).
+ int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
+ : 1;
+ if (spec_len - spec_after_colon > 0) {
+ scheme_replaced.Append(&spec[spec_after_colon],
+ spec_len - spec_after_colon);
+ }
+
+ // We now need to completely re-parse the resulting string since its meaning
+ // may have changed with the different scheme.
+ url_canon::RawCanonOutput<128> recanonicalized;
+ url_parse::Parsed recanonicalized_parsed;
+ DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),
+ charset_converter,
+ &recanonicalized, &recanonicalized_parsed);
+
+ // Recurse using the version with the scheme already replaced. This will now
+ // use the replacement rules for the new scheme.
+ //
+ // Warning: this code assumes that ReplaceComponents will re-check all
+ // components for validity. This is because we can't fail if DoCanonicalize
+ // failed above since theoretically the thing making it fail could be
+ // getting replaced here. If ReplaceComponents didn't re-check everything,
+ // we wouldn't know if something *not* getting replaced is a problem.
+ // If the scheme-specific replacers are made more intelligent so they don't
+ // re-check everything, we should instead recanonicalize the whole thing
+ // after this call to check validity (this assumes replacing the scheme is
+ // much much less common than other types of replacements, like clearing the
+ // ref).
+ url_canon::Replacements<CHAR> replacements_no_scheme = replacements;
+ replacements_no_scheme.SetScheme(NULL, url_parse::Component());
+ return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
+ recanonicalized_parsed, replacements_no_scheme,
+ charset_converter, output, out_parsed);
+ }
+
+ // If we get here, then we know the scheme doesn't need to be replaced, so can
+ // just key off the scheme in the spec to know how to do the replacements.
+ if (CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) {
return url_canon::ReplaceFileURL(spec, parsed, replacements,
charset_converter, output, out_parsed);
}
-
- if (// Either the scheme is not replaced and the old one is standard,
- (!replacements.IsSchemeOverridden() &&
- IsStandard(spec, spec_len, parsed.scheme)) ||
- // ...or it is being replaced and the new one is standard.
- (replacements.IsSchemeOverridden() &&
- IsStandardScheme(replacements.sources().scheme,
- replacements.components().scheme))) {
- // Standard URL with all parts.
+ if (DoIsStandard(spec, parsed.scheme)) {
return url_canon::ReplaceStandardURL(spec, parsed, replacements,
charset_converter, output, out_parsed);
}
-
- if (// Either the scheme is not replaced and the old one is mailto,
- (!replacements.IsSchemeOverridden() &&
- CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) ||
- // ...or it is being replaced and the new one is a mailto.
- (replacements.IsSchemeOverridden() &&
- CompareSchemeComponent(replacements.sources().scheme,
- replacements.components().scheme,
- kMailtoScheme))) {
+ if (CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) {
return url_canon::ReplaceMailtoURL(spec, parsed, replacements,
output, out_parsed);
}
+ // Default is a path URL.
return url_canon::ReplacePathURL(spec, parsed, replacements,
output, out_parsed);
}
} // namespace
+void Initialize() {
+ InitStandardSchemes();
+}
+
+void Shutdown() {
+ if (standard_schemes) {
+ delete standard_schemes;
+ standard_schemes = NULL;
+ }
+}
+
void AddStandardScheme(const char* new_scheme) {
+ // If this assert triggers, it means you've called AddStandardScheme after
+ // LockStandardSchemes have been called (see the header file for
+ // LockStandardSchemes for more).
+ //
+ // This normally means you're trying to set up a new standard scheme too late
+ // in your application's init process. Locate where your app does this
+ // initialization and calls LockStandardScheme, and add your new standard
+ // scheme there.
+ DCHECK(!standard_schemes_locked) <<
+ "Trying to add a standard scheme after the list has been locked.";
+
size_t scheme_len = strlen(new_scheme);
if (scheme_len == 0)
return;
@@ -335,14 +384,16 @@ void AddStandardScheme(const char* new_scheme) {
standard_schemes->push_back(dup_scheme);
}
-bool IsStandard(const char* spec, int spec_len,
- const url_parse::Component& scheme) {
- return DoIsStandard(spec, spec_len, scheme);
+void LockStandardSchemes() {
+ standard_schemes_locked = true;
+}
+
+bool IsStandard(const char* spec, const url_parse::Component& scheme) {
+ return DoIsStandard(spec, scheme);
}
-bool IsStandard(const char16* spec, int spec_len,
- const url_parse::Component& scheme) {
- return DoIsStandard(spec, spec_len, scheme);
+bool IsStandard(const char16* spec, const url_parse::Component& scheme) {
+ return DoIsStandard(spec, scheme);
}
bool FindAndCompareScheme(const char* str,
@@ -450,4 +501,53 @@ bool LowerCaseEqualsASCII(const char16* a_begin,
return DoLowerCaseEqualsASCII(a_begin, a_end, b);
}
+void DecodeURLEscapeSequences(const char* input, int length,
+ url_canon::CanonOutputW* output) {
+ url_canon::RawCanonOutputT<char> unescaped_chars;
+ for (int i = 0; i < length; i++) {
+ if (input[i] == '%') {
+ unsigned char ch;
+ if (url_canon::DecodeEscaped(input, &i, length, &ch)) {
+ unescaped_chars.push_back(ch);
+ } else {
+ // Invalid escape sequence, copy the percent literal.
+ unescaped_chars.push_back('%');
+ }
+ } else {
+ // Regular non-escaped 8-bit character.
+ unescaped_chars.push_back(input[i]);
+ }
+ }
+
+ // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
+ // JavaScript URLs, but Firefox and Safari do.
+ for (int i = 0; i < unescaped_chars.length(); i++) {
+ unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
+ if (uch < 0x80) {
+ // Non-UTF-8, just append directly
+ output->push_back(uch);
+ } else {
+ // next_ch will point to the last character of the decoded
+ // character.
+ int next_character = i;
+ unsigned code_point;
+ if (url_canon::ReadUTFChar(unescaped_chars.data(), &next_character,
+ unescaped_chars.length(), &code_point)) {
+ // Valid UTF-8 character, convert to UTF-16.
+ url_canon::AppendUTF16Value(code_point, output);
+ i = next_character;
+ } else {
+ // If there are any sequences that are not valid UTF-8, we keep
+ // invalid code points and promote to UTF-16. We copy all characters
+ // from the current position to the end of the identified sequence.
+ while (i < next_character) {
+ output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+ i++;
+ }
+ output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+ }
+ }
+ }
+}
+
} // namespace url_util
diff --git a/googleurl/src/url_util.h b/googleurl/src/url_util.h
index 62813a6..ec4cf9e 100644
--- a/googleurl/src/url_util.h
+++ b/googleurl/src/url_util.h
@@ -33,29 +33,69 @@
#include <string>
#include "base/string16.h"
+#include "googleurl/src/url_common.h"
#include "googleurl/src/url_parse.h"
#include "googleurl/src/url_canon.h"
namespace url_util {
+// Init ------------------------------------------------------------------------
+
+// Initialization is NOT required, it will be implicitly initialized when first
+// used. However, this implicit initialization is NOT threadsafe. If you are
+// using this library in a threaded environment and don't have a consistent
+// "first call" (an example might be calling "AddStandardScheme" with your
+// special application-specific schemes) then you will want to call initialize
+// before spawning any threads.
+//
+// It is OK to call this function more than once, subsequent calls will simply
+// "noop", unless Shutdown() was called in the mean time. This will also be a
+// "noop" if other calls to the library have forced an initialization
+// beforehand.
+GURL_API void Initialize();
+
+// Cleanup is not required, except some strings may leak. For most user
+// applications, this is fine. If you're using it in a library that may get
+// loaded and unloaded, you'll want to unload to properly clean up your
+// library.
+GURL_API void Shutdown();
+
// Schemes --------------------------------------------------------------------
// Adds an application-defined scheme to the internal list of "standard" URL
-// schemes.
-void AddStandardScheme(const char* new_scheme);
+// schemes. This function is not threadsafe and can not be called concurrently
+// with any other url_util function. It will assert if the list of standard
+// schemes has been locked (see LockStandardSchemes).
+GURL_API void AddStandardScheme(const char* new_scheme);
+
+// Sets a flag to prevent future calls to AddStandardScheme from succeeding.
+//
+// This is designed to help prevent errors for multithreaded applications.
+// Normal usage would be to call AddStandardScheme for your custom schemes at
+// the beginning of program initialization, and then LockStandardSchemes. This
+// prevents future callers from mistakenly calling AddStandardScheme when the
+// program is running with multiple threads, where such usage would be
+// dangerous.
+//
+// We could have had AddStandardScheme use a lock instead, but that would add
+// some platform-specific dependencies we don't otherwise have now, and is
+// overkill considering the normal usage is so simple.
+GURL_API void LockStandardSchemes();
// Locates the scheme in the given string and places it into |found_scheme|,
// which may be NULL to indicate the caller does not care about the range.
+//
// Returns whether the given |compare| scheme matches the scheme found in the
-// input (if any).
-bool FindAndCompareScheme(const char* str,
- int str_len,
- const char* compare,
- url_parse::Component* found_scheme);
-bool FindAndCompareScheme(const char16* str,
- int str_len,
- const char* compare,
- url_parse::Component* found_scheme);
+// input (if any). The |compare| scheme must be a valid canonical scheme or
+// the result of the comparison is undefined.
+GURL_API bool FindAndCompareScheme(const char* str,
+ int str_len,
+ const char* compare,
+ url_parse::Component* found_scheme);
+GURL_API bool FindAndCompareScheme(const char16* str,
+ int str_len,
+ const char* compare,
+ url_parse::Component* found_scheme);
inline bool FindAndCompareScheme(const std::string& str,
const char* compare,
url_parse::Component* found_scheme) {
@@ -70,12 +110,18 @@ inline bool FindAndCompareScheme(const string16& str,
}
// Returns true if the given string represents a standard URL. This means that
-// either the scheme is in the list of known standard schemes, or there is a
-// "://" following the scheme.
-bool IsStandard(const char* spec, int spec_len,
- const url_parse::Component& scheme);
-bool IsStandard(const char16* spec, int spec_len,
- const url_parse::Component& scheme);
+// either the scheme is in the list of known standard schemes.
+GURL_API bool IsStandard(const char* spec,
+ const url_parse::Component& scheme);
+GURL_API bool IsStandard(const char16* spec,
+ const url_parse::Component& scheme);
+
+// TODO(brettw) remove this. This is a temporary compatibility hack to avoid
+// breaking the WebKit build when this version is synced via Chrome.
+inline bool IsStandard(const char* spec, int spec_len,
+ const url_parse::Component& scheme) {
+ return IsStandard(spec, scheme);
+}
// URL library wrappers -------------------------------------------------------
@@ -89,16 +135,16 @@ bool IsStandard(const char16* spec, int spec_len,
// Returns true if a valid URL was produced, false if not. On failure, the
// output and parsed structures will still be filled and will be consistent,
// but they will not represent a loadable URL.
-bool Canonicalize(const char* spec,
- int spec_len,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed);
-bool Canonicalize(const char16* spec,
- int spec_len,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed);
+GURL_API bool Canonicalize(const char* spec,
+ int spec_len,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed);
+GURL_API bool Canonicalize(const char16* spec,
+ int spec_len,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed);
// Resolves a potentially relative URL relative to the given parsed base URL.
// The base MUST be valid. The resulting canonical URL and parsed information
@@ -110,41 +156,43 @@ bool Canonicalize(const char16* spec,
//
// Returns true if the output is valid, false if the input could not produce
// a valid URL.
-bool ResolveRelative(const char* base_spec,
- int base_spec_len,
- const url_parse::Parsed& base_parsed,
- const char* relative,
- int relative_length,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed);
-bool ResolveRelative(const char* base_spec,
- int base_spec_len,
- const url_parse::Parsed& base_parsed,
- const char16* relative,
- int relative_length,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* output_parsed);
+GURL_API bool ResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const url_parse::Parsed& base_parsed,
+ const char* relative,
+ int relative_length,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed);
+GURL_API bool ResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const url_parse::Parsed& base_parsed,
+ const char16* relative,
+ int relative_length,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed);
// Replaces components in the given VALID input url. The new canonical URL info
// is written to output and out_parsed.
//
// Returns true if the resulting URL is valid.
-bool ReplaceComponents(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- const url_canon::Replacements<char>& replacements,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* out_parsed);
-bool ReplaceComponents(const char* spec,
- int spec_len,
- const url_parse::Parsed& parsed,
- const url_canon::Replacements<char16>& replacements,
- url_canon::CharsetConverter* charset_converter,
- url_canon::CanonOutput* output,
- url_parse::Parsed* out_parsed);
+GURL_API bool ReplaceComponents(
+ const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ const url_canon::Replacements<char>& replacements,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* out_parsed);
+GURL_API bool ReplaceComponents(
+ const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ const url_canon::Replacements<char16>& replacements,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* out_parsed);
// String helper functions ----------------------------------------------------
@@ -154,16 +202,20 @@ bool ReplaceComponents(const char* spec,
//
// The versions of this function that don't take a b_end assume that the b
// string is NULL terminated.
-bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b);
-bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b_begin,
- const char* b_end);
-bool LowerCaseEqualsASCII(const char16* a_begin,
- const char16* a_end,
- const char* b);
+GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
+ const char* a_end,
+ const char* b);
+GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
+ const char* a_end,
+ const char* b_begin,
+ const char* b_end);
+GURL_API bool LowerCaseEqualsASCII(const char16* a_begin,
+ const char16* a_end,
+ const char* b);
+
+// Unescapes the given string using URL escaping rules.
+GURL_API void DecodeURLEscapeSequences(const char* input, int length,
+ url_canon::CanonOutputW* output);
} // namespace url_util
diff --git a/googleurl/src/url_util_unittest.cc b/googleurl/src/url_util_unittest.cc
index 12e5254..442b2ec 100644
--- a/googleurl/src/url_util_unittest.cc
+++ b/googleurl/src/url_util_unittest.cc
@@ -30,6 +30,7 @@
#include "googleurl/src/url_canon.h"
#include "googleurl/src/url_canon_stdstring.h"
#include "googleurl/src/url_parse.h"
+#include "googleurl/src/url_test_utils.h"
#include "googleurl/src/url_util.h"
#include "testing/gtest/include/gtest/gtest.h"
@@ -64,6 +65,22 @@ TEST(URLUtilTest, FindAndCompareScheme) {
// But when there is no scheme, it should fail.
EXPECT_FALSE(url_util::FindAndCompareScheme("", 0, "", &found_scheme));
EXPECT_TRUE(found_scheme == url_parse::Component());
+
+ // When there is a whitespace char in scheme, it should canonicalize the url
+ // before comparison.
+ const char whtspc_str[] = " \r\n\tjav\ra\nscri\tpt:alert(1)";
+ EXPECT_TRUE(url_util::FindAndCompareScheme(
+ whtspc_str, static_cast<int>(strlen(whtspc_str)), "javascript",
+ &found_scheme));
+ EXPECT_TRUE(found_scheme == url_parse::Component(1, 10));
+
+ // Control characters should be stripped out on the ends, and kept in the
+ // middle.
+ const char ctrl_str[] = "\02jav\02scr\03ipt:alert(1)";
+ EXPECT_FALSE(url_util::FindAndCompareScheme(
+ ctrl_str, static_cast<int>(strlen(ctrl_str)), "javascript",
+ &found_scheme));
+ EXPECT_TRUE(found_scheme == url_parse::Component(1, 11));
}
TEST(URLUtilTest, ReplaceComponents) {
@@ -96,3 +113,106 @@ TEST(URLUtilTest, ReplaceComponents) {
&new_parsed);
}
+static std::string CheckReplaceScheme(const char* base_url,
+ const char* scheme) {
+ // Make sure the input is canonicalized.
+ url_canon::RawCanonOutput<32> original;
+ url_parse::Parsed original_parsed;
+ url_util::Canonicalize(base_url, strlen(base_url), NULL,
+ &original, &original_parsed);
+
+ url_canon::Replacements<char> replacements;
+ replacements.SetScheme(scheme, url_parse::Component(0, strlen(scheme)));
+
+ std::string output_string;
+ url_canon::StdStringCanonOutput output(&output_string);
+ url_parse::Parsed output_parsed;
+ url_util::ReplaceComponents(original.data(), original.length(),
+ original_parsed, replacements, NULL,
+ &output, &output_parsed);
+
+ output.Complete();
+ return output_string;
+}
+
+TEST(URLUtilTest, ReplaceScheme) {
+ EXPECT_EQ("https://google.com/",
+ CheckReplaceScheme("http://google.com/", "https"));
+ EXPECT_EQ("file://google.com/",
+ CheckReplaceScheme("http://google.com/", "file"));
+ EXPECT_EQ("http://home/Build",
+ CheckReplaceScheme("file:///Home/Build", "http"));
+ EXPECT_EQ("javascript:foo",
+ CheckReplaceScheme("about:foo", "javascript"));
+ EXPECT_EQ("://google.com/",
+ CheckReplaceScheme("http://google.com/", ""));
+ EXPECT_EQ("http://google.com/",
+ CheckReplaceScheme("about:google.com", "http"));
+ EXPECT_EQ("http:", CheckReplaceScheme("", "http"));
+
+#ifdef WIN32
+ // Magic Windows drive letter behavior when converting to a file URL.
+ EXPECT_EQ("file:///E:/foo/",
+ CheckReplaceScheme("http://localhost/e:foo/", "file"));
+#endif
+
+ // This will probably change to "about://google.com/" when we fix
+ // http://crbug.com/160 which should also be an acceptable result.
+ EXPECT_EQ("about://google.com/",
+ CheckReplaceScheme("http://google.com/", "about"));
+}
+
+TEST(URLUtilTest, DecodeURLEscapeSequences) {
+ struct DecodeCase {
+ const char* input;
+ const char* output;
+ } decode_cases[] = {
+ {"hello, world", "hello, world"},
+ {"%01%02%03%04%05%06%07%08%09%0a%0B%0C%0D%0e%0f/",
+ "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0B\x0C\x0D\x0e\x0f/"},
+ {"%10%11%12%13%14%15%16%17%18%19%1a%1B%1C%1D%1e%1f/",
+ "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1B\x1C\x1D\x1e\x1f/"},
+ {"%20%21%22%23%24%25%26%27%28%29%2a%2B%2C%2D%2e%2f/",
+ " !\"#$%&'()*+,-.//"},
+ {"%30%31%32%33%34%35%36%37%38%39%3a%3B%3C%3D%3e%3f/",
+ "0123456789:;<=>?/"},
+ {"%40%41%42%43%44%45%46%47%48%49%4a%4B%4C%4D%4e%4f/",
+ "@ABCDEFGHIJKLMNO/"},
+ {"%50%51%52%53%54%55%56%57%58%59%5a%5B%5C%5D%5e%5f/",
+ "PQRSTUVWXYZ[\\]^_/"},
+ {"%60%61%62%63%64%65%66%67%68%69%6a%6B%6C%6D%6e%6f/",
+ "`abcdefghijklmno/"},
+ {"%70%71%72%73%74%75%76%77%78%79%7a%7B%7C%7D%7e%7f/",
+ "pqrstuvwxyz{|}~\x7f/"},
+ // Test un-UTF-8-ization.
+ {"%e4%bd%a0%e5%a5%bd", "\xe4\xbd\xa0\xe5\xa5\xbd"},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(decode_cases); i++) {
+ const char* input = decode_cases[i].input;
+ url_canon::RawCanonOutputT<char16> output;
+ url_util::DecodeURLEscapeSequences(input, strlen(input), &output);
+ EXPECT_EQ(decode_cases[i].output,
+ url_test_utils::ConvertUTF16ToUTF8(
+ string16(output.data(), output.length())));
+ }
+
+ // Our decode should decode %00
+ const char zero_input[] = "%00";
+ url_canon::RawCanonOutputT<char16> zero_output;
+ url_util::DecodeURLEscapeSequences(zero_input, strlen(zero_input),
+ &zero_output);
+ EXPECT_NE("%00",
+ url_test_utils::ConvertUTF16ToUTF8(
+ string16(zero_output.data(), zero_output.length())));
+
+ // Test the error behavior for invalid UTF-8.
+ const char invalid_input[] = "%e4%a0%e5%a5%bd";
+ const char16 invalid_expected[4] = {0x00e4, 0x00a0, 0x597d, 0};
+ url_canon::RawCanonOutputT<char16> invalid_output;
+ url_util::DecodeURLEscapeSequences(invalid_input, strlen(invalid_input),
+ &invalid_output);
+ EXPECT_EQ(string16(invalid_expected),
+ string16(invalid_output.data(), invalid_output.length()));
+}
+