diff options
Diffstat (limited to 'googleurl')
-rw-r--r-- | googleurl/src/gurl.cc | 3 | ||||
-rw-r--r-- | googleurl/src/gurl.h | 61 | ||||
-rw-r--r-- | googleurl/src/gurl_unittest.cc | 59 | ||||
-rw-r--r-- | googleurl/src/url_canon.h | 435 | ||||
-rw-r--r-- | googleurl/src/url_canon_etc.cc | 5 | ||||
-rw-r--r-- | googleurl/src/url_canon_icu.h | 10 | ||||
-rw-r--r-- | googleurl/src/url_canon_ip.cc | 13 | ||||
-rw-r--r-- | googleurl/src/url_canon_ip.h | 43 | ||||
-rw-r--r-- | googleurl/src/url_canon_path.cc | 2 | ||||
-rw-r--r-- | googleurl/src/url_canon_relative.cc | 7 | ||||
-rw-r--r-- | googleurl/src/url_canon_stdstring.h | 11 | ||||
-rw-r--r-- | googleurl/src/url_canon_stdurl.cc | 9 | ||||
-rw-r--r-- | googleurl/src/url_canon_unittest.cc | 25 | ||||
-rw-r--r-- | googleurl/src/url_common.h | 48 | ||||
-rw-r--r-- | googleurl/src/url_parse.cc | 98 | ||||
-rw-r--r-- | googleurl/src/url_parse.h | 84 | ||||
-rw-r--r-- | googleurl/src/url_util.cc | 232 | ||||
-rw-r--r-- | googleurl/src/url_util.h | 186 | ||||
-rw-r--r-- | googleurl/src/url_util_unittest.cc | 120 |
19 files changed, 908 insertions, 543 deletions
diff --git a/googleurl/src/gurl.cc b/googleurl/src/gurl.cc index 2dab0b2..a0bfd26 100644 --- a/googleurl/src/gurl.cc +++ b/googleurl/src/gurl.cc @@ -304,8 +304,7 @@ GURL GURL::GetWithEmptyPath() const { } bool GURL::IsStandard() const { - return url_util::IsStandard(spec_.data(), static_cast<int>(spec_.length()), - parsed_.scheme); + return url_util::IsStandard(spec_.data(), parsed_.scheme); } bool GURL::SchemeIs(const char* lower_ascii_scheme) const { diff --git a/googleurl/src/gurl.h b/googleurl/src/gurl.h index 36cd14c..29fea81 100644 --- a/googleurl/src/gurl.h +++ b/googleurl/src/gurl.h @@ -36,6 +36,7 @@ #include "base/string16.h" #include "googleurl/src/url_canon.h" #include "googleurl/src/url_canon_stdstring.h" +#include "googleurl/src/url_common.h" #include "googleurl/src/url_parse.h" class GURL { @@ -44,11 +45,11 @@ class GURL { typedef url_canon::StdStringReplacements<string16> ReplacementsW; // Creates an empty, invalid URL. - GURL(); + GURL_API GURL(); // Copy construction is relatively inexpensive, with most of the time going // to reallocating the string. It does not re-parse. - GURL(const GURL& other); + GURL_API GURL(const GURL& other); // The narrow version requires the input be UTF-8. Invalid UTF-8 input will // result in an invalid URL. @@ -57,14 +58,16 @@ class GURL { // encode the query parameters. It is probably sufficient for the narrow // version to assume the query parameter encoding should be the same as the // input encoding. - explicit GURL(const std::string& url_string /*, output_param_encoding*/); - explicit GURL(const string16& url_string /*, output_param_encoding*/); + GURL_API explicit GURL(const std::string& url_string + /*, output_param_encoding*/); + GURL_API explicit GURL(const string16& url_string + /*, output_param_encoding*/); // Constructor for URLs that have already been parsed and canonicalized. This // is used for conversions from KURL, for example. The caller must supply all // information associated with the URL, which must be correct and consistent. - GURL(const char* canonical_spec, size_t canonical_spec_len, - const url_parse::Parsed& parsed, bool is_valid); + GURL_API GURL(const char* canonical_spec, size_t canonical_spec_len, + const url_parse::Parsed& parsed, bool is_valid); // Returns true when this object represents a valid parsed URL. When not // valid, other functions will still succeed, but you will not get canonical @@ -96,7 +99,7 @@ class GURL { // Used invalid_spec() below to get the unusable spec of an invalid URL. This // separation is designed to prevent errors that may cause security problems // that could result from the mistaken use of an invalid URL. - const std::string& spec() const; + GURL_API const std::string& spec() const; // Returns the potentially invalid spec for a the URL. This spec MUST NOT be // modified or sent over the network. It is designed to be displayed in error @@ -148,8 +151,8 @@ class GURL { // // It is an error to resolve a URL relative to an invalid URL. The result // will be the empty URL. - GURL Resolve(const std::string& relative) const; - GURL Resolve(const string16& relative) const; + GURL_API GURL Resolve(const std::string& relative) const; + GURL_API GURL Resolve(const string16& relative) const; // Like Resolve() above but takes a character set encoder which will be used // for any query text specified in the input. The charset converter parameter @@ -158,10 +161,10 @@ class GURL { // TODO(brettw): These should be replaced with versions that take something // more friendly than a raw CharsetConverter (maybe like an ICU character set // name). - GURL ResolveWithCharsetConverter( + GURL_API GURL ResolveWithCharsetConverter( const std::string& relative, url_canon::CharsetConverter* charset_converter) const; - GURL ResolveWithCharsetConverter( + GURL_API GURL ResolveWithCharsetConverter( const string16& relative, url_canon::CharsetConverter* charset_converter) const; @@ -176,9 +179,9 @@ class GURL { // // Note that we use the more general url_canon::Replacements type to give // callers extra flexibility rather than our override. - GURL ReplaceComponents( + GURL_API GURL ReplaceComponents( const url_canon::Replacements<char>& replacements) const; - GURL ReplaceComponents( + GURL_API GURL ReplaceComponents( const url_canon::Replacements<char16>& replacements) const; // A helper function that is equivalent to replacing the path with a slash @@ -190,7 +193,7 @@ class GURL { // // It is an error to get an empty path on an invalid URL. The result // will be the empty URL. - GURL GetWithEmptyPath() const; + GURL_API GURL GetWithEmptyPath() const; // A helper function to return a GURL containing just the scheme, host, // and port from a URL. Equivalent to clearing any username and password, @@ -201,19 +204,19 @@ class GURL { // // It is an error to get the origin of an invalid URL. The result // will be the empty URL. - GURL GetOrigin() const; + GURL_API GURL GetOrigin() const; // Returns true if the scheme for the current URL is a known "standard" - // scheme or there is a "://" after it. Standard schemes have an authority - // and a path section. This includes file:, which some callers may want to - // filter out explicitly by calling SchemeIsFile. - bool IsStandard() const; + // scheme. Standard schemes have an authority and a path section. This + // includes file:, which some callers may want to filter out explicitly by + // calling SchemeIsFile. + GURL_API bool IsStandard() const; // Returns true if the given parameter (should be lower-case ASCII to match // the canonicalized scheme) is the scheme for this URL. This call is more // efficient than getting the scheme and comparing it because no copies or // object constructions are done. - bool SchemeIs(const char* lower_ascii_scheme) const; + GURL_API bool SchemeIs(const char* lower_ascii_scheme) const; // We often need to know if this is a file URL. File URLs are "standard", but // are often treated separately by some programs. @@ -229,7 +232,7 @@ class GURL { // Returns true if the hostname is an IP address. Note: this function isn't // as cheap as a simple getter because it re-parses the hostname to verify. // This currently identifies only IPv4 addresses (bug 822685). - bool HostIsIPAddress() const; + GURL_API bool HostIsIPAddress() const; // Getters for various components of the URL. The returned string will be // empty if the component is empty or is not present. @@ -295,24 +298,24 @@ class GURL { // Returns a parsed version of the port. Can also be any of the special // values defined in Parsed for ExtractPort. - int IntPort() const; + GURL_API int IntPort() const; // Returns the port number of the url, or the default port number. // If the scheme has no concept of port (or unknown default) returns // PORT_UNSPECIFIED. - int EffectiveIntPort() const; + GURL_API int EffectiveIntPort() const; // Extracts the filename portion of the path and returns it. The filename // is everything after the last slash in the path. This may be empty. - std::string ExtractFileName() const; + GURL_API std::string ExtractFileName() const; // Returns the path that should be sent to the server. This is the path, // parameter, and query portions of the URL. It is guaranteed to be ASCII. - std::string PathForRequest() const; + GURL_API std::string PathForRequest() const; // Returns the host, excluding the square brackets surrounding IPv6 address // literals. This can be useful for passing to getaddrinfo(). - std::string HostNoBrackets() const; + GURL_API std::string HostNoBrackets() const; // Returns true if this URL's host matches or is in the same domain as // the given input string. For example if this URL was "www.google.com", @@ -324,7 +327,7 @@ class GURL { // // If function DomainIs has parameter domain_len, which means the parameter // lower_ascii_domain does not gurantee to terminate with NULL character. - bool DomainIs(const char* lower_ascii_domain, int domain_len) const; + GURL_API bool DomainIs(const char* lower_ascii_domain, int domain_len) const; // If function DomainIs only has parameter lower_ascii_domain, which means // domain string should be terminate with NULL character. @@ -335,12 +338,12 @@ class GURL { // Swaps the contents of this GURL object with the argument without doing // any memory allocations. - void Swap(GURL* other); + GURL_API void Swap(GURL* other); // Returns a reference to a singleton empty GURL. This object is for callers // who return references but don't have anything to return in some cases. // This function may be called from any thread. - static const GURL& EmptyGURL(); + GURL_API static const GURL& EmptyGURL(); private: // Returns the substring of the input identified by the given component. diff --git a/googleurl/src/gurl_unittest.cc b/googleurl/src/gurl_unittest.cc index 4e81de6..079e1ea 100644 --- a/googleurl/src/gurl_unittest.cc +++ b/googleurl/src/gurl_unittest.cc @@ -31,35 +31,36 @@ void SetupReplacement(void (url_canon::Replacements<CHAR>::*func)(const CHAR*, } } +// Returns the canonicalized string for the given URL string for the +// GURLTest.Types test. +std::string TypesTestCase(const char* src) { + GURL gurl(src); + return gurl.possibly_invalid_spec(); +} + } // namespace // Different types of URLs should be handled differently by url_util, and // handed off to different canonicalizers. TEST(GURLTest, Types) { - struct TypeTest { - const char* src; - const char* expected; - } type_cases[] = { - // URLs with "://" should be treated as standard and have a hostname, even - // when the scheme is unknown. - {"something:///HOSTNAME.com/", "something://hostname.com/"}, - // In the reverse, lacking a "://" means a path URL so no canonicalization - // should happen. - {"something:HOSTNAME.com/", "something:HOSTNAME.com/"}, - {"something:/HOSTNAME.com/", "something:/HOSTNAME.com/"}, + // URLs with unknown schemes should be treated as path URLs, even when they + // have things like "://". + EXPECT_EQ("something:///HOSTNAME.com/", + TypesTestCase("something:///HOSTNAME.com/")); + + // In the reverse, known schemes should always trigger standard URL handling. + EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com")); + EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com")); + EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com")); + EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com")); + #ifdef WIN32 - // URLs that look like absolute Windows drive specs. - {"c:\\foo.txt", "file:///C:/foo.txt"}, - {"Z|foo.txt", "file:///Z:/foo.txt"}, - {"\\\\server\\foo.txt", "file://server/foo.txt"}, - {"//server/foo.txt", "file://server/foo.txt"}, + // URLs that look like absolute Windows drive specs. + EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt")); + EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt")); + EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt")); + EXPECT_EQ("file://server/foo.txt", TypesTestCase("//server/foo.txt")); #endif - }; - - for (size_t i = 0; i < ARRAYSIZE(type_cases); i++) { - GURL gurl(type_cases[i].src); - EXPECT_STREQ(type_cases[i].expected, gurl.spec().c_str()); - } } // Test the basic creation and querying of components in a GURL. We assume @@ -166,9 +167,7 @@ TEST(GURLTest, Resolve) { {"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", true, "http://www.google.com/hello/world.html?a#b"}, {"http://www.google.com/foo#bar", "#com", true, "http://www.google.com/foo#com"}, {"http://www.google.com/", "Https:images.google.com", true, "https://images.google.com/"}, - // Unknown schemes with a "://" should be treated as standard. - {"somescheme://foo/", "bar", true, "somescheme://foo/bar"}, - // Unknown schemes with no "://" are not standard. + // Unknown schemes are not standard. {"data:blahblah", "http://google.com/", true, "http://google.com/"}, {"data:blahblah", "http:google.com", true, "http://google.com/"}, {"data:/blahblah", "file.html", false, ""}, @@ -178,15 +177,15 @@ TEST(GURLTest, Resolve) { // 8-bit code path. GURL input(resolve_cases[i].base); GURL output = input.Resolve(resolve_cases[i].relative); - EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()); - EXPECT_EQ(resolve_cases[i].expected, output.spec()); + EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()) << i; + EXPECT_EQ(resolve_cases[i].expected, output.spec()) << i; // Wide code path. GURL inputw(ConvertUTF8ToUTF16(resolve_cases[i].base)); GURL outputw = input.Resolve(ConvertUTF8ToUTF16(resolve_cases[i].relative)); - EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()); - EXPECT_EQ(resolve_cases[i].expected, outputw.spec()); + EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()) << i; + EXPECT_EQ(resolve_cases[i].expected, outputw.spec()) << i; } } @@ -429,5 +428,5 @@ TEST(GURLTest, IsStandard) { EXPECT_FALSE(b.IsStandard()); GURL c("foo://bar/baz"); - EXPECT_TRUE(c.IsStandard()); + EXPECT_FALSE(c.IsStandard()); } diff --git a/googleurl/src/url_canon.h b/googleurl/src/url_canon.h index 143574d..e2cfb55 100644 --- a/googleurl/src/url_canon.h +++ b/googleurl/src/url_canon.h @@ -33,6 +33,7 @@ #include <stdlib.h> #include "base/string16.h" +#include "googleurl/src/url_common.h" #include "googleurl/src/url_parse.h" namespace url_canon { @@ -248,12 +249,12 @@ class CharsetConverter { // // Therefore, callers should not use the buffer, since it may actuall be empty, // use the computed pointer and |*output_len| instead. -const char* RemoveURLWhitespace(const char* input, int input_len, - CanonOutputT<char>* buffer, - int* output_len); -const char16* RemoveURLWhitespace(const char16* input, int input_len, - CanonOutputT<char16>* buffer, - int* output_len); +GURL_API const char* RemoveURLWhitespace(const char* input, int input_len, + CanonOutputT<char>* buffer, + int* output_len); +GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len, + CanonOutputT<char16>* buffer, + int* output_len); // IDN ------------------------------------------------------------------------ @@ -266,7 +267,7 @@ const char16* RemoveURLWhitespace(const char16* input, int input_len, // the length of the output will be set to the length of the new host name. // // On error, returns false. The output in this case is undefined. -bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); +GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); // Piece-by-piece canonicalizers ---------------------------------------------- // @@ -292,14 +293,14 @@ bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); // URLs. // // The 8-bit version requires UTF-8 encoding. -bool CanonicalizeScheme(const char* spec, - const url_parse::Component& scheme, - CanonOutput* output, - url_parse::Component* out_scheme); -bool CanonicalizeScheme(const char16* spec, - const url_parse::Component& scheme, - CanonOutput* output, - url_parse::Component* out_scheme); +GURL_API bool CanonicalizeScheme(const char* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme); +GURL_API bool CanonicalizeScheme(const char16* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme); // User info: username/password. If present, this will add the delimiters so // the output will be "<username>:<password>@" or "<username>@". Empty @@ -311,20 +312,20 @@ bool CanonicalizeScheme(const char16* spec, // is legal as long as the two components don't overlap. // // The 8-bit version requires UTF-8 encoding. -bool CanonicalizeUserInfo(const char* username_source, - const url_parse::Component& username, - const char* password_source, - const url_parse::Component& password, - CanonOutput* output, - url_parse::Component* out_username, - url_parse::Component* out_password); -bool CanonicalizeUserInfo(const char16* username_source, - const url_parse::Component& username, - const char16* password_source, - const url_parse::Component& password, - CanonOutput* output, - url_parse::Component* out_username, - url_parse::Component* out_password); +GURL_API bool CanonicalizeUserInfo(const char* username_source, + const url_parse::Component& username, + const char* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password); +GURL_API bool CanonicalizeUserInfo(const char16* username_source, + const url_parse::Component& username, + const char16* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password); // This structure holds detailed state exported from the IP/Host canonicalizers. @@ -366,27 +367,27 @@ struct CanonHostInfo { // // The 8-bit version requires UTF-8 encoding. Use this version when you only // need to know whether canonicalization succeeded. -bool CanonicalizeHost(const char* spec, - const url_parse::Component& host, - CanonOutput* output, - url_parse::Component* out_host); -bool CanonicalizeHost(const char16* spec, - const url_parse::Component& host, - CanonOutput* output, - url_parse::Component* out_host); +GURL_API bool CanonicalizeHost(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host); +GURL_API bool CanonicalizeHost(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host); // Extended version of CanonicalizeHost, which returns additional information. // Use this when you need to know whether the hostname was an IP address. // A successful return is indicated by host_info->family != BROKEN. See the // definition of CanonHostInfo above for details. -void CanonicalizeHostVerbose(const char* spec, - const url_parse::Component& host, - CanonOutput* output, - CanonHostInfo* host_info); -void CanonicalizeHostVerbose(const char16* spec, - const url_parse::Component& host, - CanonOutput* output, - CanonHostInfo* host_info); +GURL_API void CanonicalizeHostVerbose(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +GURL_API void CanonicalizeHostVerbose(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); // IP addresses. @@ -399,34 +400,34 @@ void CanonicalizeHostVerbose(const char16* spec, // This is called AUTOMATICALLY from the host canonicalizer, which ensures that // the input is unescaped and name-prepped, etc. It should not normally be // necessary or wise to call this directly. -void CanonicalizeIPAddress(const char* spec, - const url_parse::Component& host, - CanonOutput* output, - CanonHostInfo* host_info); -void CanonicalizeIPAddress(const char16* spec, - const url_parse::Component& host, - CanonOutput* output, - CanonHostInfo* host_info); +GURL_API void CanonicalizeIPAddress(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +GURL_API void CanonicalizeIPAddress(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); // Port: this function will add the colon for the port if a port is present. // The caller can pass url_parse::PORT_UNSPECIFIED as the // default_port_for_scheme argument if there is no default port. // // The 8-bit version requires UTF-8 encoding. -bool CanonicalizePort(const char* spec, - const url_parse::Component& port, - int default_port_for_scheme, - CanonOutput* output, - url_parse::Component* out_port); -bool CanonicalizePort(const char16* spec, - const url_parse::Component& port, - int default_port_for_scheme, - CanonOutput* output, - url_parse::Component* out_port); +GURL_API bool CanonicalizePort(const char* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port); +GURL_API bool CanonicalizePort(const char16* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port); // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED // if the scheme is unknown. -int DefaultPortForScheme(const char* scheme, int scheme_len); +GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len); // Path. If the input does not begin in a slash (including if the input is // empty), we'll prepend a slash to the path to make it canonical. @@ -437,14 +438,14 @@ int DefaultPortForScheme(const char* scheme, int scheme_len); // an issue. Somebody giving us an 8-bit path is responsible for generating // the path that the server expects (we'll escape high-bit characters), so // if something is invalid, it's their problem. -bool CanonicalizePath(const char* spec, - const url_parse::Component& path, - CanonOutput* output, - url_parse::Component* out_path); -bool CanonicalizePath(const char16* spec, - const url_parse::Component& path, - CanonOutput* output, - url_parse::Component* out_path); +GURL_API bool CanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +GURL_API bool CanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); // Canonicalizes the input as a file path. This is like CanonicalizePath except // that it also handles Windows drive specs. For example, the path can begin @@ -452,14 +453,14 @@ bool CanonicalizePath(const char16* spec, // The string will be appended to |*output| and |*out_path| will be updated. // // The 8-bit version requires UTF-8 encoding. -bool FileCanonicalizePath(const char* spec, - const url_parse::Component& path, - CanonOutput* output, - url_parse::Component* out_path); -bool FileCanonicalizePath(const char16* spec, - const url_parse::Component& path, - CanonOutput* output, - url_parse::Component* out_path); +GURL_API bool FileCanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +GURL_API bool FileCanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); // Query: Prepends the ? if needed. // @@ -473,16 +474,16 @@ bool FileCanonicalizePath(const char16* spec, // if necessary, for ASCII input, no conversions are necessary. // // The converter can be NULL. In this case, the output encoding will be UTF-8. -void CanonicalizeQuery(const char* spec, - const url_parse::Component& query, - CharsetConverter* converter, - CanonOutput* output, - url_parse::Component* out_query); -void CanonicalizeQuery(const char16* spec, - const url_parse::Component& query, - CharsetConverter* converter, - CanonOutput* output, - url_parse::Component* out_query); +GURL_API void CanonicalizeQuery(const char* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query); +GURL_API void CanonicalizeQuery(const char16* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query); // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only // canonicalizer that does not produce ASCII output). The output is @@ -490,14 +491,14 @@ void CanonicalizeQuery(const char16* spec, // // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use // the "Unicode replacement character" for the confusing bits and copy the rest. -void CanonicalizeRef(const char* spec, - const url_parse::Component& path, - CanonOutput* output, - url_parse::Component* out_path); -void CanonicalizeRef(const char16* spec, - const url_parse::Component& path, - CanonOutput* output, - url_parse::Component* out_path); +GURL_API void CanonicalizeRef(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +GURL_API void CanonicalizeRef(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); // Full canonicalizer --------------------------------------------------------- // @@ -510,61 +511,61 @@ void CanonicalizeRef(const char16* spec, // The 8-bit versions require UTF-8 encoding. // Use for standard URLs with authorities and paths. -bool CanonicalizeStandardURL(const char* spec, - int spec_len, - const url_parse::Parsed& parsed, - CharsetConverter* query_converter, - CanonOutput* output, - url_parse::Parsed* new_parsed); -bool CanonicalizeStandardURL(const char16* spec, - int spec_len, - const url_parse::Parsed& parsed, - CharsetConverter* query_converter, - CanonOutput* output, - url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeStandardURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeStandardURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); // Use for file URLs. -bool CanonicalizeFileURL(const char* spec, - int spec_len, - const url_parse::Parsed& parsed, - CharsetConverter* query_converter, - CanonOutput* output, - url_parse::Parsed* new_parsed); -bool CanonicalizeFileURL(const char16* spec, - int spec_len, - const url_parse::Parsed& parsed, - CharsetConverter* query_converter, - CanonOutput* output, - url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeFileURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeFileURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); // Use for path URLs such as javascript. This does not modify the path in any // way, for example, by escaping it. -bool CanonicalizePathURL(const char* spec, - int spec_len, - const url_parse::Parsed& parsed, - CanonOutput* output, - url_parse::Parsed* new_parsed); -bool CanonicalizePathURL(const char16* spec, - int spec_len, - const url_parse::Parsed& parsed, - CanonOutput* output, - url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizePathURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizePathURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); // Use for mailto URLs. This "canonicalizes" the url into a path and query // component. It does not attempt to merge "to" fields. It uses UTF-8 for // the query encoding if there is a query. This is because a mailto URL is // really intended for an external mail program, and the encoding of a page, // etc. which would influence a query encoding normally are irrelevant. -bool CanonicalizeMailtoURL(const char* spec, - int spec_len, - const url_parse::Parsed& parsed, - CanonOutput* output, - url_parse::Parsed* new_parsed); -bool CanonicalizeMailtoURL(const char16* spec, - int spec_len, - const url_parse::Parsed& parsed, - CanonOutput* output, - url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeMailtoURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool CanonicalizeMailtoURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); // Part replacer -------------------------------------------------------------- @@ -585,7 +586,7 @@ template<typename CHAR> struct URLComponentSource { // Constructor normally used by callers wishing to replace components. This // will make them all NULL, which is no replacement. The caller would then - // override the compoents they want to replace. + // override the components they want to replace. URLComponentSource() : scheme(NULL), username(NULL), @@ -749,59 +750,59 @@ class Replacements { }; // The base must be an 8-bit canonical URL. -bool ReplaceStandardURL(const char* base, - const url_parse::Parsed& base_parsed, - const Replacements<char>& replacements, - CharsetConverter* query_converter, - CanonOutput* output, - url_parse::Parsed* new_parsed); -bool ReplaceStandardURL(const char* base, - const url_parse::Parsed& base_parsed, - const Replacements<char16>& replacements, - CharsetConverter* query_converter, - CanonOutput* output, - url_parse::Parsed* new_parsed); +GURL_API bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); // Replacing some parts of a file URL is not permitted. Everything except // the host, path, query, and ref will be ignored. -bool ReplaceFileURL(const char* base, - const url_parse::Parsed& base_parsed, - const Replacements<char>& replacements, - CharsetConverter* query_converter, - CanonOutput* output, - url_parse::Parsed* new_parsed); -bool ReplaceFileURL(const char* base, - const url_parse::Parsed& base_parsed, - const Replacements<char16>& replacements, - CharsetConverter* query_converter, - CanonOutput* output, - url_parse::Parsed* new_parsed); +GURL_API bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); // Path URLs can only have the scheme and path replaced. All other components // will be ignored. -bool ReplacePathURL(const char* base, - const url_parse::Parsed& base_parsed, - const Replacements<char>& replacements, - CanonOutput* output, - url_parse::Parsed* new_parsed); -bool ReplacePathURL(const char* base, - const url_parse::Parsed& base_parsed, - const Replacements<char16>& replacements, - CanonOutput* output, - url_parse::Parsed* new_parsed); +GURL_API bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); // Mailto URLs can only have the scheme, path, and query replaced. // All other components will be ignored. -bool ReplaceMailtoURL(const char* base, - const url_parse::Parsed& base_parsed, - const Replacements<char>& replacements, - CanonOutput* output, - url_parse::Parsed* new_parsed); -bool ReplaceMailtoURL(const char* base, - const url_parse::Parsed& base_parsed, - const Replacements<char16>& replacements, - CanonOutput* output, - url_parse::Parsed* new_parsed); +GURL_API bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); +GURL_API bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); // Relative URL --------------------------------------------------------------- @@ -816,20 +817,20 @@ bool ReplaceMailtoURL(const char* base, // not). Failure means that the combination of URLs doesn't make any sense. // // The base URL should always be canonical, therefore is ASCII. -bool IsRelativeURL(const char* base, - const url_parse::Parsed& base_parsed, - const char* fragment, - int fragment_len, - bool is_base_hierarchical, - bool* is_relative, - url_parse::Component* relative_component); -bool IsRelativeURL(const char* base, - const url_parse::Parsed& base_parsed, - const char16* fragment, - int fragment_len, - bool is_base_hierarchical, - bool* is_relative, - url_parse::Component* relative_component); +GURL_API bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component); +GURL_API bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char16* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component); // Given a canonical parsed source URL, a URL fragment known to be relative, // and the identified relevant portion of the relative URL (computed by @@ -849,22 +850,22 @@ bool IsRelativeURL(const char* base, // Returns true on success. On failure, the output will be "something // reasonable" that will be consistent and valid, just probably not what // was intended by the web page author or caller. -bool ResolveRelativeURL(const char* base_url, - const url_parse::Parsed& base_parsed, - bool base_is_file, - const char* relative_url, - const url_parse::Component& relative_component, - CharsetConverter* query_converter, - CanonOutput* output, - url_parse::Parsed* out_parsed); -bool ResolveRelativeURL(const char* base_url, - const url_parse::Parsed& base_parsed, - bool base_is_file, - const char16* relative_url, - const url_parse::Component& relative_component, - CharsetConverter* query_converter, - CanonOutput* output, - url_parse::Parsed* out_parsed); +GURL_API bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed); +GURL_API bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char16* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed); } // namespace url_canon diff --git a/googleurl/src/url_canon_etc.cc b/googleurl/src/url_canon_etc.cc index 672b187..aea181a 100644 --- a/googleurl/src/url_canon_etc.cc +++ b/googleurl/src/url_canon_etc.cc @@ -120,6 +120,11 @@ bool DoScheme(const CHAR* spec, // The output scheme starts from the current position. out_scheme->begin = output->length(); + // Danger: it's important that this code does not strip any characters: it + // only emits the canonical version (be it valid or escaped) of each of + // the input characters. Stripping would put it out of sync with + // url_util::FindAndCompareScheme, which could cause some security checks on + // schemes to be incorrect. bool success = true; int end = scheme.end(); for (int i = scheme.begin; i < end; i++) { diff --git a/googleurl/src/url_canon_icu.h b/googleurl/src/url_canon_icu.h index 3980663..6bc52c3 100644 --- a/googleurl/src/url_canon_icu.h +++ b/googleurl/src/url_canon_icu.h @@ -45,13 +45,13 @@ class ICUCharsetConverter : public CharsetConverter { // Constructs a converter using an already-existing ICU character set // converter. This converter is NOT owned by this object; the lifetime must // be managed by the creator such that it is alive as long as this is. - ICUCharsetConverter(UConverter* converter); + GURL_API ICUCharsetConverter(UConverter* converter); - virtual ~ICUCharsetConverter() {} + GURL_API virtual ~ICUCharsetConverter() {} - virtual void ConvertFromUTF16(const char16* input, - int input_len, - CanonOutput* output); + GURL_API virtual void ConvertFromUTF16(const char16* input, + int input_len, + CanonOutput* output); private: // The ICU converter, not owned by this class. diff --git a/googleurl/src/url_canon_ip.cc b/googleurl/src/url_canon_ip.cc index d84ff7d..86f7c9c 100644 --- a/googleurl/src/url_canon_ip.cc +++ b/googleurl/src/url_canon_ip.cc @@ -58,11 +58,14 @@ template<typename CHAR, typename UCHAR> bool DoFindIPv4Components(const CHAR* spec, const url_parse::Component& host, url_parse::Component components[4]) { + if (!host.is_nonempty()) + return false; + int cur_component = 0; // Index of the component we're working on. int cur_component_begin = host.begin; // Start of the current component. int end = host.end(); for (int i = host.begin; /* nothing */; i++) { - if (i == end || spec[i] == '.') { + if (i >= end || spec[i] == '.') { // Found the end of the current component. int component_len = i - cur_component_begin; components[cur_component] = @@ -76,10 +79,10 @@ bool DoFindIPv4Components(const CHAR* spec, // allow an empty component at the end (this would indicate that the // input ends in a dot). We also want to error if the component is // empty and it's the only component (cur_component == 1). - if (component_len == 0 && (i != end || cur_component == 1)) + if (component_len == 0 && (i < end || cur_component == 1)) return false; - if (i == end) + if (i >= end) break; // End of the input. if (cur_component == 4) { @@ -537,8 +540,8 @@ bool DoIPv6AddressToNumber(const CHAR* spec, if (ipv6_parsed.ipv4_component.is_valid()) { // We only allow the embedded IPv4 syntax to be used for "compat" and // "mapped" formats: - // "compat" ==> 0:0:0:0:0:ffff:<IPv4-literal> - // "mapped" ==> 0:0:0:0:0:0000:<IPv4-literal> + // "mapped" ==> 0:0:0:0:0:ffff:<IPv4-literal> + // "compat" ==> 0:0:0:0:0:0000:<IPv4-literal> for (int j = 0; j < 10; ++j) { if (address[j] != 0) return false; diff --git a/googleurl/src/url_canon_ip.h b/googleurl/src/url_canon_ip.h index 6ce069d..0a01c9f 100644 --- a/googleurl/src/url_canon_ip.h +++ b/googleurl/src/url_canon_ip.h @@ -32,6 +32,7 @@ #include "base/string16.h" #include "googleurl/src/url_canon.h" +#include "googleurl/src/url_common.h" #include "googleurl/src/url_parse.h" namespace url_canon { @@ -54,12 +55,12 @@ namespace url_canon { // Mozilla), so this code path never gets hit. Our host canonicalization will // notice these spaces and escape them, which will make IP address finding // fail. This seems like better behavior than stripping after a space. -bool FindIPv4Components(const char* spec, - const url_parse::Component& host, - url_parse::Component components[4]); -bool FindIPv4Components(const char16* spec, - const url_parse::Component& host, - url_parse::Component components[4]); +GURL_API bool FindIPv4Components(const char* spec, + const url_parse::Component& host, + url_parse::Component components[4]); +GURL_API bool FindIPv4Components(const char16* spec, + const url_parse::Component& host, + url_parse::Component components[4]); // Converts an IPv4 address to a 32-bit number (network byte order). // @@ -72,26 +73,28 @@ bool FindIPv4Components(const char16* spec, // // On success, |num_ipv4_components| will be populated with the number of // components in the IPv4 address. -CanonHostInfo::Family IPv4AddressToNumber(const char* spec, - const url_parse::Component& host, - unsigned char address[4], - int* num_ipv4_components); -CanonHostInfo::Family IPv4AddressToNumber(const char16* spec, - const url_parse::Component& host, - unsigned char address[4], - int* num_ipv4_components); +GURL_API CanonHostInfo::Family IPv4AddressToNumber( + const char* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components); +GURL_API CanonHostInfo::Family IPv4AddressToNumber( + const char16* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components); // Converts an IPv6 address to a 128-bit number (network byte order), returning // true on success. False means that the input was not a valid IPv6 address. // // NOTE that |host| is expected to be surrounded by square brackets. // i.e. "[::1]" rather than "::1". -bool IPv6AddressToNumber(const char* spec, - const url_parse::Component& host, - unsigned char address[16]); -bool IPv6AddressToNumber(const char16* spec, - const url_parse::Component& host, - unsigned char address[16]); +GURL_API bool IPv6AddressToNumber(const char* spec, + const url_parse::Component& host, + unsigned char address[16]); +GURL_API bool IPv6AddressToNumber(const char16* spec, + const url_parse::Component& host, + unsigned char address[16]); } // namespace url_canon diff --git a/googleurl/src/url_canon_path.cc b/googleurl/src/url_canon_path.cc index 98ca40b..df97aad 100644 --- a/googleurl/src/url_canon_path.cc +++ b/googleurl/src/url_canon_path.cc @@ -84,7 +84,7 @@ const unsigned char kPathCharLookup[0x100] = { // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, PASS, ESCAPE, PASS, ESCAPE, ESCAPE, // @ A B C D E F G H I J K L M N O - UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE, + PASS, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE, // P Q R S T U V W X Y Z [ \ ] ^ _ UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, ESCAPE, PASS, ESCAPE, UNESCAPE, // ` a b c d e f g h i j k l m n o diff --git a/googleurl/src/url_canon_relative.cc b/googleurl/src/url_canon_relative.cc index 446b951..6bcc72f 100644 --- a/googleurl/src/url_canon_relative.cc +++ b/googleurl/src/url_canon_relative.cc @@ -457,10 +457,11 @@ bool DoResolveRelativeURL(const char* base_url, } if (relative_component.len <= 0) { - // Empty relative URL, make no changes. + // Empty relative URL, leave unchanged, only removing the ref component. int base_len = base_parsed.Length(); - for (int i = 0; i < base_len; i++) - output->push_back(base_url[i]); + base_len -= base_parsed.ref.len + 1; + out_parsed->ref.reset(); + output->Append(base_url, base_len); return true; } diff --git a/googleurl/src/url_canon_stdstring.h b/googleurl/src/url_canon_stdstring.h index 2241eb1..c43b777 100644 --- a/googleurl/src/url_canon_stdstring.h +++ b/googleurl/src/url_canon_stdstring.h @@ -31,15 +31,15 @@ // strings. Because the canonicalizer tries not to be dependent on the STL, // we have segregated it here. -#ifndef GOOGLEURL_SRC_URL_CANON_STRING_H__ -#define GOOGLEURL_SRC_URL_CANON_STRING_H__ +#ifndef GOOGLEURL_SRC_URL_CANON_STDSTRING_H__ +#define GOOGLEURL_SRC_URL_CANON_STDSTRING_H__ #include <string> #include "googleurl/src/url_canon.h" namespace url_canon { -// Write into a std::string given in the constructor. This object odes not own +// Write into a std::string given in the constructor. This object does not own // the string itself, and the user must ensure that the string stays alive // throughout the lifetime of this object. // @@ -82,7 +82,7 @@ class StdStringCanonOutput : public CanonOutput { } protected: - std::string* str_; + std::string* str_; }; // An extension of the Replacements class that allows the setters to use @@ -130,4 +130,5 @@ class StdStringReplacements : } // namespace url_canon -#endif // GOOGLEURL_SRC_URL_CANON_STRING_H__ +#endif // GOOGLEURL_SRC_URL_CANON_STDSTRING_H__ + diff --git a/googleurl/src/url_canon_stdurl.cc b/googleurl/src/url_canon_stdurl.cc index 41a8fa9..1e21a14 100644 --- a/googleurl/src/url_canon_stdurl.cc +++ b/googleurl/src/url_canon_stdurl.cc @@ -170,6 +170,15 @@ bool CanonicalizeStandardURL(const char16* spec, output, new_parsed); } +// It might be nice in the future to optimize this so unchanged components don't +// need to be recanonicalized. This is especially true since the common case for +// ReplaceComponents is removing things we don't want, like reference fragments +// and usernames. These cases can become more efficient if we can assume the +// rest of the URL is OK with these removed (or only the modified parts +// recanonicalized). This would be much more complex to implement, however. +// +// You would also need to update DoReplaceComponents in url_util.cc which +// relies on this re-checking everything (see the comment there for why). bool ReplaceStandardURL(const char* base, const url_parse::Parsed& base_parsed, const Replacements<char>& replacements, diff --git a/googleurl/src/url_canon_unittest.cc b/googleurl/src/url_canon_unittest.cc index c5be423..a3e43e2 100644 --- a/googleurl/src/url_canon_unittest.cc +++ b/googleurl/src/url_canon_unittest.cc @@ -766,6 +766,22 @@ TEST(URLCanonTest, IPv6) { } } +TEST(URLCanonTest, IPEmpty) { + std::string out_str1; + url_canon::StdStringCanonOutput output1(&out_str1); + url_canon::CanonHostInfo host_info; + + // This tests tests. + const char spec[] = "192.168.0.1"; + url_canon::CanonicalizeIPAddress(spec, url_parse::Component(), + &output1, &host_info); + EXPECT_FALSE(host_info.IsIPAddress()); + + url_canon::CanonicalizeIPAddress(spec, url_parse::Component(0, 0), + &output1, &host_info); + EXPECT_FALSE(host_info.IsIPAddress()); +} + TEST(URLCanonTest, UserInfo) { // Note that the canonicalizer should escape and treat empty components as // not being there. @@ -950,8 +966,8 @@ TEST(URLCanonTest, Path) { // %7f should be allowed and %3D should not be unescaped (these were wrong // in a previous version). {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", url_parse::Component(0, 24), true}, - // @ should be unescaped. - {"/@asdf%40", L"/@asdf%40", "/@asdf@", url_parse::Component(0, 7), true}, + // @ should be passed through unchanged (escaped or unescaped). + {"/@asdf%40", L"/@asdf%40", "/@asdf%40", url_parse::Component(0, 9), true}, // ----- encoding tests ----- // Basic conversions @@ -1736,8 +1752,11 @@ TEST(URLCanonTest, ResolveRelativeURL) { // Basic absolute input. {"http://host/a", true, false, "http://another/", true, false, false, NULL}, {"http://host/a", true, false, "http:////another/", true, false, false, NULL}, - // Empty relative URLs shouldn't change the input. + // Empty relative URLs should only remove the ref part of the URL, + // leaving the rest unchanged. {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"}, + {"http://foo/bar#ref", true, false, "", true, true, true, "http://foo/bar"}, + {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"}, // Spaces at the ends of the relative path should be ignored. {"http://foo/bar", true, false, " another ", true, true, true, "http://foo/another"}, {"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"}, diff --git a/googleurl/src/url_common.h b/googleurl/src/url_common.h new file mode 100644 index 0000000..7e7e27a --- /dev/null +++ b/googleurl/src/url_common.h @@ -0,0 +1,48 @@ +// Copyright 2010, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_COMMON_H__ +#define GOOGLEURL_SRC_URL_COMMON_H__ + +#if !defined(GURL_IMPLEMENTATION) +#define GURL_IMPLEMENTATION 0 +#endif + +#if defined(WIN32) && defined(GURL_DLL) +#if GURL_IMPLEMENTATION +#define GURL_API __declspec(dllexport) +#else +#define GURL_API __declspec(dllimport) +#endif +#else +#define GURL_API +#endif + +#endif // GOOGLEURL_SRC_URL_COMMON_H__ + diff --git a/googleurl/src/url_parse.cc b/googleurl/src/url_parse.cc index 7c37f13..a08c4da 100644 --- a/googleurl/src/url_parse.cc +++ b/googleurl/src/url_parse.cc @@ -64,54 +64,6 @@ int FindNextAuthorityTerminator(const CHAR* spec, return spec_len; // Not found. } -// Fills in all members of the Parsed structure except for the scheme. -// -// |spec| is the full spec being parsed, of length |spec_len|. -// |after_scheme| is the character immediately following the scheme (after the -// colon) where we'll begin parsing. -// -// Compatability data points. I list "host", "path" extracted: -// Input IE6 Firefox Us -// ----- -------------- -------------- -------------- -// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" -// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" -// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" -// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" -// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" -// -// (*) Interestingly, although IE fails to load these URLs, its history -// canonicalizer handles them, meaning if you've been to the corresponding -// "http://foo.com/" link, it will be colored. -template <typename CHAR> -void DoParseAfterScheme(const CHAR* spec, - int spec_len, - int after_scheme, - Parsed* parsed) { - int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); - int after_slashes = after_scheme + num_slashes; - - // First split into two main parts, the authority (username, password, host, - // and port) and the full path (path, query, and reference). - Component authority; - Component full_path; - - // Found "//<some data>", looks like an authority section. Treat everything - // from there to the next slash (or end of spec) to be the authority. Note - // that we ignore the number of slashes and treat it as the authority. - int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); - authority = Component(after_slashes, end_auth - after_slashes); - - if (end_auth == spec_len) // No beginning of path found. - full_path = Component(); - else // Everything starting from the slash to the end is the path. - full_path = Component(end_auth, spec_len - end_auth); - - // Now parse those two sub-parts. - DoParseAuthority(spec, authority, &parsed->username, &parsed->password, - &parsed->host, &parsed->port); - ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); -} - template<typename CHAR> void ParseUserInfo(const CHAR* spec, const Component& user, @@ -310,6 +262,54 @@ bool DoExtractScheme(const CHAR* url, return false; // No colon found: no scheme } +// Fills in all members of the Parsed structure except for the scheme. +// +// |spec| is the full spec being parsed, of length |spec_len|. +// |after_scheme| is the character immediately following the scheme (after the +// colon) where we'll begin parsing. +// +// Compatability data points. I list "host", "path" extracted: +// Input IE6 Firefox Us +// ----- -------------- -------------- -------------- +// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" +// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" +// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// +// (*) Interestingly, although IE fails to load these URLs, its history +// canonicalizer handles them, meaning if you've been to the corresponding +// "http://foo.com/" link, it will be colored. +template <typename CHAR> +void DoParseAfterScheme(const CHAR* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); + int after_slashes = after_scheme + num_slashes; + + // First split into two main parts, the authority (username, password, host, + // and port) and the full path (path, query, and reference). + Component authority; + Component full_path; + + // Found "//<some data>", looks like an authority section. Treat everything + // from there to the next slash (or end of spec) to be the authority. Note + // that we ignore the number of slashes and treat it as the authority. + int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); + authority = Component(after_slashes, end_auth - after_slashes); + + if (end_auth == spec_len) // No beginning of path found. + full_path = Component(); + else // Everything starting from the slash to the end is the path. + full_path = Component(end_auth, spec_len - end_auth); + + // Now parse those two sub-parts. + DoParseAuthority(spec, authority, &parsed->username, &parsed->password, + &parsed->host, &parsed->port); + ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); +} + // The main parsing function for standard URLs. Standard URLs have a scheme, // host, path, etc. template<typename CHAR> @@ -683,7 +683,7 @@ void ParseAuthority(const char* spec, DoParseAuthority(spec, auth, username, password, hostname, port_num); } -void ParseAuthority(char16* spec, +void ParseAuthority(const char16* spec, const Component& auth, Component* username, Component* password, diff --git a/googleurl/src/url_parse.h b/googleurl/src/url_parse.h index bea2766..134b445 100644 --- a/googleurl/src/url_parse.h +++ b/googleurl/src/url_parse.h @@ -34,6 +34,7 @@ #include "base/basictypes.h" #include "base/string16.h" +#include "googleurl/src/url_common.h" namespace url_parse { @@ -127,7 +128,7 @@ struct Parsed { // of the string. For example "http://": the parsed structure will only // contain an entry for the four-character scheme, and it doesn't know about // the "://". For all other last-components, it will return the real length. - int Length() const; + GURL_API int Length() const; // Returns the number of characters before the given component if it exists, // or where the component would be if it did exist. This will return the @@ -155,7 +156,8 @@ struct Parsed { // *QUERY: 14 15 <- // *REF: 20 20 // - int CountCharactersBefore(ComponentType type, bool include_delimiter) const; + GURL_API int CountCharactersBefore(ComponentType type, + bool include_delimiter) const; // Scheme without the colon: "http://foo"/ would have a scheme of "http". // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there @@ -215,24 +217,24 @@ struct Parsed { // StandardURL is for when the scheme is known to be one that has an // authority (host) like "http". This function will not handle weird ones // like "about:" and "javascript:", or do the right thing for "file:" URLs. -void ParseStandardURL(const char* url, int url_len, Parsed* parsed); -void ParseStandardURL(const char16* url, int url_len, Parsed* parsed); +GURL_API void ParseStandardURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParseStandardURL(const char16* url, int url_len, Parsed* parsed); // PathURL is for when the scheme is known not to have an authority (host) // section but that aren't file URLs either. The scheme is parsed, and // everything after the scheme is considered as the path. This is used for // things like "about:" and "javascript:" -void ParsePathURL(const char* url, int url_len, Parsed* parsed); -void ParsePathURL(const char16* url, int url_len, Parsed* parsed); +GURL_API void ParsePathURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParsePathURL(const char16* url, int url_len, Parsed* parsed); // FileURL is for file URLs. There are some special rules for interpreting // these. -void ParseFileURL(const char* url, int url_len, Parsed* parsed); -void ParseFileURL(const char16* url, int url_len, Parsed* parsed); +GURL_API void ParseFileURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParseFileURL(const char16* url, int url_len, Parsed* parsed); // MailtoURL is for mailto: urls. They are made up scheme,path,query -void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); -void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed); +GURL_API void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); +GURL_API void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed); // Helper functions ----------------------------------------------------------- @@ -256,27 +258,27 @@ void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed); // end of the string). // // The 8-bit version requires UTF-8 encoding. -bool ExtractScheme(const char* url, int url_len, Component* scheme); -bool ExtractScheme(const char16* url, int url_len, Component* scheme); +GURL_API bool ExtractScheme(const char* url, int url_len, Component* scheme); +GURL_API bool ExtractScheme(const char16* url, int url_len, Component* scheme); // Returns true if ch is a character that terminates the authority segment // of a URL. -bool IsAuthorityTerminator(char16 ch); +GURL_API bool IsAuthorityTerminator(char16 ch); // Does a best effort parse of input |spec|, in range |auth|. If a particular // component is not found, it will be set to invalid. -void ParseAuthority(const char* spec, - const Component& auth, - Component* username, - Component* password, - Component* hostname, - Component* port_num); -void ParseAuthority(char16* spec, - const Component& auth, - Component* username, - Component* password, - Component* hostname, - Component* port_num); +GURL_API void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); +GURL_API void ParseAuthority(const char16* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); // Computes the integer port value from the given port component. The port // component should have been identified by one of the init functions on @@ -285,8 +287,8 @@ void ParseAuthority(char16* spec, // The return value will be a positive integer between 0 and 64K, or one of // the two special values below. enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; -int ParsePort(const char* url, const Component& port); -int ParsePort(const char16* url, const Component& port); +GURL_API int ParsePort(const char* url, const Component& port); +GURL_API int ParsePort(const char16* url, const Component& port); // Extracts the range of the file name in the given url. The path must // already have been computed by the parse function, and the matching URL @@ -298,12 +300,12 @@ int ParsePort(const char16* url, const Component& port); // following the last slash. // // The 8-bit version requires UTF-8 encoding. -void ExtractFileName(const char* url, - const Component& path, - Component* file_name); -void ExtractFileName(const char16* url, - const Component& path, - Component* file_name); +GURL_API void ExtractFileName(const char* url, + const Component& path, + Component* file_name); +GURL_API void ExtractFileName(const char16* url, + const Component& path, + Component* file_name); // Extract the first key/value from the range defined by |*query|. Updates // |*query| to start at the end of the extracted key/value pair. This is @@ -320,14 +322,14 @@ void ExtractFileName(const char16* url, // // If no key/value are found |*key| and |*value| will be unchanged and it will // return false. -bool ExtractQueryKeyValue(const char* url, - Component* query, - Component* key, - Component* value); -bool ExtractQueryKeyValue(const char16* url, - Component* query, - Component* key, - Component* value); +GURL_API bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value); +GURL_API bool ExtractQueryKeyValue(const char16* url, + Component* query, + Component* key, + Component* value); } // namespace url_parse diff --git a/googleurl/src/url_util.cc b/googleurl/src/url_util.cc index d623b45..7e100aa 100644 --- a/googleurl/src/url_util.cc +++ b/googleurl/src/url_util.cc @@ -33,6 +33,7 @@ #include "googleurl/src/url_util.h" #include "base/logging.h" +#include "googleurl/src/url_canon_internal.h" #include "googleurl/src/url_file.h" namespace url_util { @@ -58,13 +59,15 @@ inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) { const char kFileScheme[] = "file"; // Used in a number of places. const char kMailtoScheme[] = "mailto"; -const int kNumStandardURLSchemes = 5; +const int kNumStandardURLSchemes = 7; const char* kStandardURLSchemes[kNumStandardURLSchemes] = { "http", "https", kFileScheme, // Yes, file urls can have a hostname! "ftp", "gopher", + "ws", // WebSocket. + "wss", // WebSocket secure. }; // List of the currently installed standard schemes. This list is lazily @@ -72,6 +75,9 @@ const char* kStandardURLSchemes[kNumStandardURLSchemes] = { // any destructors from being called that will slow us down or cause problems. std::vector<const char*>* standard_schemes = NULL; +// See the LockStandardSchemes declaration in the header. +bool standard_schemes_locked = false; + // Ensures that the standard_schemes list is initialized, does nothing if it // already has values. void InitStandardSchemes() { @@ -96,10 +102,9 @@ inline bool CompareSchemeComponent(const CHAR* spec, } // Returns true if the given scheme identified by |scheme| within |spec| is one -// of the registered "standard" schemes. Note that this does not check for -// "://", use IsStandard for that. +// of the registered "standard" schemes. template<typename CHAR> -bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) { +bool DoIsStandard(const CHAR* spec, const url_parse::Component& scheme) { if (!scheme.is_nonempty()) return false; // Empty or invalid schemes are non-standard. @@ -112,34 +117,20 @@ bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) { return false; } -// Returns true if the stuff following the scheme in the given spec indicates -// a "standard" URL. The presence of "://" after the scheme indicates that -// there is a hostname, etc. which we call a standard URL. -template<typename CHAR> -bool HasStandardSchemeSeparator(const CHAR* spec, int spec_len, - const url_parse::Component& scheme) { - int after_scheme = scheme.end(); - if (spec_len < after_scheme + 3) - return false; - return spec[after_scheme] == ':' && - spec[after_scheme + 1] == '/' && - spec[after_scheme + 2] == '/'; -} - -template<typename CHAR> -bool DoIsStandard(const CHAR* spec, int spec_len, - const url_parse::Component& scheme) { - return HasStandardSchemeSeparator(spec, spec_len, scheme) || - IsStandardScheme(spec, scheme); -} - template<typename CHAR> bool DoFindAndCompareScheme(const CHAR* str, int str_len, const char* compare, url_parse::Component* found_scheme) { + // Before extracting scheme, canonicalize the URL to remove any whitespace. + // This matches the canonicalization done in DoCanonicalize function. + url_canon::RawCanonOutputT<CHAR> whitespace_buffer; + int spec_len; + const CHAR* spec = RemoveURLWhitespace(str, str_len, + &whitespace_buffer, &spec_len); + url_parse::Component our_scheme; - if (!url_parse::ExtractScheme(str, str_len, &our_scheme)) { + if (!url_parse::ExtractScheme(spec, spec_len, &our_scheme)) { // No scheme. if (found_scheme) *found_scheme = url_parse::Component(); @@ -147,7 +138,7 @@ bool DoFindAndCompareScheme(const CHAR* str, } if (found_scheme) *found_scheme = our_scheme; - return CompareSchemeComponent(str, our_scheme, compare); + return CompareSchemeComponent(spec, our_scheme, compare); } template<typename CHAR> @@ -184,7 +175,7 @@ bool DoCanonicalize(const CHAR* in_spec, int in_spec_len, #endif url_parse::Component scheme; - if(!url_parse::ExtractScheme(spec, spec_len, &scheme)) + if (!url_parse::ExtractScheme(spec, spec_len, &scheme)) return false; // This is the parsed version of the input URL, we have to canonicalize it @@ -197,7 +188,7 @@ bool DoCanonicalize(const CHAR* in_spec, int in_spec_len, charset_converter, output, output_parsed); - } else if (IsStandard(spec, spec_len, scheme)) { + } else if (DoIsStandard(spec, scheme)) { // All "normal" URLs. url_parse::ParseStandardURL(spec, spec_len, &parsed_input); success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input, @@ -239,7 +230,7 @@ bool DoResolveRelative(const char* base_spec, // See if our base URL should be treated as "standard". bool standard_base_scheme = base_parsed.scheme.is_nonempty() && - IsStandard(base_spec, base_spec_len, base_parsed.scheme); + DoIsStandard(base_spec, base_parsed.scheme); bool is_relative; url_parse::Component relative_component; @@ -275,53 +266,111 @@ bool DoReplaceComponents(const char* spec, url_canon::CharsetConverter* charset_converter, url_canon::CanonOutput* output, url_parse::Parsed* out_parsed) { - // Note that we dispatch to the parser according the the scheme type of - // the OUTPUT URL. Normally, this is the same as our scheme, but if the - // scheme is being overridden, we need to test that. - - if (// Either the scheme is not replaced and the old one is a file, - (!replacements.IsSchemeOverridden() && - CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) || - // ...or it is being replaced and the new one is a file. - (replacements.IsSchemeOverridden() && - CompareSchemeComponent(replacements.sources().scheme, - replacements.components().scheme, - kFileScheme))) { + // If the scheme is overridden, just do a simple string substitution and + // reparse the whole thing. There are lots of edge cases that we really don't + // want to deal with. Like what happens if I replace "http://e:8080/foo" + // with a file. Does it become "file:///E:/8080/foo" where the port number + // becomes part of the path? Parsing that string as a file URL says "yes" + // but almost no sane rule for dealing with the components individually would + // come up with that. + // + // Why allow these crazy cases at all? Programatically, there is almost no + // case for replacing the scheme. The most common case for hitting this is + // in JS when building up a URL using the location object. In this case, the + // JS code expects the string substitution behavior: + // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3 + if (replacements.IsSchemeOverridden()) { + // Canonicalize the new scheme so it is 8-bit and can be concatenated with + // the existing spec. + url_canon::RawCanonOutput<128> scheme_replaced; + url_parse::Component scheme_replaced_parsed; + url_canon::CanonicalizeScheme( + replacements.sources().scheme, + replacements.components().scheme, + &scheme_replaced, &scheme_replaced_parsed); + + // We can assume that the input is canonicalized, which means it always has + // a colon after the scheme (or where the scheme would be). + int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1 + : 1; + if (spec_len - spec_after_colon > 0) { + scheme_replaced.Append(&spec[spec_after_colon], + spec_len - spec_after_colon); + } + + // We now need to completely re-parse the resulting string since its meaning + // may have changed with the different scheme. + url_canon::RawCanonOutput<128> recanonicalized; + url_parse::Parsed recanonicalized_parsed; + DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), + charset_converter, + &recanonicalized, &recanonicalized_parsed); + + // Recurse using the version with the scheme already replaced. This will now + // use the replacement rules for the new scheme. + // + // Warning: this code assumes that ReplaceComponents will re-check all + // components for validity. This is because we can't fail if DoCanonicalize + // failed above since theoretically the thing making it fail could be + // getting replaced here. If ReplaceComponents didn't re-check everything, + // we wouldn't know if something *not* getting replaced is a problem. + // If the scheme-specific replacers are made more intelligent so they don't + // re-check everything, we should instead recanonicalize the whole thing + // after this call to check validity (this assumes replacing the scheme is + // much much less common than other types of replacements, like clearing the + // ref). + url_canon::Replacements<CHAR> replacements_no_scheme = replacements; + replacements_no_scheme.SetScheme(NULL, url_parse::Component()); + return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(), + recanonicalized_parsed, replacements_no_scheme, + charset_converter, output, out_parsed); + } + + // If we get here, then we know the scheme doesn't need to be replaced, so can + // just key off the scheme in the spec to know how to do the replacements. + if (CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) { return url_canon::ReplaceFileURL(spec, parsed, replacements, charset_converter, output, out_parsed); } - - if (// Either the scheme is not replaced and the old one is standard, - (!replacements.IsSchemeOverridden() && - IsStandard(spec, spec_len, parsed.scheme)) || - // ...or it is being replaced and the new one is standard. - (replacements.IsSchemeOverridden() && - IsStandardScheme(replacements.sources().scheme, - replacements.components().scheme))) { - // Standard URL with all parts. + if (DoIsStandard(spec, parsed.scheme)) { return url_canon::ReplaceStandardURL(spec, parsed, replacements, charset_converter, output, out_parsed); } - - if (// Either the scheme is not replaced and the old one is mailto, - (!replacements.IsSchemeOverridden() && - CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) || - // ...or it is being replaced and the new one is a mailto. - (replacements.IsSchemeOverridden() && - CompareSchemeComponent(replacements.sources().scheme, - replacements.components().scheme, - kMailtoScheme))) { + if (CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) { return url_canon::ReplaceMailtoURL(spec, parsed, replacements, output, out_parsed); } + // Default is a path URL. return url_canon::ReplacePathURL(spec, parsed, replacements, output, out_parsed); } } // namespace +void Initialize() { + InitStandardSchemes(); +} + +void Shutdown() { + if (standard_schemes) { + delete standard_schemes; + standard_schemes = NULL; + } +} + void AddStandardScheme(const char* new_scheme) { + // If this assert triggers, it means you've called AddStandardScheme after + // LockStandardSchemes have been called (see the header file for + // LockStandardSchemes for more). + // + // This normally means you're trying to set up a new standard scheme too late + // in your application's init process. Locate where your app does this + // initialization and calls LockStandardScheme, and add your new standard + // scheme there. + DCHECK(!standard_schemes_locked) << + "Trying to add a standard scheme after the list has been locked."; + size_t scheme_len = strlen(new_scheme); if (scheme_len == 0) return; @@ -335,14 +384,16 @@ void AddStandardScheme(const char* new_scheme) { standard_schemes->push_back(dup_scheme); } -bool IsStandard(const char* spec, int spec_len, - const url_parse::Component& scheme) { - return DoIsStandard(spec, spec_len, scheme); +void LockStandardSchemes() { + standard_schemes_locked = true; +} + +bool IsStandard(const char* spec, const url_parse::Component& scheme) { + return DoIsStandard(spec, scheme); } -bool IsStandard(const char16* spec, int spec_len, - const url_parse::Component& scheme) { - return DoIsStandard(spec, spec_len, scheme); +bool IsStandard(const char16* spec, const url_parse::Component& scheme) { + return DoIsStandard(spec, scheme); } bool FindAndCompareScheme(const char* str, @@ -450,4 +501,53 @@ bool LowerCaseEqualsASCII(const char16* a_begin, return DoLowerCaseEqualsASCII(a_begin, a_end, b); } +void DecodeURLEscapeSequences(const char* input, int length, + url_canon::CanonOutputW* output) { + url_canon::RawCanonOutputT<char> unescaped_chars; + for (int i = 0; i < length; i++) { + if (input[i] == '%') { + unsigned char ch; + if (url_canon::DecodeEscaped(input, &i, length, &ch)) { + unescaped_chars.push_back(ch); + } else { + // Invalid escape sequence, copy the percent literal. + unescaped_chars.push_back('%'); + } + } else { + // Regular non-escaped 8-bit character. + unescaped_chars.push_back(input[i]); + } + } + + // Convert that 8-bit to UTF-16. It's not clear IE does this at all to + // JavaScript URLs, but Firefox and Safari do. + for (int i = 0; i < unescaped_chars.length(); i++) { + unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i)); + if (uch < 0x80) { + // Non-UTF-8, just append directly + output->push_back(uch); + } else { + // next_ch will point to the last character of the decoded + // character. + int next_character = i; + unsigned code_point; + if (url_canon::ReadUTFChar(unescaped_chars.data(), &next_character, + unescaped_chars.length(), &code_point)) { + // Valid UTF-8 character, convert to UTF-16. + url_canon::AppendUTF16Value(code_point, output); + i = next_character; + } else { + // If there are any sequences that are not valid UTF-8, we keep + // invalid code points and promote to UTF-16. We copy all characters + // from the current position to the end of the identified sequence. + while (i < next_character) { + output->push_back(static_cast<unsigned char>(unescaped_chars.at(i))); + i++; + } + output->push_back(static_cast<unsigned char>(unescaped_chars.at(i))); + } + } + } +} + } // namespace url_util diff --git a/googleurl/src/url_util.h b/googleurl/src/url_util.h index 62813a6..ec4cf9e 100644 --- a/googleurl/src/url_util.h +++ b/googleurl/src/url_util.h @@ -33,29 +33,69 @@ #include <string> #include "base/string16.h" +#include "googleurl/src/url_common.h" #include "googleurl/src/url_parse.h" #include "googleurl/src/url_canon.h" namespace url_util { +// Init ------------------------------------------------------------------------ + +// Initialization is NOT required, it will be implicitly initialized when first +// used. However, this implicit initialization is NOT threadsafe. If you are +// using this library in a threaded environment and don't have a consistent +// "first call" (an example might be calling "AddStandardScheme" with your +// special application-specific schemes) then you will want to call initialize +// before spawning any threads. +// +// It is OK to call this function more than once, subsequent calls will simply +// "noop", unless Shutdown() was called in the mean time. This will also be a +// "noop" if other calls to the library have forced an initialization +// beforehand. +GURL_API void Initialize(); + +// Cleanup is not required, except some strings may leak. For most user +// applications, this is fine. If you're using it in a library that may get +// loaded and unloaded, you'll want to unload to properly clean up your +// library. +GURL_API void Shutdown(); + // Schemes -------------------------------------------------------------------- // Adds an application-defined scheme to the internal list of "standard" URL -// schemes. -void AddStandardScheme(const char* new_scheme); +// schemes. This function is not threadsafe and can not be called concurrently +// with any other url_util function. It will assert if the list of standard +// schemes has been locked (see LockStandardSchemes). +GURL_API void AddStandardScheme(const char* new_scheme); + +// Sets a flag to prevent future calls to AddStandardScheme from succeeding. +// +// This is designed to help prevent errors for multithreaded applications. +// Normal usage would be to call AddStandardScheme for your custom schemes at +// the beginning of program initialization, and then LockStandardSchemes. This +// prevents future callers from mistakenly calling AddStandardScheme when the +// program is running with multiple threads, where such usage would be +// dangerous. +// +// We could have had AddStandardScheme use a lock instead, but that would add +// some platform-specific dependencies we don't otherwise have now, and is +// overkill considering the normal usage is so simple. +GURL_API void LockStandardSchemes(); // Locates the scheme in the given string and places it into |found_scheme|, // which may be NULL to indicate the caller does not care about the range. +// // Returns whether the given |compare| scheme matches the scheme found in the -// input (if any). -bool FindAndCompareScheme(const char* str, - int str_len, - const char* compare, - url_parse::Component* found_scheme); -bool FindAndCompareScheme(const char16* str, - int str_len, - const char* compare, - url_parse::Component* found_scheme); +// input (if any). The |compare| scheme must be a valid canonical scheme or +// the result of the comparison is undefined. +GURL_API bool FindAndCompareScheme(const char* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme); +GURL_API bool FindAndCompareScheme(const char16* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme); inline bool FindAndCompareScheme(const std::string& str, const char* compare, url_parse::Component* found_scheme) { @@ -70,12 +110,18 @@ inline bool FindAndCompareScheme(const string16& str, } // Returns true if the given string represents a standard URL. This means that -// either the scheme is in the list of known standard schemes, or there is a -// "://" following the scheme. -bool IsStandard(const char* spec, int spec_len, - const url_parse::Component& scheme); -bool IsStandard(const char16* spec, int spec_len, - const url_parse::Component& scheme); +// either the scheme is in the list of known standard schemes. +GURL_API bool IsStandard(const char* spec, + const url_parse::Component& scheme); +GURL_API bool IsStandard(const char16* spec, + const url_parse::Component& scheme); + +// TODO(brettw) remove this. This is a temporary compatibility hack to avoid +// breaking the WebKit build when this version is synced via Chrome. +inline bool IsStandard(const char* spec, int spec_len, + const url_parse::Component& scheme) { + return IsStandard(spec, scheme); +} // URL library wrappers ------------------------------------------------------- @@ -89,16 +135,16 @@ bool IsStandard(const char16* spec, int spec_len, // Returns true if a valid URL was produced, false if not. On failure, the // output and parsed structures will still be filled and will be consistent, // but they will not represent a loadable URL. -bool Canonicalize(const char* spec, - int spec_len, - url_canon::CharsetConverter* charset_converter, - url_canon::CanonOutput* output, - url_parse::Parsed* output_parsed); -bool Canonicalize(const char16* spec, - int spec_len, - url_canon::CharsetConverter* charset_converter, - url_canon::CanonOutput* output, - url_parse::Parsed* output_parsed); +GURL_API bool Canonicalize(const char* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); +GURL_API bool Canonicalize(const char16* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); // Resolves a potentially relative URL relative to the given parsed base URL. // The base MUST be valid. The resulting canonical URL and parsed information @@ -110,41 +156,43 @@ bool Canonicalize(const char16* spec, // // Returns true if the output is valid, false if the input could not produce // a valid URL. -bool ResolveRelative(const char* base_spec, - int base_spec_len, - const url_parse::Parsed& base_parsed, - const char* relative, - int relative_length, - url_canon::CharsetConverter* charset_converter, - url_canon::CanonOutput* output, - url_parse::Parsed* output_parsed); -bool ResolveRelative(const char* base_spec, - int base_spec_len, - const url_parse::Parsed& base_parsed, - const char16* relative, - int relative_length, - url_canon::CharsetConverter* charset_converter, - url_canon::CanonOutput* output, - url_parse::Parsed* output_parsed); +GURL_API bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); +GURL_API bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char16* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); // Replaces components in the given VALID input url. The new canonical URL info // is written to output and out_parsed. // // Returns true if the resulting URL is valid. -bool ReplaceComponents(const char* spec, - int spec_len, - const url_parse::Parsed& parsed, - const url_canon::Replacements<char>& replacements, - url_canon::CharsetConverter* charset_converter, - url_canon::CanonOutput* output, - url_parse::Parsed* out_parsed); -bool ReplaceComponents(const char* spec, - int spec_len, - const url_parse::Parsed& parsed, - const url_canon::Replacements<char16>& replacements, - url_canon::CharsetConverter* charset_converter, - url_canon::CanonOutput* output, - url_parse::Parsed* out_parsed); +GURL_API bool ReplaceComponents( + const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed); +GURL_API bool ReplaceComponents( + const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char16>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed); // String helper functions ---------------------------------------------------- @@ -154,16 +202,20 @@ bool ReplaceComponents(const char* spec, // // The versions of this function that don't take a b_end assume that the b // string is NULL terminated. -bool LowerCaseEqualsASCII(const char* a_begin, - const char* a_end, - const char* b); -bool LowerCaseEqualsASCII(const char* a_begin, - const char* a_end, - const char* b_begin, - const char* b_end); -bool LowerCaseEqualsASCII(const char16* a_begin, - const char16* a_end, - const char* b); +GURL_API bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b); +GURL_API bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b_begin, + const char* b_end); +GURL_API bool LowerCaseEqualsASCII(const char16* a_begin, + const char16* a_end, + const char* b); + +// Unescapes the given string using URL escaping rules. +GURL_API void DecodeURLEscapeSequences(const char* input, int length, + url_canon::CanonOutputW* output); } // namespace url_util diff --git a/googleurl/src/url_util_unittest.cc b/googleurl/src/url_util_unittest.cc index 12e5254..442b2ec 100644 --- a/googleurl/src/url_util_unittest.cc +++ b/googleurl/src/url_util_unittest.cc @@ -30,6 +30,7 @@ #include "googleurl/src/url_canon.h" #include "googleurl/src/url_canon_stdstring.h" #include "googleurl/src/url_parse.h" +#include "googleurl/src/url_test_utils.h" #include "googleurl/src/url_util.h" #include "testing/gtest/include/gtest/gtest.h" @@ -64,6 +65,22 @@ TEST(URLUtilTest, FindAndCompareScheme) { // But when there is no scheme, it should fail. EXPECT_FALSE(url_util::FindAndCompareScheme("", 0, "", &found_scheme)); EXPECT_TRUE(found_scheme == url_parse::Component()); + + // When there is a whitespace char in scheme, it should canonicalize the url + // before comparison. + const char whtspc_str[] = " \r\n\tjav\ra\nscri\tpt:alert(1)"; + EXPECT_TRUE(url_util::FindAndCompareScheme( + whtspc_str, static_cast<int>(strlen(whtspc_str)), "javascript", + &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component(1, 10)); + + // Control characters should be stripped out on the ends, and kept in the + // middle. + const char ctrl_str[] = "\02jav\02scr\03ipt:alert(1)"; + EXPECT_FALSE(url_util::FindAndCompareScheme( + ctrl_str, static_cast<int>(strlen(ctrl_str)), "javascript", + &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component(1, 11)); } TEST(URLUtilTest, ReplaceComponents) { @@ -96,3 +113,106 @@ TEST(URLUtilTest, ReplaceComponents) { &new_parsed); } +static std::string CheckReplaceScheme(const char* base_url, + const char* scheme) { + // Make sure the input is canonicalized. + url_canon::RawCanonOutput<32> original; + url_parse::Parsed original_parsed; + url_util::Canonicalize(base_url, strlen(base_url), NULL, + &original, &original_parsed); + + url_canon::Replacements<char> replacements; + replacements.SetScheme(scheme, url_parse::Component(0, strlen(scheme))); + + std::string output_string; + url_canon::StdStringCanonOutput output(&output_string); + url_parse::Parsed output_parsed; + url_util::ReplaceComponents(original.data(), original.length(), + original_parsed, replacements, NULL, + &output, &output_parsed); + + output.Complete(); + return output_string; +} + +TEST(URLUtilTest, ReplaceScheme) { + EXPECT_EQ("https://google.com/", + CheckReplaceScheme("http://google.com/", "https")); + EXPECT_EQ("file://google.com/", + CheckReplaceScheme("http://google.com/", "file")); + EXPECT_EQ("http://home/Build", + CheckReplaceScheme("file:///Home/Build", "http")); + EXPECT_EQ("javascript:foo", + CheckReplaceScheme("about:foo", "javascript")); + EXPECT_EQ("://google.com/", + CheckReplaceScheme("http://google.com/", "")); + EXPECT_EQ("http://google.com/", + CheckReplaceScheme("about:google.com", "http")); + EXPECT_EQ("http:", CheckReplaceScheme("", "http")); + +#ifdef WIN32 + // Magic Windows drive letter behavior when converting to a file URL. + EXPECT_EQ("file:///E:/foo/", + CheckReplaceScheme("http://localhost/e:foo/", "file")); +#endif + + // This will probably change to "about://google.com/" when we fix + // http://crbug.com/160 which should also be an acceptable result. + EXPECT_EQ("about://google.com/", + CheckReplaceScheme("http://google.com/", "about")); +} + +TEST(URLUtilTest, DecodeURLEscapeSequences) { + struct DecodeCase { + const char* input; + const char* output; + } decode_cases[] = { + {"hello, world", "hello, world"}, + {"%01%02%03%04%05%06%07%08%09%0a%0B%0C%0D%0e%0f/", + "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0B\x0C\x0D\x0e\x0f/"}, + {"%10%11%12%13%14%15%16%17%18%19%1a%1B%1C%1D%1e%1f/", + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1B\x1C\x1D\x1e\x1f/"}, + {"%20%21%22%23%24%25%26%27%28%29%2a%2B%2C%2D%2e%2f/", + " !\"#$%&'()*+,-.//"}, + {"%30%31%32%33%34%35%36%37%38%39%3a%3B%3C%3D%3e%3f/", + "0123456789:;<=>?/"}, + {"%40%41%42%43%44%45%46%47%48%49%4a%4B%4C%4D%4e%4f/", + "@ABCDEFGHIJKLMNO/"}, + {"%50%51%52%53%54%55%56%57%58%59%5a%5B%5C%5D%5e%5f/", + "PQRSTUVWXYZ[\\]^_/"}, + {"%60%61%62%63%64%65%66%67%68%69%6a%6B%6C%6D%6e%6f/", + "`abcdefghijklmno/"}, + {"%70%71%72%73%74%75%76%77%78%79%7a%7B%7C%7D%7e%7f/", + "pqrstuvwxyz{|}~\x7f/"}, + // Test un-UTF-8-ization. + {"%e4%bd%a0%e5%a5%bd", "\xe4\xbd\xa0\xe5\xa5\xbd"}, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(decode_cases); i++) { + const char* input = decode_cases[i].input; + url_canon::RawCanonOutputT<char16> output; + url_util::DecodeURLEscapeSequences(input, strlen(input), &output); + EXPECT_EQ(decode_cases[i].output, + url_test_utils::ConvertUTF16ToUTF8( + string16(output.data(), output.length()))); + } + + // Our decode should decode %00 + const char zero_input[] = "%00"; + url_canon::RawCanonOutputT<char16> zero_output; + url_util::DecodeURLEscapeSequences(zero_input, strlen(zero_input), + &zero_output); + EXPECT_NE("%00", + url_test_utils::ConvertUTF16ToUTF8( + string16(zero_output.data(), zero_output.length()))); + + // Test the error behavior for invalid UTF-8. + const char invalid_input[] = "%e4%a0%e5%a5%bd"; + const char16 invalid_expected[4] = {0x00e4, 0x00a0, 0x597d, 0}; + url_canon::RawCanonOutputT<char16> invalid_output; + url_util::DecodeURLEscapeSequences(invalid_input, strlen(invalid_input), + &invalid_output); + EXPECT_EQ(string16(invalid_expected), + string16(invalid_output.data(), invalid_output.length())); +} + |